diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/alloc_background.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.c | 11 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 16 | ||||
-rw-r--r-- | fs/bcachefs/data_update.c | 8 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 185 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting_format.h | 10 | ||||
-rw-r--r-- | fs/bcachefs/error.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/inode.c | 30 | ||||
-rw-r--r-- | fs/bcachefs/inode.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/io_misc.c | 12 | ||||
-rw-r--r-- | fs/bcachefs/io_write.c | 50 | ||||
-rw-r--r-- | fs/bcachefs/io_write.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/lru.c | 45 | ||||
-rw-r--r-- | fs/bcachefs/lru.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/move.c | 162 | ||||
-rw-r--r-- | fs/bcachefs/move.h | 30 | ||||
-rw-r--r-- | fs/bcachefs/opts.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/opts.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/progress.c | 39 | ||||
-rw-r--r-- | fs/bcachefs/progress.h | 12 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 213 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.h | 50 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 14 | ||||
-rw-r--r-- | fs/bcachefs/reflink.c | 16 | ||||
-rw-r--r-- | fs/bcachefs/sb-downgrade.c | 11 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 9 |
28 files changed, 483 insertions, 464 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 22e689436316..cab4d6798dd7 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -2384,8 +2384,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: + ret = bch2_dev_remove_lrus(c, ca) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 6f25e2687cd2..553031a3b06a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -675,6 +675,7 @@ struct bch_dev { x(error) \ x(topology_error) \ x(errors_fixed) \ + x(errors_fixed_silent) \ x(errors_not_fixed) \ x(no_invalid_checks) \ x(discard_mount_opt_set) \ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 76a2ae7f8d2d..0839397105a9 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -706,8 +706,7 @@ struct bch_sb_field_ext { x(fast_device_removal, BCH_VERSION(1, 27)) \ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ - x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ - x(btree_node_accounting, BCH_VERSION(1, 31)) + x(31bit_dirent_offset, BCH_VERSION(1, 30)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34ec1a90980d..52d21259ed6f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -27,10 +27,15 @@ #include <linux/moduleparam.h> #include <linux/sched/mm.h> +static __maybe_unused unsigned bch2_btree_read_corrupt_ratio; +static __maybe_unused int bch2_btree_read_corrupt_device; + #ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_btree_read_corrupt_ratio; module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); + +module_param_named(btree_read_corrupt_device, bch2_btree_read_corrupt_device, int, 0644); +MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); #endif static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) @@ -1438,7 +1443,9 @@ start: memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); bio->bi_iter.bi_size = btree_buf_bytes(b); - bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); + if (bch2_btree_read_corrupt_device == rb->pick.ptr.dev || + bch2_btree_read_corrupt_device < 0) + bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); if (ret != -BCH_ERR_btree_node_read_err_want_retry && diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 99e928f77999..021f5cb7998d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -749,7 +749,6 @@ static int __trigger_extent(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; - bool insert = !(flags & BTREE_TRIGGER_overwrite); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -803,7 +802,7 @@ static int __trigger_extent(struct btree_trans *trans, if (cur_compression_type && cur_compression_type != p.crc.compression_type) { - if (!insert) + if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, @@ -836,7 +835,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (cur_compression_type) { - if (!insert) + if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, @@ -846,17 +845,12 @@ static int __trigger_extent(struct btree_trans *trans, } if (level) { - const bool leaf_node = level == 1; - s64 v[3] = { - replicas_sectors, - insert ? 1 : -1, - !leaf_node ? (insert ? 1 : -1) : 0, - }; - - ret = bch2_disk_accounting_mod2(trans, gc, v, btree, btree_id); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { + bool insert = !(flags & BTREE_TRIGGER_overwrite); + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 5d2f536986c8..7a0da6cdf78c 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -11,6 +11,7 @@ #include "ec.h" #include "error.h" #include "extents.h" +#include "inode.h" #include "io_write.h" #include "keylist.h" #include "move.h" @@ -428,13 +429,18 @@ restart_drop_extra_replicas: goto out; } + struct bch_inode_opts opts; + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: + bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?: + bch2_bkey_set_needs_rebalance(c, &opts, insert, + SET_NEEDS_REBALANCE_foreground, + m->op.opts.change_cookie) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_internal_snapshot_node); if (ret) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 831b4c10b856..a99f821c6a1c 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -883,113 +883,118 @@ int bch2_accounting_read(struct bch_fs *c) *dst++ = *i; keys->gap = keys->nr = dst - keys->data; - guard(percpu_write)(&c->mark_lock); - - darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - memset(v, 0, sizeof(v)); - - for (unsigned j = 0; j < i->nr_counters; j++) - v[j] = percpu_u64_get(i->v[0] + j); + CLASS(printbuf, underflow_err)(); - /* - * If the entry counters are zeroed, it should be treated as - * nonexistent - it might point to an invalid device. - * - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); - free_percpu(i->v[1]); - darray_remove_item(&acc->k, i); - ret = 0; - continue; - } + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each_reverse(acc->k, i) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->pos); - if (ret) - return ret; - } + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + memset(v, 0, sizeof(v)); - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); - for (unsigned i = 0; i < acc->k.nr; i++) { - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + /* + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: + */ + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); + ret = 0; + continue; + } - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + if (ret) + return ret; + } - /* - * Check for underflow, schedule check_allocations - * necessary: - * - * XXX - see if we can factor this out to run on a bkey - * so we can check everything lazily, right now we don't - * check the non in-mem counters at all - */ - bool underflow = false; - for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - underflow |= (s64) v[j] < 0; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); - if (underflow) { - CLASS(printbuf, buf)(); - bch2_log_msg_start(c, &buf); + for (unsigned i = 0; i < acc->k.nr; i++) { + struct disk_accounting_pos k; + bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - prt_printf(&buf, "Accounting underflow for\n"); - bch2_accounting_key_to_text(&buf, &k); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + /* + * Check for underflow, schedule check_allocations + * necessary: + * + * XXX - see if we can factor this out to run on a bkey + * so we can check everything lazily, right now we don't + * check the non in-mem counters at all + */ + bool underflow = false; for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(&buf, " %lli", v[j]); - - bool print = bch2_count_fsck_err(c, accounting_key_underflow, &buf); - unsigned pos = buf.pos; - ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - print |= buf.pos != pos; + underflow |= (s64) v[j] < 0; - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - if (ret) - return ret; - } + if (underflow) { + if (!underflow_err.pos) { + bch2_log_msg_start(c, &underflow_err); + prt_printf(&underflow_err, "Accounting underflow for\n"); + } + bch2_accounting_key_to_text(&underflow_err, &k); - guard(preempt)(); - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) + prt_printf(&underflow_err, " %lli", v[j]); + prt_newline(&underflow_err); + } - switch (k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - usage->reserved += v[0] * k.persistent_reserved.nr_replicas; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); - if (ca) { - struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; - percpu_u64_set(&d->buckets, v[0]); - percpu_u64_set(&d->sectors, v[1]); - percpu_u64_set(&d->fragmented, v[2]); - - if (k.dev_data_type.data_type == BCH_DATA_sb || - k.dev_data_type.data_type == BCH_DATA_journal) - usage->hidden += v[0] * ca->mi.bucket_size; + guard(preempt)(); + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + + switch (k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + usage->reserved += v[0] * k.persistent_reserved.nr_replicas; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); + if (ca) { + struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; + percpu_u64_set(&d->buckets, v[0]); + percpu_u64_set(&d->sectors, v[1]); + percpu_u64_set(&d->fragmented, v[2]); + + if (k.dev_data_type.data_type == BCH_DATA_sb || + k.dev_data_type.data_type == BCH_DATA_journal) + usage->hidden += v[0] * ca->mi.bucket_size; + } + break; + } } - break; - } } } + if (underflow_err.pos) { + bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err); + unsigned pos = underflow_err.pos; + ret = bch2_run_explicit_recovery_pass(c, &underflow_err, + BCH_RECOVERY_PASS_check_allocations, 0); + print |= underflow_err.pos != pos; + + if (print) + bch2_print_str(c, KERN_ERR, underflow_err.buf); + if (ret) + return ret; + } + return ret; } diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 730a17ea4243..8269af1dbe2a 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -108,7 +108,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(dev_data_type, 3, 3) \ x(compression, 4, 3) \ x(snapshot, 5, 1) \ - x(btree, 6, 3) \ + x(btree, 6, 1) \ x(rebalance_work, 7, 1) \ x(inum, 8, 3) @@ -174,14 +174,6 @@ struct bch_acct_snapshot { __u32 id; } __packed; -/* - * Metadata accounting per btree id: - * [ - * total btree disk usage in sectors - * total number of btree nodes - * number of non-leaf btree nodes - * ] - */ struct bch_acct_btree { __u32 id; } __packed; diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 9e69263eb796..a16f55d98d97 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -468,10 +468,10 @@ int __bch2_fsck_err(struct bch_fs *c, if ((flags & FSCK_ERR_SILENT) || test_bit(err, c->sb.errors_silent)) { - ret = flags & FSCK_CAN_FIX + set_bit(BCH_FS_errors_fixed_silent, &c->flags); + return flags & FSCK_CAN_FIX ? bch_err_throw(c, fsck_fix) : bch_err_throw(c, fsck_ignore); - goto err; } printbuf_indent_add_nextline(out, 2); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 193c8ec0bdcd..655ed90b2a39 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -369,9 +369,9 @@ err: } int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, - u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode, - unsigned flags) + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) { CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inode_nr, snapshot), flags); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); @@ -1238,20 +1238,30 @@ void bch2_inode_opts_get_inode(struct bch_fs *c, BCH_INODE_OPTS() #undef x - ret->opt_change_cookie = atomic_read(&c->opt_change_cookie); + ret->change_cookie = atomic_read(&c->opt_change_cookie); bch2_io_opts_fixups(ret); } -int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_inode_opts *opts) +int bch2_inum_snapshot_opts_get(struct btree_trans *trans, + u64 inum, u32 snapshot, + struct bch_inode_opts *opts) { - struct bch_inode_unpacked inode; - int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); + if (inum) { + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); + if (ret) + return ret; - if (ret) - return ret; + bch2_inode_opts_get_inode(trans->c, &inode, opts); + } else { + /* + * data_update_index_update may call us for reflink btree extent + * updates, inum will be 0 + */ - bch2_inode_opts_get_inode(trans->c, &inode, opts); + bch2_inode_opts_get(trans->c, opts); + } return 0; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 12e0a104c196..63b7088811fb 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -290,7 +290,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get_inode(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_opts *); -int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_inode_opts *); +int bch2_inum_snapshot_opts_get(struct btree_trans *, u64, u32, struct bch_inode_opts *); int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, unsigned); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 5e03574059e0..04eb5ecd102b 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -109,7 +109,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, } ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); + 0, i_sectors_delta, true, 0); err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); @@ -211,7 +211,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_cut_back(end_pos, &delete); ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); + &disk_res, 0, i_sectors_delta, false, 0); bch2_disk_reservation_put(c, &disk_res); } @@ -373,7 +373,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, struct btree_iter iter; struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - struct bch_inode_opts opts; u64 dst_offset = le64_to_cpu(op->v.dst_offset); u64 src_offset = le64_to_cpu(op->v.src_offset); s64 shift = dst_offset - src_offset; @@ -384,10 +383,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bool warn_errors = i_sectors_delta != NULL; int ret = 0; - ret = bch2_inum_opts_get(trans, inum, &opts); - if (ret) - return ret; - /* * check for missing subvolume before fpunch, as in resume we don't want * it to be a fatal error @@ -476,8 +471,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index aed22fc7759b..6a5da02ce266 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -205,7 +205,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter *extent_iter, u64 new_i_size, - s64 i_sectors_delta) + s64 i_sectors_delta, + struct bch_inode_unpacked *inode_u) { /* * Crazy performance optimization: @@ -227,7 +228,13 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_cached); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); + + /* + * XXX: we currently need to unpack the inode on every write because we + * need the current io_opts, for transactional consistency - inode_v4? + */ + int ret = bkey_err(k) ?: + bch2_inode_unpack(k, inode_u); if (unlikely(ret)) return ret; @@ -303,8 +310,10 @@ int bch2_extent_update(struct btree_trans *trans, struct disk_reservation *disk_res, u64 new_i_size, s64 *i_sectors_delta_total, - bool check_enospc) + bool check_enospc, + u32 change_cookie) { + struct bch_fs *c = trans->c; struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -335,7 +344,7 @@ int bch2_extent_update(struct btree_trans *trans, if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, + ret = bch2_disk_reservation_add(c, disk_res, disk_sectors_delta - disk_res->sectors, !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); @@ -349,9 +358,16 @@ int bch2_extent_update(struct btree_trans *trans, * aren't changing - for fsync to work properly; fsync relies on * inode->bi_journal_seq which is updated by the trigger code: */ + struct bch_inode_unpacked inode; + struct bch_inode_opts opts; + ret = bch2_extent_update_i_size_sectors(trans, iter, min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: + i_sectors_delta, &inode) ?: + (bch2_inode_opts_get_inode(c, &inode, &opts), + bch2_bkey_set_needs_rebalance(c, &opts, k, + SET_NEEDS_REBALANCE_foreground, + change_cookie)) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -402,7 +418,8 @@ static int bch2_write_index_default(struct bch_write_op *op) ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_check_enospc); + op->flags & BCH_WRITE_check_enospc, + op->opts.change_cookie); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -792,10 +809,6 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); - - if (!(op->flags & BCH_WRITE_move)) - bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - bch2_keylist_push(&op->insert_keys); } @@ -1225,6 +1238,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } + struct bch_fs *c = trans->c; struct bkey_i *new = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance)); int ret = PTR_ERR_OR_ZERO(new); @@ -1239,8 +1253,6 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; - bch2_bkey_set_needs_rebalance(op->c, &op->opts, new); - /* * Note that we're not calling bch2_subvol_get_snapshot() in this path - * that was done when we kicked off the write, and here it's important @@ -1248,8 +1260,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, * since been created. The write is still outstanding, so we're ok * w.r.t. snapshot atomicity: */ + + /* + * For transactional consistency, set_needs_rebalance() has to be called + * with the io_opts from the btree in the same transaction: + */ + struct bch_inode_unpacked inode; + struct bch_inode_opts opts; + return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: + min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: + (bch2_inode_opts_get_inode(c, &inode, &opts), + bch2_bkey_set_needs_rebalance(c, &opts, new, + SET_NEEDS_REBALANCE_foreground, + op->opts.change_cookie)) ?: bch2_trans_update(trans, iter, new, BTREE_UPDATE_internal_snapshot_node); } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 6c05ba6e15d6..692529bf401d 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -28,7 +28,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, struct bkey_i *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, subvol_inum, struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); + struct disk_reservation *, u64, s64 *, bool, u32); static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, struct bch_inode_opts opts) diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index b9c0834498dd..c533b60706bf 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -51,25 +51,17 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, : 0; } -int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +static int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) { - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -} - -int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); + return __bch2_lru_set(trans, lru_id, dev_bucket, time, true); } int __bch2_lru_change(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 old_time, u64 new_time) { - if (old_time == new_time) - return 0; - - return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: - bch2_lru_set(trans, lru_id, dev_bucket, new_time); + return __bch2_lru_set(trans, lru_id, dev_bucket, old_time, false) ?: + __bch2_lru_set(trans, lru_id, dev_bucket, new_time, true); } static const char * const bch2_lru_types[] = { @@ -87,7 +79,6 @@ int bch2_lru_check_set(struct btree_trans *trans, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); CLASS(btree_iter, lru_iter)(trans, BTREE_ID_lru, lru_pos(lru_id, dev_bucket, time), 0); struct bkey_s_c lru_k = bch2_btree_iter_peek_slot(&lru_iter); int ret = bkey_err(lru_k); @@ -99,10 +90,13 @@ int bch2_lru_check_set(struct btree_trans *trans, if (ret) return ret; - if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "missing %s lru entry at pos ", bch2_lru_types[lru_type(lru_k)]); + bch2_bpos_to_text(&buf, lru_iter.pos); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, referring_k); + + if (fsck_err(trans, alloc_key_to_missing_lru_entry, "%s", buf.buf)) { ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) return ret; @@ -127,6 +121,23 @@ static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) } } +int bch2_dev_remove_lrus(struct bch_fs *c, struct bch_dev *ca) +{ + CLASS(btree_trans, trans)(c); + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key(trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, ({ + struct bbpos bp = lru_pos_to_bp(k); + + bp.btree == BTREE_ID_alloc && bp.pos.inode == ca->dev_idx + ? (bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0)) + : 0; + })); + bch_err_fn(c, ret); + return ret; +} + static u64 bkey_lru_type_idx(struct bch_fs *c, enum bch_lru_type type, struct bkey_s_c k) diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 6f1e0a7b5db5..d5a2620f2507 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -59,8 +59,6 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); .min_val_size = 8, \ }) -int bch2_lru_del(struct btree_trans *, u16, u64, u64); -int bch2_lru_set(struct btree_trans *, u16, u64, u64); int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); static inline int bch2_lru_change(struct btree_trans *trans, @@ -72,9 +70,10 @@ static inline int bch2_lru_change(struct btree_trans *trans, : 0; } +int bch2_dev_remove_lrus(struct bch_fs *, struct bch_dev *); + struct bkey_buf; int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 56e9ba4ed6a8..9a440d3f7180 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -451,93 +451,6 @@ err: return ret; } -struct bch_inode_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct bch_inode_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - - if (btree_iter_path(trans, extent_iter)->level) - return opts_ret; - - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - - if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - _ret3 = bch2_inode_unpack(k, &inode); - if (_ret3) - break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get_inode(trans->c, &inode, &e.io_opts); - - darray_push(&io_opts->d, e); - })); - io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); - if (ret) - return ERR_PTR(ret); - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { - opts_ret = &i->io_opts; - break; - } -out: - ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); - if (ret) - return ERR_PTR(ret); - return opts_ret; -} - -int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_inode_opts *io_opts, - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - - bch2_inode_opts_get(c, io_opts); - - /* reflink btree? */ - if (extent_k.k->p.inode) { - CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get_inode(c, &inode, io_opts); - } - } - - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); -} - int bch2_move_ratelimit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; @@ -582,37 +495,6 @@ int bch2_move_ratelimit(struct moving_context *ctxt) return 0; } -/* - * Move requires non extents iterators, and there's also no need for it to - * signal indirect_extent_missing_error: - */ -static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_reflink_p p) -{ - if (unlikely(REFLINK_P_ERROR(p.v))) - return bkey_s_c_null; - - struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); - - bch2_trans_iter_init(trans, iter, - BTREE_ID_reflink, reflink_pos, - BTREE_ITER_not_extents); - - struct bkey_s_c k = bch2_btree_iter_peek(iter); - if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(iter); - return k; - } - - if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(iter); - return bkey_s_c_null; - } - - return k; -} - int bch2_move_data_btree(struct moving_context *ctxt, struct bpos start, struct bpos end, @@ -627,12 +509,6 @@ int bch2_move_data_btree(struct moving_context *ctxt, struct btree_iter iter, reflink_iter = {}; struct bkey_s_c k; struct data_update_opts data_opts; - /* - * If we're moving a single file, also process reflinked data it points - * to (this includes propagating changed io_opts from the inode to the - * extent): - */ - bool walk_indirect = start.inode == end.inode; int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); @@ -697,8 +573,6 @@ root_err: bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { - struct btree_iter *extent_iter = &iter; - bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -717,41 +591,18 @@ root_err: if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - if (walk_indirect && - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - bch2_trans_iter_exit(&reflink_iter); - k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (!k.k) - goto next_nondata; - - /* - * XXX: reflink pointers may point to multiple indirect - * extents, so don't advance past the entire reflink - * pointer - need to fixup iter->k - */ - extent_iter = &reflink_iter; - } - if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, extent_iter, k); + io_opts = bch2_extent_get_apply_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k, + SET_NEEDS_REBALANCE_other); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) goto next; /* @@ -762,7 +613,7 @@ root_err: k = bkey_i_to_s_c(sk.k); if (!level) - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); else if (!data_opts.scrub) ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, k.k->p, data_opts.target, 0); @@ -944,7 +795,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, goto next; if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); + ret = bch2_extent_get_apply_io_opts_one(trans, &io_opts, &iter, k, + SET_NEEDS_REBALANCE_other); if (ret) { bch2_trans_iter_exit(&iter); continue; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 18021d2c51d0..754b0ad45950 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -87,32 +87,6 @@ void bch2_moving_ctxt_flush_all(struct moving_context *); void bch2_move_ctxt_wait_for_io(struct moving_context *); int bch2_move_ratelimit(struct moving_context *); -/* Inodes in different snapshots may have different IO options: */ -struct snapshot_io_opts_entry { - u32 snapshot; - struct bch_inode_opts io_opts; -}; - -struct per_snapshot_io_opts { - u64 cur_inum; - struct bch_inode_opts fs_io_opts; - DARRAY(struct snapshot_io_opts_entry) d; -}; - -static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) -{ - memset(io_opts, 0, sizeof(*io_opts)); - bch2_inode_opts_get(c, &io_opts->fs_io_opts); -} - -static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) -{ - darray_exit(&io_opts->d); -} - -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_inode_opts *, - struct btree_iter *, struct bkey_s_c); - int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, @@ -122,10 +96,6 @@ int bch2_move_extent(struct moving_context *, struct bch_inode_opts, struct data_update_opts); -struct bch_inode_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bpos, - struct btree_iter *, struct bkey_s_c); - int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, move_pred_fn, void *, enum btree_id, unsigned); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index c4faa66b55ce..122bc98e4cbb 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -803,7 +803,7 @@ void bch2_inode_opts_get(struct bch_fs *c, struct bch_inode_opts *ret) BCH_INODE_OPTS() #undef x - ret->opt_change_cookie = atomic_read(&c->opt_change_cookie); + ret->change_cookie = atomic_read(&c->opt_change_cookie); bch2_io_opts_fixups(ret); } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index a5779f8943cf..22cf109fb9c9 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -678,7 +678,7 @@ struct bch_inode_opts { BCH_INODE_OPTS() #undef x - u32 opt_change_cookie; + u32 change_cookie; }; static inline void bch2_io_opts_fixups(struct bch_inode_opts *opts) diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c index 7cc16490ffa9..541ee951d1c9 100644 --- a/fs/bcachefs/progress.c +++ b/fs/bcachefs/progress.c @@ -4,21 +4,14 @@ #include "disk_accounting.h" #include "progress.h" -void bch2_progress_init_inner(struct progress_indicator_state *s, - struct bch_fs *c, - u64 leaf_btree_id_mask, - u64 inner_btree_id_mask) +void bch2_progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) { memset(s, 0, sizeof(*s)); s->next_print = jiffies + HZ * 10; - /* This is only an estimation: nodes can have different replica counts */ - const u32 expected_node_disk_sectors = - READ_ONCE(c->opts.metadata_replicas) * btree_sectors(c); - - const u64 btree_id_mask = leaf_btree_id_mask | inner_btree_id_mask; - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { if (!(btree_id_mask & BIT_ULL(i))) continue; @@ -26,29 +19,9 @@ void bch2_progress_init_inner(struct progress_indicator_state *s, struct disk_accounting_pos acc; disk_accounting_key_init(acc, btree, .id = i); - struct { - u64 disk_sectors; - u64 total_nodes; - u64 inner_nodes; - } v = {0}; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), - (u64 *)&v, sizeof(v) / sizeof(u64)); - - /* Better to estimate as 0 than the total node count */ - if (inner_btree_id_mask & BIT_ULL(i)) - s->nodes_total += v.inner_nodes; - - if (!(leaf_btree_id_mask & BIT_ULL(i))) - continue; - - /* - * We check for zeros to degrade gracefully when run - * with un-upgraded accounting info (missing some counters). - */ - if (v.total_nodes != 0) - s->nodes_total += v.total_nodes - v.inner_nodes; - else - s->nodes_total += div_u64(v.disk_sectors, expected_node_disk_sectors); + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); } } diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h index 91f345337709..972a73087ffe 100644 --- a/fs/bcachefs/progress.h +++ b/fs/bcachefs/progress.h @@ -20,17 +20,7 @@ struct progress_indicator_state { struct btree *last_node; }; -void bch2_progress_init_inner(struct progress_indicator_state *s, - struct bch_fs *c, - u64 leaf_btree_id_mask, - u64 inner_btree_id_mask); - -static inline void bch2_progress_init(struct progress_indicator_state *s, - struct bch_fs *c, u64 btree_id_mask) -{ - bch2_progress_init_inner(s, c, btree_id_mask, 0); -} - +void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); void bch2_progress_update_iter(struct btree_trans *, struct progress_indicator_state *, struct btree_iter *, diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 901cff84aab5..fa73de7890da 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -211,7 +211,9 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_inode_ } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, - struct bkey_i *_k) + struct bkey_i *_k, + enum set_needs_rebalance_ctx ctx, + u32 change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; @@ -235,10 +237,11 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, return 0; } -int bch2_get_update_rebalance_opts(struct btree_trans *trans, - struct bch_inode_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx) { BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); @@ -267,10 +270,121 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: - bch_err_throw(trans->c, transaction_restart_nested); + bch_err_throw(trans->c, transaction_restart_commit); +} + +static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, + struct bkey_s_c extent_k) +{ + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; + int ret = 0; + + if (btree_iter_path(trans, extent_iter)->level) + return &io_opts->fs_io_opts; + + if (extent_k.k->type == KEY_TYPE_reflink_v) + return &io_opts->fs_io_opts; + + if (io_opts->cur_inum != extent_pos.inode) { + io_opts->d.nr = 0; + + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), + BTREE_ITER_all_snapshots, k, ({ + if (k.k->p.offset != extent_pos.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get_inode(c, &inode, &e.io_opts); + + darray_push(&io_opts->d, e); + })); + io_opts->cur_inum = extent_pos.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); + if (ret) + return ERR_PTR(ret); + + if (extent_k.k->p.snapshot) + darray_for_each(io_opts->d, i) + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) + return &i->io_opts; + + return &io_opts->fs_io_opts; +} + +struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + struct bch_inode_opts *opts = + bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); + if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) + return opts; + + int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx); + return ret ? ERR_PTR(ret) : opts; +} + +int bch2_extent_get_io_opts_one(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + struct bch_fs *c = trans->c; + + bch2_inode_opts_get(c, io_opts); + + /* reflink btree? */ + if (extent_k.k->p.inode) { + CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_cached); + struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); + int ret = bkey_err(inode_k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret && bkey_is_inode(inode_k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get_inode(c, &inode, io_opts); + } + } + + return 0; +} + +int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + int ret = bch2_extent_get_io_opts_one(trans, io_opts, extent_iter, extent_k, ctx); + if (ret || btree_iter_path(trans, extent_iter)->level) + return ret; + + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx); } #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) @@ -403,9 +517,10 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, struct btree_iter *extent_iter, - struct bch_inode_opts *io_opts, + struct bch_inode_opts **opts_ret, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; @@ -419,13 +534,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; - int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); + struct bch_inode_opts *opts = + bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, + extent_iter->pos, extent_iter, k, + SET_NEEDS_REBALANCE_other); + int ret = PTR_ERR_OR_ZERO(opts); if (ret) return bkey_s_c_err(ret); + *opts_ret = opts; + memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); + data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { @@ -450,19 +571,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); + unsigned p = bch2_bkey_ptrs_need_compress(c, opts, k, ptrs); if (p) { prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); + bch2_compression_opt_to_text(&buf, opts->background_compression); prt_str(&buf, " "); bch2_prt_u64_base2(&buf, p); prt_newline(&buf); } - p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); + p = bch2_bkey_ptrs_need_move(c, opts, ptrs); if (p) { prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, io_opts->background_target); + bch2_target_to_text(&buf, c, opts->background_target); prt_str(&buf, " "); bch2_prt_u64_base2(&buf, p); prt_newline(&buf); @@ -477,6 +598,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, noinline_for_stack static int do_rebalance_extent(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, struct btree_iter *extent_iter) { @@ -484,7 +606,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; struct data_update_opts data_opts; - struct bch_inode_opts io_opts; + struct bch_inode_opts *io_opts; struct bkey_s_c k; struct bkey_buf sk; int ret; @@ -495,8 +617,8 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); ret = lockrestart_do(trans, - bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &io_opts, &data_opts))); + bkey_err(k = next_rebalance_extent(trans, snapshot_io_opts, + work_pos, extent_iter, &io_opts, &data_opts))); if (ret || !k.k) goto out; @@ -509,7 +631,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); if (ret) { if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ @@ -528,7 +650,31 @@ out: return ret; } +static int do_rebalance_scan_indirect(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + struct bch_inode_opts *opts) +{ + u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); + u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); + u32 restart_count = trans->restart_count; + + int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink, + POS(0, idx), BTREE_ITER_not_extents, k, ({ + if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) + break; + bch2_get_update_rebalance_opts(trans, opts, &iter, k, + SET_NEEDS_REBALANCE_opt_change_indirect); + })); + if (ret) + return ret; + + /* suppress trans_was_restarted() check */ + trans->restart_count = restart_count; + return 0; +} + static int do_rebalance_scan(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, u64 inum, u64 cookie, u64 *sectors_scanned) { struct btree_trans *trans = ctxt->trans; @@ -548,32 +694,33 @@ static int do_rebalance_scan(struct moving_context *ctxt, r->state = BCH_REBALANCE_scanning; - struct per_snapshot_io_opts snapshot_io_opts; - per_snapshot_io_opts_init(&snapshot_io_opts, c); - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, r->scan_start.pos, r->scan_end.pos, BTREE_ITER_all_snapshots| BTREE_ITER_prefetch, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - struct bch_inode_opts *io_opts = bch2_move_get_io_opts(trans, - &snapshot_io_opts, iter.pos, &iter, k); - PTR_ERR_OR_ZERO(io_opts); + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, iter.pos, &iter, k, + SET_NEEDS_REBALANCE_opt_change); + PTR_ERR_OR_ZERO(opts) ?: + (inum && + k.k->type == KEY_TYPE_reflink_p && + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) + ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) + : 0); })) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - per_snapshot_io_opts_exit(&snapshot_io_opts); *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); - bch2_move_stats_exit(&r->scan_stats, c); - /* * Ensure that the rebalance_work entries we created are seen by the * next iteration of do_rebalance(), so we don't end up stuck in * rebalance_wait(): */ *sectors_scanned += 1; + bch2_move_stats_exit(&r->scan_stats, c); bch2_btree_write_buffer_flush_sync(trans); @@ -625,6 +772,9 @@ static int do_rebalance(struct moving_context *ctxt) bch2_move_stats_init(&r->work_stats, "rebalance_work"); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + while (!bch2_move_ratelimit(ctxt)) { if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); @@ -639,15 +789,18 @@ static int do_rebalance(struct moving_context *ctxt) break; ret = k->k.type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k->k.p.inode, + ? do_rebalance_scan(ctxt, &snapshot_io_opts, + k->k.p.inode, le64_to_cpu(bkey_i_to_cookie(k)->v.cookie), §ors_scanned) - : do_rebalance_extent(ctxt, k->k.p, &extent_iter); + : do_rebalance_extent(ctxt, &snapshot_io_opts, + k->k.p, &extent_iter); if (ret) break; } bch2_trans_iter_exit(&extent_iter); + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->work_stats, c); if (!ret && diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index c5f49f480a79..bff91aa0102e 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -30,11 +30,51 @@ void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, const struct bch_extent_rebalance *); u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, struct bkey_i *); -int bch2_get_update_rebalance_opts(struct btree_trans *, - struct bch_inode_opts *, - struct btree_iter *, - struct bkey_s_c); + +enum set_needs_rebalance_ctx { + SET_NEEDS_REBALANCE_opt_change, + SET_NEEDS_REBALANCE_opt_change_indirect, + SET_NEEDS_REBALANCE_foreground, + SET_NEEDS_REBALANCE_other, +}; + +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, + struct bkey_i *, enum set_needs_rebalance_ctx, u32); + +/* Inodes in different snapshots may have different IO options: */ +struct snapshot_io_opts_entry { + u32 snapshot; + struct bch_inode_opts io_opts; +}; + +struct per_snapshot_io_opts { + u64 cur_inum; + struct bch_inode_opts fs_io_opts; + DARRAY(struct snapshot_io_opts_entry) d; +}; + +static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) +{ + memset(io_opts, 0, sizeof(*io_opts)); + bch2_inode_opts_get(c, &io_opts->fs_io_opts); +} + +static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) +{ + darray_exit(&io_opts->d); +} + +struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); + +int bch2_extent_get_io_opts_one(struct btree_trans *, struct bch_inode_opts *, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); +int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opts *, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8679c8aad0e7..531c2ef128ae 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -837,33 +837,39 @@ use_clean: bch2_async_btree_node_rewrites_flush(c); /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bool errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_fixed_silent, &c->flags); + + if (errors_fixed) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); } /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_errors_fixed, &c->flags) && + errors_fixed && !test_bit(BCH_FS_errors_not_fixed, &c->flags) && !test_bit(BCH_FS_error, &c->flags)) { bch2_flush_fsck_errs(c); bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); + errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags); clear_bit(BCH_FS_errors_fixed, &c->flags); + clear_bit(BCH_FS_errors_fixed_silent, &c->flags); ret = bch2_run_recovery_passes(c, BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; - if (test_bit(BCH_FS_errors_fixed, &c->flags) || + if (errors_fixed || test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_errors_fixed, &c->flags); + if (errors_fixed) + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 55ad8ab7a148..d54468fdcb18 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -589,7 +589,6 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos dst_start = POS(dst_inum.inum, dst_offset); struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; - struct bch_inode_opts opts; struct bpos src_want; u64 dst_done = 0; u32 dst_snapshot, src_snapshot; @@ -609,10 +608,6 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_src); CLASS(btree_trans, trans)(c); - ret = bch2_inum_opts_get(trans, src_inum, &opts); - if (ret) - goto err; - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, @@ -709,11 +704,10 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); + ret = bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true, 0); bch2_disk_reservation_put(c, &disk_res); } bch2_trans_iter_exit(&dst_iter); @@ -744,7 +738,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_iter_exit(&inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -err: + bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index bfd06fd5d506..de56a1ee79db 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -104,10 +104,7 @@ x(inode_has_case_insensitive, \ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ - BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ - x(btree_node_accounting, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -155,11 +152,7 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(btree_node_accounting, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_nr_counters_wrong) + BCH_FSCK_ERR_accounting_key_junk_at_end) struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 32b12311928e..de1e8912975c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -833,6 +833,8 @@ int bch2_fs_init_rw(struct bch_fs *c) if (test_bit(BCH_FS_rw_init_done, &c->flags)) return 0; + bch_verbose(c, "doing rw allocations"); + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", @@ -1286,7 +1288,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, if (ret) goto err; - if (go_rw_in_recovery(c)) { + /* + * just make sure this is always allocated if we might need it - mount + * failing due to kthread_create() failing is _very_ annoying + */ + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) || + go_rw_in_recovery(c)) { /* * start workqueues/kworkers early - kthread creation checks for * pending signals, which is _very_ annoying |