diff options
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 203 | ||||
-rw-r--r-- | fs/bcachefs/opts.c | 9 | ||||
-rw-r--r-- | fs/bcachefs/opts.h | 23 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 67 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 4 |
6 files changed, 195 insertions, 115 deletions
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b20ea162bfa3..73f50e5489b4 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -789,102 +789,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t return ret ? ERR_PTR(ret) : next; } -/* - * At startup time, initialize the in memory accounting from the btree (and - * journal) - */ -int bch2_accounting_read(struct bch_fs *c) +static int accounting_read_mem_fixups(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; - CLASS(btree_trans, trans)(c); - CLASS(printbuf, buf)(); - - /* - * We might run more than once if we rewind to start topology repair or - * btree node scan - and those might cause us to get different results, - * so we can't just skip if we've already run. - * - * Instead, zero out any accounting we have: - */ - scoped_guard(percpu_write, &c->mark_lock) { - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - } - - struct journal_keys *keys = &c->journal_keys; - struct journal_key *jk = keys->data; - - move_gap(keys, keys->nr); - - while (jk < &darray_top(*keys) && - __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) - jk++; - - struct journal_key *end = jk; - while (end < &darray_top(*keys) && - __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0) - end++; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - iter.flags &= ~BTREE_ITER_with_journal; - int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - if (k.k->type != KEY_TYPE_accounting) - continue; - - while (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0) - jk = accumulate_and_read_journal_accounting(trans, jk); - - while (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 && - bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) { - jk->overwritten = true; - jk++; - } - - if (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0) - jk = accumulate_and_read_journal_accounting(trans, jk); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next_acc; - memset(&next_acc, 0, sizeof(next_acc)); - next_acc.type = acc_k.type + 1; - struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc)); - if (jk < end) - next = bpos_min(next, journal_key_k(c, jk)->k.p); - - bch2_btree_iter_set_pos(&iter, next); - continue; - } - - accounting_read_key(trans, k); - })); - bch2_trans_iter_exit(&iter); - if (ret) - return ret; - - while (jk < end) - jk = accumulate_and_read_journal_accounting(trans, jk); - - struct journal_key *dst = keys->data; - darray_for_each(*keys, i) - if (!i->overwritten) - *dst++ = *i; - keys->gap = keys->nr = dst - keys->data; - CLASS(printbuf, underflow_err)(); scoped_guard(percpu_write, &c->mark_lock) { @@ -905,7 +813,7 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); @@ -987,7 +895,7 @@ int bch2_accounting_read(struct bch_fs *c) if (underflow_err.pos) { bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err); unsigned pos = underflow_err.pos; - ret = bch2_run_explicit_recovery_pass(c, &underflow_err, + int ret = bch2_run_explicit_recovery_pass(c, &underflow_err, BCH_RECOVERY_PASS_check_allocations, 0); print |= underflow_err.pos != pos; @@ -997,7 +905,108 @@ int bch2_accounting_read(struct bch_fs *c) return ret; } - return ret; + return 0; +} + +/* + * At startup time, initialize the in memory accounting from the btree (and + * journal) + */ +int bch2_accounting_read(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); + + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + } + + struct journal_keys *keys = &c->journal_keys; + struct journal_key *jk = keys->data; + + move_gap(keys, keys->nr); + + while (jk < &darray_top(*keys) && + __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) + jk++; + + struct journal_key *end = jk; + while (end < &darray_top(*keys) && + __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0) + end++; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + if (k.k->type != KEY_TYPE_accounting) + continue; + + while (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0) + jk = accumulate_and_read_journal_accounting(trans, jk); + + while (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 && + bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) { + jk->overwritten = true; + jk++; + } + + if (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0) + jk = accumulate_and_read_journal_accounting(trans, jk); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next_acc; + memset(&next_acc, 0, sizeof(next_acc)); + next_acc.type = acc_k.type + 1; + struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc)); + if (jk < end) + next = bpos_min(next, journal_key_k(c, jk)->k.p); + + bch2_btree_iter_set_pos(&iter, next); + continue; + } + + accounting_read_key(trans, k); + })); + bch2_trans_iter_exit(&iter); + if (ret) + return ret; + + while (jk < end) + jk = accumulate_and_read_journal_accounting(trans, jk); + + bch2_trans_unlock(trans); + + struct journal_key *dst = keys->data; + darray_for_each(*keys, i) + if (!i->overwritten) + *dst++ = *i; + keys->gap = keys->nr = dst - keys->data; + + return accounting_read_mem_fixups(trans); } int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 78175d659e0e..e01c808e7893 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -654,7 +654,8 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, val = bch2_opt_val_synonym_lookup(name, val); - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + if (!(bch2_opt_table[id].flags & OPT_MOUNT) && + !(bch2_opt_table[id].flags & OPT_MOUNT_OLD)) return -BCH_ERR_option_name; if ((id == Opt_usrquota || @@ -677,6 +678,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (ret < 0) return -BCH_ERR_option_value; + if (bch2_opt_table[id].flags & OPT_MOUNT_OLD) { + pr_err("option %s may no longer be specified at mount time; set via sysfs opts dir", + bch2_opt_table[id].attr.name); + return 0; + } + if (opts) bch2_opt_set_by_id(opts, id, v); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index de1ac235e929..68982196b5dc 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -67,6 +67,7 @@ enum opt_flags { OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ OPT_HIDDEN = BIT(11), + OPT_MOUNT_OLD = BIT(12), /* May not be specified at mount time, but don't fail the mount */ }; enum opt_type { @@ -150,12 +151,12 @@ enum fsck_err_opts { BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_META_REPLICAS_WANT, 1, \ "#", "Number of metadata replicas") \ x(data_replicas, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_DATA_REPLICAS_WANT, 1, \ "#", "Number of data replicas") \ @@ -176,12 +177,12 @@ enum fsck_err_opts { BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ @@ -191,12 +192,12 @@ enum fsck_err_opts { BCH_SB_CSUM_ERR_RETRY_NR, 3, \ NULL, NULL) \ x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ @@ -206,27 +207,27 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ x(foreground_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_FOREGROUND_TARGET, 0, \ "(target)", "Device or label for foreground writes") \ x(background_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_BACKGROUND_TARGET, 0, \ "(target)", "Device or label to move data to in the background")\ x(promote_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0, \ "(target)", "Device or label to promote data to on read") \ x(erasure_code, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 27282616bcb0..0e40e7bd3441 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -11,6 +11,7 @@ #include "clock.h" #include "compress.h" #include "disk_groups.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "inode.h" @@ -272,6 +273,8 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, if (durability < r.data_replicas || durability >= r.data_replicas + min_durability) r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + if (!unwritten && r.erasure_code != ec) + r.need_rb |= BIT(BCH_REBALANCE_erasure_code); return r; } @@ -321,6 +324,17 @@ static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bke return 0; } +static bool bkey_has_ec(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) + return true; + return false; +} + static int new_needs_rb_allowed(struct btree_trans *trans, struct per_snapshot_io_opts *s, struct bkey_s_c k, @@ -350,6 +364,14 @@ static int new_needs_rb_allowed(struct btree_trans *trans, ctx == SET_NEEDS_REBALANCE_opt_change_indirect) return 0; + if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) && + !bkey_has_ec(k)) { + /* Foreground writes are not initially erasure coded - and we + * may crash before a stripe is created + */ + new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code); + } + if (ctx == SET_NEEDS_REBALANCE_foreground) { new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)| BIT(BCH_REBALANCE_background_target)); @@ -759,6 +781,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, return &(&darray_pop(buf))->k_i; } +static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs) +{ + struct bch_fs *c = trans->c; + + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) + continue; + + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket))) + return true; + } + return false; +} + static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, @@ -801,9 +840,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) { unsigned durability = bch2_bkey_durability(c, k); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned ptr_bit = 1; guard(rcu)(); @@ -817,9 +859,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, data_opts->extra_replicas = opts->data_replicas - durability; } else { - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -843,6 +882,26 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, } } + if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) { + if (opts->erasure_code) { + /* XXX: we'll need ratelimiting */ + if (extent_ec_pending(trans, ptrs)) + return bkey_s_c_null; + + data_opts->extra_replicas = opts->data_replicas; + } else { + unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec) { + data_opts->kill_ec_ptrs |= ptr_bit; + data_opts->extra_replicas += p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + if (!data_opts->rewrite_ptrs && !data_opts->kill_ptrs && !data_opts->kill_ec_ptrs && diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 3984f3cee929..534ba2c0f83a 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1974,6 +1974,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch_notice(ca, "%s", bch2_member_states[new_state]); + if (new_state == BCH_MEMBER_STATE_failed) + bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); + scoped_guard(mutex, &c->sb_lock) { struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_STATE(m, new_state); @@ -1983,7 +1986,6 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); - /* XXX: add a superblock bit to make this transactional */ if (new_state == BCH_MEMBER_STATE_failed) bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1f60854ab3ef..85ccc084ecd9 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -793,6 +793,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, bool is_sb = opt->get_sb || opt->get_member; bool changed = false; + if (id == Opt_durability) + bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); + if (is_sb) { changed = bch2_opt_set_sb(c, ca, opt, v); } else if (!ca) { @@ -806,7 +809,6 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, if (!ca) bch2_opt_set_by_id(&c->opts, id, v); - /* XXX: add a superblock bit to make this transactional */ if (id == Opt_durability) bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); |