summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/disk_accounting.c203
-rw-r--r--fs/bcachefs/opts.c9
-rw-r--r--fs/bcachefs/opts.h23
-rw-r--r--fs/bcachefs/rebalance.c67
-rw-r--r--fs/bcachefs/super.c4
-rw-r--r--fs/bcachefs/sysfs.c4
6 files changed, 195 insertions, 115 deletions
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b20ea162bfa3..73f50e5489b4 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -789,102 +789,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t
return ret ? ERR_PTR(ret) : next;
}
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
+static int accounting_read_mem_fixups(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
- CLASS(btree_trans, trans)(c);
- CLASS(printbuf, buf)();
-
- /*
- * We might run more than once if we rewind to start topology repair or
- * btree node scan - and those might cause us to get different results,
- * so we can't just skip if we've already run.
- *
- * Instead, zero out any accounting we have:
- */
- scoped_guard(percpu_write, &c->mark_lock) {
- darray_for_each(acc->k, e)
- percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
- for_each_member_device(c, ca)
- percpu_memset(ca->usage, 0, sizeof(*ca->usage));
- percpu_memset(c->usage, 0, sizeof(*c->usage));
- }
-
- struct journal_keys *keys = &c->journal_keys;
- struct journal_key *jk = keys->data;
-
- move_gap(keys, keys->nr);
-
- while (jk < &darray_top(*keys) &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
- jk++;
-
- struct journal_key *end = jk;
- while (end < &darray_top(*keys) &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
- end++;
-
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
- iter.flags &= ~BTREE_ITER_with_journal;
- int ret = for_each_btree_key_continue(trans, iter,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- if (k.k->type != KEY_TYPE_accounting)
- continue;
-
- while (jk < end &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
- jk = accumulate_and_read_journal_accounting(trans, jk);
-
- while (jk < end &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
- bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
- jk->overwritten = true;
- jk++;
- }
-
- if (jk < end &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
- jk = accumulate_and_read_journal_accounting(trans, jk);
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next_acc;
- memset(&next_acc, 0, sizeof(next_acc));
- next_acc.type = acc_k.type + 1;
- struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
- if (jk < end)
- next = bpos_min(next, journal_key_k(c, jk)->k.p);
-
- bch2_btree_iter_set_pos(&iter, next);
- continue;
- }
-
- accounting_read_key(trans, k);
- }));
- bch2_trans_iter_exit(&iter);
- if (ret)
- return ret;
-
- while (jk < end)
- jk = accumulate_and_read_journal_accounting(trans, jk);
-
- struct journal_key *dst = keys->data;
- darray_for_each(*keys, i)
- if (!i->overwritten)
- *dst++ = *i;
- keys->gap = keys->nr = dst - keys->data;
-
CLASS(printbuf, underflow_err)();
scoped_guard(percpu_write, &c->mark_lock) {
@@ -905,7 +813,7 @@ int bch2_accounting_read(struct bch_fs *c)
* Remove it, so that if it's re-added it gets re-marked in the
* superblock:
*/
- ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
+ int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
: bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
@@ -987,7 +895,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (underflow_err.pos) {
bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err);
unsigned pos = underflow_err.pos;
- ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
+ int ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
BCH_RECOVERY_PASS_check_allocations, 0);
print |= underflow_err.pos != pos;
@@ -997,7 +905,108 @@ int bch2_accounting_read(struct bch_fs *c)
return ret;
}
- return ret;
+ return 0;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ CLASS(btree_trans, trans)(c);
+ CLASS(printbuf, buf)();
+
+ /*
+ * We might run more than once if we rewind to start topology repair or
+ * btree node scan - and those might cause us to get different results,
+ * so we can't just skip if we've already run.
+ *
+ * Instead, zero out any accounting we have:
+ */
+ scoped_guard(percpu_write, &c->mark_lock) {
+ darray_for_each(acc->k, e)
+ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+ for_each_member_device(c, ca)
+ percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+ percpu_memset(c->usage, 0, sizeof(*c->usage));
+ }
+
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *jk = keys->data;
+
+ move_gap(keys, keys->nr);
+
+ while (jk < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
+ jk++;
+
+ struct journal_key *end = jk;
+ while (end < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
+ end++;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+ iter.flags &= ~BTREE_ITER_with_journal;
+ int ret = for_each_btree_key_continue(trans, iter,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
+
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
+
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
+ bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
+ jk->overwritten = true;
+ jk++;
+ }
+
+ if (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
+
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next_acc;
+ memset(&next_acc, 0, sizeof(next_acc));
+ next_acc.type = acc_k.type + 1;
+ struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
+ if (jk < end)
+ next = bpos_min(next, journal_key_k(c, jk)->k.p);
+
+ bch2_btree_iter_set_pos(&iter, next);
+ continue;
+ }
+
+ accounting_read_key(trans, k);
+ }));
+ bch2_trans_iter_exit(&iter);
+ if (ret)
+ return ret;
+
+ while (jk < end)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
+
+ bch2_trans_unlock(trans);
+
+ struct journal_key *dst = keys->data;
+ darray_for_each(*keys, i)
+ if (!i->overwritten)
+ *dst++ = *i;
+ keys->gap = keys->nr = dst - keys->data;
+
+ return accounting_read_mem_fixups(trans);
}
int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 78175d659e0e..e01c808e7893 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -654,7 +654,8 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
val = bch2_opt_val_synonym_lookup(name, val);
- if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+ if (!(bch2_opt_table[id].flags & OPT_MOUNT) &&
+ !(bch2_opt_table[id].flags & OPT_MOUNT_OLD))
return -BCH_ERR_option_name;
if ((id == Opt_usrquota ||
@@ -677,6 +678,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
if (ret < 0)
return -BCH_ERR_option_value;
+ if (bch2_opt_table[id].flags & OPT_MOUNT_OLD) {
+ pr_err("option %s may no longer be specified at mount time; set via sysfs opts dir",
+ bch2_opt_table[id].attr.name);
+ return 0;
+ }
+
if (opts)
bch2_opt_set_by_id(opts, id, v);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index de1ac235e929..68982196b5dc 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -67,6 +67,7 @@ enum opt_flags {
OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
OPT_HIDDEN = BIT(11),
+ OPT_MOUNT_OLD = BIT(12), /* May not be specified at mount time, but don't fail the mount */
};
enum opt_type {
@@ -150,12 +151,12 @@ enum fsck_err_opts {
BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
- OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX + 1), \
BCH_SB_META_REPLICAS_WANT, 1, \
"#", "Number of metadata replicas") \
x(data_replicas, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX + 1), \
BCH_SB_DATA_REPLICAS_WANT, 1, \
"#", "Number of data replicas") \
@@ -176,12 +177,12 @@ enum fsck_err_opts {
BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
"size", "Maximum size of checksummed/compressed extents")\
x(metadata_checksum, u8, \
- OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_STR(__bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
@@ -191,12 +192,12 @@ enum fsck_err_opts {
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
NULL, NULL) \
x(compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(background_compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
@@ -206,27 +207,27 @@ enum fsck_err_opts {
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
- OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or label for metadata writes") \
x(foreground_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0, \
"(target)", "Device or label for foreground writes") \
x(background_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0, \
"(target)", "Device or label to move data to in the background")\
x(promote_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0, \
"(target)", "Device or label to promote data to on read") \
x(erasure_code, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 27282616bcb0..0e40e7bd3441 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -11,6 +11,7 @@
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
+#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "inode.h"
@@ -272,6 +273,8 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
if (durability < r.data_replicas || durability >= r.data_replicas + min_durability)
r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
+ if (!unwritten && r.erasure_code != ec)
+ r.need_rb |= BIT(BCH_REBALANCE_erasure_code);
return r;
}
@@ -321,6 +324,17 @@ static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bke
return 0;
}
+static bool bkey_has_ec(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr)
+ return true;
+ return false;
+}
+
static int new_needs_rb_allowed(struct btree_trans *trans,
struct per_snapshot_io_opts *s,
struct bkey_s_c k,
@@ -350,6 +364,14 @@ static int new_needs_rb_allowed(struct btree_trans *trans,
ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
return 0;
+ if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) &&
+ !bkey_has_ec(k)) {
+ /* Foreground writes are not initially erasure coded - and we
+ * may crash before a stripe is created
+ */
+ new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code);
+ }
+
if (ctx == SET_NEEDS_REBALANCE_foreground) {
new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
BIT(BCH_REBALANCE_background_target));
@@ -759,6 +781,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
return &(&darray_pop(buf))->k_i;
}
+static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs)
+{
+ struct bch_fs *c = trans->c;
+
+ guard(rcu)();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+ if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket)))
+ return true;
+ }
+ return false;
+}
+
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct per_snapshot_io_opts *snapshot_io_opts,
struct bpos work_pos,
@@ -801,9 +840,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) {
unsigned durability = bch2_bkey_durability(c, k);
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned ptr_bit = 1;
guard(rcu)();
@@ -817,9 +859,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->extra_replicas = opts->data_replicas - durability;
} else {
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -843,6 +882,26 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
}
}
+ if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) {
+ if (opts->erasure_code) {
+ /* XXX: we'll need ratelimiting */
+ if (extent_ec_pending(trans, ptrs))
+ return bkey_s_c_null;
+
+ data_opts->extra_replicas = opts->data_replicas;
+ } else {
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.has_ec) {
+ data_opts->kill_ec_ptrs |= ptr_bit;
+ data_opts->extra_replicas += p.ec.redundancy;
+ }
+
+ ptr_bit <<= 1;
+ }
+ }
+ }
+
if (!data_opts->rewrite_ptrs &&
!data_opts->kill_ptrs &&
!data_opts->kill_ec_ptrs &&
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 3984f3cee929..534ba2c0f83a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1974,6 +1974,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch_notice(ca, "%s", bch2_member_states[new_state]);
+ if (new_state == BCH_MEMBER_STATE_failed)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_STATE(m, new_state);
@@ -1983,7 +1986,6 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
- /* XXX: add a superblock bit to make this transactional */
if (new_state == BCH_MEMBER_STATE_failed)
bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1f60854ab3ef..85ccc084ecd9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -793,6 +793,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
bool is_sb = opt->get_sb || opt->get_member;
bool changed = false;
+ if (id == Opt_durability)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
if (is_sb) {
changed = bch2_opt_set_sb(c, ca, opt, v);
} else if (!ca) {
@@ -806,7 +809,6 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
if (!ca)
bch2_opt_set_by_id(&c->opts, id, v);
- /* XXX: add a superblock bit to make this transactional */
if (id == Opt_durability)
bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);