summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/bcachefs_format.h3
-rw-r--r--fs/bcachefs/checksum.h11
-rw-r--r--fs/bcachefs/data_update.c28
-rw-r--r--fs/bcachefs/disk_accounting.c3
-rw-r--r--fs/bcachefs/disk_accounting_format.h8
-rw-r--r--fs/bcachefs/ec.c2
-rw-r--r--fs/bcachefs/extents.c21
-rw-r--r--fs/bcachefs/io_write.c4
-rw-r--r--fs/bcachefs/migrate.c2
-rw-r--r--fs/bcachefs/opts.c8
-rw-r--r--fs/bcachefs/opts.h1
-rw-r--r--fs/bcachefs/rebalance.c665
-rw-r--r--fs/bcachefs/rebalance.h52
-rw-r--r--fs/bcachefs/rebalance_format.h63
-rw-r--r--fs/bcachefs/sb-downgrade.c5
-rw-r--r--fs/bcachefs/sb-errors_format.h6
-rw-r--r--fs/bcachefs/super.c4
-rw-r--r--fs/bcachefs/sysfs.c4
-rw-r--r--fs/bcachefs/trace.h5
19 files changed, 650 insertions, 245 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d29bd684b137..090f11e122ad 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -707,7 +707,8 @@ struct bch_sb_field_ext {
x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
x(31bit_dirent_offset, BCH_VERSION(1, 30)) \
- x(btree_node_accounting, BCH_VERSION(1, 31))
+ x(btree_node_accounting, BCH_VERSION(1, 31)) \
+ x(rebalance_v2, BCH_VERSION(1, 32))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 10bfadcde80a..362846d5bb87 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
return bch2_csum_opt_to_type(opts.data_checksum, true);
}
+static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c,
+ struct bch_extent_rebalance opts)
+{
+ if (c->sb.encryption_type)
+ return c->opts.wide_macs
+ ? BCH_CSUM_chacha20_poly1305_128
+ : BCH_CSUM_chacha20_poly1305_80;
+
+ return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
{
if (c->sb.encryption_type)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 155c1ad42fc1..6333af6adbae 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -208,28 +208,6 @@ static void trace_data_update2(struct data_update *m,
}
noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
- struct bkey_s_c old, struct bkey_s_c k,
- struct bkey_i *insert)
-{
- struct bch_fs *c = m->op.c;
- CLASS(printbuf, buf)();
-
- bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_io_move_created_rebalance(c, buf.buf);
-
- count_event(c, io_move_created_rebalance);
-}
-
-noinline_for_stack
static int data_update_invalid_bkey(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
@@ -438,7 +416,7 @@ restart_drop_extra_replicas:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?:
- bch2_bkey_set_needs_rebalance(c, &opts, insert,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert,
SET_NEEDS_REBALANCE_foreground,
m->op.opts.change_cookie) ?:
bch2_trans_update(trans, &iter, insert,
@@ -449,10 +427,6 @@ restart_drop_extra_replicas:
if (trace_data_update_enabled())
trace_data_update2(m, old, k, insert);
- if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
- bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
- trace_io_move_created_rebalance2(m, old, k, insert);
-
ret = bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a99f821c6a1c..9da26e11446b 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -282,6 +282,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
prt_str(out, "btree=");
bch2_btree_id_to_text(out, k->btree.id);
break;
+ case BCH_DISK_ACCOUNTING_rebalance_work_v2:
+ bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type);
+ break;
}
}
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 730a17ea4243..0b61d6100180 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -110,7 +110,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
x(snapshot, 5, 1) \
x(btree, 6, 3) \
x(rebalance_work, 7, 1) \
- x(inum, 8, 3)
+ x(inum, 8, 3) \
+ x(rebalance_work_v2, 9, 1) \
enum disk_accounting_type {
#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
@@ -210,6 +211,10 @@ struct bch_acct_inum {
struct bch_acct_rebalance_work {
};
+struct bch_acct_rebalance_work_v2 {
+ __u8 type;
+};
+
struct disk_accounting_pos {
union {
struct {
@@ -224,6 +229,7 @@ struct disk_accounting_pos {
struct bch_acct_btree btree;
struct bch_acct_rebalance_work rebalance_work;
struct bch_acct_inum inum;
+ struct bch_acct_rebalance_work_v2 rebalance_work_v2;
} __packed;
} __packed;
struct bpos _pad;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 89a95b6c4e51..103719a76c81 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1134,7 +1134,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
ret = bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n),
SET_NEEDS_REBALANCE_other) ?:
- bch2_bkey_set_needs_rebalance(trans->c, &opts, n,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n,
SET_NEEDS_REBALANCE_other, 0) ?:
bch2_trans_update(trans, &iter, n, 0);
out:
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3274ba42c995..c534b009bf60 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1522,24 +1522,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
"redundant stripe entry");
have_ec = true;
break;
- case BCH_EXTENT_ENTRY_rebalance: {
- /*
- * this shouldn't be a fsck error, for forward
- * compatibility; the rebalance code should just refetch
- * the compression opt if it's unknown
- */
-#if 0
- const struct bch_extent_rebalance *r = &entry->rebalance;
-
- if (!bch2_compression_opt_valid(r->compression)) {
- union bch_compression_opt opt = { .value = r->compression };
- prt_printf(err, "invalid compression opt %u:%u",
- opt.type, opt.level);
- return bch_err_throw(c, invalid_bkey);
- }
-#endif
+ case BCH_EXTENT_ENTRY_rebalance:
+ ret = bch2_extent_rebalance_validate(c, k, from, &entry->rebalance);
+ if (ret)
+ return ret;
break;
- }
case BCH_EXTENT_ENTRY_flags:
bkey_fsck_err_on(entry != ptrs.start,
c, extent_flags_not_at_start,
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 6a5da02ce266..ccbca802db0b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -365,7 +365,7 @@ int bch2_extent_update(struct btree_trans *trans,
min(k->k.p.offset << 9, new_i_size),
i_sectors_delta, &inode) ?:
(bch2_inode_opts_get_inode(c, &inode, &opts),
- bch2_bkey_set_needs_rebalance(c, &opts, k,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k,
SET_NEEDS_REBALANCE_foreground,
change_cookie)) ?:
bch2_trans_update(trans, iter, k, 0) ?:
@@ -1271,7 +1271,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0, &inode) ?:
(bch2_inode_opts_get_inode(c, &inode, &opts),
- bch2_bkey_set_needs_rebalance(c, &opts, new,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new,
SET_NEEDS_REBALANCE_foreground,
op->opts.change_cookie)) ?:
bch2_trans_update(trans, iter, new,
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 139a6587a64e..9b172af4f8c8 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -84,7 +84,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
struct bch_inode_opts opts;
ret = bch2_extent_get_apply_io_opts_one(trans, &opts, iter, k, ctx) ?:
- bch2_bkey_set_needs_rebalance(c, &opts, n, ctx, 0) ?:
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, ctx, 0) ?:
drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false);
if (ret)
return ret;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bd5faafc9aa7..365cce4a6b49 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -103,6 +103,13 @@ static const char * const __bch2_fs_usage_types[] = {
#undef x
+static const char * const __bch2_rebalance_accounting_types[] = {
+#define x(n) #n,
+ BCH_REBALANCE_ACCOUNTING()
+#undef x
+ NULL
+};
+
static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
unsigned nr, const char *type, unsigned idx)
{
@@ -125,6 +132,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
+PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 6b9f18839345..de1ac235e929 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -34,6 +34,7 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
+void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type);
static inline const char *bch2_d_type_str(unsigned d_type)
{
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 67d6a90e86ef..27282616bcb0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
@@ -25,8 +26,29 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
+
/* bch_extent_rebalance: */
+int bch2_extent_rebalance_validate(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bkey_validate_context from,
+ const struct bch_extent_rebalance *r)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(r->pending && !(r->need_rb & BIT(BCH_REBALANCE_background_target)),
+ c, extent_rebalance_bad_pending,
+ "pending incorrectly set");
+
+ bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)),
+ c, extent_rebalance_bad_pending,
+ "hipri incorrectly set");
+
+fsck_err:
+ return ret;
+}
+
static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{
const union bch_extent_entry *entry;
@@ -38,15 +60,30 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b
return NULL;
}
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
}
+static const char * const rebalance_opts[] = {
+#define x(n) #n,
+ BCH_REBALANCE_OPTS()
+#undef x
+ NULL
+};
+
void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
const struct bch_extent_rebalance *r)
{
- prt_printf(out, "replicas=%u", r->data_replicas);
+ prt_str(out, "need_rb=");
+ prt_bitflags(out, rebalance_opts, r->need_rb);
+
+ if (r->hipri)
+ prt_str(out, " hipri");
+ if (r->pending)
+ prt_str(out, " pending");
+
+ prt_printf(out, " replicas=%u", r->data_replicas);
if (r->data_replicas_from_inode)
prt_str(out, " (inode)");
@@ -92,32 +129,54 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-int bch2_trigger_extent_rebalance(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta[1] = { 0 };
+/*
+ * XXX: check in bkey_validate that if r->hipri or r->pending are set,
+ * r->data_replicas are also set
+ */
- s64 s = bch2_bkey_sectors_need_rebalance(c, old);
- need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta[0] -= s;
+static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r)
+{
+ if (!r)
+ return 0;
+ unsigned ret = r->need_rb;
- s = bch2_bkey_sectors_need_rebalance(c, new);
- need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta[0] += s;
+ if (r->hipri)
+ ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority);
+ if (r->pending) {
+ ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending);
+ ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target);
+ }
+ return ret;
+}
- if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
+int __bch2_trigger_extent_rebalance(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned old_r, unsigned new_r,
+ enum btree_iter_update_trigger_flags flags)
+{
+ int delta = (int) !!new_r - (int) !!old_r;
+ if ((flags & BTREE_TRIGGER_transactional) && delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, need_rebalance_delta > 0);
+ new.k->p, delta > 0);
if (ret)
return ret;
}
- if (need_rebalance_sectors_delta[0]) {
+ delta = old.k->size == new.k->size
+ ? old_r ^ new_r
+ : old_r | new_r;
+ while (delta) {
+ unsigned c = __ffs(delta);
+ delta ^= BIT(c);
+
+ s64 v[1] = { 0 };
+ if (old_r & BIT(c))
+ v[0] -= (s64) old.k->size;
+ if (new_r & BIT(c))
+ v[0] += (s64) new.k->size;
+
int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- need_rebalance_sectors_delta, rebalance_work);
+ v, rebalance_work_v2, c);
if (ret)
return ret;
}
@@ -125,39 +184,48 @@ int bch2_trigger_extent_rebalance(struct btree_trans *trans,
return 0;
}
-static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
- struct bch_inode_opts *io_opts,
- unsigned *move_ptrs,
- unsigned *compress_ptrs,
- u64 *sectors)
+static struct bch_extent_rebalance
+bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_inode_opts *opts,
+ unsigned *move_ptrs,
+ unsigned *compress_ptrs,
+ unsigned *csum_ptrs,
+ bool may_update_indirect)
{
*move_ptrs = 0;
*compress_ptrs = 0;
- *sectors = 0;
+ *csum_ptrs = 0;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
- if (!io_opts && !rb_opts)
- return;
+ struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) };
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return;
+ return r;
+
+ const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs);
+ if (old_r) {
+ r = *old_r;
+ r.need_rb = 0;
+ }
+
+#define x(_name) \
+ if (k.k->type != KEY_TYPE_reflink_v || \
+ may_update_indirect || \
+ (!opts->_name##_from_inode && !r._name##_from_inode)) { \
+ r._name = opts->_name; \
+ r._name##_from_inode = opts->_name##_from_inode; \
+ }
+ BCH_REBALANCE_OPTS()
+#undef x
+
+ unsigned compression_type = bch2_compression_opt_to_type(r.background_compression);
+ unsigned csum_type = bch2_data_checksum_type_rb(c, r);
- unsigned compression_type =
- bch2_compression_opt_to_type(io_opts
- ? io_opts->background_compression
- : rb_opts->background_compression);
- unsigned target = io_opts
- ? io_opts->background_target
- : rb_opts->background_target;
- if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target))
- target = 0;
+ bool incompressible = false, unwritten = false, ec = false;
+ unsigned durability = 0, min_durability = INT_MAX;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- bool incompressible = false, unwritten = false;
-
unsigned ptr_idx = 1;
guard(rcu)();
@@ -166,102 +234,264 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
unwritten |= p.ptr.unwritten;
if (!p.ptr.cached) {
- if (p.crc.compression_type != compression_type)
+ if (p.crc.compression_type != compression_type) {
*compress_ptrs |= ptr_idx;
+ r.need_rb |= BIT(BCH_REBALANCE_background_compression);
+ }
+
+ if (p.crc.csum_type != csum_type) {
+ *csum_ptrs |= ptr_idx;
+ r.need_rb |= BIT(BCH_REBALANCE_data_checksum);
+ }
- if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
+ if (r.background_target &&
+ !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) {
*move_ptrs |= ptr_idx;
+ r.need_rb |= BIT(BCH_REBALANCE_background_target);
+ }
+
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+ durability += d;
+ min_durability = min(min_durability, d);
+
+ ec |= p.has_ec;
}
ptr_idx <<= 1;
}
- if (unwritten)
- *compress_ptrs = 0;
- if (incompressible)
+ if (unwritten || incompressible) {
*compress_ptrs = 0;
+ r.need_rb &= ~BIT(BCH_REBALANCE_background_compression);
+ }
+
+ if (unwritten) {
+ *csum_ptrs = 0;
+ r.need_rb &= !BIT(BCH_REBALANCE_data_checksum);
+ }
- unsigned rb_ptrs = *move_ptrs | *compress_ptrs;
+ if (durability < r.data_replicas || durability >= r.data_replicas + min_durability)
+ r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
+ return r;
+}
- if (!rb_ptrs)
- return;
+static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v)
+{
+ if (v && *v)
+ return 1;
- ptr_idx = 1;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (rb_ptrs & ptr_idx)
- *sectors += p.crc.compressed_size;
- ptr_idx <<= 1;
- }
+ /*
+ * If opts need to be propagated to the extent, a scan cookie should be
+ * present:
+ */
+ CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = k.k->type == KEY_TYPE_cookie;
+ if (v)
+ *v = ret;
+ return ret;
}
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bkey_s_c k,
+ struct bch_devs_mask *v)
{
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- u64 sectors = 0;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, &sectors);
- return sectors;
+ bkey_for_each_ptr(ptrs, ptr)
+ if (v && test_bit(ptr->dev, v->d))
+ return 1;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ int ret = check_rebalance_scan_cookie(trans, ptr->dev + 1, NULL);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+ if (v)
+ __set_bit(ptr->dev, v->d);
+ return ret;
+ }
+ }
+
+ return 0;
}
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s_c k)
+static int new_needs_rb_allowed(struct btree_trans *trans,
+ struct per_snapshot_io_opts *s,
+ struct bkey_s_c k,
+ enum set_needs_rebalance_ctx ctx,
+ unsigned opt_change_cookie,
+ const struct bch_extent_rebalance *old,
+ const struct bch_extent_rebalance *new,
+ unsigned new_need_rb)
{
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- u64 sectors = 0;
+ struct bch_fs *c = trans->c;
+ /*
+ * New need_rb - pointers that don't match the current io path options -
+ * are only allowed in certain situations:
+ *
+ * Propagating new options: from bch2_set_rebalance_needs_scan
+ *
+ * Foreground writes: background_compression and background_target are
+ * allowed
+ *
+ * Foreground writes: we may have raced with an option change:
+ * opt_change_cookie checks for this
+ *
+ * XXX: foreground writes should still match compression,
+ * foreground_target - figure out how to check for this
+ */
+ if (ctx == SET_NEEDS_REBALANCE_opt_change ||
+ ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
+ return 0;
+
+ if (ctx == SET_NEEDS_REBALANCE_foreground) {
+ new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
+ BIT(BCH_REBALANCE_background_target));
+
+ /*
+ * Foreground writes might end up degraded when a device is
+ * getting yanked:
+ *
+ * XXX: this is something we need to fix, but adding retries to
+ * the write path is something we have to do carefully.
+ */
+ new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas);
+ if (!new_need_rb)
+ return 0;
+
+ if (opt_change_cookie != atomic_read(&c->opt_change_cookie))
+ return 0;
+ }
- bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
- return move_ptrs|compress_ptrs;
+ /*
+ * Either the extent data or the extent io options (from
+ * bch_extent_rebalance) should match the io_opts from the
+ * inode/filesystem, unless
+ *
+ * - There's a scan pending to propagate new options
+ * - It's an indirect extent: it may be referenced by inodes
+ * with inconsistent options
+ *
+ * For efficiency (so that we can cache checking for scan
+ * cookies), only check option consistency when we're called
+ * with snapshot_io_opts - don't bother when we're called from
+ * move_data_phys() -> get_io_opts_one()
+ *
+ * Note that we can cache the existence of a cookie, but not the
+ * non-existence, to avoid spurious false positives.
+ */
+ int ret = check_rebalance_scan_cookie(trans, 0, s ? &s->fs_scan_cookie : NULL) ?:
+ check_rebalance_scan_cookie(trans, k.k->p.inode, s ? &s->inum_scan_cookie : NULL);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ if (new_need_rb == BIT(BCH_REBALANCE_data_replicas)) {
+ ret = check_dev_rebalance_scan_cookie(trans, k, s ? &s->dev_cookie : NULL);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+
+ CLASS(printbuf, buf)();
+
+ prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ const struct bch_extent_rebalance _old = {};
+ if (!old)
+ old = &_old;
+
+#define x(_name) \
+ if (new_need_rb & BIT(BCH_REBALANCE_##_name)) \
+ prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name);
+ BCH_REBALANCE_OPTS()
+#undef x
+
+ fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf);
+fsck_err:
+ return ret;
}
-static inline bool bkey_should_have_rb_opts(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s_c k)
+static inline bool bkey_should_have_rb_opts(struct bkey_s_c k,
+ struct bch_extent_rebalance new)
{
if (k.k->type == KEY_TYPE_reflink_v) {
-#define x(n) if (opts->n##_from_inode) return true;
+#define x(n) if (new.n##_from_inode) return true;
BCH_REBALANCE_OPTS()
#undef x
}
- return bch2_bkey_ptrs_need_rebalance(c, opts, k);
+ return new.need_rb;
}
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
+int bch2_bkey_set_needs_rebalance(struct btree_trans *trans,
+ struct per_snapshot_io_opts *snapshot_io_opts,
+ struct bch_inode_opts *opts,
struct bkey_i *_k,
enum set_needs_rebalance_ctx ctx,
- u32 change_cookie)
+ u32 opt_change_cookie)
{
if (!bkey_extent_is_direct_data(&_k->k))
return 0;
+ struct bch_fs *c = trans->c;
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *old =
(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
- if (bkey_should_have_rb_opts(c, opts, k.s_c)) {
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ unsigned csum_ptrs = 0;
+ struct bch_extent_rebalance new =
+ bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, &csum_ptrs,
+ ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
+
+ bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new);
+
+ if (should_have_rb == !!old &&
+ (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
+ return 0;
+
+ unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0);
+
+ if (unlikely(new_need_rb)) {
+ int ret = new_needs_rb_allowed(trans, snapshot_io_opts,
+ k.s_c, ctx, opt_change_cookie,
+ old, &new, new_need_rb);
+ if (ret)
+ return ret;
+ }
+
+ if (should_have_rb) {
if (!old) {
old = bkey_val_end(k);
k.k->u64s += sizeof(*old) / sizeof(u64);
}
- *old = io_opts_to_rebalance_opts(c, opts);
- } else {
- if (old)
- extent_entry_drop(k, (union bch_extent_entry *) old);
- }
+ *old = new;
+ } else if (old)
+ extent_entry_drop(k, (union bch_extent_entry *) old);
return 0;
}
static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bch_inode_opts *io_opts,
struct btree_iter *iter,
struct bkey_s_c k,
enum set_needs_rebalance_ctx ctx)
{
struct bch_fs *c = trans->c;
+ int ret = 0;
BUG_ON(iter->flags & BTREE_ITER_is_extents);
BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
@@ -269,36 +499,24 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
if (!bkey_extent_is_direct_data(k.k))
return 0;
- bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect;
+ struct bch_extent_rebalance *old =
+ (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k);
- /*
- * If it's an indirect extent, and we walked to it directly, we won't
- * have the options from the inode that were directly applied: options
- * from the extent take precedence - unless the io_opts option came from
- * the inode and may_update_indirect is true (walked from a
- * REFLINK_P_MAY_UPDATE_OPTIONS pointer).
- */
- const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
- if (old && k.k->type == KEY_TYPE_reflink_v) {
-#define x(_name) \
- if (old->_name##_from_inode && \
- !(may_update_indirect && io_opts->_name##_from_inode)) { \
- io_opts->_name = old->_name; \
- io_opts->_name##_from_inode = true; \
- }
- BCH_REBALANCE_OPTS()
-#undef x
- }
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ unsigned csum_ptrs = 0;
+ struct bch_extent_rebalance new =
+ bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, &csum_ptrs,
+ ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts);
+ bool should_have_rb = bkey_should_have_rb_opts(k, new);
- if (bkey_should_have_rb_opts(c, io_opts, k)
- ? old && !memcmp(old, &new, sizeof(new))
- : !old)
+ if (should_have_rb == !!old &&
+ (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
return 0;
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
- int ret = PTR_ERR_OR_ZERO(n);
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
@@ -306,7 +524,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
/* On successfull transaction commit, @k was invalidated: */
- return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?:
+ return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, n, ctx, 0) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, 0) ?:
bch_err_throw(c, transaction_restart_commit);
@@ -349,7 +567,8 @@ static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans,
darray_push(&io_opts->d, e);
}));
- io_opts->cur_inum = extent_pos.inode;
+ io_opts->cur_inum = extent_pos.inode;
+ io_opts->inum_scan_cookie = false;
}
ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -372,11 +591,13 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans,
enum set_needs_rebalance_ctx ctx)
{
struct bch_inode_opts *opts =
- bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k);
+ bch2_extent_get_io_opts(trans, snapshot_io_opts,
+ extent_pos, extent_iter, extent_k);
if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level)
return opts;
- int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx);
+ int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts,
+ extent_iter, extent_k, ctx);
return ret ? ERR_PTR(ret) : opts;
}
@@ -420,11 +641,9 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans,
if (ret || btree_iter_path(trans, extent_iter)->level)
return ret;
- return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx);
+ return bch2_get_update_rebalance_opts(trans, NULL, io_opts, extent_iter, extent_k, ctx);
}
-#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
-
static const char * const bch2_rebalance_state_strs[] = {
#define x(t) #t,
BCH_REBALANCE_STATES()
@@ -467,6 +686,11 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
return ret;
}
+int bch2_set_rebalance_needs_scan_device(struct bch_fs *c, unsigned dev)
+{
+ return bch2_set_rebalance_needs_scan(c, dev + 1);
+}
+
int bch2_set_fs_needs_rebalance(struct bch_fs *c)
{
return bch2_set_rebalance_needs_scan(c, 0);
@@ -535,23 +759,6 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
return &(&darray_pop(buf))->k_i;
}
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
- return 0;
-
- struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- extent_entry_drop(bkey_i_to_s(n),
- (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct per_snapshot_io_opts *snapshot_io_opts,
struct bpos work_pos,
@@ -570,6 +777,10 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
if (bkey_err(k))
return k;
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+ if (!r || !r->need_rb) /* Write buffer race? */
+ return bkey_s_c_null;
+
struct bch_inode_opts *opts =
bch2_extent_get_apply_io_opts(trans, snapshot_io_opts,
extent_iter->pos, extent_iter, k,
@@ -580,22 +791,66 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
*opts_ret = opts;
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ unsigned csum_ptrs = 0;
+ bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, false);
+
memset(data_opts, 0, sizeof(*data_opts));
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k);
+ data_opts->rewrite_ptrs = move_ptrs|compress_ptrs|csum_ptrs;
data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
- if (!data_opts->rewrite_ptrs) {
- /*
- * device we would want to write to offline? devices in target
- * changed?
- *
- * We'll now need a full scan before this extent is picked up
- * again:
- */
- int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
- if (ret)
- return bkey_s_c_err(ret);
+ if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) {
+ unsigned durability = bch2_bkey_durability(c, k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned ptr_bit = 1;
+
+ guard(rcu)();
+ if (durability <= opts->data_replicas) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (ca && !ptr->cached && !ca->mi.durability)
+ data_opts->kill_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ data_opts->extra_replicas = opts->data_replicas - durability;
+ } else {
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= opts->data_replicas) {
+ data_opts->kill_ptrs |= ptr_bit;
+ durability -= d;
+ }
+
+ ptr_bit <<= 1;
+ }
+
+ ptr_bit = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.has_ec && durability - p.ec.redundancy >= opts->data_replicas) {
+ data_opts->kill_ec_ptrs |= ptr_bit;
+ durability -= p.ec.redundancy;
+ }
+
+ ptr_bit <<= 1;
+ }
+ }
+ }
+
+ if (!data_opts->rewrite_ptrs &&
+ !data_opts->kill_ptrs &&
+ !data_opts->kill_ec_ptrs &&
+ !data_opts->extra_replicas) {
+ CLASS(printbuf, buf)();
+ prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "%s", buf.buf);
return bkey_s_c_null;
}
@@ -605,12 +860,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- u64 sectors = 0;
-
- bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
-
if (move_ptrs) {
prt_str(&buf, "move=");
bch2_target_to_text(&buf, c, opts->background_target);
@@ -627,6 +876,14 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
prt_newline(&buf);
}
+ if (csum_ptrs) {
+ prt_str(&buf, "csum=");
+ bch2_prt_csum_opt(&buf, opts->data_checksum);
+ prt_str(&buf, " ");
+ bch2_prt_u64_base2(&buf, csum_ptrs);
+ prt_newline(&buf);
+ }
+
trace_rebalance_extent(c, buf.buf);
}
count_event(c, rebalance_extent);
@@ -688,8 +945,75 @@ out:
return ret;
}
+static int do_rebalance_scan_bp(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct bkey_buf *last_flushed)
+{
+ if (bp.v->level) /* metadata not supported yet */
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
+ last_flushed);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!k.k)
+ return 0;
+
+ struct bch_inode_opts io_opts;
+ ret = bch2_extent_get_io_opts_one(trans, &io_opts, &iter, k,
+ SET_NEEDS_REBALANCE_opt_change);
+ bch2_trans_iter_exit(&iter);
+ return ret;
+}
+
+static int do_rebalance_scan_device(struct moving_context *ctxt,
+ unsigned dev, u64 cookie,
+ u64 *sectors_scanned)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ bch2_btree_write_buffer_flush_sync(trans);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+ POS(dev, 0), POS(dev, U64_MAX),
+ BTREE_ITER_prefetch, k, ({
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ do_rebalance_scan_bp(trans, bkey_s_c_to_backpointer(k), &last_flushed);
+ })) ?:
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_rebalance_needs_scan(trans, dev + 1, cookie));
+
+ *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
+ /*
+ * Ensure that the rebalance_work entries we created are seen by the
+ * next iteration of do_rebalance(), so we don't end up stuck in
+ * rebalance_wait():
+ */
+ *sectors_scanned += 1;
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ bch2_btree_write_buffer_flush_sync(trans);
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ return ret;
+}
+
static int do_rebalance_scan_indirect(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bch_inode_opts *opts)
{
u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
@@ -702,7 +1026,7 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans,
BTREE_ITER_not_extents, k, ({
if (bpos_ge(bkey_start_pos(k.k), POS(0, end)))
break;
- bch2_get_update_rebalance_opts(trans, opts, &iter, k,
+ bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, k,
SET_NEEDS_REBALANCE_opt_change_indirect);
}));
if (ret)
@@ -724,15 +1048,21 @@ static int do_rebalance_scan(struct moving_context *ctxt,
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
+ r->state = BCH_REBALANCE_scanning;
+
if (!inum) {
r->scan_start = BBPOS_MIN;
r->scan_end = BBPOS_MAX;
- } else {
+ } else if (inum >= BCACHEFS_ROOT_INO) {
r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
- }
+ } else {
+ unsigned dev = inum - 1;
+ r->scan_start = BBPOS(BTREE_ID_backpointers, POS(dev, 0));
+ r->scan_end = BBPOS(BTREE_ID_backpointers, POS(dev, U64_MAX));
- r->state = BCH_REBALANCE_scanning;
+ return do_rebalance_scan_device(ctxt, inum - 1, cookie, sectors_scanned);
+ }
int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
r->scan_start.pos, r->scan_end.pos,
@@ -750,7 +1080,8 @@ static int do_rebalance_scan(struct moving_context *ctxt,
(inum &&
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)
- ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts)
+ ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k),
+ snapshot_io_opts, opts)
: 0);
}));
if (ret)
@@ -1049,6 +1380,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c)
static int check_rebalance_work_one(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct btree_iter *rebalance_iter,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
@@ -1089,8 +1421,7 @@ static int check_rebalance_work_one(struct btree_trans *trans,
extent_k.k = &deleted;
}
- bool should_have_rebalance =
- bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
+ bool should_have_rebalance = bch2_bkey_needs_rb(extent_k);
bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
if (should_have_rebalance != have_rebalance) {
@@ -1119,6 +1450,21 @@ static int check_rebalance_work_one(struct btree_trans *trans,
return ret;
}
+ struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans,
+ snapshot_io_opts, extent_iter->pos, extent_iter, extent_k,
+ SET_NEEDS_REBALANCE_other);
+ ret = PTR_ERR_OR_ZERO(opts);
+ if (ret == -BCH_ERR_transaction_restart_commit) {
+ /*
+ * If get_apply_io_opts() did work, just advance and check the
+ * next key; it may have updated the rebalance_work btree so
+ * we'd need a write buffer flush to check what it just did.
+ */
+ ret = 0;
+ }
+ if (ret)
+ return ret;
+
if (cmp <= 0)
bch2_btree_iter_advance(extent_iter);
if (cmp >= 0)
@@ -1131,10 +1477,14 @@ int bch2_check_rebalance_work(struct bch_fs *c)
{
CLASS(btree_trans, trans)(c);
CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_not_extents|
BTREE_ITER_prefetch);
CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_prefetch);
+ struct per_snapshot_io_opts snapshot_io_opts;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
+
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
@@ -1148,12 +1498,15 @@ int bch2_check_rebalance_work(struct bch_fs *c)
bch2_trans_begin(trans);
- ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
+ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter,
+ &snapshot_io_opts, &last_flushed) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
}
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
bch2_bkey_buf_exit(&last_flushed, c);
return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 24bafa42f070..e6dd9a7db26c 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -7,10 +7,14 @@
#include "opts.h"
#include "rebalance_types.h"
+int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context,
+ const struct bch_extent_rebalance *);
+
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
struct bch_inode_opts *opts)
{
- struct bch_extent_rebalance r = {
+ return (struct bch_extent_rebalance) {
.type = BIT(BCH_EXTENT_ENTRY_rebalance),
#define x(_name) \
._name = opts->_name, \
@@ -18,22 +22,36 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f
BCH_REBALANCE_OPTS()
#undef x
};
-
- if (r.background_target &&
- !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
- r.background_target = 0;
-
- return r;
};
void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance *);
-int bch2_trigger_extent_rebalance(struct btree_trans *,
- struct bkey_s_c, struct bkey_s_c,
- enum btree_iter_update_trigger_flags);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+
+static inline int bch2_bkey_needs_rb(struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+ return r ? r->need_rb : 0;
+}
+
+int __bch2_trigger_extent_rebalance(struct btree_trans *,
+ struct bkey_s_c, struct bkey_s_c,
+ unsigned, unsigned,
+ enum btree_iter_update_trigger_flags);
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
+{
+ unsigned old_r = bch2_bkey_needs_rb(old);
+ unsigned new_r = bch2_bkey_needs_rb(new);
+
+ return old_r != new_r ||
+ (old.k->size != new.k->size && (old_r|new_r))
+ ? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags)
+ : 0;
+}
enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_opt_change,
@@ -42,9 +60,6 @@ enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_other,
};
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *,
- struct bkey_i *, enum set_needs_rebalance_ctx, u32);
-
/* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry {
u32 snapshot;
@@ -53,6 +68,10 @@ struct snapshot_io_opts_entry {
struct per_snapshot_io_opts {
u64 cur_inum;
+ bool fs_scan_cookie;
+ bool inum_scan_cookie;
+ struct bch_devs_mask dev_cookie;
+
struct bch_inode_opts fs_io_opts;
DARRAY(struct snapshot_io_opts_entry) d;
};
@@ -68,6 +87,10 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
darray_exit(&io_opts->d);
}
+int bch2_bkey_set_needs_rebalance(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bch_inode_opts *,
+ struct bkey_i *, enum set_needs_rebalance_ctx, u32);
+
struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *,
struct per_snapshot_io_opts *, struct bpos,
struct btree_iter *, struct bkey_s_c,
@@ -82,6 +105,7 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opt
int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_rebalance_needs_scan_device(struct bch_fs *, unsigned);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void bch2_rebalance_wakeup(struct bch_fs *c)
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
index ff9a1342a22b..d7a5f899e789 100644
--- a/fs/bcachefs/rebalance_format.h
+++ b/fs/bcachefs/rebalance_format.h
@@ -5,49 +5,76 @@
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
- unused:3,
+ unused:5,
+ hipri:1,
+ pending:1,
+ need_rb:5,
- promote_target_from_inode:1,
- erasure_code_from_inode:1,
+ data_replicas_from_inode:1,
data_checksum_from_inode:1,
+ erasure_code_from_inode:1,
background_compression_from_inode:1,
- data_replicas_from_inode:1,
background_target_from_inode:1,
+ promote_target_from_inode:1,
- promote_target:16,
- erasure_code:1,
+ data_replicas:3,
data_checksum:4,
- data_replicas:4,
+ erasure_code:1,
background_compression:8, /* enum bch_compression_opt */
- background_target:16;
+ background_target:12,
+ promote_target:12;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 background_target:16,
+ __u64 promote_target:12,
+ background_target:12,
background_compression:8,
- data_replicas:4,
- data_checksum:4,
erasure_code:1,
- promote_target:16,
+ data_checksum:4,
+ data_replicas:3,
+ promote_target_from_inode:1,
background_target_from_inode:1,
- data_replicas_from_inode:1,
background_compression_from_inode:1,
- data_checksum_from_inode:1,
erasure_code_from_inode:1,
- promote_target_from_inode:1,
+ data_checksum_from_inode:1,
+ data_replicas_from_inode:1,
- unused:3,
+ need_rb:5,
+ pending:1,
+ hipri:1,
+ unused:5,
type:6;
#endif
};
/* subset of BCH_INODE_OPTS */
#define BCH_REBALANCE_OPTS() \
+ x(data_replicas) \
x(data_checksum) \
+ x(erasure_code) \
x(background_compression) \
+ x(background_target) \
+ x(promote_target)
+
+enum bch_rebalance_opts {
+#define x(n) BCH_REBALANCE_##n,
+ BCH_REBALANCE_OPTS()
+#undef x
+};
+
+#define BCH_REBALANCE_ACCOUNTING() \
x(data_replicas) \
- x(promote_target) \
+ x(data_checksum) \
+ x(erasure_code) \
+ x(background_compression) \
x(background_target) \
- x(erasure_code)
+ x(high_priority) \
+ x(pending) \
+
+enum bch_rebalance_accounting_type {
+#define x(n) BCH_REBALANCE_ACCOUNTING_##n,
+ BCH_REBALANCE_ACCOUNTING()
+#undef x
+};
#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index bfd06fd5d506..66b7f19f0437 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -107,7 +107,10 @@
BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
x(btree_node_accounting, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch)
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(rebalance_v2, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \
+ BCH_FSCK_ERR_extent_io_opts_not_set)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 77e3fc92e39b..9ec2df6c8071 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -159,6 +159,8 @@ enum bch_fsck_flags {
x(extent_ptrs_redundant_stripe, 139, 0) \
x(extent_ptrs_unwritten, 140, 0) \
x(extent_ptrs_written_and_unwritten, 141, 0) \
+ x(extent_rebalance_bad_pending, 330, 0) \
+ x(extent_rebalance_bad_hipri, 331, 0) \
x(ptr_to_invalid_device, 142, 0) \
x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \
x(ptr_to_duplicate_device, 143, 0) \
@@ -339,7 +341,9 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 328, 0)
+ x(extent_io_opts_not_set, 328, FSCK_AUTOFIX) \
+ x(extent_io_opts_unneeded, 329, FSCK_AUTOFIX) \
+ x(MAX, 332, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5cd308a68035..52c6823ae7a4 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1980,6 +1980,10 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
+ /* XXX: add a superblock bit to make this transactional */
+ if (new_state == BCH_MEMBER_STATE_failed)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
bch2_rebalance_wakeup(c);
return ret;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ef6312c50f88..1f60854ab3ef 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -806,6 +806,10 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
if (!ca)
bch2_opt_set_by_id(&c->opts, id, v);
+ /* XXX: add a superblock bit to make this transactional */
+ if (id == Opt_durability)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
if (changed)
bch2_opt_hook_post_set(c, ca, 0, id, v);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 6c312fd9a447..c5d7be2eba03 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1339,11 +1339,6 @@ DEFINE_EVENT(fs_str, io_move_pred,
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)