summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Kconfig7
-rw-r--r--fs/bcachefs/Makefile16
-rw-r--r--fs/bcachefs/acl.c4
-rw-r--r--fs/bcachefs/acl.h4
-rw-r--r--fs/bcachefs/bcachefs.h2
-rw-r--r--fs/bcachefs/bcachefs_format.h3
-rw-r--r--fs/bcachefs/btree_gc.c6
-rw-r--r--fs/bcachefs/btree_io.c1
-rw-r--r--fs/bcachefs/btree_journal_iter.c4
-rw-r--r--fs/bcachefs/btree_node_scan.c1
-rw-r--r--fs/bcachefs/checksum.h11
-rw-r--r--fs/bcachefs/clock_types.h2
-rw-r--r--fs/bcachefs/data_update.c107
-rw-r--r--fs/bcachefs/disk_accounting.c207
-rw-r--r--fs/bcachefs/disk_accounting_format.h8
-rw-r--r--fs/bcachefs/ec.c57
-rw-r--r--fs/bcachefs/ec.h9
-rw-r--r--fs/bcachefs/errcode.h6
-rw-r--r--fs/bcachefs/extents.c21
-rw-r--r--fs/bcachefs/fs.c12
-rw-r--r--fs/bcachefs/inode.c4
-rw-r--r--fs/bcachefs/io_write.c119
-rw-r--r--fs/bcachefs/journal_reclaim.c7
-rw-r--r--fs/bcachefs/migrate.c2
-rw-r--r--fs/bcachefs/nocow_locking.c134
-rw-r--r--fs/bcachefs/nocow_locking.h26
-rw-r--r--fs/bcachefs/nocow_locking_types.h6
-rw-r--r--fs/bcachefs/opts.c21
-rw-r--r--fs/bcachefs/opts.h24
-rw-r--r--fs/bcachefs/rebalance.c712
-rw-r--r--fs/bcachefs/rebalance.h52
-rw-r--r--fs/bcachefs/rebalance_format.h63
-rw-r--r--fs/bcachefs/replicas_format.h3
-rw-r--r--fs/bcachefs/sb-downgrade.c5
-rw-r--r--fs/bcachefs/sb-errors_format.h7
-rw-r--r--fs/bcachefs/super.c19
-rw-r--r--fs/bcachefs/sysfs.c6
-rw-r--r--fs/bcachefs/trace.h5
-rw-r--r--fs/bcachefs/util.h1
-rw-r--r--fs/bcachefs/vendor/min_heap.c59
-rw-r--r--fs/bcachefs/vendor/min_heap.h477
41 files changed, 1685 insertions, 555 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5455412b2b75..df383b97aacc 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -21,7 +21,6 @@ config BCACHEFS_FS
select XOR_BLOCKS
select XXHASH
select SYMBOLIC_ERRNAME
- select MIN_HEAP
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -34,7 +33,6 @@ config BCACHEFS_QUOTA
config BCACHEFS_ERASURE_CODING
bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
depends on BCACHEFS_FS
- select QUOTACTL
help
This enables the "erasure_code" filesysystem and inode option, which
organizes data into reed-solomon stripes instead of ordinary
@@ -43,11 +41,6 @@ config BCACHEFS_ERASURE_CODING
WARNING: this feature is still undergoing on disk format changes, and
should only be enabled for testing purposes.
-config BCACHEFS_POSIX_ACL
- bool "bcachefs POSIX ACL support"
- depends on BCACHEFS_FS
- select FS_POSIX_ACL
-
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1e87eee962ec..98acb3dc66a3 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -1,4 +1,9 @@
+ifdef BCACHEFS_DKMS
+ CONFIG_BCACHEFS_FS := m
+ # Enable other features here?
+endif
+
obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
@@ -99,9 +104,16 @@ bcachefs-y := \
util.o \
varint.o \
xattr.o \
- vendor/closure.o
+ vendor/closure.o \
+ vendor/min_heap.o
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+ifndef BCACHEFS_DKMS
+ obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+endif
# Silence "note: xyz changed in GCC X.X" messages
subdir-ccflags-y += $(call cc-disable-warning, psabi)
+
+ifdef BCACHEFS_DKMS
+ subdir-ccflags-y += -I$(src)
+endif
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3befa1f36e72..a1df0ec2d511 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -57,7 +57,7 @@ void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
}
}
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#ifndef NO_BCACHEFS_FS
#include "fs.h"
@@ -437,4 +437,4 @@ err:
return ret;
}
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index fe730a6bf0c1..01cf21c5a0b6 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -26,7 +26,7 @@ typedef struct {
void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#ifndef NO_BCACHEFS_FS
struct posix_acl *bch2_get_acl(struct inode *, int, bool);
@@ -55,6 +55,6 @@ static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
return 0;
}
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* NO_BCACHEFS_FS */
#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3ccca855f05e..933c5a68eff9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -686,6 +686,7 @@ struct btree_debug {
unsigned id;
};
+#define BCH_LINK_MAX U32_MAX
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
@@ -1061,6 +1062,7 @@ struct bch_fs {
GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32];
+ struct hlist_head ec_stripes_new_buckets[64];
spinlock_t ec_stripes_new_lock;
/* ERASURE CODING */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d29bd684b137..090f11e122ad 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -707,7 +707,8 @@ struct bch_sb_field_ext {
x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
x(31bit_dirent_offset, BCH_VERSION(1, 30)) \
- x(btree_node_accounting, BCH_VERSION(1, 31))
+ x(btree_node_accounting, BCH_VERSION(1, 31)) \
+ x(rebalance_v2, BCH_VERSION(1, 32))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 63dc0836bf08..638c2a9268a9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -204,7 +204,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
if (bpos_eq(expected_start, cur->data->min_key))
return 0;
- prt_printf(&buf, " at ");
+ prt_printf(&buf, " at ");
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
prt_printf(&buf, ":\nparent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
@@ -229,8 +229,8 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
*pulled_from_scan = cur->data->min_key;
ret = bch_err_throw(c, topology_repair_did_fill_from_scan);
} else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
+ if (mustfix_fsck_err(trans, btree_node_topology_gap_between_nodes,
+ "gap between btree nodes%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 52d21259ed6f..3808c41dda84 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1318,6 +1318,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset_end(b, b->set);
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_error(b);
+ ret = 0;
continue;
}
if (ret)
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index a6f344faf751..73fa8513ee61 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -838,8 +838,10 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
bpos_ge(k->k.p, start) &&
bpos_le(k->k.p, end)))
keys->data[dst++] = *i;
- else if (i->allocated)
+ else if (i->allocated) {
kfree(i->allocated_k);
+ i->allocated_k = NULL;
+ }
}
keys->nr = keys->gap = dst;
}
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index c0dff992ad60..433d498540fd 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -12,7 +12,6 @@
#include "recovery_passes.h"
#include <linux/kthread.h>
-#include <linux/min_heap.h>
#include <linux/sched/sysctl.h>
#include <linux/sort.h>
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 10bfadcde80a..362846d5bb87 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
return bch2_csum_opt_to_type(opts.data_checksum, true);
}
+static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c,
+ struct bch_extent_rebalance opts)
+{
+ if (c->sb.encryption_type)
+ return c->opts.wide_macs
+ ? BCH_CSUM_chacha20_poly1305_128
+ : BCH_CSUM_chacha20_poly1305_80;
+
+ return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
{
if (c->sb.encryption_type)
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 37554e4514fe..d12c11dc6601 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -2,7 +2,7 @@
#ifndef _BCACHEFS_CLOCK_TYPES_H
#define _BCACHEFS_CLOCK_TYPES_H
-#include "util.h"
+#include "vendor/min_heap.h"
#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 62d5d17d681e..02997b061ce5 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -55,63 +55,6 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
return true;
}
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
-}
-
-static noinline_for_stack
-bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
- const struct bch_extent_ptr *start)
-{
- if (!ctxt) {
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr == start)
- break;
-
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
-
- __bkey_for_each_ptr(start, ptrs.end, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bool locked;
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
- if (!locked) {
- bch2_trans_unlock(ctxt->trans);
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- }
- }
- return true;
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
-{
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
- return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
- }
-
- return true;
-}
-
noinline_for_stack
static void trace_io_move_finish2(struct data_update *u,
struct bkey_i *new,
@@ -210,28 +153,6 @@ static void trace_data_update2(struct data_update *m,
}
noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
- struct bkey_s_c old, struct bkey_s_c k,
- struct bkey_i *insert)
-{
- struct bch_fs *c = m->op.c;
- CLASS(printbuf, buf)();
-
- bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_io_move_created_rebalance(c, buf.buf);
-
- count_event(c, io_move_created_rebalance);
-}
-
-noinline_for_stack
static int data_update_invalid_bkey(struct data_update *m,
struct bkey_s_c old, struct bkey_s_c k,
struct bkey_i *insert)
@@ -440,7 +361,7 @@ restart_drop_extra_replicas:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?:
- bch2_bkey_set_needs_rebalance(c, &opts, insert,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert,
SET_NEEDS_REBALANCE_foreground,
m->op.opts.change_cookie) ?:
bch2_trans_update(trans, &iter, insert,
@@ -451,10 +372,6 @@ restart_drop_extra_replicas:
if (trace_data_update_enabled())
trace_data_update2(m, old, k, insert);
- if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
- bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
- trace_io_move_created_rebalance2(m, old, k, insert);
-
ret = bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
@@ -538,7 +455,7 @@ void bch2_data_update_exit(struct data_update *update)
update->bvecs = NULL;
if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
+ bch2_bkey_nocow_unlock(c, k, 0);
bkey_put_dev_refs(c, k);
bch2_disk_reservation_put(c, &update->op.res);
bch2_bkey_buf_exit(&update->k, c);
@@ -1018,10 +935,19 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out;
}
- if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, ptrs)) {
- ret = bch_err_throw(c, nocow_lock_blocked);
- goto out_put_dev_refs;
+ if (c->opts.nocow_enabled) {
+ if (!bch2_bkey_nocow_trylock(c, ptrs, 0)) {
+ bool locked = false;
+ if (ctxt)
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bkey_nocow_trylock(c, ptrs, 0)) ||
+ list_empty(&ctxt->ios));
+ if (!locked) {
+ if (ctxt)
+ bch2_trans_unlock(ctxt->trans);
+ bch2_bkey_nocow_lock(c, ptrs, 0);
+ }
+ }
}
if (unwritten) {
@@ -1039,8 +965,7 @@ int bch2_data_update_init(struct btree_trans *trans,
return 0;
out_nocow_unlock:
if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
-out_put_dev_refs:
+ bch2_bkey_nocow_unlock(c, k, 0);
bkey_put_dev_refs(c, k);
out:
bch2_disk_reservation_put(c, &m->op.res);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a99f821c6a1c..73f50e5489b4 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -282,6 +282,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
prt_str(out, "btree=");
bch2_btree_id_to_text(out, k->btree.id);
break;
+ case BCH_DISK_ACCOUNTING_rebalance_work_v2:
+ bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type);
+ break;
}
}
@@ -786,103 +789,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t
return ret ? ERR_PTR(ret) : next;
}
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
+static int accounting_read_mem_fixups(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
- CLASS(btree_trans, trans)(c);
- CLASS(printbuf, buf)();
-
- /*
- * We might run more than once if we rewind to start topology repair or
- * btree node scan - and those might cause us to get different results,
- * so we can't just skip if we've already run.
- *
- * Instead, zero out any accounting we have:
- */
- scoped_guard(percpu_write, &c->mark_lock) {
- darray_for_each(acc->k, e)
- percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
- for_each_member_device(c, ca)
- percpu_memset(ca->usage, 0, sizeof(*ca->usage));
- percpu_memset(c->usage, 0, sizeof(*c->usage));
- }
-
- struct journal_keys *keys = &c->journal_keys;
- struct journal_key *jk = keys->data;
-
- while (jk < &darray_top(*keys) &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
- jk++;
-
- struct journal_key *end = jk;
- while (end < &darray_top(*keys) &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
- end++;
-
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
- iter.flags &= ~BTREE_ITER_with_journal;
- int ret = for_each_btree_key_continue(trans, iter,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
- if (k.k->type != KEY_TYPE_accounting)
- continue;
-
- while (jk < end &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
- jk = accumulate_and_read_journal_accounting(trans, jk);
-
- while (jk < end &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
- bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
- jk->overwritten = true;
- jk++;
- }
-
- if (jk < end &&
- __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
- jk = accumulate_and_read_journal_accounting(trans, jk);
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next_acc;
- memset(&next_acc, 0, sizeof(next_acc));
- next_acc.type = acc_k.type + 1;
- struct bpos next = disk_accounting_pos_to_bpos(&next_acc);
- if (jk < end)
- next = bpos_min(next, journal_key_k(c, jk)->k.p);
-
- bch2_btree_iter_set_pos(&iter, next);
- continue;
- }
-
- accounting_read_key(trans, k);
- }));
- bch2_trans_iter_exit(&iter);
- if (ret)
- return ret;
-
- while (jk < end)
- jk = accumulate_and_read_journal_accounting(trans, jk);
-
- struct journal_key *dst = keys->data;
- darray_for_each(*keys, i)
- if (!i->overwritten)
- *dst++ = *i;
- keys->gap = keys->nr = dst - keys->data;
-
CLASS(printbuf, underflow_err)();
scoped_guard(percpu_write, &c->mark_lock) {
@@ -903,7 +813,7 @@ int bch2_accounting_read(struct bch_fs *c)
* Remove it, so that if it's re-added it gets re-marked in the
* superblock:
*/
- ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
+ int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
: bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
@@ -985,7 +895,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (underflow_err.pos) {
bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err);
unsigned pos = underflow_err.pos;
- ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
+ int ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
BCH_RECOVERY_PASS_check_allocations, 0);
print |= underflow_err.pos != pos;
@@ -995,7 +905,108 @@ int bch2_accounting_read(struct bch_fs *c)
return ret;
}
- return ret;
+ return 0;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ CLASS(btree_trans, trans)(c);
+ CLASS(printbuf, buf)();
+
+ /*
+ * We might run more than once if we rewind to start topology repair or
+ * btree node scan - and those might cause us to get different results,
+ * so we can't just skip if we've already run.
+ *
+ * Instead, zero out any accounting we have:
+ */
+ scoped_guard(percpu_write, &c->mark_lock) {
+ darray_for_each(acc->k, e)
+ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
+ for_each_member_device(c, ca)
+ percpu_memset(ca->usage, 0, sizeof(*ca->usage));
+ percpu_memset(c->usage, 0, sizeof(*c->usage));
+ }
+
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *jk = keys->data;
+
+ move_gap(keys, keys->nr);
+
+ while (jk < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
+ jk++;
+
+ struct journal_key *end = jk;
+ while (end < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
+ end++;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+ iter.flags &= ~BTREE_ITER_with_journal;
+ int ret = for_each_btree_key_continue(trans, iter,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
+
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
+
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
+ bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
+ jk->overwritten = true;
+ jk++;
+ }
+
+ if (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
+
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next_acc;
+ memset(&next_acc, 0, sizeof(next_acc));
+ next_acc.type = acc_k.type + 1;
+ struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
+ if (jk < end)
+ next = bpos_min(next, journal_key_k(c, jk)->k.p);
+
+ bch2_btree_iter_set_pos(&iter, next);
+ continue;
+ }
+
+ accounting_read_key(trans, k);
+ }));
+ bch2_trans_iter_exit(&iter);
+ if (ret)
+ return ret;
+
+ while (jk < end)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
+
+ bch2_trans_unlock(trans);
+
+ struct journal_key *dst = keys->data;
+ darray_for_each(*keys, i)
+ if (!i->overwritten)
+ *dst++ = *i;
+ keys->gap = keys->nr = dst - keys->data;
+
+ return accounting_read_mem_fixups(trans);
}
int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 730a17ea4243..0b61d6100180 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -110,7 +110,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
x(snapshot, 5, 1) \
x(btree, 6, 3) \
x(rebalance_work, 7, 1) \
- x(inum, 8, 3)
+ x(inum, 8, 3) \
+ x(rebalance_work_v2, 9, 1) \
enum disk_accounting_type {
#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
@@ -210,6 +211,10 @@ struct bch_acct_inum {
struct bch_acct_rebalance_work {
};
+struct bch_acct_rebalance_work_v2 {
+ __u8 type;
+};
+
struct disk_accounting_pos {
union {
struct {
@@ -224,6 +229,7 @@ struct disk_accounting_pos {
struct bch_acct_btree btree;
struct bch_acct_rebalance_work rebalance_work;
struct bch_acct_inum inum;
+ struct bch_acct_rebalance_work_v2 rebalance_work_v2;
} __packed;
} __packed;
struct bpos _pad;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 89a95b6c4e51..78afd44a7a3f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -895,8 +895,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
* Hash table of open stripes:
* Stripes that are being created or modified are kept in a hash table, so that
* stripe deletion can skip them.
+ *
+ * Additionally, we have a hash table for buckets that have stripes being
+ * created, to avoid racing with rebalance:
*/
+static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
+{
+ unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
+ struct ec_stripe_new_bucket *s;
+
+ hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash)
+ if (s->dev_bucket == dev_bucket)
+ return true;
+ return false;
+}
+
+bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
+{
+ guard(spinlock)(&c->ec_stripes_new_lock);
+ return __bch2_bucket_has_new_stripe(c, dev_bucket);
+}
+
+static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket)
+{
+ s->dev_bucket = dev_bucket;
+
+ unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
+ hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]);
+}
+
+static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ unsigned nr_blocks = s->nr_data + s->nr_parity;
+
+ guard(spinlock)(&c->ec_stripes_new_lock);
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (!s->blocks[i])
+ continue;
+
+ struct open_bucket *ob = c->open_buckets + s->blocks[i];
+ struct bpos bucket = POS(ob->dev, ob->bucket);
+
+ stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket));
+ }
+}
+
+static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+
+ for (unsigned i = 0; i < v->nr_blocks; i++)
+ hlist_del_init(&s->buckets[i].hash);
+}
+
static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
{
unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
@@ -937,6 +989,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
hlist_del_init(&s->hash);
s->idx = 0;
+
+ stripe_new_buckets_del(c, s);
}
/* stripe deletion */
@@ -1134,7 +1188,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
ret = bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n),
SET_NEEDS_REBALANCE_other) ?:
- bch2_bkey_set_needs_rebalance(trans->c, &opts, n,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n,
SET_NEEDS_REBALANCE_other, 0) ?:
bch2_trans_update(trans, &iter, n, 0);
out:
@@ -2027,6 +2081,7 @@ allocate_buf:
if (ret)
goto err;
+ stripe_new_buckets_add(c, s);
s->allocated = true;
allocated:
BUG_ON(!s->idx);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index cc778da99030..85598448c7e1 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -191,6 +191,11 @@ enum ec_stripe_ref {
STRIPE_REF_NR
};
+struct ec_stripe_new_bucket {
+ struct hlist_node hash;
+ u64 dev_bucket;
+};
+
struct ec_stripe_new {
struct bch_fs *c;
struct ec_stripe_head *h;
@@ -217,6 +222,8 @@ struct ec_stripe_new {
open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
+ struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX];
+
struct ec_stripe_buf new_stripe;
struct ec_stripe_buf existing_stripe;
};
@@ -248,6 +255,8 @@ struct ec_stripe_head {
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
+bool bch2_bucket_has_new_stripe(struct bch_fs *, u64);
+
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index cbf1eedddad7..2712a9754100 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -228,6 +228,7 @@
x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \
x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \
x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \
+ x(EMLINK, too_many_links) \
x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EOPNOTSUPP, no_casefolding_without_utf8) \
x(EOPNOTSUPP, casefolding_disabled) \
@@ -367,7 +368,10 @@
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, invalid_snapshot_node) \
x(0, option_needs_open_fs) \
- x(0, remove_disk_accounting_entry)
+ x(0, remove_disk_accounting_entry) \
+ x(0, nocow_trylock_fail) \
+ x(BCH_ERR_nocow_trylock_fail, nocow_trylock_contended) \
+ x(BCH_ERR_nocow_trylock_fail, nocow_trylock_bucket_full) \
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 3274ba42c995..c534b009bf60 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1522,24 +1522,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
"redundant stripe entry");
have_ec = true;
break;
- case BCH_EXTENT_ENTRY_rebalance: {
- /*
- * this shouldn't be a fsck error, for forward
- * compatibility; the rebalance code should just refetch
- * the compression opt if it's unknown
- */
-#if 0
- const struct bch_extent_rebalance *r = &entry->rebalance;
-
- if (!bch2_compression_opt_valid(r->compression)) {
- union bch_compression_opt opt = { .value = r->compression };
- prt_printf(err, "invalid compression opt %u:%u",
- opt.type, opt.level);
- return bch_err_throw(c, invalid_bkey);
- }
-#endif
+ case BCH_EXTENT_ENTRY_rebalance:
+ ret = bch2_extent_rebalance_validate(c, k, from, &entry->rebalance);
+ if (ret)
+ return ret;
break;
- }
case BCH_EXTENT_ENTRY_flags:
bkey_fsck_err_on(entry != ptrs.start,
c, extent_flags_not_at_start,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f1849eb8327d..414adf86efdc 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -541,11 +541,9 @@ __bch2_create(struct mnt_idmap *idmap,
* preallocate acls + vfs inode before btree transaction, so that
* nothing can fail after the transaction succeeds:
*/
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
if (ret)
return ERR_PTR(ret);
-#endif
inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
@@ -1751,10 +1749,8 @@ static const struct inode_operations bch_file_inode_operations = {
.setattr = bch2_setattr,
.fiemap = bch2_fiemap,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -1773,10 +1769,8 @@ static const struct inode_operations bch_dir_inode_operations = {
.setattr = bch2_setattr,
.tmpfile = bch2_tmpfile,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -1797,10 +1791,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
.getattr = bch2_getattr,
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -1809,10 +1801,8 @@ static const struct inode_operations bch_special_inode_operations = {
.getattr = bch2_getattr,
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
-#endif
.fileattr_get = bch2_fileattr_get,
.fileattr_set = bch2_fileattr_set,
};
@@ -2546,10 +2536,8 @@ got_sb:
c->dev = sb->s_dev;
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
if (c->opts.acl)
sb->s_flags |= SB_POSIXACL;
-#endif
sb->s_shrink->seeks = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 543627fb58be..fda4ca783848 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1184,8 +1184,8 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
if (bi->bi_flags & BCH_INODE_unlinked)
bi->bi_flags &= ~BCH_INODE_unlinked;
else {
- if (bi->bi_nlink == U32_MAX)
- return -EINVAL;
+ if (bi->bi_nlink == BCH_LINK_MAX - nlink_bias(bi->bi_mode))
+ return -BCH_ERR_too_many_links;
bi->bi_nlink++;
}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 6a5da02ce266..66ee07be149e 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -33,7 +33,6 @@
#include <linux/blkdev.h>
#include <linux/moduleparam.h>
-#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
@@ -365,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans,
min(k->k.p.offset << 9, new_i_size),
i_sectors_delta, &inode) ?:
(bch2_inode_opts_get_inode(c, &inode, &opts),
- bch2_bkey_set_needs_rebalance(c, &opts, k,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k,
SET_NEEDS_REBALANCE_foreground,
change_cookie)) ?:
bch2_trans_update(trans, iter, k, 0) ?:
@@ -1271,7 +1270,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0, &inode) ?:
(bch2_inode_opts_get_inode(c, &inode, &opts),
- bch2_bkey_set_needs_rebalance(c, &opts, new,
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new,
SET_NEEDS_REBALANCE_foreground,
op->opts.change_cookie)) ?:
bch2_trans_update(trans, iter, new,
@@ -1323,11 +1322,25 @@ static CLOSURE_CALLBACK(bch2_nocow_write_done)
bch2_write_done(cl);
}
-struct bucket_to_lock {
- struct bpos b;
- unsigned gen;
- struct nocow_lock_bucket *l;
-};
+static bool bkey_get_dev_iorefs(struct bch_fs *c, struct bkey_ptrs_c ptrs)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
+ BCH_DEV_WRITE_REF_io_write);
+ if (unlikely(!ca)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+ enumerated_ref_put(&bch2_dev_have_ref(c, ptr2->dev)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
+ }
+
+ return false;
+ }
+ }
+
+ return true;
+}
static void bch2_nocow_write(struct bch_write_op *op)
{
@@ -1335,15 +1348,14 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
- DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+ struct bkey_ptrs_c ptrs;
u32 snapshot;
- struct bucket_to_lock *stale_at;
+ const struct bch_extent_ptr *stale_at;
int stale, ret;
if (op->flags & BCH_WRITE_move)
return;
- darray_init(&buckets);
trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
@@ -1358,8 +1370,6 @@ retry:
while (1) {
struct bio *bio = &op->wbio.bio;
- buckets.nr = 0;
-
ret = bch2_trans_relock(trans);
if (ret)
break;
@@ -1381,50 +1391,42 @@ retry:
break;
/* Get iorefs before dropping btree locks: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
- BCH_DEV_WRITE_REF_io_write);
- if (unlikely(!ca))
- goto err_get_ioref;
-
- struct bpos b = PTR_BUCKET_POS(ca, ptr);
- struct nocow_lock_bucket *l =
- bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
- prefetch(l);
-
- /* XXX allocating memory with btree locks held - rare */
- darray_push_gfp(&buckets, ((struct bucket_to_lock) {
- .b = b, .gen = ptr->gen, .l = l,
- }), GFP_KERNEL|__GFP_NOFAIL);
-
- if (ptr->unwritten)
- op->flags |= BCH_WRITE_convert_unwritten;
- }
+ ptrs = bch2_bkey_ptrs_c(k);
+ if (!bkey_get_dev_iorefs(c, ptrs))
+ goto out;
/* Unlock before taking nocow locks, doing IO: */
bkey_reassemble(op->insert_keys.top, k);
- bch2_trans_unlock(trans);
+ k = bkey_i_to_s_c(op->insert_keys.top);
+ ptrs = bch2_bkey_ptrs_c(k);
- bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_convert_unwritten)
- bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+ bch2_trans_unlock(trans);
- darray_for_each(buckets, i) {
- struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
+ bch2_bkey_nocow_lock(c, ptrs, BUCKET_NOCOW_LOCK_UPDATE);
- __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
- bucket_to_u64(i->b),
- BUCKET_NOCOW_LOCK_UPDATE);
+ /*
+ * This could be handled better: If we're able to trylock the
+ * nocow locks with btree locks held we know dirty pointers
+ * can't be stale
+ */
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- int gen = bucket_gen_get(ca, i->b.offset);
- stale = gen < 0 ? gen : gen_after(gen, i->gen);
+ int gen = bucket_gen_get(ca, PTR_BUCKET_NR(ca, ptr));
+ stale = gen < 0 ? gen : gen_after(gen, ptr->gen);
if (unlikely(stale)) {
- stale_at = i;
+ stale_at = ptr;
goto err_bucket_stale;
}
+
+ if (ptr->unwritten)
+ op->flags |= BCH_WRITE_convert_unwritten;
}
+ bch2_cut_front(op->pos, op->insert_keys.top);
+ if (op->flags & BCH_WRITE_convert_unwritten)
+ bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
bio = &op->wbio.bio;
if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
bio = bio_split(bio, k.k->p.offset - op->pos.offset,
@@ -1458,8 +1460,6 @@ err:
goto retry;
bch2_trans_put(trans);
- darray_exit(&buckets);
-
if (ret) {
bch2_write_op_error(op, op->pos.offset,
"%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
@@ -1484,24 +1484,11 @@ err:
continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
}
return;
-err_get_ioref:
- darray_for_each(buckets, i)
- enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
- BCH_DEV_WRITE_REF_io_write);
-
- /* Fall back to COW path: */
- goto out;
err_bucket_stale:
- darray_for_each(buckets, i) {
- bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
- if (i == stale_at)
- break;
- }
-
CLASS(printbuf, buf)();
if (bch2_fs_inconsistent_on(stale < 0, c,
- "pointer to invalid bucket in nocow path on device %llu\n %s",
- stale_at->b.inode,
+ "pointer to invalid bucket in nocow path on device %u\n %s",
+ stale_at->dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch_err_throw(c, data_write_invalid_ptr);
} else {
@@ -1509,7 +1496,13 @@ err_bucket_stale:
ret = bch_err_throw(c, transaction_restart);
}
- goto err_get_ioref;
+ bch2_bkey_nocow_unlock(c, k, BUCKET_NOCOW_LOCK_UPDATE);
+ bkey_for_each_ptr(ptrs, ptr)
+ enumerated_ref_put(&bch2_dev_have_ref(c, ptr->dev)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
+
+ /* Fall back to COW path: */
+ goto out;
}
static void __bch2_write(struct bch_write_op *op)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ae747c87fcf9..f7b0fdd99c75 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -766,6 +766,9 @@ static int bch2_journal_reclaim_thread(void *arg)
set_freezable();
+ kthread_wait_freezable(test_bit(BCH_FS_rw, &c->flags) ||
+ kthread_should_stop());
+
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
@@ -826,8 +829,10 @@ int bch2_journal_reclaim_start(struct journal *j)
struct task_struct *p;
int ret;
- if (j->reclaim_thread)
+ if (j->reclaim_thread) {
+ wake_up_process(j->reclaim_thread);
return 0;
+ }
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 139a6587a64e..9b172af4f8c8 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -84,7 +84,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
struct bch_inode_opts opts;
ret = bch2_extent_get_apply_io_opts_one(trans, &opts, iter, k, ctx) ?:
- bch2_bkey_set_needs_rebalance(c, &opts, n, ctx, 0) ?:
+ bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, ctx, 0) ?:
drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false);
if (ret)
return ret;
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 71b17f18e90c..c8907070796f 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -6,6 +6,16 @@
#include "nocow_locking.h"
#include "util.h"
+#include <linux/prefetch.h>
+
+static bool nocow_bucket_empty(struct nocow_lock_bucket *l)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (atomic_read(&l->l[i]))
+ return false;
+ return true;
+}
+
bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
{
u64 dev_bucket = bucket_to_u64(bucket);
@@ -20,14 +30,12 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos
#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, u64 dev_bucket, int flags)
{
- u64 dev_bucket = bucket_to_u64(bucket);
struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
int lock_val = flags ? 1 : -1;
- unsigned i;
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++)
if (l->b[i] == dev_bucket) {
int v = atomic_sub_return(lock_val, &l->l[i]);
@@ -40,8 +48,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
BUG();
}
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
+static int __bch2_bucket_nocow_trylock(struct bch_fs *c, struct nocow_lock_bucket *l,
+ u64 dev_bucket, int flags)
{
int v, lock_val = flags ? 1 : -1;
unsigned i;
@@ -58,32 +66,128 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
goto take_lock;
}
- return false;
+ return bch_err_throw(c, nocow_trylock_bucket_full);
got_entry:
v = atomic_read(&l->l[i]);
if (lock_val > 0 ? v < 0 : v > 0)
- return false;
+ return bch_err_throw(c, nocow_trylock_contended);
take_lock:
v = atomic_read(&l->l[i]);
/* Overflow? */
if (v && sign(v + lock_val) != sign(v))
- return false;
+ return bch_err_throw(c, nocow_trylock_contended);
atomic_add(lock_val, &l->l[i]);
+ return 0;
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bch_fs *c, struct bpos bucket, int flags)
+{
+ struct bucket_nocow_lock_table *t = &c->nocow_locks;
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+ return !__bch2_bucket_nocow_trylock(c, l, dev_bucket, flags);
+}
+
+void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags);
+ }
+}
+
+bool bch2_bkey_nocow_trylock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ if (unlikely(!bch2_bucket_nocow_trylock(c, bucket, flags))) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr2->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr2);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags);
+ }
+ return false;
+ }
+ }
+
return true;
}
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
+struct bucket_to_lock {
+ u64 b;
+ struct nocow_lock_bucket *l;
+};
+
+static inline int bucket_to_lock_cmp(struct bucket_to_lock l,
+ struct bucket_to_lock r)
{
- if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
- struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+ return cmp_int(l.l, r.l);
+}
+
+void bch2_bkey_nocow_lock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
+{
+ if (bch2_bkey_nocow_trylock(c, ptrs, flags))
+ return;
+
+ DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+ darray_init(&buckets);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ u64 b = bucket_to_u64(PTR_BUCKET_POS(ca, ptr));
+ struct nocow_lock_bucket *l =
+ bucket_nocow_lock(&c->nocow_locks, b);
+ prefetch(l);
+
+ /* XXX allocating memory with btree locks held - rare */
+ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .l = l, }),
+ GFP_KERNEL|__GFP_NOFAIL);
+ }
+
+ WARN_ON_ONCE(buckets.nr > NOCOW_LOCK_BUCKET_SIZE);
+
+ bubble_sort(buckets.data, buckets.nr, bucket_to_lock_cmp);
+retake_all:
+ darray_for_each(buckets, i) {
+ int ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags);
+ if (!ret)
+ continue;
+
u64 start_time = local_clock();
- __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+ if (ret == -BCH_ERR_nocow_trylock_contended)
+ __closure_wait_event(&i->l->wait,
+ (ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags)) != -BCH_ERR_nocow_trylock_contended);
+ if (!ret) {
+ bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ continue;
+ }
+
+ BUG_ON(ret != -BCH_ERR_nocow_trylock_bucket_full);
+
+ darray_for_each(buckets, i2) {
+ if (i2 == i)
+ break;
+ __bch2_bucket_nocow_unlock(&c->nocow_locks, i2->b, flags);
+ }
+
+ __closure_wait_event(&i->l->wait, nocow_bucket_empty(i->l));
bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ goto retake_all;
}
+
+ darray_exit(&buckets);
}
void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index 48b8a003c0d2..972c914780f9 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -19,29 +19,19 @@ static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lo
#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0)
bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
- struct nocow_lock_bucket *, u64, int);
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
+void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, u64, int);
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
+static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket,
+ int flags)
{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+ __bch2_bucket_nocow_unlock(t, bucket_to_u64(bucket), flags);
}
+void bch2_bkey_nocow_unlock(struct bch_fs *, struct bkey_s_c, int);
+bool bch2_bkey_nocow_trylock(struct bch_fs *, struct bkey_ptrs_c, int);
+void bch2_bkey_nocow_lock(struct bch_fs *, struct bkey_ptrs_c, int);
+
void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
void bch2_fs_nocow_locking_exit(struct bch_fs *);
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
index bd12bf677924..3fed8e957e20 100644
--- a/fs/bcachefs/nocow_locking_types.h
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -5,11 +5,13 @@
#define BUCKET_NOCOW_LOCKS_BITS 10
#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS)
+#define NOCOW_LOCK_BUCKET_SIZE 6
+
struct nocow_lock_bucket {
struct closure_waitlist wait;
spinlock_t lock;
- u64 b[4];
- atomic_t l[4];
+ u64 b[NOCOW_LOCK_BUCKET_SIZE];
+ atomic_t l[NOCOW_LOCK_BUCKET_SIZE];
} __aligned(SMP_CACHE_BYTES);
struct bucket_nocow_lock_table {
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bd5faafc9aa7..e01c808e7893 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -103,6 +103,13 @@ static const char * const __bch2_fs_usage_types[] = {
#undef x
+static const char * const __bch2_rebalance_accounting_types[] = {
+#define x(n) #n,
+ BCH_REBALANCE_ACCOUNTING()
+#undef x
+ NULL
+};
+
static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
unsigned nr, const char *type, unsigned idx)
{
@@ -125,6 +132,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
+PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
@@ -646,11 +654,8 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
val = bch2_opt_val_synonym_lookup(name, val);
- if (!(bch2_opt_table[id].flags & OPT_MOUNT))
- return -BCH_ERR_option_name;
-
- if (id == Opt_acl &&
- !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+ if (!(bch2_opt_table[id].flags & OPT_MOUNT) &&
+ !(bch2_opt_table[id].flags & OPT_MOUNT_OLD))
return -BCH_ERR_option_name;
if ((id == Opt_usrquota ||
@@ -673,6 +678,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
if (ret < 0)
return -BCH_ERR_option_value;
+ if (bch2_opt_table[id].flags & OPT_MOUNT_OLD) {
+ pr_err("option %s may no longer be specified at mount time; set via sysfs opts dir",
+ bch2_opt_table[id].attr.name);
+ return 0;
+ }
+
if (opts)
bch2_opt_set_by_id(opts, id, v);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 6b9f18839345..68982196b5dc 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -34,6 +34,7 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
+void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type);
static inline const char *bch2_d_type_str(unsigned d_type)
{
@@ -66,6 +67,7 @@ enum opt_flags {
OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
OPT_HIDDEN = BIT(11),
+ OPT_MOUNT_OLD = BIT(12), /* May not be specified at mount time, but don't fail the mount */
};
enum opt_type {
@@ -149,12 +151,12 @@ enum fsck_err_opts {
BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
- OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX + 1), \
BCH_SB_META_REPLICAS_WANT, 1, \
"#", "Number of metadata replicas") \
x(data_replicas, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX + 1), \
BCH_SB_DATA_REPLICAS_WANT, 1, \
"#", "Number of data replicas") \
@@ -175,12 +177,12 @@ enum fsck_err_opts {
BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
"size", "Maximum size of checksummed/compressed extents")\
x(metadata_checksum, u8, \
- OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_STR(__bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
@@ -190,12 +192,12 @@ enum fsck_err_opts {
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
NULL, NULL) \
x(compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(background_compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
@@ -205,27 +207,27 @@ enum fsck_err_opts {
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
- OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or label for metadata writes") \
x(foreground_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0, \
"(target)", "Device or label for foreground writes") \
x(background_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0, \
"(target)", "Device or label to move data to in the background")\
x(promote_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0, \
"(target)", "Device or label to promote data to on read") \
x(erasure_code, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 67d6a90e86ef..0e40e7bd3441 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
@@ -10,6 +11,7 @@
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
+#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "inode.h"
@@ -25,8 +27,29 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
+
/* bch_extent_rebalance: */
+int bch2_extent_rebalance_validate(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bkey_validate_context from,
+ const struct bch_extent_rebalance *r)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(r->pending && !(r->need_rb & BIT(BCH_REBALANCE_background_target)),
+ c, extent_rebalance_bad_pending,
+ "pending incorrectly set");
+
+ bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)),
+ c, extent_rebalance_bad_pending,
+ "hipri incorrectly set");
+
+fsck_err:
+ return ret;
+}
+
static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{
const union bch_extent_entry *entry;
@@ -38,15 +61,30 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b
return NULL;
}
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
}
+static const char * const rebalance_opts[] = {
+#define x(n) #n,
+ BCH_REBALANCE_OPTS()
+#undef x
+ NULL
+};
+
void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
const struct bch_extent_rebalance *r)
{
- prt_printf(out, "replicas=%u", r->data_replicas);
+ prt_str(out, "need_rb=");
+ prt_bitflags(out, rebalance_opts, r->need_rb);
+
+ if (r->hipri)
+ prt_str(out, " hipri");
+ if (r->pending)
+ prt_str(out, " pending");
+
+ prt_printf(out, " replicas=%u", r->data_replicas);
if (r->data_replicas_from_inode)
prt_str(out, " (inode)");
@@ -92,32 +130,54 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-int bch2_trigger_extent_rebalance(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta[1] = { 0 };
+/*
+ * XXX: check in bkey_validate that if r->hipri or r->pending are set,
+ * r->data_replicas are also set
+ */
- s64 s = bch2_bkey_sectors_need_rebalance(c, old);
- need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta[0] -= s;
+static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r)
+{
+ if (!r)
+ return 0;
+ unsigned ret = r->need_rb;
- s = bch2_bkey_sectors_need_rebalance(c, new);
- need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta[0] += s;
+ if (r->hipri)
+ ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority);
+ if (r->pending) {
+ ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending);
+ ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target);
+ }
+ return ret;
+}
- if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
+int __bch2_trigger_extent_rebalance(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned old_r, unsigned new_r,
+ enum btree_iter_update_trigger_flags flags)
+{
+ int delta = (int) !!new_r - (int) !!old_r;
+ if ((flags & BTREE_TRIGGER_transactional) && delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, need_rebalance_delta > 0);
+ new.k->p, delta > 0);
if (ret)
return ret;
}
- if (need_rebalance_sectors_delta[0]) {
+ delta = old.k->size == new.k->size
+ ? old_r ^ new_r
+ : old_r | new_r;
+ while (delta) {
+ unsigned c = __ffs(delta);
+ delta ^= BIT(c);
+
+ s64 v[1] = { 0 };
+ if (old_r & BIT(c))
+ v[0] -= (s64) old.k->size;
+ if (new_r & BIT(c))
+ v[0] += (s64) new.k->size;
+
int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- need_rebalance_sectors_delta, rebalance_work);
+ v, rebalance_work_v2, c);
if (ret)
return ret;
}
@@ -125,39 +185,48 @@ int bch2_trigger_extent_rebalance(struct btree_trans *trans,
return 0;
}
-static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
- struct bch_inode_opts *io_opts,
- unsigned *move_ptrs,
- unsigned *compress_ptrs,
- u64 *sectors)
+static struct bch_extent_rebalance
+bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_inode_opts *opts,
+ unsigned *move_ptrs,
+ unsigned *compress_ptrs,
+ unsigned *csum_ptrs,
+ bool may_update_indirect)
{
*move_ptrs = 0;
*compress_ptrs = 0;
- *sectors = 0;
+ *csum_ptrs = 0;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
- if (!io_opts && !rb_opts)
- return;
+ struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) };
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return;
+ return r;
- unsigned compression_type =
- bch2_compression_opt_to_type(io_opts
- ? io_opts->background_compression
- : rb_opts->background_compression);
- unsigned target = io_opts
- ? io_opts->background_target
- : rb_opts->background_target;
- if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target))
- target = 0;
+ const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs);
+ if (old_r) {
+ r = *old_r;
+ r.need_rb = 0;
+ }
+
+#define x(_name) \
+ if (k.k->type != KEY_TYPE_reflink_v || \
+ may_update_indirect || \
+ (!opts->_name##_from_inode && !r._name##_from_inode)) { \
+ r._name = opts->_name; \
+ r._name##_from_inode = opts->_name##_from_inode; \
+ }
+ BCH_REBALANCE_OPTS()
+#undef x
+
+ unsigned compression_type = bch2_compression_opt_to_type(r.background_compression);
+ unsigned csum_type = bch2_data_checksum_type_rb(c, r);
+
+ bool incompressible = false, unwritten = false, ec = false;
+ unsigned durability = 0, min_durability = INT_MAX;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- bool incompressible = false, unwritten = false;
-
unsigned ptr_idx = 1;
guard(rcu)();
@@ -166,102 +235,285 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
unwritten |= p.ptr.unwritten;
if (!p.ptr.cached) {
- if (p.crc.compression_type != compression_type)
+ if (p.crc.compression_type != compression_type) {
*compress_ptrs |= ptr_idx;
+ r.need_rb |= BIT(BCH_REBALANCE_background_compression);
+ }
- if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
+ if (p.crc.csum_type != csum_type) {
+ *csum_ptrs |= ptr_idx;
+ r.need_rb |= BIT(BCH_REBALANCE_data_checksum);
+ }
+
+ if (r.background_target &&
+ !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) {
*move_ptrs |= ptr_idx;
+ r.need_rb |= BIT(BCH_REBALANCE_background_target);
+ }
+
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+ durability += d;
+ min_durability = min(min_durability, d);
+
+ ec |= p.has_ec;
}
ptr_idx <<= 1;
}
- if (unwritten)
- *compress_ptrs = 0;
- if (incompressible)
+ if (unwritten || incompressible) {
*compress_ptrs = 0;
+ r.need_rb &= ~BIT(BCH_REBALANCE_background_compression);
+ }
- unsigned rb_ptrs = *move_ptrs | *compress_ptrs;
+ if (unwritten) {
+ *csum_ptrs = 0;
+ r.need_rb &= !BIT(BCH_REBALANCE_data_checksum);
+ }
- if (!rb_ptrs)
- return;
+ if (durability < r.data_replicas || durability >= r.data_replicas + min_durability)
+ r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
+ if (!unwritten && r.erasure_code != ec)
+ r.need_rb |= BIT(BCH_REBALANCE_erasure_code);
+ return r;
+}
- ptr_idx = 1;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (rb_ptrs & ptr_idx)
- *sectors += p.crc.compressed_size;
- ptr_idx <<= 1;
+static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v)
+{
+ if (v && *v)
+ return 1;
+
+ /*
+ * If opts need to be propagated to the extent, a scan cookie should be
+ * present:
+ */
+ CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = k.k->type == KEY_TYPE_cookie;
+ if (v)
+ *v = ret;
+ return ret;
+}
+
+static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bkey_s_c k,
+ struct bch_devs_mask *v)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (v && test_bit(ptr->dev, v->d))
+ return 1;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ int ret = check_rebalance_scan_cookie(trans, ptr->dev + 1, NULL);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+ if (v)
+ __set_bit(ptr->dev, v->d);
+ return ret;
+ }
}
+
+ return 0;
}
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+static bool bkey_has_ec(struct bkey_s_c k)
{
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- u64 sectors = 0;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
- bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, &sectors);
- return sectors;
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr)
+ return true;
+ return false;
}
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s_c k)
+static int new_needs_rb_allowed(struct btree_trans *trans,
+ struct per_snapshot_io_opts *s,
+ struct bkey_s_c k,
+ enum set_needs_rebalance_ctx ctx,
+ unsigned opt_change_cookie,
+ const struct bch_extent_rebalance *old,
+ const struct bch_extent_rebalance *new,
+ unsigned new_need_rb)
{
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- u64 sectors = 0;
+ struct bch_fs *c = trans->c;
+ /*
+ * New need_rb - pointers that don't match the current io path options -
+ * are only allowed in certain situations:
+ *
+ * Propagating new options: from bch2_set_rebalance_needs_scan
+ *
+ * Foreground writes: background_compression and background_target are
+ * allowed
+ *
+ * Foreground writes: we may have raced with an option change:
+ * opt_change_cookie checks for this
+ *
+ * XXX: foreground writes should still match compression,
+ * foreground_target - figure out how to check for this
+ */
+ if (ctx == SET_NEEDS_REBALANCE_opt_change ||
+ ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
+ return 0;
+
+ if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) &&
+ !bkey_has_ec(k)) {
+ /* Foreground writes are not initially erasure coded - and we
+ * may crash before a stripe is created
+ */
+ new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code);
+ }
+
+ if (ctx == SET_NEEDS_REBALANCE_foreground) {
+ new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
+ BIT(BCH_REBALANCE_background_target));
+
+ /*
+ * Foreground writes might end up degraded when a device is
+ * getting yanked:
+ *
+ * XXX: this is something we need to fix, but adding retries to
+ * the write path is something we have to do carefully.
+ */
+ new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas);
+ if (!new_need_rb)
+ return 0;
+
+ if (opt_change_cookie != atomic_read(&c->opt_change_cookie))
+ return 0;
+ }
- bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
- return move_ptrs|compress_ptrs;
+ /*
+ * Either the extent data or the extent io options (from
+ * bch_extent_rebalance) should match the io_opts from the
+ * inode/filesystem, unless
+ *
+ * - There's a scan pending to propagate new options
+ * - It's an indirect extent: it may be referenced by inodes
+ * with inconsistent options
+ *
+ * For efficiency (so that we can cache checking for scan
+ * cookies), only check option consistency when we're called
+ * with snapshot_io_opts - don't bother when we're called from
+ * move_data_phys() -> get_io_opts_one()
+ *
+ * Note that we can cache the existence of a cookie, but not the
+ * non-existence, to avoid spurious false positives.
+ */
+ int ret = check_rebalance_scan_cookie(trans, 0, s ? &s->fs_scan_cookie : NULL) ?:
+ check_rebalance_scan_cookie(trans, k.k->p.inode, s ? &s->inum_scan_cookie : NULL);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ if (new_need_rb == BIT(BCH_REBALANCE_data_replicas)) {
+ ret = check_dev_rebalance_scan_cookie(trans, k, s ? &s->dev_cookie : NULL);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+
+ CLASS(printbuf, buf)();
+
+ prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ const struct bch_extent_rebalance _old = {};
+ if (!old)
+ old = &_old;
+
+#define x(_name) \
+ if (new_need_rb & BIT(BCH_REBALANCE_##_name)) \
+ prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name);
+ BCH_REBALANCE_OPTS()
+#undef x
+
+ fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf);
+fsck_err:
+ return ret;
}
-static inline bool bkey_should_have_rb_opts(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s_c k)
+static inline bool bkey_should_have_rb_opts(struct bkey_s_c k,
+ struct bch_extent_rebalance new)
{
if (k.k->type == KEY_TYPE_reflink_v) {
-#define x(n) if (opts->n##_from_inode) return true;
+#define x(n) if (new.n##_from_inode) return true;
BCH_REBALANCE_OPTS()
#undef x
}
- return bch2_bkey_ptrs_need_rebalance(c, opts, k);
+ return new.need_rb;
}
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
+int bch2_bkey_set_needs_rebalance(struct btree_trans *trans,
+ struct per_snapshot_io_opts *snapshot_io_opts,
+ struct bch_inode_opts *opts,
struct bkey_i *_k,
enum set_needs_rebalance_ctx ctx,
- u32 change_cookie)
+ u32 opt_change_cookie)
{
if (!bkey_extent_is_direct_data(&_k->k))
return 0;
+ struct bch_fs *c = trans->c;
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *old =
(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
- if (bkey_should_have_rb_opts(c, opts, k.s_c)) {
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ unsigned csum_ptrs = 0;
+ struct bch_extent_rebalance new =
+ bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, &csum_ptrs,
+ ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
+
+ bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new);
+
+ if (should_have_rb == !!old &&
+ (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
+ return 0;
+
+ unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0);
+
+ if (unlikely(new_need_rb)) {
+ int ret = new_needs_rb_allowed(trans, snapshot_io_opts,
+ k.s_c, ctx, opt_change_cookie,
+ old, &new, new_need_rb);
+ if (ret)
+ return ret;
+ }
+
+ if (should_have_rb) {
if (!old) {
old = bkey_val_end(k);
k.k->u64s += sizeof(*old) / sizeof(u64);
}
- *old = io_opts_to_rebalance_opts(c, opts);
- } else {
- if (old)
- extent_entry_drop(k, (union bch_extent_entry *) old);
- }
+ *old = new;
+ } else if (old)
+ extent_entry_drop(k, (union bch_extent_entry *) old);
return 0;
}
static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bch_inode_opts *io_opts,
struct btree_iter *iter,
struct bkey_s_c k,
enum set_needs_rebalance_ctx ctx)
{
struct bch_fs *c = trans->c;
+ int ret = 0;
BUG_ON(iter->flags & BTREE_ITER_is_extents);
BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
@@ -269,36 +521,24 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
if (!bkey_extent_is_direct_data(k.k))
return 0;
- bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect;
+ struct bch_extent_rebalance *old =
+ (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k);
- /*
- * If it's an indirect extent, and we walked to it directly, we won't
- * have the options from the inode that were directly applied: options
- * from the extent take precedence - unless the io_opts option came from
- * the inode and may_update_indirect is true (walked from a
- * REFLINK_P_MAY_UPDATE_OPTIONS pointer).
- */
- const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
- if (old && k.k->type == KEY_TYPE_reflink_v) {
-#define x(_name) \
- if (old->_name##_from_inode && \
- !(may_update_indirect && io_opts->_name##_from_inode)) { \
- io_opts->_name = old->_name; \
- io_opts->_name##_from_inode = true; \
- }
- BCH_REBALANCE_OPTS()
-#undef x
- }
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ unsigned csum_ptrs = 0;
+ struct bch_extent_rebalance new =
+ bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, &csum_ptrs,
+ ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts);
+ bool should_have_rb = bkey_should_have_rb_opts(k, new);
- if (bkey_should_have_rb_opts(c, io_opts, k)
- ? old && !memcmp(old, &new, sizeof(new))
- : !old)
+ if (should_have_rb == !!old &&
+ (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
return 0;
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
- int ret = PTR_ERR_OR_ZERO(n);
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
@@ -306,7 +546,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
/* On successfull transaction commit, @k was invalidated: */
- return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?:
+ return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, n, ctx, 0) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, 0) ?:
bch_err_throw(c, transaction_restart_commit);
@@ -349,7 +589,8 @@ static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans,
darray_push(&io_opts->d, e);
}));
- io_opts->cur_inum = extent_pos.inode;
+ io_opts->cur_inum = extent_pos.inode;
+ io_opts->inum_scan_cookie = false;
}
ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -372,11 +613,13 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans,
enum set_needs_rebalance_ctx ctx)
{
struct bch_inode_opts *opts =
- bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k);
+ bch2_extent_get_io_opts(trans, snapshot_io_opts,
+ extent_pos, extent_iter, extent_k);
if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level)
return opts;
- int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx);
+ int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts,
+ extent_iter, extent_k, ctx);
return ret ? ERR_PTR(ret) : opts;
}
@@ -420,11 +663,9 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans,
if (ret || btree_iter_path(trans, extent_iter)->level)
return ret;
- return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx);
+ return bch2_get_update_rebalance_opts(trans, NULL, io_opts, extent_iter, extent_k, ctx);
}
-#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
-
static const char * const bch2_rebalance_state_strs[] = {
#define x(t) #t,
BCH_REBALANCE_STATES()
@@ -467,6 +708,11 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
return ret;
}
+int bch2_set_rebalance_needs_scan_device(struct bch_fs *c, unsigned dev)
+{
+ return bch2_set_rebalance_needs_scan(c, dev + 1);
+}
+
int bch2_set_fs_needs_rebalance(struct bch_fs *c)
{
return bch2_set_rebalance_needs_scan(c, 0);
@@ -535,21 +781,21 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
return &(&darray_pop(buf))->k_i;
}
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs)
{
- if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
- return 0;
+ struct bch_fs *c = trans->c;
- struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
+ guard(rcu)();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (!ca)
+ continue;
- extent_entry_drop(bkey_i_to_s(n),
- (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+ if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket)))
+ return true;
+ }
+ return false;
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@@ -570,6 +816,10 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
if (bkey_err(k))
return k;
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+ if (!r || !r->need_rb) /* Write buffer race? */
+ return bkey_s_c_null;
+
struct bch_inode_opts *opts =
bch2_extent_get_apply_io_opts(trans, snapshot_io_opts,
extent_iter->pos, extent_iter, k,
@@ -580,22 +830,86 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
*opts_ret = opts;
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ unsigned csum_ptrs = 0;
+ bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, false);
+
memset(data_opts, 0, sizeof(*data_opts));
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k);
+ data_opts->rewrite_ptrs = move_ptrs|compress_ptrs|csum_ptrs;
data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
- if (!data_opts->rewrite_ptrs) {
- /*
- * device we would want to write to offline? devices in target
- * changed?
- *
- * We'll now need a full scan before this extent is picked up
- * again:
- */
- int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
- if (ret)
- return bkey_s_c_err(ret);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) {
+ unsigned durability = bch2_bkey_durability(c, k);
+ unsigned ptr_bit = 1;
+
+ guard(rcu)();
+ if (durability <= opts->data_replicas) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (ca && !ptr->cached && !ca->mi.durability)
+ data_opts->kill_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ data_opts->extra_replicas = opts->data_replicas - durability;
+ } else {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= opts->data_replicas) {
+ data_opts->kill_ptrs |= ptr_bit;
+ durability -= d;
+ }
+
+ ptr_bit <<= 1;
+ }
+
+ ptr_bit = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.has_ec && durability - p.ec.redundancy >= opts->data_replicas) {
+ data_opts->kill_ec_ptrs |= ptr_bit;
+ durability -= p.ec.redundancy;
+ }
+
+ ptr_bit <<= 1;
+ }
+ }
+ }
+
+ if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) {
+ if (opts->erasure_code) {
+ /* XXX: we'll need ratelimiting */
+ if (extent_ec_pending(trans, ptrs))
+ return bkey_s_c_null;
+
+ data_opts->extra_replicas = opts->data_replicas;
+ } else {
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.has_ec) {
+ data_opts->kill_ec_ptrs |= ptr_bit;
+ data_opts->extra_replicas += p.ec.redundancy;
+ }
+
+ ptr_bit <<= 1;
+ }
+ }
+ }
+
+ if (!data_opts->rewrite_ptrs &&
+ !data_opts->kill_ptrs &&
+ !data_opts->kill_ec_ptrs &&
+ !data_opts->extra_replicas) {
+ CLASS(printbuf, buf)();
+ prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "%s", buf.buf);
return bkey_s_c_null;
}
@@ -605,12 +919,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- u64 sectors = 0;
-
- bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
-
if (move_ptrs) {
prt_str(&buf, "move=");
bch2_target_to_text(&buf, c, opts->background_target);
@@ -627,6 +935,14 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
prt_newline(&buf);
}
+ if (csum_ptrs) {
+ prt_str(&buf, "csum=");
+ bch2_prt_csum_opt(&buf, opts->data_checksum);
+ prt_str(&buf, " ");
+ bch2_prt_u64_base2(&buf, csum_ptrs);
+ prt_newline(&buf);
+ }
+
trace_rebalance_extent(c, buf.buf);
}
count_event(c, rebalance_extent);
@@ -688,8 +1004,75 @@ out:
return ret;
}
+static int do_rebalance_scan_bp(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct bkey_buf *last_flushed)
+{
+ if (bp.v->level) /* metadata not supported yet */
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
+ last_flushed);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!k.k)
+ return 0;
+
+ struct bch_inode_opts io_opts;
+ ret = bch2_extent_get_io_opts_one(trans, &io_opts, &iter, k,
+ SET_NEEDS_REBALANCE_opt_change);
+ bch2_trans_iter_exit(&iter);
+ return ret;
+}
+
+static int do_rebalance_scan_device(struct moving_context *ctxt,
+ unsigned dev, u64 cookie,
+ u64 *sectors_scanned)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ bch2_btree_write_buffer_flush_sync(trans);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
+ POS(dev, 0), POS(dev, U64_MAX),
+ BTREE_ITER_prefetch, k, ({
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ do_rebalance_scan_bp(trans, bkey_s_c_to_backpointer(k), &last_flushed);
+ })) ?:
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_rebalance_needs_scan(trans, dev + 1, cookie));
+
+ *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
+ /*
+ * Ensure that the rebalance_work entries we created are seen by the
+ * next iteration of do_rebalance(), so we don't end up stuck in
+ * rebalance_wait():
+ */
+ *sectors_scanned += 1;
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ bch2_btree_write_buffer_flush_sync(trans);
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ return ret;
+}
+
static int do_rebalance_scan_indirect(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bch_inode_opts *opts)
{
u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
@@ -702,7 +1085,7 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans,
BTREE_ITER_not_extents, k, ({
if (bpos_ge(bkey_start_pos(k.k), POS(0, end)))
break;
- bch2_get_update_rebalance_opts(trans, opts, &iter, k,
+ bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, k,
SET_NEEDS_REBALANCE_opt_change_indirect);
}));
if (ret)
@@ -724,15 +1107,21 @@ static int do_rebalance_scan(struct moving_context *ctxt,
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
+ r->state = BCH_REBALANCE_scanning;
+
if (!inum) {
r->scan_start = BBPOS_MIN;
r->scan_end = BBPOS_MAX;
- } else {
+ } else if (inum >= BCACHEFS_ROOT_INO) {
r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
- }
+ } else {
+ unsigned dev = inum - 1;
+ r->scan_start = BBPOS(BTREE_ID_backpointers, POS(dev, 0));
+ r->scan_end = BBPOS(BTREE_ID_backpointers, POS(dev, U64_MAX));
- r->state = BCH_REBALANCE_scanning;
+ return do_rebalance_scan_device(ctxt, inum - 1, cookie, sectors_scanned);
+ }
int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
r->scan_start.pos, r->scan_end.pos,
@@ -750,7 +1139,8 @@ static int do_rebalance_scan(struct moving_context *ctxt,
(inum &&
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)
- ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts)
+ ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k),
+ snapshot_io_opts, opts)
: 0);
}));
if (ret)
@@ -1049,6 +1439,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c)
static int check_rebalance_work_one(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct btree_iter *rebalance_iter,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
@@ -1089,8 +1480,7 @@ static int check_rebalance_work_one(struct btree_trans *trans,
extent_k.k = &deleted;
}
- bool should_have_rebalance =
- bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
+ bool should_have_rebalance = bch2_bkey_needs_rb(extent_k);
bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
if (should_have_rebalance != have_rebalance) {
@@ -1119,6 +1509,21 @@ static int check_rebalance_work_one(struct btree_trans *trans,
return ret;
}
+ struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans,
+ snapshot_io_opts, extent_iter->pos, extent_iter, extent_k,
+ SET_NEEDS_REBALANCE_other);
+ ret = PTR_ERR_OR_ZERO(opts);
+ if (ret == -BCH_ERR_transaction_restart_commit) {
+ /*
+ * If get_apply_io_opts() did work, just advance and check the
+ * next key; it may have updated the rebalance_work btree so
+ * we'd need a write buffer flush to check what it just did.
+ */
+ ret = 0;
+ }
+ if (ret)
+ return ret;
+
if (cmp <= 0)
bch2_btree_iter_advance(extent_iter);
if (cmp >= 0)
@@ -1131,10 +1536,14 @@ int bch2_check_rebalance_work(struct bch_fs *c)
{
CLASS(btree_trans, trans)(c);
CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_not_extents|
BTREE_ITER_prefetch);
CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_prefetch);
+ struct per_snapshot_io_opts snapshot_io_opts;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
+
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
@@ -1148,12 +1557,15 @@ int bch2_check_rebalance_work(struct bch_fs *c)
bch2_trans_begin(trans);
- ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
+ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter,
+ &snapshot_io_opts, &last_flushed) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
}
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
bch2_bkey_buf_exit(&last_flushed, c);
return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 24bafa42f070..e6dd9a7db26c 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -7,10 +7,14 @@
#include "opts.h"
#include "rebalance_types.h"
+int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context,
+ const struct bch_extent_rebalance *);
+
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
struct bch_inode_opts *opts)
{
- struct bch_extent_rebalance r = {
+ return (struct bch_extent_rebalance) {
.type = BIT(BCH_EXTENT_ENTRY_rebalance),
#define x(_name) \
._name = opts->_name, \
@@ -18,22 +22,36 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f
BCH_REBALANCE_OPTS()
#undef x
};
-
- if (r.background_target &&
- !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
- r.background_target = 0;
-
- return r;
};
void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance *);
-int bch2_trigger_extent_rebalance(struct btree_trans *,
- struct bkey_s_c, struct bkey_s_c,
- enum btree_iter_update_trigger_flags);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+
+static inline int bch2_bkey_needs_rb(struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+ return r ? r->need_rb : 0;
+}
+
+int __bch2_trigger_extent_rebalance(struct btree_trans *,
+ struct bkey_s_c, struct bkey_s_c,
+ unsigned, unsigned,
+ enum btree_iter_update_trigger_flags);
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
+{
+ unsigned old_r = bch2_bkey_needs_rb(old);
+ unsigned new_r = bch2_bkey_needs_rb(new);
+
+ return old_r != new_r ||
+ (old.k->size != new.k->size && (old_r|new_r))
+ ? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags)
+ : 0;
+}
enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_opt_change,
@@ -42,9 +60,6 @@ enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_other,
};
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *,
- struct bkey_i *, enum set_needs_rebalance_ctx, u32);
-
/* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry {
u32 snapshot;
@@ -53,6 +68,10 @@ struct snapshot_io_opts_entry {
struct per_snapshot_io_opts {
u64 cur_inum;
+ bool fs_scan_cookie;
+ bool inum_scan_cookie;
+ struct bch_devs_mask dev_cookie;
+
struct bch_inode_opts fs_io_opts;
DARRAY(struct snapshot_io_opts_entry) d;
};
@@ -68,6 +87,10 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt
darray_exit(&io_opts->d);
}
+int bch2_bkey_set_needs_rebalance(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bch_inode_opts *,
+ struct bkey_i *, enum set_needs_rebalance_ctx, u32);
+
struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *,
struct per_snapshot_io_opts *, struct bpos,
struct btree_iter *, struct bkey_s_c,
@@ -82,6 +105,7 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opt
int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_rebalance_needs_scan_device(struct bch_fs *, unsigned);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void bch2_rebalance_wakeup(struct bch_fs *c)
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
index ff9a1342a22b..d7a5f899e789 100644
--- a/fs/bcachefs/rebalance_format.h
+++ b/fs/bcachefs/rebalance_format.h
@@ -5,49 +5,76 @@
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
- unused:3,
+ unused:5,
+ hipri:1,
+ pending:1,
+ need_rb:5,
- promote_target_from_inode:1,
- erasure_code_from_inode:1,
+ data_replicas_from_inode:1,
data_checksum_from_inode:1,
+ erasure_code_from_inode:1,
background_compression_from_inode:1,
- data_replicas_from_inode:1,
background_target_from_inode:1,
+ promote_target_from_inode:1,
- promote_target:16,
- erasure_code:1,
+ data_replicas:3,
data_checksum:4,
- data_replicas:4,
+ erasure_code:1,
background_compression:8, /* enum bch_compression_opt */
- background_target:16;
+ background_target:12,
+ promote_target:12;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 background_target:16,
+ __u64 promote_target:12,
+ background_target:12,
background_compression:8,
- data_replicas:4,
- data_checksum:4,
erasure_code:1,
- promote_target:16,
+ data_checksum:4,
+ data_replicas:3,
+ promote_target_from_inode:1,
background_target_from_inode:1,
- data_replicas_from_inode:1,
background_compression_from_inode:1,
- data_checksum_from_inode:1,
erasure_code_from_inode:1,
- promote_target_from_inode:1,
+ data_checksum_from_inode:1,
+ data_replicas_from_inode:1,
- unused:3,
+ need_rb:5,
+ pending:1,
+ hipri:1,
+ unused:5,
type:6;
#endif
};
/* subset of BCH_INODE_OPTS */
#define BCH_REBALANCE_OPTS() \
+ x(data_replicas) \
x(data_checksum) \
+ x(erasure_code) \
x(background_compression) \
+ x(background_target) \
+ x(promote_target)
+
+enum bch_rebalance_opts {
+#define x(n) BCH_REBALANCE_##n,
+ BCH_REBALANCE_OPTS()
+#undef x
+};
+
+#define BCH_REBALANCE_ACCOUNTING() \
x(data_replicas) \
- x(promote_target) \
+ x(data_checksum) \
+ x(erasure_code) \
+ x(background_compression) \
x(background_target) \
- x(erasure_code)
+ x(high_priority) \
+ x(pending) \
+
+enum bch_rebalance_accounting_type {
+#define x(n) BCH_REBALANCE_ACCOUNTING_##n,
+ BCH_REBALANCE_ACCOUNTING()
+#undef x
+};
#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
index b7eff904acdb..898caf943b34 100644
--- a/fs/bcachefs/replicas_format.h
+++ b/fs/bcachefs/replicas_format.h
@@ -17,7 +17,8 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[] __counted_by(nr_devs);
+ /* No counted_by: bch_replicas_cpu entries are all the size of the biggest entry */
+ __u8 devs[];
} __packed;
struct bch_sb_field_replicas {
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index bfd06fd5d506..66b7f19f0437 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -107,7 +107,10 @@
BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
x(btree_node_accounting, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch)
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(rebalance_v2, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \
+ BCH_FSCK_ERR_extent_io_opts_not_set)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 77e3fc92e39b..010b2a86f1be 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -74,6 +74,7 @@ enum bch_fsck_flags {
x(btree_root_bad_min_key, 60, 0) \
x(btree_root_bad_max_key, 61, 0) \
x(btree_node_read_error, 62, FSCK_AUTOFIX) \
+ x(btree_node_topology_gap_between_nodes, 328, FSCK_AUTOFIX) \
x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \
@@ -159,6 +160,8 @@ enum bch_fsck_flags {
x(extent_ptrs_redundant_stripe, 139, 0) \
x(extent_ptrs_unwritten, 140, 0) \
x(extent_ptrs_written_and_unwritten, 141, 0) \
+ x(extent_rebalance_bad_pending, 331, 0) \
+ x(extent_rebalance_bad_hipri, 332, 0) \
x(ptr_to_invalid_device, 142, 0) \
x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \
x(ptr_to_duplicate_device, 143, 0) \
@@ -339,7 +342,9 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 328, 0)
+ x(extent_io_opts_not_set, 329, FSCK_AUTOFIX) \
+ x(extent_io_opts_unneeded, 330, FSCK_AUTOFIX) \
+ x(MAX, 333, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 5cd308a68035..534ba2c0f83a 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -564,15 +564,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
* successfully marked the filesystem dirty
*/
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
enumerated_ref_start(&c->writes);
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting journal reclaim thread");
+ goto err;
+ }
+
ret = bch2_copygc_start(c);
if (ret) {
bch_err_msg(c, ret, "error starting copygc thread");
@@ -852,7 +854,8 @@ int bch2_fs_init_rw(struct bch_fs *c)
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_io_write_init(c) ?:
- bch2_fs_journal_init(&c->journal);
+ bch2_fs_journal_init(&c->journal) ?:
+ bch2_journal_reclaim_start(&c->journal);
if (ret)
return ret;
@@ -1971,6 +1974,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch_notice(ca, "%s", bch2_member_states[new_state]);
+ if (new_state == BCH_MEMBER_STATE_failed)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_STATE(m, new_state);
@@ -1980,6 +1986,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
+ if (new_state == BCH_MEMBER_STATE_failed)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
bch2_rebalance_wakeup(c);
return ret;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ef6312c50f88..85ccc084ecd9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -793,6 +793,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
bool is_sb = opt->get_sb || opt->get_member;
bool changed = false;
+ if (id == Opt_durability)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
if (is_sb) {
changed = bch2_opt_set_sb(c, ca, opt, v);
} else if (!ca) {
@@ -806,6 +809,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
if (!ca)
bch2_opt_set_by_id(&c->opts, id, v);
+ if (id == Opt_durability)
+ bch2_set_rebalance_needs_scan_device(c, ca->dev_idx);
+
if (changed)
bch2_opt_hook_post_set(c, ca, 0, id, v);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 6c312fd9a447..c5d7be2eba03 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1339,11 +1339,6 @@ DEFINE_EVENT(fs_str, io_move_pred,
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 555e0d8f3cf0..eb4222f0f34b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -7,7 +7,6 @@
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
-#include <linux/min_heap.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
diff --git a/fs/bcachefs/vendor/min_heap.c b/fs/bcachefs/vendor/min_heap.c
new file mode 100644
index 000000000000..21f744549d66
--- /dev/null
+++ b/fs/bcachefs/vendor/min_heap.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "min_heap.h"
+
+void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size)
+{
+ __min_heap_init_inline(heap, data, size);
+}
+
+void *__bch2_min_heap_peek(struct min_heap_char *heap)
+{
+ return __min_heap_peek_inline(heap);
+}
+
+bool __bch2_min_heap_full(min_heap_char *heap)
+{
+ return __min_heap_full_inline(heap);
+}
+
+void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heap_sift_down_inline(heap, pos, elem_size, func, args);
+}
+
+void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+}
+
+void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heapify_all_inline(heap, elem_size, func, args);
+}
+
+bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ return __min_heap_pop_inline(heap, elem_size, func, args);
+}
+
+void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ __min_heap_pop_push_inline(heap, element, elem_size, func, args);
+}
+
+bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ return __min_heap_push_inline(heap, element, elem_size, func, args);
+}
+
+bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ return __min_heap_del_inline(heap, elem_size, idx, func, args);
+}
diff --git a/fs/bcachefs/vendor/min_heap.h b/fs/bcachefs/vendor/min_heap.h
new file mode 100644
index 000000000000..865c8b33b11f
--- /dev/null
+++ b/fs/bcachefs/vendor/min_heap.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIN_HEAP_H
+#define _LINUX_MIN_HEAP_H
+
+#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+/*
+ * The Min Heap API provides utilities for managing min-heaps, a binary tree
+ * structure where each node's value is less than or equal to its children's
+ * values, ensuring the smallest element is at the root.
+ *
+ * Users should avoid directly calling functions prefixed with __min_heap_*().
+ * Instead, use the provided macro wrappers.
+ *
+ * For further details and examples, refer to Documentation/core-api/min_heap.rst.
+ */
+
+/**
+ * Data structure to hold a min-heap.
+ * @nr: Number of elements currently in the heap.
+ * @size: Maximum number of elements that can be held in current storage.
+ * @data: Pointer to the start of array holding the heap elements.
+ * @preallocated: Start of the static preallocated array holding the heap elements.
+ */
+#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \
+struct _name { \
+ size_t nr; \
+ size_t size; \
+ _type *data; \
+ _type preallocated[_nr]; \
+}
+
+#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0)
+
+typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char;
+
+#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *)
+#define __minheap_obj_size(_heap) sizeof((_heap)->data[0])
+
+/**
+ * struct min_heap_callbacks - Data/functions to customise the min_heap.
+ * @less: Partial order function for this heap.
+ * @swp: Swap elements function.
+ */
+struct min_heap_callbacks {
+ bool (*less)(const void *lhs, const void *rhs, void *args);
+ void (*swp)(void *lhs, void *rhs, void *args);
+};
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static __always_inline
+void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static __always_inline
+void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static __always_inline
+void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
+#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
+#define SWAP_BYTES ((void (*)(void *, void *, void *))2)
+
+/*
+ * Selects the appropriate swap function based on the element size.
+ */
+static __always_inline
+void *select_swap_func(const void *base, size_t size)
+{
+ if (is_aligned(base, size, 8))
+ return SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ return SWAP_WORDS_32;
+ else
+ return SWAP_BYTES;
+}
+
+static __always_inline
+void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
+ void *priv)
+{
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, priv);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought. Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2. But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit. That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+ i -= size;
+ i -= size & -(i & lsbit);
+ return i / 2;
+}
+
+/* Initialize a min-heap. */
+static __always_inline
+void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size)
+{
+ heap->nr = 0;
+ heap->size = size;
+ if (data)
+ heap->data = data;
+ else
+ heap->data = heap->preallocated;
+}
+
+#define min_heap_init_inline(_heap, _data, _size) \
+ __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
+
+/* Get the minimum element from the heap. */
+static __always_inline
+void *__min_heap_peek_inline(struct min_heap_char *heap)
+{
+ return heap->nr ? heap->data : NULL;
+}
+
+#define min_heap_peek_inline(_heap) \
+ (__minheap_cast(_heap) \
+ __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr)))
+
+/* Check if the heap is full. */
+static __always_inline
+bool __min_heap_full_inline(min_heap_char *heap)
+{
+ return heap->nr == heap->size;
+}
+
+#define min_heap_full_inline(_heap) \
+ __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr))
+
+/* Sift the element at pos down the heap. */
+static __always_inline
+void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ const unsigned long lsbit = elem_size & -elem_size;
+ void *data = heap->data;
+ void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+ /* pre-scale counters for performance */
+ size_t a = pos * elem_size;
+ size_t b, c, d;
+ size_t n = heap->nr * elem_size;
+
+ if (!swp)
+ swp = select_swap_func(data, elem_size);
+
+ /* Find the sift-down path all the way to the leaves. */
+ for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
+ b = func->less(data + c, data + d, args) ? c : d;
+
+ /* Special case for the last leaf with no sibling. */
+ if (d == n)
+ b = c;
+
+ /* Backtrack to the correct location. */
+ while (b != a && func->less(data + a, data + b, args))
+ b = parent(b, lsbit, elem_size);
+
+ /* Shift the element into its correct place. */
+ c = b;
+ while (b != a) {
+ b = parent(b, lsbit, elem_size);
+ do_swap(data + b, data + c, elem_size, swp, args);
+ }
+}
+
+#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \
+ __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Sift up ith element from the heap, O(log2(nr)). */
+static __always_inline
+void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ const unsigned long lsbit = elem_size & -elem_size;
+ void *data = heap->data;
+ void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+ /* pre-scale counters for performance */
+ size_t a = idx * elem_size, b;
+
+ if (!swp)
+ swp = select_swap_func(data, elem_size);
+
+ while (a) {
+ b = parent(a, lsbit, elem_size);
+ if (func->less(data + b, data + a, args))
+ break;
+ do_swap(data + a, data + b, elem_size, swp, args);
+ a = b;
+ }
+}
+
+#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \
+ __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+
+/* Floyd's approach to heapification that is O(nr). */
+static __always_inline
+void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ ssize_t i;
+
+ for (i = heap->nr / 2 - 1; i >= 0; i--)
+ __min_heap_sift_down_inline(heap, i, elem_size, func, args);
+}
+
+#define min_heapify_all_inline(_heap, _func, _args) \
+ __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Remove minimum element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+
+ if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+ return false;
+
+ /* Place last element at the root (position 0) and then sift down. */
+ heap->nr--;
+ memcpy(data, data + (heap->nr * elem_size), elem_size);
+ __min_heap_sift_down_inline(heap, 0, elem_size, func, args);
+
+ return true;
+}
+
+#define min_heap_pop_inline(_heap, _func, _args) \
+ __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+
+/*
+ * Remove the minimum element and then push the given element. The
+ * implementation performs 1 sift (O(log2(nr))) and is therefore more
+ * efficient than a pop followed by a push that does 2.
+ */
+static __always_inline
+void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ memcpy(heap->data, element, elem_size);
+ __min_heap_sift_down_inline(heap, 0, elem_size, func, args);
+}
+
+#define min_heap_pop_push_inline(_heap, _element, _func, _args) \
+ __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Push an element on to the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+ size_t pos;
+
+ if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
+ return false;
+
+ /* Place at the end of data. */
+ pos = heap->nr;
+ memcpy(data + (pos * elem_size), element, elem_size);
+ heap->nr++;
+
+ /* Sift child at pos up. */
+ __min_heap_sift_up_inline(heap, elem_size, pos, func, args);
+
+ return true;
+}
+
+#define min_heap_push_inline(_heap, _element, _func, _args) \
+ __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+
+/* Remove ith element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args)
+{
+ void *data = heap->data;
+ void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+
+ if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+ return false;
+
+ if (!swp)
+ swp = select_swap_func(data, elem_size);
+
+ /* Place last element at the root (position 0) and then sift down. */
+ heap->nr--;
+ if (idx == heap->nr)
+ return true;
+ do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
+ __min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+ __min_heap_sift_down_inline(heap, idx, elem_size, func, args);
+
+ return true;
+}
+
+#define min_heap_del_inline(_heap, _idx, _func, _args) \
+ __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+
+void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size);
+void *__bch2_min_heap_peek(struct min_heap_char *heap);
+bool __bch2_min_heap_full(min_heap_char *heap);
+void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+ const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+ const struct min_heap_callbacks *func, void *args);
+
+#define min_heap_init(_heap, _data, _size) \
+ __bch2_min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
+#define min_heap_peek(_heap) \
+ (__minheap_cast(_heap) __bch2_min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr)))
+#define min_heap_full(_heap) \
+ __bch2_min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr))
+#define min_heap_sift_down(_heap, _pos, _func, _args) \
+ __bch2_min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_sift_up(_heap, _idx, _func, _args) \
+ __bch2_min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+#define min_heapify_all(_heap, _func, _args) \
+ __bch2_min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop(_heap, _func, _args) \
+ __bch2_min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop_push(_heap, _element, _func, _args) \
+ __bch2_min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_push(_heap, _element, _func, _args) \
+ __bch2_min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \
+ __minheap_obj_size(_heap), _func, _args)
+#define min_heap_del(_heap, _idx, _func, _args) \
+ __bch2_min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr), \
+ __minheap_obj_size(_heap), _idx, _func, _args)
+
+#endif /* _LINUX_MIN_HEAP_H */