summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/Kconfig4
-rw-r--r--fs/bcachefs/Makefile8
-rw-r--r--fs/bcachefs/async_objs.c4
-rw-r--r--fs/bcachefs/bcachefs.h2
-rw-r--r--fs/bcachefs/bcachefs_format.h3
-rw-r--r--fs/bcachefs/closure.h5
-rw-r--r--fs/bcachefs/data_update.c8
-rw-r--r--fs/bcachefs/disk_accounting.c3
-rw-r--r--fs/bcachefs/disk_accounting_format.h8
-rw-r--r--fs/bcachefs/errcode.h5
-rw-r--r--fs/bcachefs/fast_list.c3
-rw-r--r--fs/bcachefs/fs.c14
-rw-r--r--fs/bcachefs/movinggc.c44
-rw-r--r--fs/bcachefs/nocow_locking.c3
-rw-r--r--fs/bcachefs/opts.c8
-rw-r--r--fs/bcachefs/opts.h1
-rw-r--r--fs/bcachefs/rebalance.c308
-rw-r--r--fs/bcachefs/rebalance.h36
-rw-r--r--fs/bcachefs/rebalance_format.h63
-rw-r--r--fs/bcachefs/sb-downgrade.c4
-rw-r--r--fs/bcachefs/util.h2
-rw-r--r--fs/bcachefs/vendor/closure.c218
-rw-r--r--fs/bcachefs/vendor/closure.h490
23 files changed, 968 insertions, 276 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 8cb2b9d5da96..5455412b2b75 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -3,14 +3,12 @@ config BCACHEFS_FS
tristate "bcachefs filesystem support (EXPERIMENTAL)"
depends on BLOCK
select EXPORTFS
- select CLOSURES
select CRC32
select CRC64
select FS_POSIX_ACL
select LZ4_COMPRESS
select LZ4_DECOMPRESS
select LZ4HC_COMPRESS
- select LZ4HC_DECOMPRESS
select ZLIB_DEFLATE
select ZLIB_INFLATE
select ZSTD_COMPRESS
@@ -22,10 +20,8 @@ config BCACHEFS_FS
select RAID6_PQ
select XOR_BLOCKS
select XXHASH
- select SRCU
select SYMBOLIC_ERRNAME
select MIN_HEAP
- select XARRAY_MULTI
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index bb2a80fba12b..1e87eee962ec 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -5,6 +5,7 @@ bcachefs-y := \
acl.o \
alloc_background.o \
alloc_foreground.o \
+ async_objs.o \
backpointers.o \
bkey.o \
bkey_methods.o \
@@ -41,6 +42,7 @@ bcachefs-y := \
extents.o \
extent_update.o \
eytzinger.o \
+ fast_list.o \
fs.o \
fs-ioctl.o \
fs-io.o \
@@ -96,10 +98,8 @@ bcachefs-y := \
two_state_shared_lock.o \
util.o \
varint.o \
- xattr.o
-
-bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += fast_list.o
-bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o
+ xattr.o \
+ vendor/closure.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
index ad04e5f0f056..bd935782c5f7 100644
--- a/fs/bcachefs/async_objs.c
+++ b/fs/bcachefs/async_objs.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+
/*
* Async obj debugging: keep asynchronous objects on (very fast) lists, make
* them visibile in debugfs:
@@ -139,3 +141,5 @@ int bch2_fs_async_obj_init(struct bch_fs *c)
return 0;
}
+
+#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 83d6ab9c1a91..3ccca855f05e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -196,7 +196,6 @@
#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
-#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/math64.h>
@@ -217,6 +216,7 @@
#include "bcachefs_format.h"
#include "btree_journal_iter_types.h"
+#include "closure.h"
#include "disk_accounting_types.h"
#include "errcode.h"
#include "fast_list.h"
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 090f11e122ad..d29bd684b137 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -707,8 +707,7 @@ struct bch_sb_field_ext {
x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
x(31bit_dirent_offset, BCH_VERSION(1, 30)) \
- x(btree_node_accounting, BCH_VERSION(1, 31)) \
- x(rebalance_v2, BCH_VERSION(1, 32))
+ x(btree_node_accounting, BCH_VERSION(1, 31))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/closure.h b/fs/bcachefs/closure.h
new file mode 100644
index 000000000000..d8d4c7093ce0
--- /dev/null
+++ b/fs/bcachefs/closure.h
@@ -0,0 +1,5 @@
+#include "vendor/closure.h"
+
+#define closure_wait bch2_closure_wait
+#define closure_return_sync bch2_closure_return_sync
+#define __closure_wake_up __bch2_closure_wake_up
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 894ba0944fea..62d5d17d681e 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -91,8 +91,10 @@ bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bke
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
list_empty(&ctxt->ios));
- if (!locked)
+ if (!locked) {
+ bch2_trans_unlock(ctxt->trans);
bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+ }
}
return true;
}
@@ -449,8 +451,8 @@ restart_drop_extra_replicas:
if (trace_data_update_enabled())
trace_data_update2(m, old, k, insert);
- if (bch2_bkey_needs_rb(bkey_i_to_s_c(insert)) &&
- !bch2_bkey_needs_rb(k))
+ if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
+ bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
trace_io_move_created_rebalance2(m, old, k, insert);
ret = bch2_trans_commit(trans, &op->res,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 9da26e11446b..a99f821c6a1c 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -282,9 +282,6 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
prt_str(out, "btree=");
bch2_btree_id_to_text(out, k->btree.id);
break;
- case BCH_DISK_ACCOUNTING_rebalance_work_v2:
- bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type);
- break;
}
}
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 0b61d6100180..730a17ea4243 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -110,8 +110,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
x(snapshot, 5, 1) \
x(btree, 6, 3) \
x(rebalance_work, 7, 1) \
- x(inum, 8, 3) \
- x(rebalance_work_v2, 9, 1) \
+ x(inum, 8, 3)
enum disk_accounting_type {
#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
@@ -211,10 +210,6 @@ struct bch_acct_inum {
struct bch_acct_rebalance_work {
};
-struct bch_acct_rebalance_work_v2 {
- __u8 type;
-};
-
struct disk_accounting_pos {
union {
struct {
@@ -229,7 +224,6 @@ struct disk_accounting_pos {
struct bch_acct_btree btree;
struct bch_acct_rebalance_work rebalance_work;
struct bch_acct_inum inum;
- struct bch_acct_rebalance_work_v2 rebalance_work_v2;
} __packed;
} __packed;
struct bpos _pad;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c4344a1d6976..cbf1eedddad7 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -188,6 +188,11 @@
x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \
x(0, data_update_done) \
x(0, bkey_was_deleted) \
+ x(0, bucket_not_moveable) \
+ x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_dev_not_rw) \
+ x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_bucket_open) \
+ x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_bp_mismatch) \
+ x(BCH_ERR_bucket_not_moveable, bucket_not_moveable_lru_race) \
x(BCH_ERR_data_update_done, data_update_done_would_block) \
x(BCH_ERR_data_update_done, data_update_done_unwritten) \
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
index 6be2a45be1dd..de2947cd2bcf 100644
--- a/fs/bcachefs/fast_list.c
+++ b/fs/bcachefs/fast_list.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
/*
* Fast, unordered lists
@@ -166,3 +167,5 @@ int fast_list_init(struct fast_list *l)
return -ENOMEM;
return 0;
}
+
+#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 958849c30071..f1849eb8327d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1521,6 +1521,7 @@ static const struct vm_operations_struct bch_vm_ops = {
.page_mkwrite = bch2_page_mkwrite,
};
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,17,0)
static int bch2_mmap_prepare(struct vm_area_desc *desc)
{
file_accessed(desc->file);
@@ -1528,6 +1529,15 @@ static int bch2_mmap_prepare(struct vm_area_desc *desc)
desc->vm_ops = &bch_vm_ops;
return 0;
}
+#else
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+
+ vma->vm_ops = &bch_vm_ops;
+ return 0;
+}
+#endif
/* Directories: */
@@ -1719,7 +1729,11 @@ static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,17,0)
.mmap_prepare = bch2_mmap_prepare,
+#else
+ .mmap = bch2_mmap,
+#endif
.get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index f36d60b8fb07..0f7e35684bc8 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -62,25 +62,38 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
- if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
+ /*
+ * Valid bucket?
+ *
+ * XXX: we should kill the LRU entry here if it's not
+ */
+ CLASS(bch2_dev_bucket_tryget, ca)(c, b->k.bucket);
+ if (!ca)
return 0;
- CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- CLASS(bch2_dev_bucket_tryget, ca)(c, k.k->p);
- if (!ca)
+ if (ca->mi.state != BCH_MEMBER_STATE_rw ||
+ !bch2_dev_is_online(ca)) {
+ bch_err_throw(c, bucket_not_moveable_dev_not_rw);
return 0;
+ }
- if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
+ /* Bucket still being written? */
+ if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) {
+ bch_err_throw(c, bucket_not_moveable_bucket_open);
return 0;
+ }
- if (ca->mi.state != BCH_MEMBER_STATE_rw ||
- !bch2_dev_is_online(ca))
+ /* We won't be able to evacuate it if there's missing backpointers */
+ if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) {
+ bch_err_throw(c, bucket_not_moveable_bp_mismatch);
return 0;
+ }
+
+ CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
@@ -88,7 +101,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
b->sectors = bch2_bucket_sectors_dirty(*a);
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
- return lru_idx && lru_idx <= time;
+ if (!lru_idx || lru_idx > time) {
+ bch_err_throw(c, bucket_not_moveable_lru_race);
+ return 0;
+ }
+
+ return true;
}
static void move_bucket_free(struct buckets_in_flight *list,
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 58cfd540c6d6..71b17f18e90c 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -2,11 +2,10 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "closure.h"
#include "nocow_locking.h"
#include "util.h"
-#include <linux/closure.h>
-
bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
{
u64 dev_bucket = bucket_to_u64(bucket);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 365cce4a6b49..bd5faafc9aa7 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -103,13 +103,6 @@ static const char * const __bch2_fs_usage_types[] = {
#undef x
-static const char * const __bch2_rebalance_accounting_types[] = {
-#define x(n) #n,
- BCH_REBALANCE_ACCOUNTING()
-#undef x
- NULL
-};
-
static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
unsigned nr, const char *type, unsigned idx)
{
@@ -132,7 +125,6 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
-PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index de1ac235e929..6b9f18839345 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -34,7 +34,6 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
-void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type);
static inline const char *bch2_d_type_str(unsigned d_type)
{
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index f2d0040d043c..67d6a90e86ef 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -38,30 +38,15 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b
return NULL;
}
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
}
-static const char * const rebalance_opts[] = {
-#define x(n) #n,
- BCH_REBALANCE_OPTS()
-#undef x
- NULL
-};
-
void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
const struct bch_extent_rebalance *r)
{
- prt_str(out, "need_rb=");
- prt_bitflags(out, rebalance_opts, r->need_rb);
-
- if (r->hipri)
- prt_str(out, " hipri");
- if (r->pending)
- prt_str(out, " pending");
-
- prt_printf(out, " replicas=%u", r->data_replicas);
+ prt_printf(out, "replicas=%u", r->data_replicas);
if (r->data_replicas_from_inode)
prt_str(out, " (inode)");
@@ -107,54 +92,32 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-/*
- * XXX: check in bkey_validate that if r->hipri or r->pending are set,
- * r->data_replicas are also set
- */
-
-static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r)
+int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
{
- if (!r)
- return 0;
- unsigned ret = r->need_rb;
+ struct bch_fs *c = trans->c;
+ int need_rebalance_delta = 0;
+ s64 need_rebalance_sectors_delta[1] = { 0 };
- if (r->hipri)
- ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority);
- if (r->pending) {
- ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending);
- ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target);
- }
- return ret;
-}
+ s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+ need_rebalance_delta -= s != 0;
+ need_rebalance_sectors_delta[0] -= s;
-int __bch2_trigger_extent_rebalance(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned old_r, unsigned new_r,
- enum btree_iter_update_trigger_flags flags)
-{
- int delta = (int) !!new_r - (int) !!old_r;
- if ((flags & BTREE_TRIGGER_transactional) && delta) {
+ s = bch2_bkey_sectors_need_rebalance(c, new);
+ need_rebalance_delta += s != 0;
+ need_rebalance_sectors_delta[0] += s;
+
+ if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, delta > 0);
+ new.k->p, need_rebalance_delta > 0);
if (ret)
return ret;
}
- delta = old.k->size == new.k->size
- ? old_r ^ new_r
- : old_r | new_r;
- while (delta) {
- unsigned c = __ffs(delta);
- delta ^= BIT(c);
-
- s64 v[1] = { 0 };
- if (old_r & BIT(c))
- v[0] -= (s64) old.k->size;
- if (new_r & BIT(c))
- v[0] += (s64) new.k->size;
-
+ if (need_rebalance_sectors_delta[0]) {
int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- v, rebalance_work_v2, c);
+ need_rebalance_sectors_delta, rebalance_work);
if (ret)
return ret;
}
@@ -162,45 +125,39 @@ int __bch2_trigger_extent_rebalance(struct btree_trans *trans,
return 0;
}
-static struct bch_extent_rebalance
-bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
- struct bch_inode_opts *opts,
- unsigned *move_ptrs,
- unsigned *compress_ptrs,
- bool may_update_indirect)
+static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_inode_opts *io_opts,
+ unsigned *move_ptrs,
+ unsigned *compress_ptrs,
+ u64 *sectors)
{
*move_ptrs = 0;
*compress_ptrs = 0;
+ *sectors = 0;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) };
-
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return r;
-
- const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs);
- if (old_r) {
- r = *old_r;
- r.need_rb = 0;
- }
-#define x(_name) \
- if (k.k->type != KEY_TYPE_reflink_v || \
- may_update_indirect || \
- (!opts->_name##_from_inode && !r._name##_from_inode)) { \
- r._name = opts->_name; \
- r._name##_from_inode = opts->_name##_from_inode; \
- }
- BCH_REBALANCE_OPTS()
-#undef x
+ const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
+ if (!io_opts && !rb_opts)
+ return;
- unsigned compression_type = bch2_compression_opt_to_type(r.background_compression);
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return;
- bool incompressible = false, unwritten = false, ec = false;
- unsigned durability = 0, min_durability = INT_MAX;
+ unsigned compression_type =
+ bch2_compression_opt_to_type(io_opts
+ ? io_opts->background_compression
+ : rb_opts->background_compression);
+ unsigned target = io_opts
+ ? io_opts->background_target
+ : rb_opts->background_target;
+ if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target))
+ target = 0;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ bool incompressible = false, unwritten = false;
+
unsigned ptr_idx = 1;
guard(rcu)();
@@ -209,50 +166,72 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
unwritten |= p.ptr.unwritten;
if (!p.ptr.cached) {
- if (p.crc.compression_type != compression_type) {
+ if (p.crc.compression_type != compression_type)
*compress_ptrs |= ptr_idx;
- r.need_rb |= BIT(BCH_REBALANCE_background_compression);
- }
- if (r.background_target &&
- !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) {
+ if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
*move_ptrs |= ptr_idx;
- r.need_rb |= BIT(BCH_REBALANCE_background_target);
- }
-
- unsigned d = bch2_extent_ptr_durability(c, &p);
- durability += d;
- min_durability = min(min_durability, d);
-
- ec |= p.has_ec;
}
ptr_idx <<= 1;
}
- if (unwritten || incompressible) {
+ if (unwritten)
*compress_ptrs = 0;
- r.need_rb &= ~BIT(BCH_REBALANCE_background_compression);
+ if (incompressible)
+ *compress_ptrs = 0;
+
+ unsigned rb_ptrs = *move_ptrs | *compress_ptrs;
+
+ if (!rb_ptrs)
+ return;
+
+ ptr_idx = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (rb_ptrs & ptr_idx)
+ *sectors += p.crc.compressed_size;
+ ptr_idx <<= 1;
}
- return r;
}
-static inline bool bkey_should_have_rb_opts(struct bkey_s_c k,
- struct bch_extent_rebalance new)
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ u64 sectors = 0;
+
+ bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, &sectors);
+ return sectors;
+}
+
+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+ struct bch_inode_opts *opts,
+ struct bkey_s_c k)
+{
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ u64 sectors = 0;
+
+ bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
+ return move_ptrs|compress_ptrs;
+}
+
+static inline bool bkey_should_have_rb_opts(struct bch_fs *c,
+ struct bch_inode_opts *opts,
+ struct bkey_s_c k)
{
if (k.k->type == KEY_TYPE_reflink_v) {
-#define x(n) if (new.n##_from_inode) return true;
+#define x(n) if (opts->n##_from_inode) return true;
BCH_REBALANCE_OPTS()
#undef x
}
- return new.need_rb;
+ return bch2_bkey_ptrs_need_rebalance(c, opts, k);
}
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c,
- struct bch_inode_opts *opts,
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
struct bkey_i *_k,
enum set_needs_rebalance_ctx ctx,
- u32 opt_change_cookie)
+ u32 change_cookie)
{
if (!bkey_extent_is_direct_data(&_k->k))
return 0;
@@ -261,27 +240,17 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c,
struct bch_extent_rebalance *old =
(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- struct bch_extent_rebalance new =
- bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs,
- ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
-
- bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new);
-
- if (should_have_rb == !!old &&
- (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
- return 0;
-
- if (should_have_rb) {
+ if (bkey_should_have_rb_opts(c, opts, k.s_c)) {
if (!old) {
old = bkey_val_end(k);
k.k->u64s += sizeof(*old) / sizeof(u64);
}
- *old = new;
- } else if (old)
- extent_entry_drop(k, (union bch_extent_entry *) old);
+ *old = io_opts_to_rebalance_opts(c, opts);
+ } else {
+ if (old)
+ extent_entry_drop(k, (union bch_extent_entry *) old);
+ }
return 0;
}
@@ -300,19 +269,32 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
if (!bkey_extent_is_direct_data(k.k))
return 0;
- struct bch_extent_rebalance *old =
- (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k);
+ bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect;
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- struct bch_extent_rebalance new =
- bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs,
- ctx == SET_NEEDS_REBALANCE_opt_change_indirect);
+ /*
+ * If it's an indirect extent, and we walked to it directly, we won't
+ * have the options from the inode that were directly applied: options
+ * from the extent take precedence - unless the io_opts option came from
+ * the inode and may_update_indirect is true (walked from a
+ * REFLINK_P_MAY_UPDATE_OPTIONS pointer).
+ */
+ const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+ if (old && k.k->type == KEY_TYPE_reflink_v) {
+#define x(_name) \
+ if (old->_name##_from_inode && \
+ !(may_update_indirect && io_opts->_name##_from_inode)) { \
+ io_opts->_name = old->_name; \
+ io_opts->_name##_from_inode = true; \
+ }
+ BCH_REBALANCE_OPTS()
+#undef x
+ }
- bool should_have_rb = bkey_should_have_rb_opts(k, new);
+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts);
- if (should_have_rb == !!old &&
- (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
+ if (bkey_should_have_rb_opts(c, io_opts, k)
+ ? old && !memcmp(old, &new, sizeof(new))
+ : !old)
return 0;
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
@@ -324,7 +306,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
/* On successfull transaction commit, @k was invalidated: */
- return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?:
+ return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, 0) ?:
bch_err_throw(c, transaction_restart_commit);
@@ -390,8 +372,7 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans,
enum set_needs_rebalance_ctx ctx)
{
struct bch_inode_opts *opts =
- bch2_extent_get_io_opts(trans, snapshot_io_opts,
- extent_pos, extent_iter, extent_k);
+ bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k);
if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level)
return opts;
@@ -554,6 +535,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
return &(&darray_pop(buf))->k_i;
}
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
+ return 0;
+
+ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ extent_entry_drop(bkey_i_to_s(n),
+ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
+
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct per_snapshot_io_opts *snapshot_io_opts,
struct bpos work_pos,
@@ -572,10 +570,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
if (bkey_err(k))
return k;
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
- if (!r || !r->need_rb) /* Write buffer race? */
- return bkey_s_c_null;
-
struct bch_inode_opts *opts =
bch2_extent_get_apply_io_opts(trans, snapshot_io_opts,
extent_iter->pos, extent_iter, k,
@@ -586,23 +580,22 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
*opts_ret = opts;
- unsigned move_ptrs = 0;
- unsigned compress_ptrs = 0;
- bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, false);
-
memset(data_opts, 0, sizeof(*data_opts));
- data_opts->rewrite_ptrs = move_ptrs|compress_ptrs;
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k);
data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
- if (!data_opts->rewrite_ptrs &&
- !data_opts->kill_ptrs &&
- !data_opts->kill_ec_ptrs &&
- !data_opts->extra_replicas) {
- CLASS(printbuf, buf)();
- prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n ");
- bch2_bkey_val_to_text(&buf, c, k);
- bch_err(c, "%s", buf.buf);
+ if (!data_opts->rewrite_ptrs) {
+ /*
+ * device we would want to write to offline? devices in target
+ * changed?
+ *
+ * We'll now need a full scan before this extent is picked up
+ * again:
+ */
+ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
return bkey_s_c_null;
}
@@ -612,6 +605,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ u64 sectors = 0;
+
+ bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
+
if (move_ptrs) {
prt_str(&buf, "move=");
bch2_target_to_text(&buf, c, opts->background_target);
@@ -1090,7 +1089,8 @@ static int check_rebalance_work_one(struct btree_trans *trans,
extent_k.k = &deleted;
}
- bool should_have_rebalance = bch2_bkey_needs_rb(extent_k);
+ bool should_have_rebalance =
+ bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
if (should_have_rebalance != have_rebalance) {
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index ae576d8af609..24bafa42f070 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -10,7 +10,7 @@
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
struct bch_inode_opts *opts)
{
- return (struct bch_extent_rebalance) {
+ struct bch_extent_rebalance r = {
.type = BIT(BCH_EXTENT_ENTRY_rebalance),
#define x(_name) \
._name = opts->_name, \
@@ -18,36 +18,22 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f
BCH_REBALANCE_OPTS()
#undef x
};
+
+ if (r.background_target &&
+ !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
+ r.background_target = 0;
+
+ return r;
};
void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance *);
-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
-
-static inline int bch2_bkey_needs_rb(struct bkey_s_c k)
-{
- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
- return r ? r->need_rb : 0;
-}
+int bch2_trigger_extent_rebalance(struct btree_trans *,
+ struct bkey_s_c, struct bkey_s_c,
+ enum btree_iter_update_trigger_flags);
-int __bch2_trigger_extent_rebalance(struct btree_trans *,
- struct bkey_s_c, struct bkey_s_c,
- unsigned, unsigned,
- enum btree_iter_update_trigger_flags);
-
-static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- unsigned old_r = bch2_bkey_needs_rb(old);
- unsigned new_r = bch2_bkey_needs_rb(new);
-
- return old_r != new_r ||
- (old.k->size != new.k->size && (old_r|new_r))
- ? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags)
- : 0;
-}
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_opt_change,
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
index d7a5f899e789..ff9a1342a22b 100644
--- a/fs/bcachefs/rebalance_format.h
+++ b/fs/bcachefs/rebalance_format.h
@@ -5,76 +5,49 @@
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
- unused:5,
- hipri:1,
- pending:1,
- need_rb:5,
+ unused:3,
- data_replicas_from_inode:1,
- data_checksum_from_inode:1,
+ promote_target_from_inode:1,
erasure_code_from_inode:1,
+ data_checksum_from_inode:1,
background_compression_from_inode:1,
+ data_replicas_from_inode:1,
background_target_from_inode:1,
- promote_target_from_inode:1,
- data_replicas:3,
- data_checksum:4,
+ promote_target:16,
erasure_code:1,
+ data_checksum:4,
+ data_replicas:4,
background_compression:8, /* enum bch_compression_opt */
- background_target:12,
- promote_target:12;
+ background_target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 promote_target:12,
- background_target:12,
+ __u64 background_target:16,
background_compression:8,
- erasure_code:1,
+ data_replicas:4,
data_checksum:4,
- data_replicas:3,
+ erasure_code:1,
+ promote_target:16,
- promote_target_from_inode:1,
background_target_from_inode:1,
+ data_replicas_from_inode:1,
background_compression_from_inode:1,
- erasure_code_from_inode:1,
data_checksum_from_inode:1,
- data_replicas_from_inode:1,
+ erasure_code_from_inode:1,
+ promote_target_from_inode:1,
- need_rb:5,
- pending:1,
- hipri:1,
- unused:5,
+ unused:3,
type:6;
#endif
};
/* subset of BCH_INODE_OPTS */
#define BCH_REBALANCE_OPTS() \
- x(data_replicas) \
x(data_checksum) \
- x(erasure_code) \
x(background_compression) \
- x(background_target) \
- x(promote_target)
-
-enum bch_rebalance_opts {
-#define x(n) BCH_REBALANCE_##n,
- BCH_REBALANCE_OPTS()
-#undef x
-};
-
-#define BCH_REBALANCE_ACCOUNTING() \
x(data_replicas) \
- x(data_checksum) \
- x(erasure_code) \
- x(background_compression) \
+ x(promote_target) \
x(background_target) \
- x(high_priority) \
- x(pending) \
-
-enum bch_rebalance_accounting_type {
-#define x(n) BCH_REBALANCE_ACCOUNTING_##n,
- BCH_REBALANCE_ACCOUNTING()
-#undef x
-};
+ x(erasure_code)
#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 0d47e83c28a6..bfd06fd5d506 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -107,9 +107,7 @@
BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
x(btree_node_accounting, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(rebalance_v2, \
- BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work))
+ BCH_FSCK_ERR_accounting_mismatch)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 52ac8230be9f..555e0d8f3cf0 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -4,7 +4,6 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/closure.h>
#include <linux/errno.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
@@ -21,6 +20,7 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include "closure.h"
#include "mean_and_variance.h"
#include "darray.h"
diff --git a/fs/bcachefs/vendor/closure.c b/fs/bcachefs/vendor/closure.c
new file mode 100644
index 000000000000..bdafd3a57386
--- /dev/null
+++ b/fs/bcachefs/vendor/closure.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "closure.h"
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
+
+static void closure_val_checks(struct closure *cl, unsigned new, int d)
+{
+ unsigned count = new & CLOSURE_REMAINING_MASK;
+
+ if (WARN(new & CLOSURE_GUARD_MASK,
+ "closure %ps has guard bits set: %x (%u), delta %i",
+ cl->fn,
+ new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d))
+ new &= ~CLOSURE_GUARD_MASK;
+
+ WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)),
+ "closure %ps ref hit 0 with incorrect flags set: %x (%u)",
+ cl->fn,
+ new, (unsigned) __fls(new));
+}
+
+enum new_closure_state {
+ CLOSURE_normal_put,
+ CLOSURE_requeue,
+ CLOSURE_done,
+};
+
+/* For clearing flags with the same atomic op as a put */
+void bch2_closure_sub(struct closure *cl, int v)
+{
+ enum new_closure_state s;
+ struct task_struct *sleeper;
+
+ /* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */
+ guard(rcu)();
+
+ int old = atomic_read_acquire(&cl->remaining), new;
+ do {
+ new = old - v;
+
+ if (new & CLOSURE_REMAINING_MASK) {
+ s = CLOSURE_normal_put;
+ } else {
+ if ((cl->fn || (new & CLOSURE_SLEEPING)) &&
+ !(new & CLOSURE_DESTRUCTOR)) {
+ s = CLOSURE_requeue;
+ new += CLOSURE_REMAINING_INITIALIZER;
+ } else
+ s = CLOSURE_done;
+
+ sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL;
+ new &= ~CLOSURE_SLEEPING;
+ }
+
+ closure_val_checks(cl, new, -v);
+ } while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new));
+
+ if (s == CLOSURE_normal_put)
+ return;
+
+ if (sleeper) {
+ smp_mb();
+ wake_up_process(sleeper);
+ return;
+ }
+
+ if (s == CLOSURE_requeue) {
+ closure_queue(cl);
+ } else {
+ struct closure *parent = cl->parent;
+ closure_fn *destructor = cl->fn;
+
+ closure_debug_destroy(cl);
+
+ if (destructor)
+ destructor(&cl->work);
+
+ if (parent)
+ closure_put(parent);
+ }
+}
+
+/*
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __bch2_closure_wake_up(struct closure_waitlist *wait_list)
+{
+ struct llist_node *list;
+ struct closure *cl, *t;
+ struct llist_node *reverse = NULL;
+
+ list = llist_del_all(&wait_list->list);
+
+ /* We first reverse the list to preserve FIFO ordering and fairness */
+ reverse = llist_reverse_order(list);
+
+ /* Then do the wakeups */
+ llist_for_each_entry_safe(cl, t, reverse, list) {
+ closure_set_waiting(cl, 0);
+ bch2_closure_sub(cl, CLOSURE_WAITING + 1);
+ }
+}
+
+/**
+ * closure_wait - add a closure to a waitlist
+ * @waitlist: will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
+ *
+ */
+bool bch2_closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+ return false;
+
+ closure_set_waiting(cl, _RET_IP_);
+ unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining);
+ closure_val_checks(cl, r, CLOSURE_WAITING + 1);
+
+ llist_add(&cl->list, &waitlist->list);
+
+ return true;
+}
+
+void __sched __bch2_closure_sync(struct closure *cl)
+{
+ cl->sleeper = current;
+ bch2_closure_sub(cl,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_SLEEPING);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+ break;
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+}
+
+/*
+ * closure_return_sync - finish running a closure, synchronously (i.e. waiting
+ * for outstanding get()s to finish) and returning once closure refcount is 0.
+ *
+ * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
+ * closure_get_not_zero() calls will fail.
+ */
+void __sched bch2_closure_return_sync(struct closure *cl)
+{
+ cl->sleeper = current;
+ bch2_closure_sub(cl,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_DESTRUCTOR -
+ CLOSURE_SLEEPING);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+ break;
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (cl->parent)
+ closure_put(cl->parent);
+}
+
+int __sched __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+ int ret = 0;
+
+ cl->sleeper = current;
+ bch2_closure_sub(cl,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_SLEEPING);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ /*
+ * Carefully undo the continue_at() - but only if it
+ * hasn't completed, i.e. the final closure_put() hasn't
+ * happened yet:
+ */
+ unsigned old = atomic_read(&cl->remaining), new;
+ if (!(old & CLOSURE_SLEEPING))
+ goto success;
+
+ if (!timeout) {
+ do {
+ if (!(old & CLOSURE_SLEEPING))
+ goto success;
+
+ new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING;
+ closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING);
+ } while (!atomic_try_cmpxchg(&cl->remaining, &old, new));
+
+ ret = -ETIME;
+ break;
+ }
+
+ timeout = schedule_timeout(timeout);
+ }
+success:
+ __set_current_state(TASK_RUNNING);
+ return ret;
+}
diff --git a/fs/bcachefs/vendor/closure.h b/fs/bcachefs/vendor/closure.h
new file mode 100644
index 000000000000..79112efe30a7
--- /dev/null
+++ b/fs/bcachefs/vendor/closure.h
@@ -0,0 +1,490 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ * continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ * closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct work_struct *);
+extern struct dentry *bcache_debug;
+
+struct closure_waitlist {
+ struct llist_head list;
+};
+
+enum closure_state {
+ /*
+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+ * the thread that owns the closure, and cleared by the thread that's
+ * waking up the closure.
+ *
+ * The rest are for debugging and don't affect behaviour:
+ *
+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+ * closure_init() and when closure_put() runs then next function), and
+ * must be cleared before remaining hits 0. Primarily to help guard
+ * against incorrect usage and accidentally transferring references.
+ * continue_at() and closure_return() clear it for you, if you're doing
+ * something unusual you can use closure_set_dead() which also helps
+ * annotate where references are being transferred.
+ */
+
+ CLOSURE_BITS_START = (1U << 24),
+ CLOSURE_DESTRUCTOR = (1U << 24),
+ CLOSURE_SLEEPING = (1U << 26),
+ CLOSURE_WAITING = (1U << 28),
+ CLOSURE_RUNNING = (1U << 30),
+};
+
+#define CLOSURE_GUARD_MASK \
+ (((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1))
+
+#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
+
+struct closure {
+ union {
+ struct {
+ struct workqueue_struct *wq;
+ struct task_struct *sleeper;
+ struct llist_node list;
+ closure_fn *fn;
+ };
+ struct work_struct work;
+ };
+
+ struct closure *parent;
+
+ atomic_t remaining;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD 0xc054dead
+#define CLOSURE_MAGIC_ALIVE 0xc054a11e
+#define CLOSURE_MAGIC_STACK 0xc05451cc
+
+ unsigned int magic;
+ struct list_head all;
+ unsigned long ip;
+ unsigned long waiting_on;
+#endif
+};
+
+void bch2_closure_sub(struct closure *cl, int v);
+void __bch2_closure_wake_up(struct closure_waitlist *list);
+bool bch2_closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __bch2_closure_sync(struct closure *cl);
+
+/*
+ * closure_put - decrement a closure's refcount
+ */
+static inline void closure_put(struct closure *cl)
+{
+ bch2_closure_sub(cl, 1);
+}
+
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+ return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+ if (closure_nr_remaining(cl) > 1)
+ __bch2_closure_sync(cl);
+}
+
+int __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout);
+
+static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+ return closure_nr_remaining(cl) > 1
+ ? __bch2_closure_sync_timeout(cl, timeout)
+ : 0;
+}
+
+//#ifdef CONFIG_DEBUG_CLOSURES
+#if 0
+
+void bch2_closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void bch2_closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+ atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+ struct workqueue_struct *wq)
+{
+ closure_set_ip(cl);
+ cl->fn = fn;
+ cl->wq = wq;
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+ struct workqueue_struct *wq = cl->wq;
+ /**
+ * Changes made to closure, work_struct, or a couple of other structs
+ * may cause work.func not pointing to the right location.
+ */
+ BUILD_BUG_ON(offsetof(struct closure, fn)
+ != offsetof(struct work_struct, func));
+
+ if (wq) {
+ INIT_WORK(&cl->work, cl->work.func);
+ BUG_ON(!queue_work(wq, &cl->work));
+ } else
+ cl->fn(&cl->work);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ BUG_ON((atomic_inc_return(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) <= 1);
+#else
+ atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_get_not_zero
+ */
+static inline bool closure_get_not_zero(struct closure *cl)
+{
+ unsigned old = atomic_read(&cl->remaining);
+ do {
+ if (!(old & CLOSURE_REMAINING_MASK))
+ return false;
+
+ } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1));
+
+ return true;
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl: closure to initialize
+ * @parent: parent of the new closure. cl will take a refcount on it for its
+ * lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+ cl->fn = NULL;
+ cl->parent = parent;
+ if (parent)
+ closure_get(parent);
+
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+ bch2_closure_debug_create(cl);
+ closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+ memset(cl, 0, sizeof(struct closure));
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+static inline void closure_init_stack_release(struct closure *cl)
+{
+ memset(cl, 0, sizeof(struct closure));
+ atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list,
+ * with memory barrier
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+ /* Memory barrier for the wait list */
+ smp_mb();
+ __bch2_closure_wake_up(list);
+}
+
+#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws)
+#define closure_type(name, type, member) \
+ struct closure *cl = container_of(ws, struct closure, work); \
+ type *name = container_of(cl, type, member)
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
+ */
+#define continue_at(_cl, _fn, _wq) \
+do { \
+ set_closure_fn(_cl, _fn, _wq); \
+ bch2_closure_sub(_cl, CLOSURE_RUNNING + 1); \
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl) continue_at((_cl), NULL, NULL)
+
+void bch2_closure_return_sync(struct closure *cl);
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq) \
+do { \
+ set_closure_fn(_cl, _fn, _wq); \
+ closure_queue(_cl); \
+} while (0)
+
+/**
+ * closure_return_with_destructor - finish execution of a closure,
+ * with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor) \
+do { \
+ set_closure_fn(_cl, _destructor, NULL); \
+ bch2_closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+ struct workqueue_struct *wq,
+ struct closure *parent)
+{
+ closure_init(cl, parent);
+ continue_at_nobarrier(cl, fn, wq);
+}
+
+#define __closure_wait_event(waitlist, _cond) \
+do { \
+ struct closure cl; \
+ \
+ closure_init_stack(&cl); \
+ \
+ while (1) { \
+ bch2_closure_wait(waitlist, &cl); \
+ if (_cond) \
+ break; \
+ closure_sync(&cl); \
+ } \
+ closure_wake_up(waitlist); \
+ closure_sync(&cl); \
+} while (0)
+
+#define closure_wait_event(waitlist, _cond) \
+do { \
+ if (!(_cond)) \
+ __closure_wait_event(waitlist, _cond); \
+} while (0)
+
+#define __closure_wait_event_timeout(waitlist, _cond, _until) \
+({ \
+ struct closure cl; \
+ long _t; \
+ \
+ closure_init_stack(&cl); \
+ \
+ while (1) { \
+ bch2_closure_wait(waitlist, &cl); \
+ if (_cond) { \
+ _t = max_t(long, 1L, _until - jiffies); \
+ break; \
+ } \
+ _t = max_t(long, 0L, _until - jiffies); \
+ if (!_t) \
+ break; \
+ closure_sync_timeout(&cl, _t); \
+ } \
+ closure_wake_up(waitlist); \
+ closure_sync(&cl); \
+ _t; \
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout) \
+({ \
+ unsigned long _until = jiffies + _timeout; \
+ (_cond) \
+ ? max_t(long, 1L, _until - jiffies) \
+ : __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
+#endif /* _LINUX_CLOSURE_H */