16 files changed, 922 insertions, 29 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 8b846c09350b..5455412b2b75 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -3,7 +3,6 @@ config BCACHEFS_FS
 	tristate "bcachefs filesystem support (EXPERIMENTAL)"
 	depends on BLOCK
 	select EXPORTFS
-	select CLOSURES
 	select CRC32
 	select CRC64
 	select FS_POSIX_ACL
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index a4258615dffa..1e87eee962ec 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -98,7 +98,8 @@ bcachefs-y		:=	\
 	two_state_shared_lock.o	\
 	util.o			\
 	varint.o		\
-	xattr.o
+	xattr.o			\
+	vendor/closure.o
 
 obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 83d6ab9c1a91..3ccca855f05e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -196,7 +196,6 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/bug.h>
 #include <linux/bio.h>
-#include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/math64.h>
@@ -217,6 +216,7 @@
 
 #include "bcachefs_format.h"
 #include "btree_journal_iter_types.h"
+#include "closure.h"
 #include "disk_accounting_types.h"
 #include "errcode.h"
 #include "fast_list.h"
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 59638d09e1fd..3b1d694dcb3a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -15,6 +15,7 @@
 
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
 #include <linux/swap.h>
 
 const char * const bch2_btree_node_flags[] = {
@@ -565,6 +566,19 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 	return btree_cache_can_free(list);
 }
 
+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+	struct btree_cache_list *list = shrink->private_data;
+	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+	char *cbuf;
+	size_t buflen = seq_buf_get_buf(s, &cbuf);
+	struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+	bch2_btree_cache_to_text(&out, bc);
+	seq_buf_commit(s, out.pos);
+}
+
 void bch2_fs_btree_cache_exit(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
@@ -659,6 +673,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->live[0].shrink	= shrink;
 	shrink->count_objects	= bch2_btree_cache_count;
 	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->to_text		= bch2_btree_cache_shrinker_to_text;
 	shrink->seeks		= 2;
 	shrink->private_data	= &bc->live[0];
 	shrinker_register(shrink);
@@ -669,6 +684,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 	bc->live[1].shrink	= shrink;
 	shrink->count_objects	= bch2_btree_cache_count;
 	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->to_text		= bch2_btree_cache_shrinker_to_text;
 	shrink->seeks		= 8;
 	shrink->private_data	= &bc->live[1];
 	shrinker_register(shrink);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4890cbc88e7c..e3336ab27ccc 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -13,6 +13,7 @@
 #include "trace.h"
 
 #include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
 
 static inline bool btree_uses_pcpu_readers(enum btree_id id)
 {
@@ -808,6 +809,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
 }
 
+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	char *cbuf;
+	size_t buflen = seq_buf_get_buf(s, &cbuf);
+	struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+	bch2_btree_key_cache_to_text(&out, bc);
+	seq_buf_commit(s, out.pos);
+}
+
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
@@ -832,6 +845,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	bc->shrink = shrink;
 	shrink->count_objects	= bch2_btree_key_cache_count;
 	shrink->scan_objects	= bch2_btree_key_cache_scan;
+	shrink->to_text		= bch2_btree_key_cache_shrinker_to_text;
 	shrink->batch		= 1 << 14;
 	shrink->seeks		= 0;
 	shrink->private_data	= c;
diff --git a/fs/bcachefs/closure.h b/fs/bcachefs/closure.h
new file mode 100644
index 000000000000..d8d4c7093ce0
--- /dev/null
+++ b/fs/bcachefs/closure.h
@@ -0,0 +1,5 @@
+#include "vendor/closure.h"
+
+#define closure_wait		bch2_closure_wait
+#define closure_return_sync	bch2_closure_return_sync
+#define __closure_wake_up	__bch2_closure_wake_up
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 155c1ad42fc1..62d5d17d681e 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -91,8 +91,10 @@ bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bke
 		move_ctxt_wait_event(ctxt,
 				     (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
 				     list_empty(&ctxt->ios));
-		if (!locked)
+		if (!locked) {
+			bch2_trans_unlock(ctxt->trans);
 			bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+		}
 	}
 	return true;
 }
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c4344a1d6976..cbf1eedddad7 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -188,6 +188,11 @@
 	x(BCH_ERR_recovery_will_run,	recovery_pass_will_run)			\
 	x(0,				data_update_done)			\
 	x(0,				bkey_was_deleted)			\
+	x(0,				bucket_not_moveable)			\
+	x(BCH_ERR_bucket_not_moveable,	bucket_not_moveable_dev_not_rw)		\
+	x(BCH_ERR_bucket_not_moveable,	bucket_not_moveable_bucket_open)	\
+	x(BCH_ERR_bucket_not_moveable,	bucket_not_moveable_bp_mismatch)	\
+	x(BCH_ERR_bucket_not_moveable,	bucket_not_moveable_lru_race)		\
 	x(BCH_ERR_data_update_done,	data_update_done_would_block)		\
 	x(BCH_ERR_data_update_done,	data_update_done_unwritten)		\
 	x(BCH_ERR_data_update_done,	data_update_done_no_writes_needed)	\
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 958849c30071..f1849eb8327d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1521,6 +1521,7 @@ static const struct vm_operations_struct bch_vm_ops = {
 	.page_mkwrite   = bch2_page_mkwrite,
 };
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,17,0)
 static int bch2_mmap_prepare(struct vm_area_desc *desc)
 {
 	file_accessed(desc->file);
@@ -1528,6 +1529,15 @@ static int bch2_mmap_prepare(struct vm_area_desc *desc)
 	desc->vm_ops = &bch_vm_ops;
 	return 0;
 }
+#else
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+#endif
 
 /* Directories: */
 
@@ -1719,7 +1729,11 @@ static const struct file_operations bch_file_operations = {
 	.llseek		= bch2_llseek,
 	.read_iter	= bch2_read_iter,
 	.write_iter	= bch2_write_iter,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,17,0)
 	.mmap_prepare	= bch2_mmap_prepare,
+#else
+	.mmap		= bch2_mmap,
+#endif
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fsync		= bch2_fsync,
 	.splice_read	= filemap_splice_read,
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index ca480b8f8dae..ac545f962ce9 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -42,12 +42,6 @@ module_param_named(read_corrupt_device, bch2_read_corrupt_device, int, 0644);
 MODULE_PARM_DESC(read_corrupt_device, "");
 #endif
 
-static bool bch2_poison_extents_on_checksum_error;
-module_param_named(poison_extents_on_checksum_error,
-		   bch2_poison_extents_on_checksum_error, bool, 0644);
-MODULE_PARM_DESC(poison_extents_on_checksum_error,
-		 "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
-
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
 static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now)
@@ -551,9 +545,6 @@ static void get_rbio_extent(struct btree_trans *trans,
 static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
 					enum btree_id btree, struct bkey_s_c read_k)
 {
-	if (!bch2_poison_extents_on_checksum_error)
-		return 0;
-
 	struct bch_fs *c = trans->c;
 
 	struct data_update *u = rbio_data_update(rbio);
@@ -1291,6 +1282,10 @@ retry_pick:
 
 	async_object_list_add(c, rbio, rbio, &rbio->list_idx);
 
+	/* XXX: also nvme read recovery level */
+	if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
+		rbio->bio.bi_opf |= REQ_FUA;
+
 	if (rbio->bounce)
 		trace_and_count(c, io_read_bounce, &rbio->bio);
 
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index f36d60b8fb07..0f7e35684bc8 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -62,25 +62,38 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 
-	if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
+	/*
+	 * Valid bucket?
+	 *
+	 * XXX: we should kill the LRU entry here if it's not
+	 */
+	CLASS(bch2_dev_bucket_tryget, ca)(c, b->k.bucket);
+	if (!ca)
 		return 0;
 
-	CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached);
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	CLASS(bch2_dev_bucket_tryget, ca)(c, k.k->p);
-	if (!ca)
+	if (ca->mi.state != BCH_MEMBER_STATE_rw ||
+	    !bch2_dev_is_online(ca)) {
+		bch_err_throw(c, bucket_not_moveable_dev_not_rw);
 		return 0;
+	}
 
-	if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
+	/* Bucket still being written? */
+	if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) {
+		bch_err_throw(c, bucket_not_moveable_bucket_open);
 		return 0;
+	}
 
-	if (ca->mi.state != BCH_MEMBER_STATE_rw ||
-	    !bch2_dev_is_online(ca))
+	/* We won't be able to evacuate it if there's missing backpointers */
+	if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) {
+		bch_err_throw(c, bucket_not_moveable_bp_mismatch);
 		return 0;
+	}
+
+	CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
 
 	struct bch_alloc_v4 _a;
 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
@@ -88,7 +101,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	b->sectors	= bch2_bucket_sectors_dirty(*a);
 	u64 lru_idx	= alloc_lru_idx_fragmentation(*a, ca);
 
-	return lru_idx && lru_idx <= time;
+	if (!lru_idx || lru_idx > time) {
+		bch_err_throw(c, bucket_not_moveable_lru_race);
+		return 0;
+	}
+
+	return true;
 }
 
 static void move_bucket_free(struct buckets_in_flight *list,
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 58cfd540c6d6..71b17f18e90c 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -2,11 +2,10 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "closure.h"
 #include "nocow_locking.h"
 #include "util.h"
 
-#include <linux/closure.h>
-
 bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
 {
 	u64 dev_bucket = bucket_to_u64(bucket);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 40adefe7170f..ef6312c50f88 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -45,6 +45,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/sort.h>
+#include <linux/string_choices.h>
 #include <linux/sched/clock.h>
 
 #include "util.h"
@@ -157,6 +158,7 @@ write_attribute(trigger_recalc_capacity);
 write_attribute(trigger_delete_dead_snapshots);
 write_attribute(trigger_emergency_read_only);
 read_attribute(gc_gens_pos);
+__sysfs_attribute(read_fua_test, 0400);
 
 read_attribute(uuid);
 read_attribute(minor);
@@ -304,6 +306,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
 	prt_printf(out, "reserved:\t\t%llu\n",	b.reserved);
 }
 
+static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bio *bio = NULL;
+	void *buf = NULL;
+	unsigned bs = c->opts.block_size, iters;
+	u64 end, test_duration = NSEC_PER_SEC * 2;
+	struct bch2_time_stats stats_nofua, stats_fua, stats_random;
+	int ret = 0;
+
+	bch2_time_stats_init_no_pcpu(&stats_nofua);
+	bch2_time_stats_init_no_pcpu(&stats_fua);
+	bch2_time_stats_init_no_pcpu(&stats_random);
+
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) {
+		prt_str(out, "offline\n");
+		return 0;
+	}
+
+	struct block_device *bdev = ca->disk_sb.bdev;
+
+	bio = bio_kmalloc(1, GFP_KERNEL);
+	if (!bio) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	buf = kmalloc(bs, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	end = ktime_get_ns() + test_duration;
+	for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
+		bch2_bio_map(bio, buf, bs);
+
+		u64 submit_time = ktime_get_ns();
+		ret = submit_bio_wait(bio);
+		bch2_time_stats_update(&stats_nofua, submit_time);
+
+		if (ret)
+			goto err;
+	}
+
+	end = ktime_get_ns() + test_duration;
+	for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
+		bch2_bio_map(bio, buf, bs);
+
+		u64 submit_time = ktime_get_ns();
+		ret = submit_bio_wait(bio);
+		bch2_time_stats_update(&stats_fua, submit_time);
+
+		if (ret)
+			goto err;
+	}
+
+	u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
+
+	end = ktime_get_ns() + test_duration;
+	for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
+		bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
+		bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
+		bch2_bio_map(bio, buf, bs);
+
+		u64 submit_time = ktime_get_ns();
+		ret = submit_bio_wait(bio);
+		bch2_time_stats_update(&stats_random, submit_time);
+
+		if (ret)
+			goto err;
+	}
+
+	u64 ns_nofua		= mean_and_variance_get_mean(stats_nofua.duration_stats);
+	u64 ns_fua		= mean_and_variance_get_mean(stats_fua.duration_stats);
+	u64 ns_rand		= mean_and_variance_get_mean(stats_random.duration_stats);
+
+	u64 stddev_nofua	= mean_and_variance_get_stddev(stats_nofua.duration_stats);
+	u64 stddev_fua		= mean_and_variance_get_stddev(stats_fua.duration_stats);
+	u64 stddev_rand		= mean_and_variance_get_stddev(stats_random.duration_stats);
+
+	printbuf_tabstop_push(out, 8);
+	printbuf_tabstop_push(out, 12);
+	printbuf_tabstop_push(out, 12);
+	prt_printf(out, "This test must be run on an idle drive for accurate results\n");
+	prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device));
+	prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev)));
+	prt_newline(out);
+	prt_printf(out, "ns:\tlatency\rstddev\r\n");
+	prt_printf(out, "nofua\t%llu\r%llu\r\n",	ns_nofua,	stddev_nofua);
+	prt_printf(out, "fua\t%llu\r%llu\r\n",		ns_fua,		stddev_fua);
+	prt_printf(out, "random\t%llu\r%llu\r\n",	ns_rand,	stddev_rand);
+
+	bool read_cache = ns_nofua * 2 < ns_rand;
+	bool fua_cached	= read_cache && ns_fua < (ns_nofua + ns_rand) / 2;
+
+	if (!read_cache)
+		prt_str(out, "reads don't appear to be cached - safe\n");
+	else if (!fua_cached)
+		prt_str(out, "fua reads don't appear to be cached - safe\n");
+	else
+		prt_str(out, "fua reads appear to be cached - unsafe\n");
+err:
+	kfree(buf);
+	kfree(bio);
+	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -847,6 +959,9 @@ SHOW(bch2_dev)
 	if (attr == &sysfs_open_buckets)
 		bch2_open_buckets_to_text(out, c, ca);
 
+	if (attr == &sysfs_read_fua_test)
+		return bch2_read_fua_test(out, ca);
+
 	int opt_id = bch2_opt_lookup(attr->name);
 	if (opt_id >= 0)
 		return sysfs_opt_show(c, ca, opt_id, out);
@@ -911,6 +1026,8 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_congested,
 #endif
 
+	&sysfs_read_fua_test,
+
 	/* debug: */
 	&sysfs_alloc_debug,
 	&sysfs_open_buckets,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 52ac8230be9f..555e0d8f3cf0 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -4,7 +4,6 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/closure.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
@@ -21,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
+#include "closure.h"
 #include "mean_and_variance.h"
 
 #include "darray.h"
diff --git a/fs/bcachefs/vendor/closure.c b/fs/bcachefs/vendor/closure.c
new file mode 100644
index 000000000000..bdafd3a57386
--- /dev/null
+++ b/fs/bcachefs/vendor/closure.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "closure.h"
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
+
+static void closure_val_checks(struct closure *cl, unsigned new, int d)
+{
+	unsigned count = new & CLOSURE_REMAINING_MASK;
+
+	if (WARN(new & CLOSURE_GUARD_MASK,
+		 "closure %ps has guard bits set: %x (%u), delta %i",
+		 cl->fn,
+		 new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d))
+		new &= ~CLOSURE_GUARD_MASK;
+
+	WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)),
+	     "closure %ps ref hit 0 with incorrect flags set: %x (%u)",
+	     cl->fn,
+	     new, (unsigned) __fls(new));
+}
+
+enum new_closure_state {
+	CLOSURE_normal_put,
+	CLOSURE_requeue,
+	CLOSURE_done,
+};
+
+/* For clearing flags with the same atomic op as a put */
+void bch2_closure_sub(struct closure *cl, int v)
+{
+	enum new_closure_state s;
+	struct task_struct *sleeper;
+
+	/* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */
+	guard(rcu)();
+
+	int old = atomic_read_acquire(&cl->remaining), new;
+	do {
+		new = old - v;
+
+		if (new & CLOSURE_REMAINING_MASK) {
+			s = CLOSURE_normal_put;
+		} else {
+			if ((cl->fn || (new & CLOSURE_SLEEPING)) &&
+			    !(new & CLOSURE_DESTRUCTOR)) {
+				s = CLOSURE_requeue;
+				new += CLOSURE_REMAINING_INITIALIZER;
+			} else
+				s = CLOSURE_done;
+
+			sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL;
+			new &= ~CLOSURE_SLEEPING;
+		}
+
+		closure_val_checks(cl, new, -v);
+	} while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new));
+
+	if (s == CLOSURE_normal_put)
+		return;
+
+	if (sleeper) {
+		smp_mb();
+		wake_up_process(sleeper);
+		return;
+	}
+
+	if (s == CLOSURE_requeue) {
+		closure_queue(cl);
+	} else {
+		struct closure *parent = cl->parent;
+		closure_fn *destructor = cl->fn;
+
+		closure_debug_destroy(cl);
+
+		if (destructor)
+			destructor(&cl->work);
+
+		if (parent)
+			closure_put(parent);
+	}
+}
+
+/*
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __bch2_closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list;
+	struct closure *cl, *t;
+	struct llist_node *reverse = NULL;
+
+	list = llist_del_all(&wait_list->list);
+
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+	reverse = llist_reverse_order(list);
+
+	/* Then do the wakeups */
+	llist_for_each_entry_safe(cl, t, reverse, list) {
+		closure_set_waiting(cl, 0);
+		bch2_closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+
+/**
+ * closure_wait - add a closure to a waitlist
+ * @waitlist: will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
+ *
+ */
+bool bch2_closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining);
+	closure_val_checks(cl, r, CLOSURE_WAITING + 1);
+
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+
+void __sched __bch2_closure_sync(struct closure *cl)
+{
+	cl->sleeper = current;
+	bch2_closure_sub(cl,
+		    CLOSURE_REMAINING_INITIALIZER -
+		    CLOSURE_SLEEPING);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+}
+
+/*
+ * closure_return_sync - finish running a closure, synchronously (i.e. waiting
+ * for outstanding get()s to finish) and returning once closure refcount is 0.
+ *
+ * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
+ * closure_get_not_zero() calls will fail.
+ */
+void __sched bch2_closure_return_sync(struct closure *cl)
+{
+	cl->sleeper = current;
+	bch2_closure_sub(cl,
+		    CLOSURE_REMAINING_INITIALIZER -
+		    CLOSURE_DESTRUCTOR -
+		    CLOSURE_SLEEPING);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+
+	if (cl->parent)
+		closure_put(cl->parent);
+}
+
+int __sched __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+	int ret = 0;
+
+	cl->sleeper = current;
+	bch2_closure_sub(cl,
+		    CLOSURE_REMAINING_INITIALIZER -
+		    CLOSURE_SLEEPING);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		/*
+		 * Carefully undo the continue_at() - but only if it
+		 * hasn't completed, i.e. the final closure_put() hasn't
+		 * happened yet:
+		 */
+		unsigned old = atomic_read(&cl->remaining), new;
+		if (!(old & CLOSURE_SLEEPING))
+			goto success;
+
+		if (!timeout) {
+			do {
+				if (!(old & CLOSURE_SLEEPING))
+					goto success;
+
+				new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING;
+				closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING);
+			} while (!atomic_try_cmpxchg(&cl->remaining, &old, new));
+
+			ret = -ETIME;
+			break;
+		}
+
+		timeout = schedule_timeout(timeout);
+	}
+success:
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
diff --git a/fs/bcachefs/vendor/closure.h b/fs/bcachefs/vendor/closure.h
new file mode 100644
index 000000000000..79112efe30a7
--- /dev/null
+++ b/fs/bcachefs/vendor/closure.h
@@ -0,0 +1,490 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct work_struct *);
+extern struct dentry *bcache_debug;
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 24),
+	CLOSURE_DESTRUCTOR	= (1U << 24),
+	CLOSURE_SLEEPING	= (1U << 26),
+	CLOSURE_WAITING		= (1U << 28),
+	CLOSURE_RUNNING		= (1U << 30),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	(((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1))
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct task_struct	*sleeper;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+#define CLOSURE_MAGIC_STACK	0xc05451cc
+
+	unsigned int		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void bch2_closure_sub(struct closure *cl, int v);
+void __bch2_closure_wake_up(struct closure_waitlist *list);
+bool bch2_closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __bch2_closure_sync(struct closure *cl);
+
+/*
+ * closure_put - decrement a closure's refcount
+ */
+static inline void closure_put(struct closure *cl)
+{
+	bch2_closure_sub(cl, 1);
+}
+
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if (closure_nr_remaining(cl) > 1)
+		__bch2_closure_sync(cl);
+}
+
+int __bch2_closure_sync_timeout(struct closure *cl, unsigned long timeout);
+
+static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+	return closure_nr_remaining(cl) > 1
+		? __bch2_closure_sync_timeout(cl, timeout)
+		: 0;
+}
+
+//#ifdef CONFIG_DEBUG_CLOSURES
+#if 0
+
+void bch2_closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void bch2_closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	/**
+	 * Changes made to closure, work_struct, or a couple of other structs
+	 * may cause work.func not pointing to the right location.
+	 */
+	BUILD_BUG_ON(offsetof(struct closure, fn)
+		     != offsetof(struct work_struct, func));
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(&cl->work);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_get_not_zero
+ */
+static inline bool closure_get_not_zero(struct closure *cl)
+{
+	unsigned old = atomic_read(&cl->remaining);
+	do {
+		if (!(old & CLOSURE_REMAINING_MASK))
+			return false;
+
+	} while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1));
+
+	return true;
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	bch2_closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+static inline void closure_init_stack_release(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->magic = CLOSURE_MAGIC_STACK;
+#endif
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list,
+ *		     with memory barrier
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	/* Memory barrier for the wait list */
+	smp_mb();
+	__bch2_closure_wake_up(list);
+}
+
+#define CLOSURE_CALLBACK(name)	void name(struct work_struct *ws)
+#define closure_type(name, type, member)				\
+	struct closure *cl = container_of(ws, struct closure, work);	\
+	type *name = container_of(cl, type, member)
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	bch2_closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+void bch2_closure_return_sync(struct closure *cl);
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(_cl);						\
+} while (0)
+
+/**
+ * closure_return_with_destructor - finish execution of a closure,
+ *				    with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	bch2_closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#define __closure_wait_event(waitlist, _cond)				\
+do {									\
+	struct closure cl;						\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		bch2_closure_wait(waitlist, &cl);			\
+		if (_cond)						\
+			break;						\
+		closure_sync(&cl);					\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+} while (0)
+
+#define closure_wait_event(waitlist, _cond)				\
+do {									\
+	if (!(_cond))							\
+		__closure_wait_event(waitlist, _cond);			\
+} while (0)
+
+#define __closure_wait_event_timeout(waitlist, _cond, _until)		\
+({									\
+	struct closure cl;						\
+	long _t;							\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		bch2_closure_wait(waitlist, &cl);			\
+		if (_cond) {						\
+			_t = max_t(long, 1L, _until - jiffies);		\
+			break;						\
+		}							\
+		_t = max_t(long, 0L, _until - jiffies);			\
+		if (!_t)						\
+			break;						\
+		closure_sync_timeout(&cl, _t);				\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+	_t;								\
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout)		\
+({									\
+	unsigned long _until = jiffies + _timeout;			\
+	(_cond)								\
+		? max_t(long, 1L, _until - jiffies)			\
+		: __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
+#endif /* _LINUX_CLOSURE_H */