20 files changed, 771 insertions, 202 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5455412b2b75..df383b97aacc 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -21,7 +21,6 @@ config BCACHEFS_FS
 	select XOR_BLOCKS
 	select XXHASH
 	select SYMBOLIC_ERRNAME
-	select MIN_HEAP
 	help
 	The bcachefs filesystem - a modern, copy on write filesystem, with
 	support for multiple devices, compression, checksumming, etc.
@@ -34,7 +33,6 @@ config BCACHEFS_QUOTA
 config BCACHEFS_ERASURE_CODING
 	bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
 	depends on BCACHEFS_FS
-	select QUOTACTL
 	help
 	This enables the "erasure_code" filesysystem and inode option, which
 	organizes data into reed-solomon stripes instead of ordinary
@@ -43,11 +41,6 @@ config BCACHEFS_ERASURE_CODING
 	WARNING: this feature is still undergoing on disk format changes, and
 	should only be enabled for testing purposes.
 
-config BCACHEFS_POSIX_ACL
-	bool "bcachefs POSIX ACL support"
-	depends on BCACHEFS_FS
-	select FS_POSIX_ACL
-
 config BCACHEFS_DEBUG
 	bool "bcachefs debugging"
 	depends on BCACHEFS_FS
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1e87eee962ec..98acb3dc66a3 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -1,4 +1,9 @@
 
+ifdef BCACHEFS_DKMS
+	CONFIG_BCACHEFS_FS := m
+	# Enable other features here?
+endif
+
 obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 
 bcachefs-y		:=	\
@@ -99,9 +104,16 @@ bcachefs-y		:=	\
 	util.o			\
 	varint.o		\
 	xattr.o			\
-	vendor/closure.o
+	vendor/closure.o	\
+	vendor/min_heap.o
 
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
+ifndef BCACHEFS_DKMS
+	obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
+endif
 
 # Silence "note: xyz changed in GCC X.X" messages
 subdir-ccflags-y += $(call cc-disable-warning, psabi)
+
+ifdef BCACHEFS_DKMS
+	subdir-ccflags-y += -I$(src)
+endif
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3befa1f36e72..a1df0ec2d511 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -57,7 +57,7 @@ void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
 	}
 }
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#ifndef NO_BCACHEFS_FS
 
 #include "fs.h"
 
@@ -437,4 +437,4 @@ err:
 	return ret;
 }
 
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index fe730a6bf0c1..01cf21c5a0b6 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -26,7 +26,7 @@ typedef struct {
 
 void bch2_acl_to_text(struct printbuf *, const void *, size_t);
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#ifndef NO_BCACHEFS_FS
 
 struct posix_acl *bch2_get_acl(struct inode *, int, bool);
 
@@ -55,6 +55,6 @@ static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
 	return 0;
 }
 
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* NO_BCACHEFS_FS */
 
 #endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index a6f344faf751..73fa8513ee61 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -838,8 +838,10 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
 		      bpos_ge(k->k.p, start) &&
 		      bpos_le(k->k.p, end)))
 			keys->data[dst++] = *i;
-		else if (i->allocated)
+		else if (i->allocated) {
 			kfree(i->allocated_k);
+			i->allocated_k = NULL;
+		}
 	}
 	keys->nr = keys->gap = dst;
 }
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index c0dff992ad60..433d498540fd 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -12,7 +12,6 @@
 #include "recovery_passes.h"
 
 #include <linux/kthread.h>
-#include <linux/min_heap.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sort.h>
 
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 37554e4514fe..d12c11dc6601 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -2,7 +2,7 @@
 #ifndef _BCACHEFS_CLOCK_TYPES_H
 #define _BCACHEFS_CLOCK_TYPES_H
 
-#include "util.h"
+#include "vendor/min_heap.h"
 
 #define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 62d5d17d681e..6615c868d066 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -55,63 +55,6 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
 	return true;
 }
 
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-		bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-	}
-}
-
-static noinline_for_stack
-bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
-		       const struct bch_extent_ptr *start)
-{
-	if (!ctxt) {
-		bkey_for_each_ptr(ptrs, ptr) {
-			if (ptr == start)
-				break;
-
-			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-			struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-			bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-		}
-		return false;
-	}
-
-	__bkey_for_each_ptr(start, ptrs.end, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-		bool locked;
-		move_ctxt_wait_event(ctxt,
-				     (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
-				     list_empty(&ctxt->ios));
-		if (!locked) {
-			bch2_trans_unlock(ctxt->trans);
-			bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-		}
-	}
-	return true;
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
-{
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-		if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
-			return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
-	}
-
-	return true;
-}
-
 noinline_for_stack
 static void trace_io_move_finish2(struct data_update *u,
 				  struct bkey_i *new,
@@ -538,7 +481,7 @@ void bch2_data_update_exit(struct data_update *update)
 	update->bvecs = NULL;
 
 	if (c->opts.nocow_enabled)
-		bkey_nocow_unlock(c, k);
+		bch2_bkey_nocow_unlock(c, k, 0);
 	bkey_put_dev_refs(c, k);
 	bch2_disk_reservation_put(c, &update->op.res);
 	bch2_bkey_buf_exit(&update->k, c);
@@ -1018,10 +961,19 @@ int bch2_data_update_init(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (c->opts.nocow_enabled &&
-	    !bkey_nocow_lock(c, ctxt, ptrs)) {
-		ret = bch_err_throw(c, nocow_lock_blocked);
-		goto out_put_dev_refs;
+	if (c->opts.nocow_enabled) {
+		if (!bch2_bkey_nocow_trylock(c, ptrs, 0)) {
+			bool locked = false;
+			if (ctxt)
+				move_ctxt_wait_event(ctxt,
+					(locked = bch2_bkey_nocow_trylock(c, ptrs, 0)) ||
+					list_empty(&ctxt->ios));
+			if (!locked) {
+				if (ctxt)
+					bch2_trans_unlock(ctxt->trans);
+				bch2_bkey_nocow_lock(c, ptrs, 0);
+			}
+		}
 	}
 
 	if (unwritten) {
@@ -1039,8 +991,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	return 0;
 out_nocow_unlock:
 	if (c->opts.nocow_enabled)
-		bkey_nocow_unlock(c, k);
-out_put_dev_refs:
+		bch2_bkey_nocow_unlock(c, k, 0);
 	bkey_put_dev_refs(c, k);
 out:
 	bch2_disk_reservation_put(c, &m->op.res);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index cbf1eedddad7..db2dc5b0e0f2 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -367,7 +367,10 @@
 	x(BCH_ERR_nopromote,		nopromote_enomem)			\
 	x(0,				invalid_snapshot_node)			\
 	x(0,				option_needs_open_fs)			\
-	x(0,				remove_disk_accounting_entry)
+	x(0,				remove_disk_accounting_entry)		\
+	x(0,				nocow_trylock_fail)			\
+	x(BCH_ERR_nocow_trylock_fail,	nocow_trylock_contended)		\
+	x(BCH_ERR_nocow_trylock_fail,	nocow_trylock_bucket_full)		\
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f1849eb8327d..414adf86efdc 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -541,11 +541,9 @@ __bch2_create(struct mnt_idmap *idmap,
 	 * preallocate acls + vfs inode before btree transaction, so that
 	 * nothing can fail after the transaction succeeds:
 	 */
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
 	if (ret)
 		return ERR_PTR(ret);
-#endif
 
 	inode = __bch2_new_inode(c, GFP_NOFS);
 	if (unlikely(!inode)) {
@@ -1751,10 +1749,8 @@ static const struct inode_operations bch_file_inode_operations = {
 	.setattr	= bch2_setattr,
 	.fiemap		= bch2_fiemap,
 	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
-#endif
 	.fileattr_get	= bch2_fileattr_get,
 	.fileattr_set	= bch2_fileattr_set,
 };
@@ -1773,10 +1769,8 @@ static const struct inode_operations bch_dir_inode_operations = {
 	.setattr	= bch2_setattr,
 	.tmpfile	= bch2_tmpfile,
 	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
-#endif
 	.fileattr_get	= bch2_fileattr_get,
 	.fileattr_set	= bch2_fileattr_set,
 };
@@ -1797,10 +1791,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
 	.getattr	= bch2_getattr,
 	.setattr	= bch2_setattr,
 	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
-#endif
 	.fileattr_get	= bch2_fileattr_get,
 	.fileattr_set	= bch2_fileattr_set,
 };
@@ -1809,10 +1801,8 @@ static const struct inode_operations bch_special_inode_operations = {
 	.getattr	= bch2_getattr,
 	.setattr	= bch2_setattr,
 	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
-#endif
 	.fileattr_get	= bch2_fileattr_get,
 	.fileattr_set	= bch2_fileattr_set,
 };
@@ -2546,10 +2536,8 @@ got_sb:
 
 	c->dev = sb->s_dev;
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 	if (c->opts.acl)
 		sb->s_flags	|= SB_POSIXACL;
-#endif
 
 	sb->s_shrink->seeks = 0;
 
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 6a5da02ce266..ed4a9440143a 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -33,7 +33,6 @@
 
 #include <linux/blkdev.h>
 #include <linux/moduleparam.h>
-#include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
@@ -1323,11 +1322,25 @@ static CLOSURE_CALLBACK(bch2_nocow_write_done)
 	bch2_write_done(cl);
 }
 
-struct bucket_to_lock {
-	struct bpos		b;
-	unsigned		gen;
-	struct nocow_lock_bucket *l;
-};
+static bool bkey_get_dev_iorefs(struct bch_fs *c, struct bkey_ptrs_c ptrs)
+{
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
+							BCH_DEV_WRITE_REF_io_write);
+		if (unlikely(!ca)) {
+			bkey_for_each_ptr(ptrs, ptr2) {
+				if (ptr2 == ptr)
+					break;
+				enumerated_ref_put(&bch2_dev_have_ref(c, ptr2->dev)->io_ref[WRITE],
+						   BCH_DEV_WRITE_REF_io_write);
+			}
+
+			return false;
+		}
+	}
+
+	return true;
+}
 
 static void bch2_nocow_write(struct bch_write_op *op)
 {
@@ -1335,15 +1348,14 @@ static void bch2_nocow_write(struct bch_write_op *op)
 	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+	struct bkey_ptrs_c ptrs;
 	u32 snapshot;
-	struct bucket_to_lock *stale_at;
+	const struct bch_extent_ptr *stale_at;
 	int stale, ret;
 
 	if (op->flags & BCH_WRITE_move)
 		return;
 
-	darray_init(&buckets);
 	trans = bch2_trans_get(c);
 retry:
 	bch2_trans_begin(trans);
@@ -1358,8 +1370,6 @@ retry:
 	while (1) {
 		struct bio *bio = &op->wbio.bio;
 
-		buckets.nr = 0;
-
 		ret = bch2_trans_relock(trans);
 		if (ret)
 			break;
@@ -1381,50 +1391,42 @@ retry:
 			break;
 
 		/* Get iorefs before dropping btree locks: */
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		bkey_for_each_ptr(ptrs, ptr) {
-			struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
-							BCH_DEV_WRITE_REF_io_write);
-			if (unlikely(!ca))
-				goto err_get_ioref;
-
-			struct bpos b = PTR_BUCKET_POS(ca, ptr);
-			struct nocow_lock_bucket *l =
-				bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
-			prefetch(l);
-
-			/* XXX allocating memory with btree locks held - rare */
-			darray_push_gfp(&buckets, ((struct bucket_to_lock) {
-						   .b = b, .gen = ptr->gen, .l = l,
-						   }), GFP_KERNEL|__GFP_NOFAIL);
-
-			if (ptr->unwritten)
-				op->flags |= BCH_WRITE_convert_unwritten;
-		}
+		ptrs = bch2_bkey_ptrs_c(k);
+		if (!bkey_get_dev_iorefs(c, ptrs))
+			goto out;
 
 		/* Unlock before taking nocow locks, doing IO: */
 		bkey_reassemble(op->insert_keys.top, k);
-		bch2_trans_unlock(trans);
+		k = bkey_i_to_s_c(op->insert_keys.top);
+		ptrs = bch2_bkey_ptrs_c(k);
 
-		bch2_cut_front(op->pos, op->insert_keys.top);
-		if (op->flags & BCH_WRITE_convert_unwritten)
-			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+		bch2_trans_unlock(trans);
 
-		darray_for_each(buckets, i) {
-			struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
+		bch2_bkey_nocow_lock(c, ptrs, BUCKET_NOCOW_LOCK_UPDATE);
 
-			__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
-						 bucket_to_u64(i->b),
-						 BUCKET_NOCOW_LOCK_UPDATE);
+		/*
+		 * This could be handled better: If we're able to trylock the
+		 * nocow locks with btree locks held we know dirty pointers
+		 * can't be stale
+		 */
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
 
-			int gen = bucket_gen_get(ca, i->b.offset);
-			stale = gen < 0 ? gen : gen_after(gen, i->gen);
+			int gen = bucket_gen_get(ca, PTR_BUCKET_NR(ca, ptr));
+			stale = gen < 0 ? gen : gen_after(gen, ptr->gen);
 			if (unlikely(stale)) {
-				stale_at = i;
+				stale_at = ptr;
 				goto err_bucket_stale;
 			}
+
+			if (ptr->unwritten)
+				op->flags |= BCH_WRITE_convert_unwritten;
 		}
 
+		bch2_cut_front(op->pos, op->insert_keys.top);
+		if (op->flags & BCH_WRITE_convert_unwritten)
+			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
 		bio = &op->wbio.bio;
 		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
 			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
@@ -1458,8 +1460,6 @@ err:
 		goto retry;
 
 	bch2_trans_put(trans);
-	darray_exit(&buckets);
-
 	if (ret) {
 		bch2_write_op_error(op, op->pos.offset,
 				    "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
@@ -1484,24 +1484,11 @@ err:
 		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
 	}
 	return;
-err_get_ioref:
-	darray_for_each(buckets, i)
-		enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
-				   BCH_DEV_WRITE_REF_io_write);
-
-	/* Fall back to COW path: */
-	goto out;
 err_bucket_stale:
-	darray_for_each(buckets, i) {
-		bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
-		if (i == stale_at)
-			break;
-	}
-
 	CLASS(printbuf, buf)();
 	if (bch2_fs_inconsistent_on(stale < 0, c,
-				    "pointer to invalid bucket in nocow path on device %llu\n  %s",
-				    stale_at->b.inode,
+				    "pointer to invalid bucket in nocow path on device %u\n  %s",
+				    stale_at->dev,
 				    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = bch_err_throw(c, data_write_invalid_ptr);
 	} else {
@@ -1509,7 +1496,13 @@ err_bucket_stale:
 		ret = bch_err_throw(c, transaction_restart);
 	}
 
-	goto err_get_ioref;
+	bch2_bkey_nocow_unlock(c, k, BUCKET_NOCOW_LOCK_UPDATE);
+	bkey_for_each_ptr(ptrs, ptr)
+		enumerated_ref_put(&bch2_dev_have_ref(c, ptr->dev)->io_ref[WRITE],
+				   BCH_DEV_WRITE_REF_io_write);
+
+	/* Fall back to COW path: */
+	goto out;
 }
 
 static void __bch2_write(struct bch_write_op *op)
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index 5c321a0d1f89..6d31508cabb2 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -980,7 +980,7 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 		ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
 						 snapshot_overwrites, &buf);
 		if (ret)
-			return ret;
+			goto out;
 
 		if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
 			inode->bi_flags |= BCH_INODE_has_case_insensitive;
@@ -999,14 +999,14 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 		if (dir.bi_parent_subvol) {
 			ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
 			if (ret)
-				return ret;
+				goto out;
 
 			snapshot_overwrites = NULL;
 		}
 
 		ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
 		if (ret)
-			return ret;
+			goto out;
 
 		if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
 			prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
@@ -1014,13 +1014,13 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 			ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
 							 snapshot_overwrites, &buf);
 			if (ret)
-				return ret;
+				goto out;
 
 			if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
 				dir.bi_flags |= BCH_INODE_has_case_insensitive;
 				ret = __bch2_fsck_write_inode(trans, &dir);
 				if (ret)
-					return ret;
+					goto out;
 			}
 		}
 
@@ -1033,13 +1033,13 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 	}
 out:
 fsck_err:
+	bch_err_fn(trans->c, ret);
 	if (ret)
 		return ret;
 
-	if (repairing_parents) {
+	if (repairing_parents)
 		return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
 			bch_err_throw(trans->c, transaction_restart_nested);
-	}
 
 	return 0;
 }
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 71b17f18e90c..c8907070796f 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -6,6 +6,16 @@
 #include "nocow_locking.h"
 #include "util.h"
 
+#include <linux/prefetch.h>
+
+static bool nocow_bucket_empty(struct nocow_lock_bucket *l)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (atomic_read(&l->l[i]))
+			return false;
+	return true;
+}
+
 bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
 {
 	u64 dev_bucket = bucket_to_u64(bucket);
@@ -20,14 +30,12 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos
 
 #define sign(v)		(v < 0 ? -1 : v > 0 ? 1 : 0)
 
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, u64 dev_bucket, int flags)
 {
-	u64 dev_bucket = bucket_to_u64(bucket);
 	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
 	int lock_val = flags ? 1 : -1;
-	unsigned i;
 
-	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++)
 		if (l->b[i] == dev_bucket) {
 			int v = atomic_sub_return(lock_val, &l->l[i]);
 
@@ -40,8 +48,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
 	BUG();
 }
 
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
-				 u64 dev_bucket, int flags)
+static int __bch2_bucket_nocow_trylock(struct bch_fs *c, struct nocow_lock_bucket *l,
+				u64 dev_bucket, int flags)
 {
 	int v, lock_val = flags ? 1 : -1;
 	unsigned i;
@@ -58,32 +66,128 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
 			goto take_lock;
 		}
 
-	return false;
+	return bch_err_throw(c, nocow_trylock_bucket_full);
 got_entry:
 	v = atomic_read(&l->l[i]);
 	if (lock_val > 0 ? v < 0 : v > 0)
-		return false;
+		return bch_err_throw(c, nocow_trylock_contended);
 take_lock:
 	v = atomic_read(&l->l[i]);
 	/* Overflow? */
 	if (v && sign(v + lock_val) != sign(v))
-		return false;
+		return bch_err_throw(c, nocow_trylock_contended);
 
 	atomic_add(lock_val, &l->l[i]);
+	return 0;
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bch_fs *c, struct bpos bucket, int flags)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	return !__bch2_bucket_nocow_trylock(c, l, dev_bucket, flags);
+}
+
+void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags);
+	}
+}
+
+bool bch2_bkey_nocow_trylock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
+{
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		if (unlikely(!bch2_bucket_nocow_trylock(c, bucket, flags))) {
+			bkey_for_each_ptr(ptrs, ptr2) {
+				if (ptr2 == ptr)
+					break;
+
+				struct bch_dev *ca = bch2_dev_have_ref(c, ptr2->dev);
+				struct bpos bucket = PTR_BUCKET_POS(ca, ptr2);
+				bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags);
+			}
+			return false;
+		}
+	}
+
 	return true;
 }
 
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-			      struct nocow_lock_bucket *l,
-			      u64 dev_bucket, int flags)
+struct bucket_to_lock {
+	u64			b;
+	struct nocow_lock_bucket *l;
+};
+
+static inline int bucket_to_lock_cmp(struct bucket_to_lock l,
+				     struct bucket_to_lock r)
 {
-	if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
-		struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+	return cmp_int(l.l, r.l);
+}
+
+void bch2_bkey_nocow_lock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
+{
+	if (bch2_bkey_nocow_trylock(c, ptrs, flags))
+		return;
+
+	DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+	darray_init(&buckets);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		u64 b = bucket_to_u64(PTR_BUCKET_POS(ca, ptr));
+		struct nocow_lock_bucket *l =
+			bucket_nocow_lock(&c->nocow_locks, b);
+		prefetch(l);
+
+		/* XXX allocating memory with btree locks held - rare */
+		darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .l = l, }),
+				GFP_KERNEL|__GFP_NOFAIL);
+	}
+
+	WARN_ON_ONCE(buckets.nr > NOCOW_LOCK_BUCKET_SIZE);
+
+	bubble_sort(buckets.data, buckets.nr, bucket_to_lock_cmp);
+retake_all:
+	darray_for_each(buckets, i) {
+		int ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags);
+		if (!ret)
+			continue;
+
 		u64 start_time = local_clock();
 
-		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+		if (ret == -BCH_ERR_nocow_trylock_contended)
+			__closure_wait_event(&i->l->wait,
+					(ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags)) != -BCH_ERR_nocow_trylock_contended);
+		if (!ret) {
+			bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+			continue;
+		}
+
+		BUG_ON(ret != -BCH_ERR_nocow_trylock_bucket_full);
+
+		darray_for_each(buckets, i2) {
+			if (i2 == i)
+				break;
+			__bch2_bucket_nocow_unlock(&c->nocow_locks, i2->b, flags);
+		}
+
+		__closure_wait_event(&i->l->wait, nocow_bucket_empty(i->l));
 		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+		goto retake_all;
 	}
+
+	darray_exit(&buckets);
 }
 
 void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index 48b8a003c0d2..972c914780f9 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -19,29 +19,19 @@ static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lo
 #define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
 
 bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
-			      struct nocow_lock_bucket *, u64, int);
 
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-					  struct bpos bucket, int flags)
-{
-	u64 dev_bucket = bucket_to_u64(bucket);
-	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
-	__bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
+void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, u64, int);
 
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
-					  struct bpos bucket, int flags)
+static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket,
+					    int flags)
 {
-	u64 dev_bucket = bucket_to_u64(bucket);
-	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
-	return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+	__bch2_bucket_nocow_unlock(t, bucket_to_u64(bucket), flags);
 }
 
+void bch2_bkey_nocow_unlock(struct bch_fs *, struct bkey_s_c, int);
+bool bch2_bkey_nocow_trylock(struct bch_fs *, struct bkey_ptrs_c, int);
+void bch2_bkey_nocow_lock(struct bch_fs *, struct bkey_ptrs_c, int);
+
 void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
 
 void bch2_fs_nocow_locking_exit(struct bch_fs *);
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
index bd12bf677924..3fed8e957e20 100644
--- a/fs/bcachefs/nocow_locking_types.h
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -5,11 +5,13 @@
 #define BUCKET_NOCOW_LOCKS_BITS		10
 #define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
 
+#define NOCOW_LOCK_BUCKET_SIZE	6
+
 struct nocow_lock_bucket {
 	struct closure_waitlist		wait;
 	spinlock_t			lock;
-	u64				b[4];
-	atomic_t			l[4];
+	u64				b[NOCOW_LOCK_BUCKET_SIZE];
+	atomic_t			l[NOCOW_LOCK_BUCKET_SIZE];
 } __aligned(SMP_CACHE_BYTES);
 
 struct bucket_nocow_lock_table {
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bd5faafc9aa7..b9bac9fbbafb 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -649,10 +649,6 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
 	if (!(bch2_opt_table[id].flags & OPT_MOUNT))
 		return -BCH_ERR_option_name;
 
-	if (id == Opt_acl &&
-	    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
-		return -BCH_ERR_option_name;
-
 	if ((id == Opt_usrquota ||
 	     id == Opt_grpquota) &&
 	    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
index b7eff904acdb..898caf943b34 100644
--- a/fs/bcachefs/replicas_format.h
+++ b/fs/bcachefs/replicas_format.h
@@ -17,7 +17,8 @@ struct bch_replicas_entry_v1 {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
-	__u8			devs[] __counted_by(nr_devs);
+	/* No counted_by: bch_replicas_cpu entries are all the size of the biggest entry */
+	__u8			devs[];
 } __packed;
 
 struct bch_sb_field_replicas {
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 555e0d8f3cf0..eb4222f0f34b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -7,7 +7,6 @@
 #include <linux/errno.h>
 #include <linux/freezer.h>
 #include <linux/kernel.h>
-#include <linux/min_heap.h>
 #include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/log2.h>
diff --git a/fs/bcachefs/vendor/min_heap.c b/fs/bcachefs/vendor/min_heap.c
new file mode 100644
index 000000000000..21f744549d66
--- /dev/null
+++ b/fs/bcachefs/vendor/min_heap.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "min_heap.h"
+
+void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size)
+{
+	__min_heap_init_inline(heap, data, size);
+}
+
+void *__bch2_min_heap_peek(struct min_heap_char *heap)
+{
+	return __min_heap_peek_inline(heap);
+}
+
+bool __bch2_min_heap_full(min_heap_char *heap)
+{
+	return __min_heap_full_inline(heap);
+}
+
+void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
+			  const struct min_heap_callbacks *func, void *args)
+{
+	__min_heap_sift_down_inline(heap, pos, elem_size, func, args);
+}
+
+void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+			const struct min_heap_callbacks *func, void *args)
+{
+	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+}
+
+void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size,
+		       const struct min_heap_callbacks *func, void *args)
+{
+	__min_heapify_all_inline(heap, elem_size, func, args);
+}
+
+bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size,
+		    const struct min_heap_callbacks *func, void *args)
+{
+	return __min_heap_pop_inline(heap, elem_size, func, args);
+}
+
+void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+			 const struct min_heap_callbacks *func, void *args)
+{
+	__min_heap_pop_push_inline(heap, element, elem_size, func, args);
+}
+
+bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+		     const struct min_heap_callbacks *func, void *args)
+{
+	return __min_heap_push_inline(heap, element, elem_size, func, args);
+}
+
+bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+		    const struct min_heap_callbacks *func, void *args)
+{
+	return __min_heap_del_inline(heap, elem_size, idx, func, args);
+}
diff --git a/fs/bcachefs/vendor/min_heap.h b/fs/bcachefs/vendor/min_heap.h
new file mode 100644
index 000000000000..865c8b33b11f
--- /dev/null
+++ b/fs/bcachefs/vendor/min_heap.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIN_HEAP_H
+#define _LINUX_MIN_HEAP_H
+
+#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+/*
+ * The Min Heap API provides utilities for managing min-heaps, a binary tree
+ * structure where each node's value is less than or equal to its children's
+ * values, ensuring the smallest element is at the root.
+ *
+ * Users should avoid directly calling functions prefixed with __min_heap_*().
+ * Instead, use the provided macro wrappers.
+ *
+ * For further details and examples, refer to Documentation/core-api/min_heap.rst.
+ */
+
+/**
+ * Data structure to hold a min-heap.
+ * @nr: Number of elements currently in the heap.
+ * @size: Maximum number of elements that can be held in current storage.
+ * @data: Pointer to the start of array holding the heap elements.
+ * @preallocated: Start of the static preallocated array holding the heap elements.
+ */
+#define MIN_HEAP_PREALLOCATED(_type, _name, _nr)	\
+struct _name {	\
+	size_t nr;	\
+	size_t size;	\
+	_type *data;	\
+	_type preallocated[_nr];	\
+}
+
+#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0)
+
+typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char;
+
+#define __minheap_cast(_heap)		(typeof((_heap)->data[0]) *)
+#define __minheap_obj_size(_heap)	sizeof((_heap)->data[0])
+
+/**
+ * struct min_heap_callbacks - Data/functions to customise the min_heap.
+ * @less: Partial order function for this heap.
+ * @swp: Swap elements function.
+ */
+struct min_heap_callbacks {
+	bool (*less)(const void *lhs, const void *rhs, void *args);
+	void (*swp)(void *lhs, void *rhs, void *args);
+};
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static __always_inline
+void swap_words_32(void *a, void *b, size_t n)
+{
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static __always_inline
+void swap_words_64(void *a, void *b, size_t n)
+{
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static __always_inline
+void swap_bytes(void *a, void *b, size_t n)
+{
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
+#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
+#define SWAP_BYTES    ((void (*)(void *, void *, void *))2)
+
+/*
+ * Selects the appropriate swap function based on the element size.
+ */
+static __always_inline
+void *select_swap_func(const void *base, size_t size)
+{
+	if (is_aligned(base, size, 8))
+		return SWAP_WORDS_64;
+	else if (is_aligned(base, size, 4))
+		return SWAP_WORDS_32;
+	else
+		return SWAP_BYTES;
+}
+
+static __always_inline
+void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
+	     void *priv)
+{
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, priv);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
+/* Initialize a min-heap. */
+static __always_inline
+void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size)
+{
+	heap->nr = 0;
+	heap->size = size;
+	if (data)
+		heap->data = data;
+	else
+		heap->data = heap->preallocated;
+}
+
+#define min_heap_init_inline(_heap, _data, _size)	\
+	__min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
+
+/* Get the minimum element from the heap. */
+static __always_inline
+void *__min_heap_peek_inline(struct min_heap_char *heap)
+{
+	return heap->nr ? heap->data : NULL;
+}
+
+#define min_heap_peek_inline(_heap)	\
+	(__minheap_cast(_heap)	\
+	 __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr)))
+
+/* Check if the heap is full. */
+static __always_inline
+bool __min_heap_full_inline(min_heap_char *heap)
+{
+	return heap->nr == heap->size;
+}
+
+#define min_heap_full_inline(_heap)	\
+	__min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr))
+
+/* Sift the element at pos down the heap. */
+static __always_inline
+void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
+				 const struct min_heap_callbacks *func, void *args)
+{
+	const unsigned long lsbit = elem_size & -elem_size;
+	void *data = heap->data;
+	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+	/* pre-scale counters for performance */
+	size_t a = pos * elem_size;
+	size_t b, c, d;
+	size_t n = heap->nr * elem_size;
+
+	if (!swp)
+		swp = select_swap_func(data, elem_size);
+
+	/* Find the sift-down path all the way to the leaves. */
+	for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
+		b = func->less(data + c, data + d, args) ? c : d;
+
+	/* Special case for the last leaf with no sibling. */
+	if (d == n)
+		b = c;
+
+	/* Backtrack to the correct location. */
+	while (b != a && func->less(data + a, data + b, args))
+		b = parent(b, lsbit, elem_size);
+
+	/* Shift the element into its correct place. */
+	c = b;
+	while (b != a) {
+		b = parent(b, lsbit, elem_size);
+		do_swap(data + b, data + c, elem_size, swp, args);
+	}
+}
+
+#define min_heap_sift_down_inline(_heap, _pos, _func, _args)	\
+	__min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos,	\
+				    __minheap_obj_size(_heap), _func, _args)
+
+/* Sift up ith element from the heap, O(log2(nr)). */
+static __always_inline
+void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+			       const struct min_heap_callbacks *func, void *args)
+{
+	const unsigned long lsbit = elem_size & -elem_size;
+	void *data = heap->data;
+	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+	/* pre-scale counters for performance */
+	size_t a = idx * elem_size, b;
+
+	if (!swp)
+		swp = select_swap_func(data, elem_size);
+
+	while (a) {
+		b = parent(a, lsbit, elem_size);
+		if (func->less(data + b, data + a, args))
+			break;
+		do_swap(data + a, data + b, elem_size, swp, args);
+		a = b;
+	}
+}
+
+#define min_heap_sift_up_inline(_heap, _idx, _func, _args)	\
+	__min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr),	\
+				  __minheap_obj_size(_heap), _idx, _func, _args)
+
+/* Floyd's approach to heapification that is O(nr). */
+static __always_inline
+void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size,
+			      const struct min_heap_callbacks *func, void *args)
+{
+	ssize_t i;
+
+	for (i = heap->nr / 2 - 1; i >= 0; i--)
+		__min_heap_sift_down_inline(heap, i, elem_size, func, args);
+}
+
+#define min_heapify_all_inline(_heap, _func, _args)	\
+	__min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr),	\
+				 __minheap_obj_size(_heap), _func, _args)
+
+/* Remove minimum element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size,
+			   const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+
+	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+		return false;
+
+	/* Place last element at the root (position 0) and then sift down. */
+	heap->nr--;
+	memcpy(data, data + (heap->nr * elem_size), elem_size);
+	__min_heap_sift_down_inline(heap, 0, elem_size, func, args);
+
+	return true;
+}
+
+#define min_heap_pop_inline(_heap, _func, _args)	\
+	__min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr),	\
+			      __minheap_obj_size(_heap), _func, _args)
+
+/*
+ * Remove the minimum element and then push the given element. The
+ * implementation performs 1 sift (O(log2(nr))) and is therefore more
+ * efficient than a pop followed by a push that does 2.
+ */
+static __always_inline
+void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+				const struct min_heap_callbacks *func, void *args)
+{
+	memcpy(heap->data, element, elem_size);
+	__min_heap_sift_down_inline(heap, 0, elem_size, func, args);
+}
+
+#define min_heap_pop_push_inline(_heap, _element, _func, _args)	\
+	__min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element,	\
+				   __minheap_obj_size(_heap), _func, _args)
+
+/* Push an element on to the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
+			    const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+	size_t pos;
+
+	if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
+		return false;
+
+	/* Place at the end of data. */
+	pos = heap->nr;
+	memcpy(data + (pos * elem_size), element, elem_size);
+	heap->nr++;
+
+	/* Sift child at pos up. */
+	__min_heap_sift_up_inline(heap, elem_size, pos, func, args);
+
+	return true;
+}
+
+#define min_heap_push_inline(_heap, _element, _func, _args)	\
+	__min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element,	\
+					    __minheap_obj_size(_heap), _func, _args)
+
+/* Remove ith element from the heap, O(log2(nr)). */
+static __always_inline
+bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
+			   const struct min_heap_callbacks *func, void *args)
+{
+	void *data = heap->data;
+	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
+
+	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
+		return false;
+
+	if (!swp)
+		swp = select_swap_func(data, elem_size);
+
+	/* Place last element at the root (position 0) and then sift down. */
+	heap->nr--;
+	if (idx == heap->nr)
+		return true;
+	do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
+	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
+	__min_heap_sift_down_inline(heap, idx, elem_size, func, args);
+
+	return true;
+}
+
+#define min_heap_del_inline(_heap, _idx, _func, _args)	\
+	__min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr),	\
+			      __minheap_obj_size(_heap), _idx, _func, _args)
+
+void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size);
+void *__bch2_min_heap_peek(struct min_heap_char *heap);
+bool __bch2_min_heap_full(min_heap_char *heap);
+void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
+			  const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
+			const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size,
+		       const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size,
+		    const struct min_heap_callbacks *func, void *args);
+void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
+			 const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
+		     const struct min_heap_callbacks *func, void *args);
+bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
+		    const struct min_heap_callbacks *func, void *args);
+
+#define min_heap_init(_heap, _data, _size)	\
+	__bch2_min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
+#define min_heap_peek(_heap)	\
+	(__minheap_cast(_heap) __bch2_min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr)))
+#define min_heap_full(_heap)	\
+	__bch2_min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr))
+#define min_heap_sift_down(_heap, _pos, _func, _args)	\
+	__bch2_min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos,	\
+			     __minheap_obj_size(_heap), _func, _args)
+#define min_heap_sift_up(_heap, _idx, _func, _args)	\
+	__bch2_min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr),	\
+			   __minheap_obj_size(_heap), _idx, _func, _args)
+#define min_heapify_all(_heap, _func, _args)	\
+	__bch2_min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr),	\
+			  __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop(_heap, _func, _args)	\
+	__bch2_min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr),	\
+		       __minheap_obj_size(_heap), _func, _args)
+#define min_heap_pop_push(_heap, _element, _func, _args)	\
+	__bch2_min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element,	\
+			    __minheap_obj_size(_heap), _func, _args)
+#define min_heap_push(_heap, _element, _func, _args)	\
+	__bch2_min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element,	\
+			__minheap_obj_size(_heap), _func, _args)
+#define min_heap_del(_heap, _idx, _func, _args)	\
+	__bch2_min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr),	\
+		       __minheap_obj_size(_heap), _idx, _func, _args)
+
+#endif /* _LINUX_MIN_HEAP_H */