Update bcachefs sources to fe72e70682 bcachefs: Fix for btree_gc repairing interior btree ptrs

author: Kent Overstreet <kent.overstreet@gmail.com> 2021-04-15 13:05:38 -0400
committer: Kent Overstreet <kent.overstreet@gmail.com> 2021-04-19 21:26:48 -0400
commit: ceac31bcb6992cb8b7770d2a0e91b055e5020431 (patch)
tree: 77a66249069a2de7bafdcec5afbce387059d0bf9
parent: 8ba5e814fd3d0e9559adca72f73202a7dc304acc (diff)
31 files changed, 776 insertions, 1375 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 4891beca..82c9b19f 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-8eca47e4d5c4e6817ad4c020be4280bd82104efd
+fe72e70682cd2430a099c08c3135253675030d28
diff --git a/include/linux/wait.h b/include/linux/wait.h
index c3d98242..d1d33e67 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -90,6 +90,7 @@ do {									\
 	__wait_event(wq, condition);					\
 } while (0)
 
+#define wait_event_freezable(wq, condition)	({wait_event(wq, condition); 0; })
 #define wait_event_killable(wq, condition)	({wait_event(wq, condition); 0; })
 #define wait_event_interruptible(wq, condition)	({wait_event(wq, condition); 0; })
 
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 5ab05693..e6c3e17a 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -353,28 +353,6 @@ DEFINE_EVENT(btree_node, btree_set_root,
 
 /* Garbage collection */
 
-DEFINE_EVENT(btree_node, btree_gc_coalesce,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-TRACE_EVENT(btree_gc_coalesce_fail,
-	TP_PROTO(struct bch_fs *c, int reason),
-	TP_ARGS(c, reason),
-
-	TP_STRUCT__entry(
-		__field(u8,		reason			)
-		__array(char,		uuid,	16		)
-	),
-
-	TP_fast_assign(
-		__entry->reason		= reason;
-		memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
-	),
-
-	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
-);
-
 DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
 	TP_PROTO(struct bch_fs *c, struct btree *b),
 	TP_ARGS(c, b)
@@ -395,16 +373,6 @@ DEFINE_EVENT(bch_fs, gc_end,
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_fs, gc_coalesce_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_coalesce_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
 DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
 	TP_PROTO(struct bch_fs *c),
 	TP_ARGS(c)
@@ -412,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
 
 /* Allocator */
 
-TRACE_EVENT(alloc_batch,
-	TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
-	TP_ARGS(ca, free, total),
+TRACE_EVENT(alloc_scan,
+	TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
+	TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(size_t,		free		)
-		__field(size_t,		total		)
+		__field(dev_t,		dev		)
+		__field(u64,		found		)
+		__field(u64,		inc_gen		)
+		__field(u64,		inc_gen_skipped	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->free = free;
-		__entry->total = total;
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->found		= found;
+		__entry->inc_gen	= inc_gen;
+		__entry->inc_gen_skipped = inc_gen_skipped;
 	),
 
-	TP_printk("%pU free %zu total %zu",
-		__entry->uuid, __entry->free, __entry->total)
+	TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
 );
 
 TRACE_EVENT(invalidate,
@@ -449,13 +420,10 @@ TRACE_EVENT(invalidate,
 	),
 
 	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-		  __entry->sectors, MAJOR(__entry->dev),
-		  MINOR(__entry->dev), __entry->offset)
-);
-
-DEFINE_EVENT(bch_fs, rescale_prios,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
+		  __entry->sectors,
+		  MAJOR(__entry->dev),
+		  MINOR(__entry->dev),
+		  __entry->offset)
 );
 
 DECLARE_EVENT_CLASS(bucket_alloc,
@@ -463,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_ARGS(ca, reserve),
 
 	TP_STRUCT__entry(
-		__array(char,			uuid,	16)
-		__field(enum alloc_reserve,	reserve	  )
+		__field(dev_t,			dev	)
+		__field(enum alloc_reserve,	reserve	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->reserve = reserve;
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->reserve	= reserve;
 	),
 
-	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+	TP_printk("%d,%d reserve %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->reserve)
 );
 
 DEFINE_EVENT(bucket_alloc, bucket_alloc,
@@ -598,77 +568,93 @@ DEFINE_EVENT(transaction_restart,	trans_restart_btree_node_reused,
 TRACE_EVENT(trans_restart_would_deadlock,
 	TP_PROTO(unsigned long	trans_ip,
 		 unsigned long	caller_ip,
+		 bool		in_traverse_all,
 		 unsigned	reason,
 		 enum btree_id	have_btree_id,
 		 unsigned	have_iter_type,
+		 struct bpos	*have_pos,
 		 enum btree_id	want_btree_id,
-		 unsigned	want_iter_type),
-	TP_ARGS(trans_ip, caller_ip, reason,
-		have_btree_id, have_iter_type,
-		want_btree_id, want_iter_type),
+		 unsigned	want_iter_type,
+		 struct bpos	*want_pos),
+	TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+		have_btree_id, have_iter_type, have_pos,
+		want_btree_id, want_iter_type, want_pos),
 
 	TP_STRUCT__entry(
 		__field(unsigned long,		trans_ip	)
 		__field(unsigned long,		caller_ip	)
+		__field(u8,			in_traverse_all	)
 		__field(u8,			reason		)
 		__field(u8,			have_btree_id	)
 		__field(u8,			have_iter_type	)
 		__field(u8,			want_btree_id	)
 		__field(u8,			want_iter_type	)
+
+		__field(u64,			have_pos_inode	)
+		__field(u64,			have_pos_offset	)
+		__field(u32,			have_pos_snapshot)
+		__field(u32,			want_pos_snapshot)
+		__field(u64,			want_pos_inode	)
+		__field(u64,			want_pos_offset	)
 	),
 
 	TP_fast_assign(
 		__entry->trans_ip		= trans_ip;
 		__entry->caller_ip		= caller_ip;
+		__entry->in_traverse_all	= in_traverse_all;
 		__entry->reason			= reason;
 		__entry->have_btree_id		= have_btree_id;
 		__entry->have_iter_type		= have_iter_type;
 		__entry->want_btree_id		= want_btree_id;
 		__entry->want_iter_type		= want_iter_type;
+
+		__entry->have_pos_inode		= have_pos->inode;
+		__entry->have_pos_offset	= have_pos->offset;
+		__entry->have_pos_snapshot	= have_pos->snapshot;
+
+		__entry->want_pos_inode		= want_pos->inode;
+		__entry->want_pos_offset	= want_pos->offset;
+		__entry->want_pos_snapshot	= want_pos->snapshot;
 	),
 
-	TP_printk("%ps %pS because %u have %u:%u want %u:%u",
+	TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
 		  (void *) __entry->trans_ip,
 		  (void *) __entry->caller_ip,
+		  __entry->in_traverse_all,
 		  __entry->reason,
 		  __entry->have_btree_id,
 		  __entry->have_iter_type,
+		  __entry->have_pos_inode,
+		  __entry->have_pos_offset,
+		  __entry->have_pos_snapshot,
 		  __entry->want_btree_id,
-		  __entry->want_iter_type)
-);
-
-TRACE_EVENT(trans_restart_iters_realloced,
-	TP_PROTO(unsigned long ip, unsigned nr),
-	TP_ARGS(ip, nr),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,		ip	)
-		__field(unsigned,		nr	)
-	),
-
-	TP_fast_assign(
-		__entry->ip	= ip;
-		__entry->nr	= nr;
-	),
-
-	TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
+		  __entry->want_iter_type,
+		  __entry->want_pos_inode,
+		  __entry->want_pos_offset,
+		  __entry->want_pos_snapshot)
 );
 
 TRACE_EVENT(trans_restart_mem_realloced,
-	TP_PROTO(unsigned long ip, unsigned long bytes),
-	TP_ARGS(ip, bytes),
+	TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
+		 unsigned long bytes),
+	TP_ARGS(trans_ip, caller_ip, bytes),
 
 	TP_STRUCT__entry(
-		__field(unsigned long,		ip	)
-		__field(unsigned long,		bytes	)
+		__field(unsigned long,		trans_ip	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned long,		bytes		)
 	),
 
 	TP_fast_assign(
-		__entry->ip	= ip;
-		__entry->bytes	= bytes;
+		__entry->trans_ip	= trans_ip;
+		__entry->caller_ip	= caller_ip;
+		__entry->bytes		= bytes;
 	),
 
-	TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes)
+	TP_printk("%ps %pS bytes %lu",
+		  (void *) __entry->trans_ip,
+		  (void *) __entry->caller_ip,
+		  __entry->bytes)
 );
 
 DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
@@ -726,6 +712,11 @@ DEFINE_EVENT(transaction_restart,	trans_restart_traverse,
 	TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,	trans_traverse_all,
+	TP_PROTO(unsigned long ip),
+	TP_ARGS(ip)
+);
+
 DECLARE_EVENT_CLASS(node_lock_fail,
 	TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
 	TP_ARGS(level, iter_seq, node, node_seq),
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 55562763..912020e6 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -25,44 +25,19 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
+const char * const bch2_allocator_states[] = {
+#define x(n)	#n,
+	ALLOC_THREAD_STATES()
+#undef x
+	NULL
+};
+
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
 	BCH_ALLOC_FIELDS_V1()
 #undef x
 };
 
-/* Ratelimiting/PD controllers */
-
-static void pd_controllers_update(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(to_delayed_work(work),
-					   struct bch_fs,
-					   pd_controllers_update);
-	struct bch_dev *ca;
-	s64 free = 0, fragmented = 0;
-	unsigned i;
-
-	for_each_member_device(ca, c, i) {
-		struct bch_dev_usage stats = bch2_dev_usage_read(ca);
-
-		free += bucket_to_sector(ca,
-				__dev_buckets_available(ca, stats)) << 9;
-		/*
-		 * Bytes of internal fragmentation, which can be
-		 * reclaimed by copy GC
-		 */
-		fragmented += max_t(s64, 0, (bucket_to_sector(ca,
-					stats.d[BCH_DATA_user].buckets +
-					stats.d[BCH_DATA_cached].buckets) -
-				  (stats.d[BCH_DATA_user].sectors +
-				   stats.d[BCH_DATA_cached].sectors)) << 9);
-	}
-
-	bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
-	schedule_delayed_work(&c->pd_controllers_update,
-			      c->pd_controllers_update_seconds * HZ);
-}
-
 /* Persistent alloc info: */
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -234,7 +209,7 @@ void bch2_alloc_pack(struct bch_fs *c,
 	bch2_alloc_pack_v2(dst, src);
 }
 
-static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);
 
@@ -254,7 +229,7 @@ const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 		return "invalid device";
 
 	/* allow for unknown fields */
-	if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
 		return "incorrect value size";
 
 	return NULL;
@@ -279,9 +254,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 {
 	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-	pr_buf(out, "gen %u oldest_gen %u data_type %u",
-	       u.gen, u.oldest_gen, u.data_type);
-#define x(_name, ...)	pr_buf(out, #_name " %llu ", (u64) u._name);
+	pr_buf(out, "gen %u oldest_gen %u data_type %s",
+	       u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
+#define x(_name, ...)	pr_buf(out, " " #_name " %llu", (u64) u._name);
 	BCH_ALLOC_FIELDS_V2()
 #undef  x
 }
@@ -322,7 +297,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
 					  NULL, bch2_alloc_read_fn);
 	up_read(&c->gc_lock);
-
 	if (ret) {
 		bch_err(c, "error reading alloc info: %i", ret);
 		return ret;
@@ -467,52 +441,6 @@ out:
  * commands to the newly free buckets, then puts them on the various freelists.
  */
 
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned long gc_count = c->gc_count;
-	s64 available;
-	int ret = 0;
-
-	ca->allocator_state = ALLOCATOR_BLOCKED;
-	closure_wake_up(&c->freelist_wait);
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		if (gc_count != c->gc_count)
-			ca->inc_gen_really_needs_gc = 0;
-
-		available  = dev_buckets_reclaimable(ca);
-		available -= ca->inc_gen_really_needs_gc;
-
-		available = max(available, 0LL);
-
-		if (available)
-			break;
-
-		up_read(&c->gc_lock);
-		schedule();
-		try_to_freeze();
-		down_read(&c->gc_lock);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	ca->allocator_state = ALLOCATOR_RUNNING;
-	closure_wake_up(&c->freelist_wait);
-
-	return ret;
-}
-
 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 				       struct bucket_mark m)
 {
@@ -530,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 
 	gc_gen = bucket_gc_gen(bucket(ca, b));
 
-	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
-		ca->inc_gen_needs_gc++;
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX)
-		ca->inc_gen_really_needs_gc++;
+	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
+	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;
 
 	return gc_gen < BUCKET_GC_GEN_MAX;
 }
@@ -611,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 		struct bucket_mark m = READ_ONCE(g->mark);
 		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
+		cond_resched();
+
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
 
@@ -627,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 				.key	= key,
 			};
 		}
-
-		cond_resched();
 	}
 
 	if (e.nr)
@@ -721,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	size_t i, nr = 0;
 
 	ca->inc_gen_needs_gc			= 0;
+	ca->inc_gen_really_needs_gc		= 0;
 
 	switch (ca->mi.replacement) {
 	case BCH_CACHE_REPLACEMENT_lru:
@@ -742,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return nr;
 }
 
-static inline long next_alloc_bucket(struct bch_dev *ca)
-{
-	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-	while (ca->alloc_heap.used) {
-		if (top->nr) {
-			size_t b = top->bucket;
-
-			top->bucket++;
-			top->nr--;
-			return b;
-		}
-
-		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-	}
-
-	return -1;
-}
-
 /*
  * returns sequence number of most recent journal entry that updated this
  * bucket:
@@ -783,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
 	}
 }
 
-static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
-				       struct bch_dev *ca,
-				       struct btree_iter *iter,
-				       u64 *journal_seq, unsigned flags)
+static int bucket_invalidate_btree(struct btree_trans *trans,
+				   struct bch_dev *ca, u64 b)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_buf a;
+	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
-	bool invalidating_cached_data;
+	struct btree_iter *iter =
+		bch2_trans_get_iter(trans, BTREE_ID_alloc,
+				    POS(ca->dev_idx, b),
+				    BTREE_ITER_CACHED|
+				    BTREE_ITER_CACHED_NOFILL|
+				    BTREE_ITER_INTENT);
+	int ret;
+
+	a = bch2_trans_kmalloc(trans, sizeof(*a));
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto err;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, b);
+	m = READ_ONCE(g->mark);
+	u = alloc_mem_to_key(iter, g, m);
+	percpu_up_read(&c->mark_lock);
+
+	u.gen++;
+	u.data_type	= 0;
+	u.dirty_sectors	= 0;
+	u.cached_sectors = 0;
+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
+
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				      u64 *journal_seq, unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark m;
 	size_t b;
 	int ret = 0;
 
@@ -808,7 +754,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
 	BUG_ON(m.dirty_sectors);
 
-	bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+	bch2_mark_alloc_bucket(c, ca, b, true);
 
 	spin_lock(&c->freelist_lock);
 	verify_not_on_freelist(c, ca, b);
@@ -839,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 		goto out;
 	}
 
-	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
-retry:
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, iter->pos.offset);
-	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(iter, g, m);
-
-	percpu_up_read(&c->mark_lock);
-
-	invalidating_cached_data = u.cached_sectors != 0;
-
-	u.gen++;
-	u.data_type	= 0;
-	u.dirty_sectors	= 0;
-	u.cached_sectors = 0;
-	u.read_time	= atomic64_read(&c->io_clock[READ].now);
-	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
-
-	bch2_alloc_pack(c, &a, u);
-	bch2_trans_update(trans, iter, &a.k,
-			  BTREE_TRIGGER_BUCKET_INVALIDATE);
-
-	/*
-	 * XXX:
-	 * when using deferred btree updates, we have journal reclaim doing
-	 * btree updates and thus requiring the allocator to make forward
-	 * progress, and here the allocator is requiring space in the journal -
-	 * so we need a journal pre-reservation:
-	 */
-	ret = bch2_trans_commit(trans, NULL,
-				invalidating_cached_data ? journal_seq : NULL,
-				BTREE_INSERT_NOUNLOCK|
-				BTREE_INSERT_NOCHECK_RW|
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_JOURNAL_RESERVED|
-				flags);
-	if (ret == -EINTR)
-		goto retry;
+	ret = bch2_trans_do(c, NULL, journal_seq,
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_JOURNAL_RESERVED|
+			    flags,
+			    bucket_invalidate_btree(&trans, ca, b));
 out:
 	if (!ret) {
 		/* remove from alloc_heap: */
@@ -905,8 +815,7 @@ out:
 		percpu_down_read(&c->mark_lock);
 		spin_lock(&c->freelist_lock);
 
-		bch2_mark_alloc_bucket(c, ca, b, false,
-				       gc_pos_alloc(c, NULL), 0);
+		bch2_mark_alloc_bucket(c, ca, b, false);
 
 		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
 		BUG_ON(b != b2);
@@ -923,29 +832,23 @@ out:
  */
 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	u64 journal_seq = 0;
 	int ret = 0;
 
-	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
-				   POS(ca->dev_idx, 0),
-				   BTREE_ITER_CACHED|
-				   BTREE_ITER_CACHED_NOFILL|
-				   BTREE_ITER_INTENT);
-
 	/* Only use nowait if we've already invalidated at least one bucket: */
 	while (!ret &&
 	       !fifo_full(&ca->free_inc) &&
-	       ca->alloc_heap.used)
-		ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
-				BTREE_INSERT_GC_LOCK_HELD|
+	       ca->alloc_heap.used) {
+		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
-
-	bch2_trans_iter_put(&trans, iter);
-	bch2_trans_exit(&trans);
+		/*
+		 * We only want to batch up invalidates when they're going to
+		 * require flushing the journal:
+		 */
+		if (!journal_seq)
+			break;
+	}
 
 	/* If we used NOWAIT, don't return the error: */
 	if (!fifo_empty(&ca->free_inc))
@@ -965,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return 0;
 }
 
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
+{
+	if (ca->allocator_state != new_state) {
+		ca->allocator_state = new_state;
+		closure_wake_up(&ca->fs->freelist_wait);
+	}
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
 	unsigned i;
 	int ret = 0;
 
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++) {
-
-			/*
-			 * Don't strand buckets on the copygc freelist until
-			 * after recovery is finished:
-			 */
-			if (!test_bit(BCH_FS_STARTED, &c->flags) &&
-			    i == RESERVE_MOVINGGC)
-				continue;
-
-			if (fifo_push(&ca->free[i], bucket)) {
-				fifo_pop(&ca->free_inc, bucket);
-
-				closure_wake_up(&c->freelist_wait);
-				ca->allocator_state = ALLOCATOR_RUNNING;
-
-				spin_unlock(&c->freelist_lock);
-				goto out;
-			}
-		}
-
-		if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
-			ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
-			closure_wake_up(&c->freelist_wait);
-		}
-
-		spin_unlock(&c->freelist_lock);
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++) {
+		/*
+		 * Don't strand buckets on the copygc freelist until
+		 * after recovery is finished:
+		 */
+		if (i == RESERVE_MOVINGGC &&
+		    !test_bit(BCH_FS_STARTED, &c->flags))
+			continue;
 
-		if ((current->flags & PF_KTHREAD) &&
-		    kthread_should_stop()) {
+		if (fifo_push(&ca->free[i], b)) {
+			fifo_pop(&ca->free_inc, b);
 			ret = 1;
 			break;
 		}
-
-		schedule();
-		try_to_freeze();
 	}
-out:
-	__set_current_state(TASK_RUNNING);
+	spin_unlock(&c->freelist_lock);
+
+	ca->allocator_state = ret
+		? ALLOCATOR_running
+		: ALLOCATOR_blocked_full;
+	closure_wake_up(&c->freelist_wait);
 	return ret;
 }
 
-/*
- * Pulls buckets off free_inc, discards them (if enabled), then adds them to
- * freelists, waiting until there's room if necessary:
- */
-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
-	while (!fifo_empty(&ca->free_inc)) {
-		size_t bucket = fifo_peek(&ca->free_inc);
-
-		if (ca->mi.discard &&
-		    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bucket),
-					     ca->mi.bucket_size, GFP_NOIO, 0);
-
-		if (push_invalidated_bucket(c, ca, bucket))
-			return 1;
-	}
+	if (ca->mi.discard &&
+	    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
+				     ca->mi.bucket_size, GFP_NOFS, 0);
+}
 
-	return 0;
+static bool allocator_thread_running(struct bch_dev *ca)
+{
+	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
+		? ALLOCATOR_running
+		: ALLOCATOR_stopped;
+	alloc_thread_set_state(ca, state);
+	return state == ALLOCATOR_running;
 }
 
-static inline bool allocator_thread_running(struct bch_dev *ca)
+static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
 {
-	return ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+	s64 available = dev_buckets_reclaimable(ca) -
+		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
+	bool ret = available > 0;
+
+	alloc_thread_set_state(ca, ret
+			       ? ALLOCATOR_running
+			       : ALLOCATOR_blocked);
+	return ret;
 }
 
 /**
@@ -1056,62 +948,29 @@ static int bch2_allocator_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
+	unsigned long gc_count = c->gc_count;
 	size_t nr;
 	int ret;
 
 	set_freezable();
 
 	while (1) {
-		if (!allocator_thread_running(ca)) {
-			ca->allocator_state = ALLOCATOR_STOPPED;
-			if (kthread_wait_freezable(allocator_thread_running(ca)))
-				break;
-		}
-
-		ca->allocator_state = ALLOCATOR_RUNNING;
-
-		cond_resched();
-		if (kthread_should_stop())
-			break;
-
-		pr_debug("discarding %zu invalidated buckets",
-			 fifo_used(&ca->free_inc));
-
-		ret = discard_invalidated_buckets(c, ca);
+		ret = kthread_wait_freezable(allocator_thread_running(ca));
 		if (ret)
 			goto stop;
 
-		down_read(&c->gc_lock);
-
-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret) {
-			up_read(&c->gc_lock);
-			goto stop;
-		}
-
-		if (!fifo_empty(&ca->free_inc)) {
-			up_read(&c->gc_lock);
-			continue;
-		}
-
-		pr_debug("free_inc now empty");
-
-		while (1) {
+		while (!ca->alloc_heap.used) {
 			cond_resched();
-			/*
-			 * Find some buckets that we can invalidate, either
-			 * they're completely unused, or only contain clean data
-			 * that's been written back to the backing device or
-			 * another cache tier
-			 */
 
-			pr_debug("scanning for reclaimable buckets");
+			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
+			if (ret)
+				goto stop;
 
+			gc_count = c->gc_count;
 			nr = find_reclaimable_buckets(c, ca);
 
-			pr_debug("found %zu buckets", nr);
-
-			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
+			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+					 ca->inc_gen_really_needs_gc);
 
 			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
 			     ca->inc_gen_really_needs_gc) &&
@@ -1119,37 +978,24 @@ static int bch2_allocator_thread(void *arg)
 				atomic_inc(&c->kick_gc);
 				wake_up_process(c->gc_thread);
 			}
-
-			if (nr)
-				break;
-
-			/*
-			 * If we found any buckets, we have to invalidate them
-			 * before we scan for more - but if we didn't find very
-			 * many we may want to wait on more buckets being
-			 * available so we don't spin:
-			 */
-			ret = wait_buckets_available(c, ca);
-			if (ret) {
-				up_read(&c->gc_lock);
-				goto stop;
-			}
 		}
 
-		up_read(&c->gc_lock);
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret)
+			goto stop;
 
-		pr_debug("%zu buckets to invalidate", nr);
+		while (!fifo_empty(&ca->free_inc)) {
+			u64 b = fifo_peek(&ca->free_inc);
 
-		/*
-		 * alloc_heap is now full of newly-invalidated buckets: next,
-		 * write out the new bucket gens:
-		 */
-	}
+			discard_one_bucket(c, ca, b);
 
+			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
+			if (ret)
+				goto stop;
+		}
+	}
 stop:
-	pr_debug("alloc thread stopping (ret %i)", ret);
-	ca->allocator_state = ALLOCATOR_STOPPED;
-	closure_wake_up(&c->freelist_wait);
+	alloc_thread_set_state(ca, ALLOCATOR_stopped);
 	return 0;
 }
 
@@ -1158,7 +1004,7 @@ stop:
 void bch2_recalc_capacity(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
 	unsigned i, j;
@@ -1201,8 +1047,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 		dev_reserve *= ca->mi.bucket_size;
 
-		copygc_threshold += dev_reserve;
-
 		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
 					     ca->mi.first_bucket);
 
@@ -1220,7 +1064,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	reserved_sectors = min(reserved_sectors, capacity);
 
-	c->copygc_threshold = copygc_threshold;
 	c->capacity = capacity - reserved_sectors;
 
 	c->bucket_size_max = bucket_size_max;
@@ -1331,7 +1174,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
 {
 	if (ca->alloc_thread)
 		closure_wait_event(&c->freelist_wait,
-				   ca->allocator_state != ALLOCATOR_RUNNING);
+				   ca->allocator_state != ALLOCATOR_running);
 }
 
 /* stop allocator thread: */
@@ -1385,7 +1228,4 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
-
-	c->pd_controllers_update_seconds = 5;
-	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
 }
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 6fededcd..ad15a806 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -6,6 +6,8 @@
 #include "alloc_types.h"
 #include "debug.h"
 
+extern const char * const bch2_allocator_states[];
+
 struct bkey_alloc_unpacked {
 	u64		bucket;
 	u8		dev;
@@ -98,10 +100,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 
 	rcu_read_lock();
 	p = rcu_dereference(ca->alloc_thread);
-	if (p) {
+	if (p)
 		wake_up_process(p);
-		ca->allocator_state = ALLOCATOR_RUNNING;
-	}
 	rcu_read_unlock();
 }
 
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index a29d313d..412fed47 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -1,57 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Primary bucket allocation code
- *
  * Copyright 2012 Google, Inc.
  *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
  *
  * bch2_bucket_alloc() allocates a single bucket from a specific device.
  *
  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
  * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch2_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
  */
 
 #include "bcachefs.h"
@@ -98,8 +55,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	percpu_down_read(&c->mark_lock);
 	spin_lock(&ob->lock);
 
-	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
-			       false, gc_pos_alloc(c, ob), 0);
+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
 	ob->valid = false;
 	ob->type = 0;
 
@@ -109,7 +65,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 	spin_lock(&c->freelist_lock);
 	ob->freelist = c->open_buckets_freelist;
 	c->open_buckets_freelist = ob - c->open_buckets;
+
 	c->open_buckets_nr_free++;
+	ca->nr_open_buckets--;
 	spin_unlock(&c->freelist_lock);
 
 	closure_wake_up(&c->open_buckets_wait);
@@ -316,6 +274,7 @@ out:
 		c->blocked_allocate = 0;
 	}
 
+	ca->nr_open_buckets++;
 	spin_unlock(&c->freelist_lock);
 
 	bch2_wake_allocator(ca);
@@ -680,11 +639,14 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 {
 	struct write_point *wp;
 
+	rcu_read_lock();
 	hlist_for_each_entry_rcu(wp, head, node)
 		if (wp->write_point == write_point)
-			return wp;
-
-	return NULL;
+			goto out;
+	wp = NULL;
+out:
+	rcu_read_unlock();
+	return wp;
 }
 
 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index be164d61..4a1cd8b7 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -10,6 +10,18 @@
 
 struct ec_bucket_buf;
 
+#define ALLOC_THREAD_STATES()		\
+	x(stopped)			\
+	x(running)			\
+	x(blocked)			\
+	x(blocked_full)
+
+enum allocator_states {
+#define x(n)	ALLOCATOR_##n,
+	ALLOC_THREAD_STATES()
+#undef x
+};
+
 enum alloc_reserve {
 	RESERVE_BTREE_MOVINGGC	= -2,
 	RESERVE_BTREE		= -1,
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 549cded6..aade5624 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -379,7 +379,6 @@ enum gc_phase {
 	GC_PHASE_BTREE_reflink,
 
 	GC_PHASE_PENDING_DELETE,
-	GC_PHASE_ALLOC,
 };
 
 struct gc_pos {
@@ -447,6 +446,7 @@ struct bch_dev {
 	 */
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
+	unsigned		nr_open_buckets;
 
 	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
 	open_bucket_idx_t	open_buckets_partial_nr;
@@ -456,16 +456,7 @@ struct bch_dev {
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 
-	/*
-	 * XXX: this should be an enum for allocator state, so as to include
-	 * error state
-	 */
-	enum {
-		ALLOCATOR_STOPPED,
-		ALLOCATOR_RUNNING,
-		ALLOCATOR_BLOCKED,
-		ALLOCATOR_BLOCKED_FULL,
-	}			allocator_state;
+	enum allocator_states	allocator_state;
 
 	alloc_heap		alloc_heap;
 
@@ -664,9 +655,6 @@ struct bch_fs {
 	struct workqueue_struct	*copygc_wq;
 
 	/* ALLOCATION */
-	struct delayed_work	pd_controllers_update;
-	unsigned		pd_controllers_update_seconds;
-
 	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
 
 	u64			capacity; /* sectors */
@@ -726,6 +714,9 @@ struct bch_fs {
 	atomic_t		kick_gc;
 	unsigned long		gc_count;
 
+	enum btree_id		gc_gens_btree;
+	struct bpos		gc_gens_pos;
+
 	/*
 	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
 	 * has been marked by GC.
@@ -772,9 +763,8 @@ struct bch_fs {
 	/* COPYGC */
 	struct task_struct	*copygc_thread;
 	copygc_heap		copygc_heap;
-	struct bch_pd_controller copygc_pd;
 	struct write_point	copygc_write_point;
-	u64			copygc_threshold;
+	s64			copygc_wait;
 
 	/* STRIPES: */
 	GENRADIX(struct stripe) stripes[2];
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 6fe95b80..450b613d 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -98,12 +98,50 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 	return bch2_bkey_ops[k.k->type].key_invalid(c, k);
 }
 
+static unsigned bch2_key_types_allowed[] = {
+	[BKEY_TYPE_extents] =
+		(1U << KEY_TYPE_discard)|
+		(1U << KEY_TYPE_error)|
+		(1U << KEY_TYPE_extent)|
+		(1U << KEY_TYPE_reservation)|
+		(1U << KEY_TYPE_reflink_p)|
+		(1U << KEY_TYPE_inline_data),
+	[BKEY_TYPE_inodes] =
+		(1U << KEY_TYPE_inode)|
+		(1U << KEY_TYPE_inode_generation),
+	[BKEY_TYPE_dirents] =
+		(1U << KEY_TYPE_hash_whiteout)|
+		(1U << KEY_TYPE_dirent),
+	[BKEY_TYPE_xattrs] =
+		(1U << KEY_TYPE_hash_whiteout)|
+		(1U << KEY_TYPE_xattr),
+	[BKEY_TYPE_alloc] =
+		(1U << KEY_TYPE_alloc)|
+		(1U << KEY_TYPE_alloc_v2),
+	[BKEY_TYPE_quotas] =
+		(1U << KEY_TYPE_quota),
+	[BKEY_TYPE_stripes] =
+		(1U << KEY_TYPE_stripe),
+	[BKEY_TYPE_reflink] =
+		(1U << KEY_TYPE_reflink_v)|
+		(1U << KEY_TYPE_indirect_inline_data),
+	[BKEY_TYPE_btree] =
+		(1U << KEY_TYPE_btree_ptr)|
+		(1U << KEY_TYPE_btree_ptr_v2),
+};
+
 const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 				enum btree_node_type type)
 {
+	unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
+		bch2_key_types_allowed[type] ;
+
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
 
+	if (!(key_types_allowed & (1U << k.k->type)))
+		return "invalid key type for this btree";
+
 	if (type == BKEY_TYPE_btree &&
 	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
 		return "value too big";
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 751920a5..536947cc 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -250,39 +250,54 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 
 		bkey_reassemble(new, *k);
 
-		bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
-			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-			struct bucket *g = PTR_BUCKET(ca, ptr, true);
-
-			(ptr->cached &&
-			 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
-			(!ptr->cached &&
-			 gen_cmp(ptr->gen, g->mark.gen) < 0);
-		}));
+		if (level) {
+			/*
+			 * We don't want to drop btree node pointers - if the
+			 * btree node isn't there anymore, the read path will
+			 * sort it out:
+			 */
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_for_each_ptr(ptrs, ptr) {
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+				ptr->gen = g->mark.gen;
+			}
+		} else {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+				(ptr->cached &&
+				 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+				(!ptr->cached &&
+				 gen_cmp(ptr->gen, g->mark.gen) < 0);
+			}));
 again:
-		ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-		bkey_extent_entry_for_each(ptrs, entry) {
-			if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-				struct stripe *m = genradix_ptr(&c->stripes[true],
-								entry->stripe_ptr.idx);
-				union bch_extent_entry *next_ptr;
-
-				bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
-					if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
-						goto found;
-				next_ptr = NULL;
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_extent_entry_for_each(ptrs, entry) {
+				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+					struct stripe *m = genradix_ptr(&c->stripes[true],
+									entry->stripe_ptr.idx);
+					union bch_extent_entry *next_ptr;
+
+					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+							goto found;
+					next_ptr = NULL;
 found:
-				if (!next_ptr) {
-					bch_err(c, "aieee, found stripe ptr with no data ptr");
-					continue;
-				}
-
-				if (!m || !m->alive ||
-				    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
-							       &next_ptr->ptr,
-							       m->sectors)) {
-					bch2_bkey_extent_entry_drop(new, entry);
-					goto again;
+					if (!next_ptr) {
+						bch_err(c, "aieee, found stripe ptr with no data ptr");
+						continue;
+					}
+
+					if (!m || !m->alive ||
+					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+								       &next_ptr->ptr,
+								       m->sectors)) {
+						bch2_bkey_extent_entry_drop(new, entry);
+						goto again;
+					}
 				}
 			}
 		}
@@ -301,10 +316,10 @@ fsck_err:
 
 static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 			    unsigned level, bool is_root,
-			    struct bkey_s_c k,
+			    struct bkey_s_c *k,
 			    u8 *max_stale, bool initial)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bkey_ptrs_c ptrs;
 	const struct bch_extent_ptr *ptr;
 	unsigned flags =
 		BTREE_TRIGGER_GC|
@@ -313,28 +328,29 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 
 	if (initial) {
 		BUG_ON(bch2_journal_seq_verify &&
-		       k.k->version.lo > journal_cur_seq(&c->journal));
+		       k->k->version.lo > journal_cur_seq(&c->journal));
 
-		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
 				"key version number higher than recorded: %llu > %llu",
-				k.k->version.lo,
+				k->k->version.lo,
 				atomic64_read(&c->key_version)))
-			atomic64_set(&c->key_version, k.k->version.lo);
+			atomic64_set(&c->key_version, k->k->version.lo);
 
 		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		    fsck_err_on(!bch2_bkey_replicas_marked(c, k), c,
+		    fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
 				"superblock not marked as containing replicas (type %u)",
-				k.k->type)) {
-			ret = bch2_mark_bkey_replicas(c, k);
+				k->k->type)) {
+			ret = bch2_mark_bkey_replicas(c, *k);
 			if (ret) {
 				bch_err(c, "error marking bkey replicas: %i", ret);
 				goto err;
 			}
 		}
 
-		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
+		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
 	}
 
+	ptrs = bch2_bkey_ptrs_c(*k);
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 		struct bucket *g = PTR_BUCKET(ca, ptr, true);
@@ -345,7 +361,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
 		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
 	}
 
-	bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
+	bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags);
 fsck_err:
 err:
 	if (ret)
@@ -374,7 +390,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
-				       k, max_stale, initial);
+				       &k, max_stale, initial);
 		if (ret)
 			break;
 
@@ -396,12 +412,13 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-			 bool initial)
+			 bool initial, bool metadata_only)
 {
 	struct btree_trans trans;
 	struct btree_iter *iter;
 	struct btree *b;
-	unsigned depth = bch2_expensive_debug_checks	? 0
+	unsigned depth = metadata_only			? 1
+		: bch2_expensive_debug_checks		? 0
 		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
@@ -445,10 +462,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
 	mutex_lock(&c->btree_root_lock);
 	b = c->btree_roots[btree_id].b;
-	if (!btree_node_fake(b))
+	if (!btree_node_fake(b)) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
-				       bkey_i_to_s_c(&b->key),
-				       &max_stale, initial);
+				       &k, &max_stale, initial);
+	}
 	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
 	mutex_unlock(&c->btree_root_lock);
 
@@ -474,7 +493,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
-				       k, &max_stale, true);
+				       &k, &max_stale, true);
 		if (ret) {
 			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
 			break;
@@ -544,11 +563,13 @@ fsck_err:
 }
 
 static int bch2_gc_btree_init(struct bch_fs *c,
-			      enum btree_id btree_id)
+			      enum btree_id btree_id,
+			      bool metadata_only)
 {
 	struct btree *b;
-	unsigned target_depth = bch2_expensive_debug_checks	? 0
-		: !btree_node_type_needs_gc(btree_id)		? 1
+	unsigned target_depth = metadata_only		? 1
+		: bch2_expensive_debug_checks		? 0
+		: !btree_node_type_needs_gc(btree_id)	? 1
 		: 0;
 	u8 max_stale = 0;
 	char buf[100];
@@ -575,10 +596,12 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 	if (b->c.level >= target_depth)
 		ret = bch2_gc_btree_init_recurse(c, b, target_depth);
 
-	if (!ret)
+	if (!ret) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
 		ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
-				       bkey_i_to_s_c(&b->key),
-				       &max_stale, true);
+				       &k, &max_stale, true);
+	}
 fsck_err:
 	six_unlock_read(&b->c.lock);
 
@@ -593,7 +616,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 		(int) btree_id_to_gc_phase(r);
 }
 
-static int bch2_gc_btrees(struct bch_fs *c, bool initial)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
 	enum btree_id ids[BTREE_ID_NR];
 	unsigned i;
@@ -605,8 +628,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial)
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		enum btree_id id = ids[i];
 		int ret = initial
-			? bch2_gc_btree_init(c, id)
-			: bch2_gc_btree(c, id, initial);
+			? bch2_gc_btree_init(c, id, metadata_only)
+			: bch2_gc_btree(c, id, initial, metadata_only);
 		if (ret) {
 			bch_err(c, "%s: ret %i", __func__, ret);
 			return ret;
@@ -707,52 +730,6 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 }
 #endif
 
-static void bch2_mark_allocator_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct open_bucket *ob;
-	size_t i, j, iter;
-	unsigned ci;
-
-	percpu_down_read(&c->mark_lock);
-
-	spin_lock(&c->freelist_lock);
-	gc_pos_set(c, gc_pos_alloc(c, NULL));
-
-	for_each_member_device(ca, c, ci) {
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			bch2_mark_alloc_bucket(c, ca, i, true,
-					       gc_pos_alloc(c, NULL),
-					       BTREE_TRIGGER_GC);
-
-
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				bch2_mark_alloc_bucket(c, ca, i, true,
-						       gc_pos_alloc(c, NULL),
-						       BTREE_TRIGGER_GC);
-	}
-
-	spin_unlock(&c->freelist_lock);
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid) {
-			gc_pos_set(c, gc_pos_alloc(c, ob));
-			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
-					       gc_pos_alloc(c, ob),
-					       BTREE_TRIGGER_GC);
-		}
-		spin_unlock(&ob->lock);
-	}
-
-	percpu_up_read(&c->mark_lock);
-}
-
 static void bch2_gc_free(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -775,10 +752,10 @@ static void bch2_gc_free(struct bch_fs *c)
 }
 
 static int bch2_gc_done(struct bch_fs *c,
-			bool initial)
+			bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
-	bool verify = (!initial ||
+	bool verify = !metadata_only && (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 	unsigned i, dev;
 	int ret = 0;
@@ -805,7 +782,7 @@ static int bch2_gc_done(struct bch_fs *c,
 	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
 		if (verify)						\
 			fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f	\
-				": got %u, should be %u", i, b,		\
+				": got %u, should be %u", dev, b,	\
 				dst->b[b].mark.gen,			\
 				bch2_data_types[dst->b[b].mark.data_type],\
 				dst->b[b].mark._f, src->b[b].mark._f);	\
@@ -813,11 +790,11 @@ static int bch2_gc_done(struct bch_fs *c,
 		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 	}
 #define copy_dev_field(_f, _msg, ...)					\
-	copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)					\
 	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-	{
+	if (!metadata_only) {
 		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
 		struct stripe *dst, *src;
 
@@ -857,7 +834,6 @@ static int bch2_gc_done(struct bch_fs *c,
 		for (b = 0; b < src->nbuckets; b++) {
 			copy_bucket_field(gen);
 			copy_bucket_field(data_type);
-			copy_bucket_field(owned_by_allocator);
 			copy_bucket_field(stripe);
 			copy_bucket_field(dirty_sectors);
 			copy_bucket_field(cached_sectors);
@@ -890,20 +866,28 @@ static int bch2_gc_done(struct bch_fs *c,
 
 		copy_fs_field(hidden,		"hidden");
 		copy_fs_field(btree,		"btree");
-		copy_fs_field(data,	"data");
-		copy_fs_field(cached,	"cached");
-		copy_fs_field(reserved,	"reserved");
-		copy_fs_field(nr_inodes,"nr_inodes");
 
-		for (i = 0; i < BCH_REPLICAS_MAX; i++)
-			copy_fs_field(persistent_reserved[i],
-				      "persistent_reserved[%i]", i);
+		if (!metadata_only) {
+			copy_fs_field(data,	"data");
+			copy_fs_field(cached,	"cached");
+			copy_fs_field(reserved,	"reserved");
+			copy_fs_field(nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}
 
 		for (i = 0; i < c->replicas.nr; i++) {
 			struct bch_replicas_entry *e =
 				cpu_replicas_entry(&c->replicas, i);
 			char buf[80];
 
+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_user ||
+			     e->data_type == BCH_DATA_cached))
+				continue;
+
 			bch2_replicas_entry_to_text(&PBUF(buf), e);
 
 			copy_fs_field(replicas[i], "%s", buf);
@@ -921,7 +905,8 @@ fsck_err:
 	return ret;
 }
 
-static int bch2_gc_start(struct bch_fs *c)
+static int bch2_gc_start(struct bch_fs *c,
+			 bool metadata_only)
 {
 	struct bch_dev *ca;
 	unsigned i;
@@ -985,6 +970,11 @@ static int bch2_gc_start(struct bch_fs *c)
 
 			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
 			d->gen_valid = s->gen_valid;
+
+			if (metadata_only &&
+			    (s->mark.data_type == BCH_DATA_user ||
+			     s->mark.data_type == BCH_DATA_cached))
+				d->_mark = s->mark;
 		}
 	};
 
@@ -1011,7 +1001,7 @@ static int bch2_gc_start(struct bch_fs *c)
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, bool initial)
+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
 	struct bch_dev *ca;
 	u64 start_time = local_clock();
@@ -1027,21 +1017,19 @@ int bch2_gc(struct bch_fs *c, bool initial)
 	closure_wait_event(&c->btree_interior_update_wait,
 			   !bch2_btree_interior_updates_nr_pending(c));
 again:
-	ret = bch2_gc_start(c);
+	ret = bch2_gc_start(c, metadata_only);
 	if (ret)
 		goto out;
 
 	bch2_mark_superblocks(c);
 
-	ret = bch2_gc_btrees(c, initial);
+	ret = bch2_gc_btrees(c, initial, metadata_only);
 	if (ret)
 		goto out;
 
 #if 0
 	bch2_mark_pending_btree_node_frees(c);
 #endif
-	bch2_mark_allocator_buckets(c);
-
 	c->gc_count++;
 
 	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
@@ -1071,7 +1059,7 @@ out:
 		bch2_journal_block(&c->journal);
 
 		percpu_down_write(&c->mark_lock);
-		ret = bch2_gc_done(c, initial);
+		ret = bch2_gc_done(c, initial, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
 	} else {
@@ -1142,7 +1130,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 	struct btree_iter *iter;
 	struct bkey_s_c k;
 	struct bkey_buf sk;
-	int ret = 0;
+	int ret = 0, commit_err = 0;
 
 	bch2_bkey_buf_init(&sk);
 	bch2_trans_init(&trans, c, 0, 0);
@@ -1154,18 +1142,20 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 	while ((k = bch2_btree_iter_peek(iter)).k &&
 	       !(ret = bkey_err(k))) {
-		if (gc_btree_gens_key(c, k)) {
+		c->gc_gens_pos = iter->pos;
+
+		if (gc_btree_gens_key(c, k) && !commit_err) {
 			bch2_bkey_buf_reassemble(&sk, c, k);
 			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
 			bch2_trans_update(&trans, iter, sk.k, 0);
 
-			ret = bch2_trans_commit(&trans, NULL, NULL,
-						BTREE_INSERT_NOFAIL);
-			if (ret == -EINTR)
+			commit_err = bch2_trans_commit(&trans, NULL, NULL,
+						       BTREE_INSERT_NOWAIT|
+						       BTREE_INSERT_NOFAIL);
+			if (commit_err == -EINTR) {
+				commit_err = 0;
 				continue;
-			if (ret) {
-				break;
 			}
 		}
 
@@ -1205,6 +1195,8 @@ int bch2_gc_gens(struct bch_fs *c)
 
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if ((1 << i) & BTREE_ID_HAS_PTRS) {
+			c->gc_gens_btree = i;
+			c->gc_gens_pos = POS_MIN;
 			ret = bch2_gc_btree_gens(c, i);
 			if (ret) {
 				bch_err(c, "error recalculating oldest_gen: %i", ret);
@@ -1221,352 +1213,15 @@ int bch2_gc_gens(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 
+	c->gc_gens_btree	= 0;
+	c->gc_gens_pos		= POS_MIN;
+
 	c->gc_count++;
 err:
 	up_read(&c->gc_lock);
 	return ret;
 }
 
-/* Btree coalescing */
-
-static void recalc_packed_keys(struct btree *b)
-{
-	struct bset *i = btree_bset_first(b);
-	struct bkey_packed *k;
-
-	memset(&b->nr, 0, sizeof(b->nr));
-
-	BUG_ON(b->nsets != 1);
-
-	vstruct_for_each(i, k)
-		btree_keys_account_key_add(&b->nr, 0, k);
-}
-
-static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
-				struct btree *old_nodes[GC_MERGE_NODES])
-{
-	struct btree *parent = btree_node_parent(iter, old_nodes[0]);
-	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
-	unsigned blocks = btree_blocks(c) * 2 / 3;
-	struct btree *new_nodes[GC_MERGE_NODES];
-	struct btree_update *as;
-	struct keylist keylist;
-	struct bkey_format_state format_state;
-	struct bkey_format new_format;
-
-	memset(new_nodes, 0, sizeof(new_nodes));
-	bch2_keylist_init(&keylist, NULL);
-
-	/* Count keys that are not deleted */
-	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
-		u64s += old_nodes[i]->nr.live_u64s;
-
-	nr_old_nodes = nr_new_nodes = i;
-
-	/* Check if all keys in @old_nodes could fit in one fewer node */
-	if (nr_old_nodes <= 1 ||
-	    __vstruct_blocks(struct btree_node, c->block_bits,
-			     DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
-		return;
-
-	/* Find a format that all keys in @old_nodes can pack into */
-	bch2_bkey_format_init(&format_state);
-
-	/*
-	 * XXX: this won't correctly take it account the new min/max keys:
-	 */
-	for (i = 0; i < nr_old_nodes; i++)
-		__bch2_btree_calc_format(&format_state, old_nodes[i]);
-
-	new_format = bch2_bkey_format_done(&format_state);
-
-	/* Check if repacking would make any nodes too big to fit */
-	for (i = 0; i < nr_old_nodes; i++)
-		if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
-			trace_btree_gc_coalesce_fail(c,
-					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
-			return;
-		}
-
-	if (bch2_keylist_realloc(&keylist, NULL, 0,
-			BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
-		return;
-	}
-
-	as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
-			btree_update_reserve_required(c, parent) + nr_old_nodes,
-			BTREE_INSERT_NOFAIL|
-			BTREE_INSERT_USE_RESERVE);
-	if (IS_ERR(as)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
-		bch2_keylist_free(&keylist, NULL);
-		return;
-	}
-
-	trace_btree_gc_coalesce(c, old_nodes[0]);
-
-	for (i = 0; i < nr_old_nodes; i++)
-		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
-
-	/* Repack everything with @new_format and sort down to one bset */
-	for (i = 0; i < nr_old_nodes; i++)
-		new_nodes[i] =
-			__bch2_btree_node_alloc_replacement(as, old_nodes[i],
-							    new_format);
-
-	/*
-	 * Conceptually we concatenate the nodes together and slice them
-	 * up at different boundaries.
-	 */
-	for (i = nr_new_nodes - 1; i > 0; --i) {
-		struct btree *n1 = new_nodes[i];
-		struct btree *n2 = new_nodes[i - 1];
-
-		struct bset *s1 = btree_bset_first(n1);
-		struct bset *s2 = btree_bset_first(n2);
-		struct bkey_packed *k, *last = NULL;
-
-		/* Calculate how many keys from @n2 we could fit inside @n1 */
-		u64s = 0;
-
-		for (k = s2->start;
-		     k < vstruct_last(s2) &&
-		     vstruct_blocks_plus(n1->data, c->block_bits,
-					 u64s + k->u64s) <= blocks;
-		     k = bkey_next(k)) {
-			last = k;
-			u64s += k->u64s;
-		}
-
-		if (u64s == le16_to_cpu(s2->u64s)) {
-			/* n2 fits entirely in n1 */
-			n1->key.k.p = n1->data->max_key = n2->data->max_key;
-
-			memcpy_u64s(vstruct_last(s1),
-				    s2->start,
-				    le16_to_cpu(s2->u64s));
-			le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
-
-			set_btree_bset_end(n1, n1->set);
-
-			six_unlock_write(&n2->c.lock);
-			bch2_btree_node_free_never_inserted(c, n2);
-			six_unlock_intent(&n2->c.lock);
-
-			memmove(new_nodes + i - 1,
-				new_nodes + i,
-				sizeof(new_nodes[0]) * (nr_new_nodes - i));
-			new_nodes[--nr_new_nodes] = NULL;
-		} else if (u64s) {
-			/* move part of n2 into n1 */
-			n1->key.k.p = n1->data->max_key =
-				bkey_unpack_pos(n1, last);
-
-			n2->data->min_key = bpos_successor(n1->data->max_key);
-
-			memcpy_u64s(vstruct_last(s1),
-				    s2->start, u64s);
-			le16_add_cpu(&s1->u64s, u64s);
-
-			memmove(s2->start,
-				vstruct_idx(s2, u64s),
-				(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
-			s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
-
-			set_btree_bset_end(n1, n1->set);
-			set_btree_bset_end(n2, n2->set);
-		}
-	}
-
-	for (i = 0; i < nr_new_nodes; i++) {
-		struct btree *n = new_nodes[i];
-
-		recalc_packed_keys(n);
-		btree_node_reset_sib_u64s(n);
-
-		bch2_btree_build_aux_trees(n);
-
-		bch2_btree_update_add_new_node(as, n);
-		six_unlock_write(&n->c.lock);
-
-		bch2_btree_node_write(c, n, SIX_LOCK_intent);
-	}
-
-	/*
-	 * The keys for the old nodes get deleted. We don't want to insert keys
-	 * that compare equal to the keys for the new nodes we'll also be
-	 * inserting - we can't because keys on a keylist must be strictly
-	 * greater than the previous keys, and we also don't need to since the
-	 * key for the new node will serve the same purpose (overwriting the key
-	 * for the old node).
-	 */
-	for (i = 0; i < nr_old_nodes; i++) {
-		struct bkey_i delete;
-		unsigned j;
-
-		for (j = 0; j < nr_new_nodes; j++)
-			if (!bpos_cmp(old_nodes[i]->key.k.p,
-				      new_nodes[j]->key.k.p))
-				goto next;
-
-		bkey_init(&delete.k);
-		delete.k.p = old_nodes[i]->key.k.p;
-		bch2_keylist_add_in_order(&keylist, &delete);
-next:
-		i = i;
-	}
-
-	/*
-	 * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
-	 * does the lookup once and thus expects the keys to be in sorted order
-	 * so we have to make sure the new keys are correctly ordered with
-	 * respect to the deleted keys added in the previous loop
-	 */
-	for (i = 0; i < nr_new_nodes; i++)
-		bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
-
-	/* Insert the newly coalesced nodes */
-	bch2_btree_insert_node(as, parent, iter, &keylist, 0);
-
-	BUG_ON(!bch2_keylist_empty(&keylist));
-
-	BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
-
-	bch2_btree_iter_node_replace(iter, new_nodes[0]);
-
-	for (i = 0; i < nr_new_nodes; i++)
-		bch2_btree_update_get_open_buckets(as, new_nodes[i]);
-
-	/* Free the old nodes and update our sliding window */
-	for (i = 0; i < nr_old_nodes; i++) {
-		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
-
-		/*
-		 * the index update might have triggered a split, in which case
-		 * the nodes we coalesced - the new nodes we just created -
-		 * might not be sibling nodes anymore - don't add them to the
-		 * sliding window (except the first):
-		 */
-		if (!i) {
-			old_nodes[i] = new_nodes[i];
-		} else {
-			old_nodes[i] = NULL;
-		}
-	}
-
-	for (i = 0; i < nr_new_nodes; i++)
-		six_unlock_intent(&new_nodes[i]->c.lock);
-
-	bch2_btree_update_done(as);
-	bch2_keylist_free(&keylist, NULL);
-}
-
-static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct btree *b;
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	unsigned i;
-	int ret = 0;
-
-	/* Sliding window of adjacent btree nodes */
-	struct btree *merge[GC_MERGE_NODES];
-	u32 lock_seq[GC_MERGE_NODES];
-
-	bch2_trans_init(&trans, c, 0, 0);
-
-	/*
-	 * XXX: We don't have a good way of positively matching on sibling nodes
-	 * that have the same parent - this code works by handling the cases
-	 * where they might not have the same parent, and is thus fragile. Ugh.
-	 *
-	 * Perhaps redo this to use multiple linked iterators?
-	 */
-	memset(merge, 0, sizeof(merge));
-
-	__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
-			      BTREE_MAX_DEPTH, 0,
-			      BTREE_ITER_PREFETCH, b) {
-		memmove(merge + 1, merge,
-			sizeof(merge) - sizeof(merge[0]));
-		memmove(lock_seq + 1, lock_seq,
-			sizeof(lock_seq) - sizeof(lock_seq[0]));
-
-		merge[0] = b;
-
-		for (i = 1; i < GC_MERGE_NODES; i++) {
-			if (!merge[i] ||
-			    !six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
-				break;
-
-			if (merge[i]->c.level != merge[0]->c.level) {
-				six_unlock_intent(&merge[i]->c.lock);
-				break;
-			}
-		}
-		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
-
-		bch2_coalesce_nodes(c, iter, merge);
-
-		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
-			lock_seq[i] = merge[i]->c.lock.state.seq;
-			six_unlock_intent(&merge[i]->c.lock);
-		}
-
-		lock_seq[0] = merge[0]->c.lock.state.seq;
-
-		if (kthread && kthread_should_stop()) {
-			ret = -ESHUTDOWN;
-			break;
-		}
-
-		bch2_trans_cond_resched(&trans);
-
-		/*
-		 * If the parent node wasn't relocked, it might have been split
-		 * and the nodes in our sliding window might not have the same
-		 * parent anymore - blow away the sliding window:
-		 */
-		if (btree_iter_node(iter, iter->level + 1) &&
-		    !btree_node_intent_locked(iter, iter->level + 1))
-			memset(merge + 1, 0,
-			       (GC_MERGE_NODES - 1) * sizeof(merge[0]));
-	}
-	bch2_trans_iter_put(&trans, iter);
-
-	return bch2_trans_exit(&trans) ?: ret;
-}
-
-/**
- * bch_coalesce - coalesce adjacent nodes with low occupancy
- */
-void bch2_coalesce(struct bch_fs *c)
-{
-	enum btree_id id;
-
-	down_read(&c->gc_lock);
-	trace_gc_coalesce_start(c);
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		int ret = c->btree_roots[id].b
-			? bch2_coalesce_btree(c, id)
-			: 0;
-
-		if (ret) {
-			if (ret != -ESHUTDOWN)
-				bch_err(c, "btree coalescing failed: %d", ret);
-			return;
-		}
-	}
-
-	trace_gc_coalesce_end(c);
-	up_read(&c->gc_lock);
-}
-
 static int bch2_gc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index b1362a9f..e9a87394 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -4,9 +4,7 @@
 
 #include "btree_types.h"
 
-void bch2_coalesce(struct bch_fs *);
-
-int bch2_gc(struct bch_fs *, bool);
+int bch2_gc(struct bch_fs *, bool, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
@@ -92,14 +90,6 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
 	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
 }
 
-static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
-{
-	return (struct gc_pos) {
-		.phase	= GC_PHASE_ALLOC,
-		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
-	};
-}
-
 static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 4a2f5726..c8d8df96 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1057,14 +1057,17 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	struct btree_read_bio *rb;
 	struct bch_dev *ca;
 	struct bio *bio;
+	char buf[200];
 	int ret;
 
+	btree_pos_to_text(&PBUF(buf), c, b);
 	trace_btree_read(c, b);
 
 	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 					 NULL, &pick);
 	if (bch2_fs_fatal_err_on(ret <= 0, c,
-			"btree node read error: no device to read from")) {
+			"btree node read error: no device to read from\n"
+			" at %s", buf)) {
 		set_btree_node_read_error(b);
 		return;
 	}
@@ -1337,13 +1340,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	return ret;
 }
 
-static void btree_write_submit(struct work_struct *work)
-{
-	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
-
-	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
-}
-
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 {
 	struct btree_write_bio *wbio;
@@ -1351,6 +1347,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	struct bset *i;
 	struct btree_node *bn = NULL;
 	struct btree_node_entry *bne = NULL;
+	struct bkey_buf k;
 	struct bch_extent_ptr *ptr;
 	struct sort_iter sort_iter;
 	struct nonce nonce;
@@ -1361,6 +1358,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	bool validate_before_checksum = false;
 	void *data;
 
+	bch2_bkey_buf_init(&k);
+
 	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
 		return;
 
@@ -1537,7 +1536,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	wbio_init(&wbio->wbio.bio);
 	wbio->data			= data;
 	wbio->bytes			= bytes;
-	wbio->wbio.c			= c;
 	wbio->wbio.used_mempool		= used_mempool;
 	wbio->wbio.bio.bi_opf		= REQ_OP_WRITE|REQ_META;
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
@@ -1560,9 +1558,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	 * just make all btree node writes FUA to keep things sane.
 	 */
 
-	bkey_copy(&wbio->key, &b->key);
+	bch2_bkey_buf_copy(&k, c, &b->key);
 
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
 		ptr->offset += b->written;
 
 	b->written += sectors_to_write;
@@ -1570,8 +1568,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 	atomic64_inc(&c->btree_writes_nr);
 	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
-	INIT_WORK(&wbio->work, btree_write_submit);
-	schedule_work(&wbio->work);
+	/* XXX: submitting IO with btree locks held: */
+	bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
+	bch2_bkey_buf_exit(&k, c);
 	return;
 err:
 	set_btree_node_noevict(b);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index c8a8b05a..95c35161 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -42,7 +42,6 @@ struct btree_read_bio {
 
 struct btree_write_bio {
 	struct work_struct	work;
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	void			*data;
 	unsigned		bytes;
 	struct bch_write_bio	wbio;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index adcb0ee4..c8f527bc 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -260,13 +260,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 */
 		if (type == SIX_LOCK_intent &&
 		    linked->nodes_locked != linked->nodes_intent_locked) {
-			linked->locks_want = max_t(unsigned,
-					linked->locks_want,
-					__fls(linked->nodes_locked) + 1);
-			if (!btree_iter_get_locks(linked, true, false)) {
-				deadlock_iter = linked;
-				reason = 1;
-			}
+			deadlock_iter = linked;
+			reason = 1;
 		}
 
 		if (linked->btree_id != iter->btree_id) {
@@ -295,14 +290,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 		 * we're about to lock, it must have the ancestors locked too:
 		 */
 		if (level > __fls(linked->nodes_locked)) {
-			linked->locks_want =
-				max(level + 1, max_t(unsigned,
-				    linked->locks_want,
-				    iter->locks_want));
-			if (!btree_iter_get_locks(linked, true, false)) {
-				deadlock_iter = linked;
-				reason = 5;
-			}
+			deadlock_iter = linked;
+			reason = 5;
 		}
 
 		/* Must lock btree nodes in key order: */
@@ -311,27 +300,19 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 						 btree_iter_type(linked))) <= 0) {
 			deadlock_iter = linked;
 			reason = 7;
-		}
-
-		/*
-		 * Recheck if this is a node we already have locked - since one
-		 * of the get_locks() calls might've successfully
-		 * upgraded/relocked it:
-		 */
-		if (linked->l[level].b == b &&
-		    btree_node_locked_type(linked, level) >= type) {
-			six_lock_increment(&b->c.lock, type);
-			return true;
+			BUG_ON(trans->in_traverse_all);
 		}
 	}
 
 	if (unlikely(deadlock_iter)) {
 		trace_trans_restart_would_deadlock(iter->trans->ip, ip,
-				reason,
+				trans->in_traverse_all, reason,
 				deadlock_iter->btree_id,
 				btree_iter_type(deadlock_iter),
+				&deadlock_iter->real_pos,
 				iter->btree_id,
-				btree_iter_type(iter));
+				btree_iter_type(iter),
+				&pos);
 		return false;
 	}
 
@@ -409,12 +390,27 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 		return true;
 
 	/*
-	 * Ancestor nodes must be locked before child nodes, so set locks_want
-	 * on iterators that might lock ancestors before us to avoid getting
-	 * -EINTR later:
+	 * XXX: this is ugly - we'd prefer to not be mucking with other
+	 * iterators in the btree_trans here.
+	 *
+	 * On failure to upgrade the iterator, setting iter->locks_want and
+	 * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+	 * get the locks we want on transaction restart.
+	 *
+	 * But if this iterator was a clone, on transaction restart what we did
+	 * to this iterator isn't going to be preserved.
+	 *
+	 * Possibly we could add an iterator field for the parent iterator when
+	 * an iterator is a copy - for now, we'll just upgrade any other
+	 * iterators with the same btree id.
+	 *
+	 * The code below used to be needed to ensure ancestor nodes get locked
+	 * before interior nodes - now that's handled by
+	 * bch2_btree_iter_traverse_all().
 	 */
 	trans_for_each_iter(iter->trans, linked)
 		if (linked != iter &&
+		    btree_iter_type(linked) == btree_iter_type(iter) &&
 		    linked->btree_id == iter->btree_id &&
 		    linked->locks_want < new_locks_want) {
 			linked->locks_want = new_locks_want;
@@ -1184,7 +1180,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	u8 sorted[BTREE_ITER_MAX];
-	unsigned i, nr_sorted = 0;
+	int i, nr_sorted = 0;
+	bool relock_fail;
 
 	if (trans->in_traverse_all)
 		return -EINTR;
@@ -1192,15 +1189,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 	trans->in_traverse_all = true;
 retry_all:
 	nr_sorted = 0;
+	relock_fail = false;
 
-	trans_for_each_iter(trans, iter)
+	trans_for_each_iter(trans, iter) {
+		if (!bch2_btree_iter_relock(iter, true))
+			relock_fail = true;
 		sorted[nr_sorted++] = iter->idx;
+	}
+
+	if (!relock_fail) {
+		trans->in_traverse_all = false;
+		return 0;
+	}
 
 #define btree_iter_cmp_by_idx(_l, _r)				\
 		btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
 	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
+
+	for (i = nr_sorted - 2; i >= 0; --i) {
+		struct btree_iter *iter1 = trans->iters + sorted[i];
+		struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+
+		if (iter1->btree_id == iter2->btree_id &&
+		    iter1->locks_want < iter2->locks_want)
+			__bch2_btree_iter_upgrade(iter1, iter2->locks_want);
+		else if (!iter1->locks_want && iter2->locks_want)
+			__bch2_btree_iter_upgrade(iter1, 1);
+	}
+
 	bch2_trans_unlock(trans);
 	cond_resched();
 
@@ -1250,6 +1268,8 @@ out:
 	bch2_btree_cache_cannibalize_unlock(c);
 
 	trans->in_traverse_all = false;
+
+	trace_trans_traverse_all(trans->ip);
 	return ret;
 }
 
@@ -2009,10 +2029,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 		if (iter->btree_id != btree_id)
 			continue;
 
-		if (best &&
-		    bkey_cmp(bpos_diff(best->real_pos, pos),
-			     bpos_diff(iter->real_pos, pos)) > 0)
-			continue;
+		if (best) {
+			int cmp = bkey_cmp(bpos_diff(best->real_pos, pos),
+					   bpos_diff(iter->real_pos, pos));
+
+			if (cmp < 0 ||
+			    ((cmp == 0 && btree_iter_keep(trans, iter))))
+				continue;
+		}
 
 		best = iter;
 	}
@@ -2040,13 +2064,18 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
 
 	iter->snapshot = pos.snapshot;
 
-	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+	/*
+	 * If the iterator has locks_want greater than requested, we explicitly
+	 * do not downgrade it here - on transaction restart because btree node
+	 * split needs to upgrade locks, we might be putting/getting the
+	 * iterator again. Downgrading iterators only happens via an explicit
+	 * bch2_trans_downgrade().
+	 */
 
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
 	if (locks_want > iter->locks_want) {
 		iter->locks_want = locks_want;
 		btree_iter_get_locks(iter, true, false);
-	} else if (locks_want < iter->locks_want) {
-		__bch2_btree_iter_downgrade(iter, locks_want);
 	}
 
 	while (iter->level < depth) {
@@ -2108,37 +2137,28 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
 	return iter;
 }
 
-static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 {
-	if (size > trans->mem_bytes) {
+	size_t new_top = trans->mem_top + size;
+	void *p;
+
+	if (new_top > trans->mem_bytes) {
 		size_t old_bytes = trans->mem_bytes;
-		size_t new_bytes = roundup_pow_of_two(size);
+		size_t new_bytes = roundup_pow_of_two(new_top);
 		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
 
 		if (!new_mem)
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 
 		trans->mem = new_mem;
 		trans->mem_bytes = new_bytes;
 
 		if (old_bytes) {
-			trace_trans_restart_mem_realloced(trans->ip, new_bytes);
-			return -EINTR;
+			trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+			return ERR_PTR(-EINTR);
 		}
 	}
 
-	return 0;
-}
-
-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
-	void *p;
-	int ret;
-
-	ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
-	if (ret)
-		return ERR_PTR(ret);
-
 	p = trans->mem + trans->mem_top;
 	trans->mem_top += size;
 	return p;
@@ -2188,7 +2208,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 	if (!(flags & TRANS_RESET_NOUNLOCK))
 		bch2_trans_cond_resched(trans);
 
-	if (!(flags & TRANS_RESET_NOTRAVERSE))
+	if (!(flags & TRANS_RESET_NOTRAVERSE) &&
+	    trans->iters_linked)
 		bch2_btree_iter_traverse_all(trans);
 }
 
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 07d9b6d3..2f63adb9 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
 {
 	return   cmp_int(l->btree_id, r->btree_id) ?:
 		-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-		 bkey_cmp(l->pos, r->pos);
+		 bkey_cmp(l->real_pos, r->real_pos);
 }
 
 /*
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index f5160d4f..afdcc98d 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -222,18 +222,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
 					     struct btree_insert_entry *i)
 {
-	struct bch_fs *c = trans->c;
-
-	if (bch2_debug_check_bkeys) {
-		const char *invalid = bch2_bkey_invalid(c,
-				bkey_i_to_s_c(i->k), i->bkey_type);
-		if (invalid) {
-			char buf[200];
-
-			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-			panic("invalid bkey %s on insert: %s\n", buf, invalid);
-		}
-	}
 	BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
 	BUG_ON(i->level		!= i->iter->level);
 	BUG_ON(i->btree_id	!= i->iter->btree_id);
@@ -319,8 +307,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 }
 
 static inline void do_btree_insert_one(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bkey_i *insert)
+				       struct btree_insert_entry *i)
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
@@ -329,20 +316,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
-	insert->k.needs_whiteout = false;
+	i->k->k.needs_whiteout = false;
 
-	did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED)
-		? btree_insert_key_leaf(trans, iter, insert)
-		: bch2_btree_insert_key_cached(trans, iter, insert);
+	did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
+		? btree_insert_key_leaf(trans, i->iter, i->k)
+		: bch2_btree_insert_key_cached(trans, i->iter, i->k);
 	if (!did_work)
 		return;
 
 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		bch2_journal_add_keys(j, &trans->journal_res,
-				      iter->btree_id, insert);
+				      i->btree_id,
+				      i->level,
+				      i->k);
 
 		bch2_journal_set_has_inode(j, &trans->journal_res,
-					   insert->k.p.inode);
+					   i->k->k.p.inode);
 
 		if (trans->journal_seq)
 			*trans->journal_seq = trans->journal_res.seq;
@@ -480,7 +469,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 		bch2_trans_mark_gc(trans);
 
 	trans_for_each_update2(trans, i)
-		do_btree_insert_one(trans, i->iter, i->k);
+		do_btree_insert_one(trans, i);
 err:
 	if (marking) {
 		percpu_up_read(&c->mark_lock);
@@ -592,9 +581,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 		}
 	}
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		trans_for_each_update2(trans, i)
-			btree_insert_entry_checks(trans, i);
+	trans_for_each_update2(trans, i) {
+		const char *invalid = bch2_bkey_invalid(c,
+				bkey_i_to_s_c(i->k), i->bkey_type);
+		if (invalid) {
+			char buf[200];
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+			bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
+			bch2_fatal_error(c);
+		}
+		btree_insert_entry_checks(trans, i);
+	}
 	bch2_btree_trans_verify_locks(trans);
 
 	trans_for_each_update2(trans, i)
@@ -629,25 +627,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
 static int journal_reclaim_wait_done(struct bch_fs *c)
 {
-	int ret;
-
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		return ret;
-
-	ret = !bch2_btree_key_cache_must_wait(c);
-	if (ret)
-		return ret;
-
-	journal_reclaim_kick(&c->journal);
-
-	if (mutex_trylock(&c->journal.reclaim_lock)) {
-		ret = bch2_journal_reclaim(&c->journal);
-		mutex_unlock(&c->journal.reclaim_lock);
-	}
+	int ret = bch2_journal_error(&c->journal) ?:
+		!bch2_btree_key_cache_must_wait(c);
 
 	if (!ret)
-		ret = !bch2_btree_key_cache_must_wait(c);
+		journal_reclaim_kick(&c->journal);
 	return ret;
 }
 
@@ -735,10 +719,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 		bch2_trans_unlock(trans);
 
-		wait_event(c->journal.reclaim_wait,
-			   (ret = journal_reclaim_wait_done(c)));
+		wait_event_freezable(c->journal.reclaim_wait,
+				     (ret = journal_reclaim_wait_done(c)));
+		if (ret < 0)
+			return ret;
 
-		if (!ret && bch2_trans_relock(trans))
+		if (bch2_trans_relock(trans))
 			return 0;
 
 		trace_trans_restart_journal_reclaim(trans->ip);
@@ -1151,8 +1137,7 @@ int __bch2_btree_insert(struct btree_trans *trans,
 	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
 				   BTREE_ITER_INTENT);
 
-	ret   = bch2_btree_iter_traverse(iter) ?:
-		bch2_trans_update(trans, iter, k, 0);
+	ret = bch2_trans_update(trans, iter, k, 0);
 	bch2_trans_iter_put(trans, iter);
 	return ret;
 }
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index f0c3bbc7..6b99f127 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -3,64 +3,6 @@
  * Code for manipulating bucket marks for garbage collection.
  *
  * Copyright 2014 Datera, Inc.
- *
- * Bucket states:
- * - free bucket: mark == 0
- *   The bucket contains no data and will not be read
- *
- * - allocator bucket: owned_by_allocator == 1
- *   The bucket is on a free list, or it is an open bucket
- *
- * - cached bucket: owned_by_allocator == 0 &&
- *                  dirty_sectors == 0 &&
- *                  cached_sectors > 0
- *   The bucket contains data but may be safely discarded as there are
- *   enough replicas of the data on other cache devices, or it has been
- *   written back to the backing device
- *
- * - dirty bucket: owned_by_allocator == 0 &&
- *                 dirty_sectors > 0
- *   The bucket contains data that we must not discard (either only copy,
- *   or one of the 'main copies' for data requiring multiple replicas)
- *
- * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
- *   This is a btree node, journal or gen/prio bucket
- *
- * Lifecycle:
- *
- * bucket invalidated => bucket on freelist => open bucket =>
- *     [dirty bucket =>] cached bucket => bucket invalidated => ...
- *
- * Note that cache promotion can skip the dirty bucket step, as data
- * is copied from a deeper tier to a shallower tier, onto a cached
- * bucket.
- * Note also that a cached bucket can spontaneously become dirty --
- * see below.
- *
- * Only a traversal of the key space can determine whether a bucket is
- * truly dirty or cached.
- *
- * Transitions:
- *
- * - free => allocator: bucket was invalidated
- * - cached => allocator: bucket was invalidated
- *
- * - allocator => dirty: open bucket was filled up
- * - allocator => cached: open bucket was filled up
- * - allocator => metadata: metadata was allocated
- *
- * - dirty => cached: dirty sectors were copied to a deeper tier
- * - dirty => free: dirty sectors were overwritten or moved (copy gc)
- * - cached => free: cached sectors were overwritten
- *
- * - metadata => free: metadata was freed
- *
- * Oddities:
- * - cached => dirty: a device was removed so formerly replicated data
- *                    is no longer sufficiently replicated
- * - free => cached: cannot happen
- * - free => dirty: cannot happen
- * - free => metadata: cannot happen
  */
 
 #include "bcachefs.h"
@@ -229,7 +171,7 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 	percpu_down_read(&c->mark_lock);
 
 	ret = kmalloc(sizeof(struct bch_fs_usage_online) +
-		      sizeof(u64) + c->replicas.nr, GFP_NOFS);
+		      sizeof(u64) * c->replicas.nr, GFP_NOFS);
 	if (unlikely(!ret)) {
 		percpu_up_read(&c->mark_lock);
 		return NULL;
@@ -538,33 +480,17 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
 	ret;								\
 })
 
-static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-				    size_t b, bool owned_by_allocator,
-				    bool gc)
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator)
 {
-	struct bucket *g = __bucket(ca, b, gc);
+	struct bucket *g = bucket(ca, b);
 	struct bucket_mark old, new;
 
 	old = bucket_cmpxchg(g, new, ({
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
 
-	BUG_ON(!gc &&
-	       !owned_by_allocator && !old.owned_by_allocator);
-
-	return 0;
-}
-
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-			    size_t b, bool owned_by_allocator,
-			    struct gc_pos pos, unsigned flags)
-{
-	preempt_disable();
-
-	do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
-		   ca, b, owned_by_allocator);
-
-	preempt_enable();
+	BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
 static int bch2_mark_alloc(struct bch_fs *c,
@@ -1890,10 +1816,11 @@ int bch2_trans_mark_update(struct btree_trans *trans,
 		return 0;
 
 	if (!btree_node_type_is_extents(iter->btree_id)) {
-		/* iterators should be uptodate, shouldn't get errors here: */
 		if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
 			old = bch2_btree_iter_peek_slot(iter);
-			BUG_ON(bkey_err(old));
+			ret = bkey_err(old);
+			if (ret)
+				return ret;
 		} else {
 			struct bkey_cached *ck = (void *) iter->l[0].b;
 
@@ -2004,22 +1931,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		goto out;
 	}
 
-	if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-			"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
-			"while marking %s",
-			iter->pos.inode, iter->pos.offset, u.gen,
-			bch2_data_types[u.data_type ?: type],
-			u.dirty_sectors, sectors, ca->mi.bucket_size,
-			bch2_data_types[type]);
-		ret = -EIO;
-		goto out;
-	}
-
-	if (u.data_type		== type &&
-	    u.dirty_sectors	== sectors)
-		goto out;
-
 	u.data_type	= type;
 	u.dirty_sectors	= sectors;
 
@@ -2031,53 +1942,44 @@ out:
 }
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-				    struct disk_reservation *res,
 				    struct bch_dev *ca, size_t b,
 				    enum bch_data_type type,
 				    unsigned sectors)
 {
-	return __bch2_trans_do(trans, res, NULL, 0,
-			__bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
-							ca->mi.bucket_size));
-
+	return __bch2_trans_do(trans, NULL, NULL, 0,
+			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
 }
 
 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
-					    struct disk_reservation *res,
 					    struct bch_dev *ca,
 					    u64 start, u64 end,
 					    enum bch_data_type type,
 					    u64 *bucket, unsigned *bucket_sectors)
 {
-	int ret;
-
 	do {
 		u64 b = sector_to_bucket(ca, start);
 		unsigned sectors =
 			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
 
-		if (b != *bucket) {
-			if (*bucket_sectors) {
-				ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
-						*bucket, type, *bucket_sectors);
-				if (ret)
-					return ret;
-			}
+		if (b != *bucket && *bucket_sectors) {
+			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
+								  type, *bucket_sectors);
+			if (ret)
+				return ret;
 
-			*bucket		= b;
-			*bucket_sectors	= 0;
+			*bucket_sectors = 0;
 		}
 
+		*bucket		= b;
 		*bucket_sectors	+= sectors;
 		start += sectors;
-	} while (!ret && start < end);
+	} while (start < end);
 
 	return 0;
 }
 
 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
-			     struct disk_reservation *res,
-			     struct bch_dev *ca)
+				    struct bch_dev *ca)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	u64 bucket = 0;
@@ -2088,14 +1990,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 		u64 offset = le64_to_cpu(layout->sb_offset[i]);
 
 		if (offset == BCH_SB_SECTOR) {
-			ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+			ret = bch2_trans_mark_metadata_sectors(trans, ca,
 						0, BCH_SB_SECTOR,
 						BCH_DATA_sb, &bucket, &bucket_sectors);
 			if (ret)
 				return ret;
 		}
 
-		ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
 				      offset + (1 << layout->sb_max_size_bits),
 				      BCH_DATA_sb, &bucket, &bucket_sectors);
 		if (ret)
@@ -2103,14 +2005,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 	}
 
 	if (bucket_sectors) {
-		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
 				bucket, BCH_DATA_sb, bucket_sectors);
 		if (ret)
 			return ret;
 	}
 
 	for (i = 0; i < ca->journal.nr; i++) {
-		ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
 				ca->journal.buckets[i],
 				BCH_DATA_journal, ca->mi.bucket_size);
 		if (ret)
@@ -2120,12 +2022,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_trans_mark_dev_sb(struct bch_fs *c,
-			   struct disk_reservation *res,
-			   struct bch_dev *ca)
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_trans_do(c, res, NULL, 0,
-			__bch2_trans_mark_dev_sb(&trans, res, ca));
+	return bch2_trans_do(c, NULL, NULL, 0,
+			__bch2_trans_mark_dev_sb(&trans, ca));
 }
 
 /* Disk reservations: */
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 297b04b2..7463e642 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -191,6 +191,7 @@ static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
 	for (i = 0; i < RESERVE_NR; i++)
 		available -= fifo_used(&ca->free[i]);
 	available -= fifo_used(&ca->free_inc);
+	available -= ca->nr_open_buckets;
 	spin_unlock(&c->freelist_lock);
 
 	return max(available, 0LL);
@@ -234,8 +235,7 @@ bch2_fs_usage_read_short(struct bch_fs *);
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
-			    size_t, bool, struct gc_pos, unsigned);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 			       size_t, enum bch_data_type, unsigned,
 			       struct gc_pos, unsigned);
@@ -252,11 +252,9 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
 			   struct bkey_i *insert, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
-int bch2_trans_mark_metadata_bucket(struct btree_trans *,
-			struct disk_reservation *, struct bch_dev *,
-			size_t, enum bch_data_type, unsigned);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
-			   struct bch_dev *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
+				    size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
 
 /* disk reservations: */
 
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 588b1a72..b2de2995 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -59,6 +59,11 @@ struct bch_dev_usage {
 	struct {
 		u64		buckets;
 		u64		sectors; /* _compressed_ sectors: */
+		/*
+		 * XXX
+		 * Why do we have this? Isn't it just buckets * bucket_size -
+		 * sectors?
+		 */
 		u64		fragmented;
 	}			d[BCH_DATA_NR];
 };
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 1a94e7f7..dda3608d 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -2619,54 +2619,21 @@ err:
 	return ret;
 }
 
-static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
-			    loff_t offset, loff_t len)
+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			     u64 start_sector, u64 end_sector)
 {
-	struct address_space *mapping = inode->v.i_mapping;
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct btree_trans trans;
 	struct btree_iter *iter;
-	struct bpos end_pos;
-	loff_t end		= offset + len;
-	loff_t block_start	= round_down(offset,	block_bytes(c));
-	loff_t block_end	= round_up(end,		block_bytes(c));
-	unsigned sectors;
+	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
 	unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
-	int ret;
+	int ret = 0;
 
 	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
-		ret = inode_newsize_ok(&inode->v, end);
-		if (ret)
-			goto err;
-	}
-
-	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = __bch2_truncate_page(inode,
-					   offset >> PAGE_SHIFT,
-					   offset, end);
-
-		if (!ret &&
-		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
-			ret = __bch2_truncate_page(inode,
-						   end >> PAGE_SHIFT,
-						   offset, end);
-
-		if (unlikely(ret))
-			goto err;
-
-		truncate_pagecache_range(&inode->v, offset, end - 1);
-	}
-
 	iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-			POS(inode->v.i_ino, block_start >> 9),
+			POS(inode->v.i_ino, start_sector),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	end_pos = POS(inode->v.i_ino, block_end >> 9);
 
 	while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
 		s64 i_sectors_delta = 0;
@@ -2674,6 +2641,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		struct quota_res quota_res = { 0 };
 		struct bkey_i_reservation reservation;
 		struct bkey_s_c k;
+		unsigned sectors;
 
 		bch2_trans_begin(&trans);
 
@@ -2734,7 +2702,48 @@ bkey_err:
 			ret = 0;
 	}
 	bch2_trans_iter_put(&trans, iter);
+	bch2_trans_exit(&trans);
+	return ret;
+}
 
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	loff_t end		= offset + len;
+	loff_t block_start	= round_down(offset,	block_bytes(c));
+	loff_t block_end	= round_up(end,		block_bytes(c));
+	int ret;
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			goto err;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = __bch2_truncate_page(inode,
+					   offset >> PAGE_SHIFT,
+					   offset, end);
+
+		if (!ret &&
+		    offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+			ret = __bch2_truncate_page(inode,
+						   end >> PAGE_SHIFT,
+						   offset, end);
+
+		if (unlikely(ret))
+			goto err;
+
+		truncate_pagecache_range(&inode->v, offset, end - 1);
+	}
+
+	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
 	if (ret)
 		goto err;
 
@@ -2748,28 +2757,13 @@ bkey_err:
 	if (end >= inode->v.i_size &&
 	    (!(mode & FALLOC_FL_KEEP_SIZE) ||
 	     (mode & FALLOC_FL_ZERO_RANGE))) {
-		struct btree_iter *inode_iter;
-		struct bch_inode_unpacked inode_u;
-
-		do {
-			bch2_trans_begin(&trans);
-			inode_iter = bch2_inode_peek(&trans, &inode_u,
-						     inode->v.i_ino, 0);
-			ret = PTR_ERR_OR_ZERO(inode_iter);
-		} while (ret == -EINTR);
-
-		bch2_trans_iter_put(&trans, inode_iter);
-		bch2_trans_unlock(&trans);
-
-		if (ret)
-			goto err;
 
 		/*
 		 * Sync existing appends before extending i_size,
 		 * as in bch2_extend():
 		 */
 		ret = filemap_write_and_wait_range(mapping,
-					inode_u.bi_size, S64_MAX);
+					inode->ei_inode.bi_size, S64_MAX);
 		if (ret)
 			goto err;
 
@@ -2783,7 +2777,6 @@ bkey_err:
 		mutex_unlock(&inode->ei_update_lock);
 	}
 err:
-	bch2_trans_exit(&trans);
 	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 	inode_unlock(&inode->v);
 	return ret;
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 6280b357..eb8ac164 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -81,51 +81,37 @@ static int write_inode(struct btree_trans *trans,
 	return ret;
 }
 
-static int __remove_dirent(struct btree_trans *trans,
-			   struct bkey_s_c_dirent dirent)
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
-	struct qstr name;
+	struct btree_iter *iter;
 	struct bch_inode_unpacked dir_inode;
 	struct bch_hash_info dir_hash_info;
-	u64 dir_inum = dirent.k->p.inode;
 	int ret;
-	char *buf;
-
-	name.len = bch2_dirent_name_bytes(dirent);
-	buf = bch2_trans_kmalloc(trans, name.len + 1);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
 
-	memcpy(buf, dirent.v->d_name, name.len);
-	buf[name.len] = '\0';
-	name.name = buf;
-
-	ret = lookup_inode(trans, dir_inum, &dir_inode, NULL);
-	if (ret && ret != -EINTR)
-		bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
+	ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
 	if (ret)
 		return ret;
 
 	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-	ret = bch2_hash_delete(trans, bch2_dirent_hash_desc,
-			       &dir_hash_info, dir_inum, &name);
-	if (ret && ret != -EINTR)
-		bch_err(c, "remove_dirent: err %i deleting dirent", ret);
-	if (ret)
-		return ret;
+	iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
-	return 0;
+	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				  &dir_hash_info, iter);
+	bch2_trans_iter_put(trans, iter);
+	return ret;
 }
 
-static int remove_dirent(struct btree_trans *trans,
-			 struct bkey_s_c_dirent dirent)
+static int remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
-	return __bch2_trans_do(trans, NULL, NULL,
-			       BTREE_INSERT_NOFAIL|
-			       BTREE_INSERT_LAZY_RW,
-			       __remove_dirent(trans, dirent));
+	int ret = __bch2_trans_do(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW,
+				  __remove_dirent(trans, pos));
+	if (ret)
+		bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
+	return ret;
 }
 
 static int __reattach_inode(struct btree_trans *trans,
@@ -173,13 +159,10 @@ static int reattach_inode(struct btree_trans *trans,
 			  struct bch_inode_unpacked *lostfound,
 			  u64 inum)
 {
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+	int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
 			      __reattach_inode(trans, lostfound, inum));
 	if (ret)
-		bch_err(c, "error %i reattaching inode %llu", ret, inum);
+		bch_err(trans->c, "error %i reattaching inode %llu", ret, inum);
 
 	return ret;
 }
@@ -202,7 +185,7 @@ static int remove_backpointer(struct btree_trans *trans,
 		goto out;
 	}
 
-	ret = remove_dirent(trans, bkey_s_c_to_dirent(k));
+	ret = remove_dirent(trans, k.k->p);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@@ -752,7 +735,7 @@ retry:
 				"dirent points to missing inode:\n%s",
 				(bch2_bkey_val_to_text(&PBUF(buf), c,
 						       k), buf))) {
-			ret = remove_dirent(&trans, d);
+			ret = remove_dirent(&trans, d.k->p);
 			if (ret)
 				goto err;
 			goto next;
@@ -783,7 +766,7 @@ retry:
 					backpointer_exists, c,
 					"directory %llu with multiple links",
 					target.bi_inum)) {
-				ret = remove_dirent(&trans, d);
+				ret = remove_dirent(&trans, d.k->p);
 				if (ret)
 					goto err;
 				continue;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index b901be5b..1b49a1c3 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -787,7 +787,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	 * We may be called from the device add path, before the new device has
 	 * actually been added to the running filesystem:
 	 */
-	if (c)
+	if (!new_fs)
 		spin_lock(&c->journal.lock);
 
 	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
@@ -795,17 +795,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
 
-	if (c)
+	if (!new_fs)
 		spin_unlock(&c->journal.lock);
 
 	while (ja->nr < nr) {
 		struct open_bucket *ob = NULL;
 		unsigned pos;
-		long bucket;
+		long b;
 
 		if (new_fs) {
-			bucket = bch2_bucket_alloc_new_fs(ca);
-			if (bucket < 0) {
+			b = bch2_bucket_alloc_new_fs(ca);
+			if (b < 0) {
 				ret = -ENOSPC;
 				goto err;
 			}
@@ -819,10 +819,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 				goto err;
 			}
 
-			bucket = sector_to_bucket(ca, ob->ptr.offset);
-		}
+			b = sector_to_bucket(ca, ob->ptr.offset);
 
-		if (c) {
 			percpu_down_read(&c->mark_lock);
 			spin_lock(&c->journal.lock);
 		}
@@ -839,9 +837,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
 		ja->nr++;
 
-		ja->buckets[pos] = bucket;
+		ja->buckets[pos] = b;
 		ja->bucket_seq[pos] = 0;
-		journal_buckets->buckets[pos] = cpu_to_le64(bucket);
+		journal_buckets->buckets[pos] = cpu_to_le64(b);
 
 		if (pos <= ja->discard_idx)
 			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -852,28 +850,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		if (pos <= ja->cur_idx)
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-		if (!c || new_fs)
-			bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+		if (new_fs) {
+			bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
 						  ca->mi.bucket_size,
 						  gc_phase(GC_PHASE_SB),
 						  0);
-
-		if (c) {
+		} else {
 			spin_unlock(&c->journal.lock);
 			percpu_up_read(&c->mark_lock);
-		}
 
-		if (c && !new_fs)
 			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
-				bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
-						bucket, BCH_DATA_journal,
+				bch2_trans_mark_metadata_bucket(&trans, ca,
+						b, BCH_DATA_journal,
 						ca->mi.bucket_size));
 
-		if (!new_fs)
 			bch2_open_bucket_put(c, ob);
 
-		if (ret)
-			goto err;
+			if (ret)
+				goto err;
+		}
 	}
 err:
 	bch2_sb_resize_journal(&ca->disk_sb,
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index cc497125..1d556790 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -241,10 +241,11 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
 }
 
 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
-					enum btree_id id, const struct bkey_i *k)
+					enum btree_id id, unsigned level,
+					const struct bkey_i *k)
 {
 	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
-			       id, 0, k, k->k.u64s);
+			       id, level, k, k->k.u64s);
 }
 
 static inline bool journal_entry_empty(struct jset *j)
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 7be6c65c..f117d361 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -599,7 +599,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	u64 seq_to_flush;
-	size_t min_nr, nr_flushed;
+	size_t min_nr, min_key_cache, nr_flushed;
 	unsigned flags;
 	int ret = 0;
 
@@ -649,9 +649,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 				atomic_long_read(&c->btree_key_cache.nr_dirty),
 				atomic_long_read(&c->btree_key_cache.nr_keys));
 
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
+
 		nr_flushed = journal_flush_pins(j, seq_to_flush,
-					min_nr,
-					min(bch2_nr_btree_keys_need_flush(c), 128UL));
+						min_nr, min_key_cache);
 
 		if (direct)
 			j->nr_direct_reclaim += nr_flushed;
@@ -661,7 +662,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 		if (nr_flushed)
 			wake_up(&j->reclaim_wait);
-	} while (min_nr && nr_flushed && !direct);
+	} while ((min_nr || min_key_cache) && !direct);
 
 	memalloc_noreclaim_restore(flags);
 
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 5b108490..aa8e8c25 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -68,7 +68,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 	bch2_bkey_buf_init(&_insert);
 	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
 
-	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
 	iter = bch2_trans_get_iter(&trans, m->btree_id,
 				   bkey_start_pos(&bch2_keylist_front(keys)->k),
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 71db7ae6..80772cff 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -108,7 +108,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 
 	spin_lock(&ca->fs->freelist_lock);
 	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-		ca->allocator_state != ALLOCATOR_RUNNING;
+		ca->allocator_state != ALLOCATOR_running;
 	spin_unlock(&ca->fs->freelist_lock);
 
 	return ret;
@@ -222,7 +222,7 @@ static int bch2_copygc(struct bch_fs *c)
 	ret = bch2_move_data(c,
 			     0,			POS_MIN,
 			     BTREE_ID_NR,	POS_MAX,
-			     &c->copygc_pd.rate,
+			     NULL,
 			     writepoint_ptr(&c->copygc_write_point),
 			     copygc_pred, NULL,
 			     &move_stats);
@@ -282,8 +282,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned dev_idx;
-	u64 fragmented_allowed = c->copygc_threshold;
-	u64 fragmented = 0;
+	u64 fragmented_allowed = 0, fragmented = 0;
 
 	for_each_rw_member(ca, c, dev_idx) {
 		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
@@ -312,11 +311,14 @@ static int bch2_copygc_thread(void *arg)
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
+			c->copygc_wait = last + wait;
 			bch2_kthread_io_clock_wait(clock, last + wait,
 					MAX_SCHEDULE_TIMEOUT);
 			continue;
 		}
 
+		c->copygc_wait = 0;
+
 		if (bch2_copygc(c))
 			break;
 	}
@@ -326,9 +328,6 @@ static int bch2_copygc_thread(void *arg)
 
 void bch2_copygc_stop(struct bch_fs *c)
 {
-	c->copygc_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&c->copygc_pd.rate);
-
 	if (c->copygc_thread) {
 		kthread_stop(c->copygc_thread);
 		put_task_struct(c->copygc_thread);
@@ -365,6 +364,4 @@ int bch2_copygc_start(struct bch_fs *c)
 
 void bch2_fs_copygc_init(struct bch_fs *c)
 {
-	bch2_pd_controller_init(&c->copygc_pd);
-	c->copygc_pd.d_term = 0;
 }
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index b5cc0e64..2dc3dee4 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -1005,13 +1005,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	}
 
-	if (!c->sb.clean &&
-	    !(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
-		bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required");
-		c->opts.fsck = true;
-		c->opts.fix_errors = FSCK_OPT_YES;
-	}
-
 	if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
 		bch_info(c, "alloc_v2 feature bit not set, fsck required");
 		c->opts.fsck = true;
@@ -1145,9 +1138,11 @@ use_clean:
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
 	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
 	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+		bool metadata_only = c->opts.norecovery;
+
 		bch_info(c, "starting mark and sweep");
 		err = "error in mark and sweep";
-		ret = bch2_gc(c, true);
+		ret = bch2_gc(c, true, metadata_only);
 		if (ret)
 			goto err;
 		bch_verbose(c, "mark and sweep done");
@@ -1245,8 +1240,8 @@ use_clean:
 	}
 
 	if (c->opts.fsck &&
-	    !test_bit(BCH_FS_ERROR, &c->flags)) {
-		c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    BCH_SB_HAS_ERRORS(c->disk_sb.sb)) {
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
 		write_sb = true;
 	}
@@ -1338,10 +1333,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * Write out the superblock and journal buckets, now that we can do
 	 * btree updates
 	 */
-	err = "error writing alloc info";
-	ret = bch2_alloc_write(c, 0);
-	if (ret)
-		goto err;
+	err = "error marking superblock and journal";
+	for_each_member_device(ca, c, i) {
+		ret = bch2_trans_mark_dev_sb(c, ca);
+		if (ret)
+			goto err;
+	}
 
 	bch2_inode_init(c, &root_inode, 0, 0,
 			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 1e297171..4128a1b3 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -313,8 +313,8 @@ static int replicas_table_update(struct bch_fs *c,
 out:
 	free_percpu(new_gc);
 	kfree(new_scratch);
-	free_percpu(new_usage[1]);
-	free_percpu(new_usage[0]);
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		free_percpu(new_usage[i]);
 	kfree(new_base);
 	return ret;
 err:
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index b2a2614b..61fd1144 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -286,7 +286,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 	percpu_ref_kill(&c->writes);
 
 	cancel_work_sync(&c->ec_stripe_delete_work);
-	cancel_delayed_work(&c->pd_controllers_update);
 
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
@@ -371,8 +370,6 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
 		return ret;
 	}
 
-	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
-
 	schedule_work(&c->ec_stripe_delete_work);
 
 	return 0;
@@ -566,7 +563,6 @@ void __bch2_fs_stop(struct bch_fs *c)
 		cancel_work_sync(&ca->io_error_work);
 
 	cancel_work_sync(&c->btree_write_error_work);
-	cancel_delayed_work_sync(&c->pd_controllers_update);
 	cancel_work_sync(&c->read_only_work);
 
 	for (i = 0; i < c->sb.nr_devices; i++)
@@ -908,9 +904,16 @@ int bch2_fs_start(struct bch_fs *c)
 	/*
 	 * Allocator threads don't start filling copygc reserve until after we
 	 * set BCH_FS_STARTED - wake them now:
+	 *
+	 * XXX ugly hack:
+	 * Need to set ca->allocator_state here instead of relying on the
+	 * allocator threads to do it to avoid racing with the copygc threads
+	 * checking it and thinking they have no alloc reserve:
 	 */
-	for_each_online_member(ca, c, i)
+	for_each_online_member(ca, c, i) {
+		ca->allocator_state = ALLOCATOR_running;
 		bch2_wake_allocator(ca);
+	}
 
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
@@ -1679,7 +1682,7 @@ have_slot:
 	bch2_dev_usage_journal_reserve(c);
 
 	err = "error marking superblock";
-	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
+	ret = bch2_trans_mark_dev_sb(c, ca);
 	if (ret)
 		goto err_late;
 
@@ -1739,7 +1742,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	ca = bch_dev_locked(c, dev_idx);
 
-	if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+	if (bch2_trans_mark_dev_sb(c, ca)) {
 		err = "bch2_trans_mark_dev_sb() error";
 		goto err;
 	}
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 2d008979..21ef7719 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -132,10 +132,10 @@ do {									\
 } while (0)
 
 write_attribute(trigger_journal_flush);
-write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
+rw_attribute(gc_gens_pos);
 
 read_attribute(uuid);
 read_attribute(minor);
@@ -190,7 +190,7 @@ rw_attribute(cache_replacement_policy);
 rw_attribute(label);
 
 rw_attribute(copy_gc_enabled);
-sysfs_pd_controller_attribute(copy_gc);
+read_attribute(copy_gc_wait);
 
 rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
@@ -199,8 +199,6 @@ rw_attribute(promote_whole_extents);
 
 read_attribute(new_stripes);
 
-rw_attribute(pd_controllers_update_seconds);
-
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
@@ -314,6 +312,13 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 	return 0;
 }
 
+void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+	bch2_bpos_to_text(out, c->gc_gens_pos);
+	pr_buf(out, "\n");
+}
+
 SHOW(bch2_fs)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -339,14 +344,18 @@ SHOW(bch2_fs)
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
-	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+	if (attr == &sysfs_gc_gens_pos) {
+		bch2_gc_gens_pos_to_text(&out, c);
+		return out.pos - buf;
+	}
 
-	sysfs_print(pd_controllers_update_seconds,
-		    c->pd_controllers_update_seconds);
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
 	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
-	sysfs_pd_controller_show(copy_gc,	&c->copygc_pd);
+	sysfs_hprint(copy_gc_wait,
+		     max(0LL, c->copygc_wait -
+			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
 
 	if (attr == &sysfs_rebalance_work) {
 		bch2_rebalance_work_to_text(&out, c);
@@ -454,10 +463,7 @@ STORE(bch2_fs)
 		return ret;
 	}
 
-	sysfs_strtoul(pd_controllers_update_seconds,
-		      c->pd_controllers_update_seconds);
 	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
-	sysfs_pd_controller_store(copy_gc,	&c->copygc_pd);
 
 	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
 
@@ -471,9 +477,6 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_journal_flush)
 		bch2_journal_meta(&c->journal);
 
-	if (attr == &sysfs_trigger_btree_coalesce)
-		bch2_coalesce(c);
-
 	if (attr == &sysfs_trigger_gc) {
 		/*
 		 * Full gc is currently incompatible with btree key cache:
@@ -570,16 +573,16 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_extent_migrate_raced,
 
 	&sysfs_trigger_journal_flush,
-	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
+	&sysfs_gc_gens_pos,
 	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
+	&sysfs_copy_gc_wait,
 
 	&sysfs_rebalance_enabled,
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
-	sysfs_pd_controller_files(copy_gc),
 
 	&sysfs_new_stripes,
 
@@ -817,23 +820,28 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
 	       "free[RESERVE_NONE]\t%zu/%zu\n"
 	       "freelist_wait\t\t%s\n"
-	       "open buckets\t\t%u/%u (reserved %u)\n"
+	       "open buckets allocated\t%u\n"
+	       "open buckets this dev\t%u\n"
+	       "open buckets total\t%u\n"
 	       "open_buckets_wait\t%s\n"
 	       "open_buckets_btree\t%u\n"
 	       "open_buckets_user\t%u\n"
-	       "btree reserve cache\t%u\n",
+	       "btree reserve cache\t%u\n"
+	       "thread state:\t\t%s\n",
 	       stats.buckets_ec,
 	       __dev_buckets_available(ca, stats),
 	       fifo_used(&ca->free_inc),		ca->free_inc.size,
 	       fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 	       fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
 	       c->freelist_wait.list.first		? "waiting" : "empty",
-	       c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
-	       BTREE_NODE_OPEN_BUCKET_RESERVE,
+	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
+	       ca->nr_open_buckets,
+	       OPEN_BUCKETS_COUNT,
 	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 	       nr[BCH_DATA_btree],
 	       nr[BCH_DATA_user],
-	       c->btree_reserve_cache_nr);
+	       c->btree_reserve_cache_nr,
+	       bch2_allocator_states[ca->allocator_state]);
 }
 
 static const char * const bch2_rw[] = {
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index 7507b6bc..254e3b31 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -497,6 +497,42 @@ static int rand_insert(struct bch_fs *c, u64 nr)
 	return ret;
 }
 
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans trans;
+	struct bkey_i_cookie k[8];
+	int ret = 0;
+	unsigned j;
+	u64 i;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
+		for (j = 0; j < ARRAY_SIZE(k); j++) {
+			bkey_cookie_init(&k[j].k_i);
+			k[j].k.p.offset = test_rand();
+			k[j].k.p.snapshot = U32_MAX;
+		}
+
+		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
+			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
+		if (ret) {
+			bch_err(c, "error in rand_insert_multi: %i", ret);
+			break;
+		}
+	}
+
+	bch2_trans_exit(&trans);
+	return ret;
+}
+
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
 	struct btree_trans trans;
@@ -765,6 +801,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 	if (!strcmp(testname, #_test)) j.fn = _test
 
 	perf_test(rand_insert);
+	perf_test(rand_insert_multi);
 	perf_test(rand_lookup);
 	perf_test(rand_mixed);
 	perf_test(rand_delete);
author	Kent Overstreet <kent.overstreet@gmail.com>	2021-04-15 13:05:38 -0400
committer	Kent Overstreet <kent.overstreet@gmail.com>	2021-04-19 21:26:48 -0400
commit	ceac31bcb6992cb8b7770d2a0e91b055e5020431 (patch)
tree	77a66249069a2de7bafdcec5afbce387059d0bf9
parent	8ba5e814fd3d0e9559adca72f73202a7dc304acc (diff)