28 files changed, 443 insertions, 375 deletions
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 23a9fbb36f49..1a8fa5482653 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -206,8 +206,7 @@ static inline bool may_alloc_bucket(struct bch_fs *c,
 
 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
 					      struct alloc_request *req,
-					      u64 bucket, u8 gen,
-					      struct closure *cl)
+					      u64 bucket, u8 gen)
 {
 	struct bch_dev *ca = req->ca;
 
@@ -222,12 +221,18 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
 	spin_lock(&c->freelist_lock);
 
 	if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
-		if (cl)
-			closure_wait(&c->open_buckets_wait, cl);
-
 		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
+
+		int ret;
+		if (req->cl && !(req->flags & BCH_WRITE_alloc_nowait)) {
+			closure_wait(&c->open_buckets_wait, req->cl);
+			ret = bch_err_throw(c, open_bucket_alloc_blocked);
+		} else {
+			ret = bch_err_throw(c, open_buckets_empty);
+		}
+
 		spin_unlock(&c->freelist_lock);
-		return ERR_PTR(bch_err_throw(c, open_buckets_empty));
+		return ERR_PTR(ret);
 	}
 
 	/* Recheck under lock: */
@@ -259,8 +264,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
 
 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
 					    struct alloc_request *req,
-					    struct btree_iter *freespace_iter,
-					    struct closure *cl)
+					    struct btree_iter *freespace_iter)
 {
 	struct bch_fs *c = trans->c;
 	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
@@ -275,7 +279,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
 	if (ret)
 		return NULL;
 
-	return __try_alloc_bucket(c, req, b, gen, cl);
+	return __try_alloc_bucket(c, req, b, gen);
 }
 
 /*
@@ -283,8 +287,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
  */
 static noinline struct open_bucket *
 bch2_bucket_alloc_early(struct btree_trans *trans,
-			struct alloc_request *req,
-			struct closure *cl)
+			struct alloc_request *req)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = req->ca;
@@ -348,7 +351,7 @@ again:
 		req->counters.buckets_seen++;
 
 		ob = may_alloc_bucket(c, req, k.k->p)
-			? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
+			? __try_alloc_bucket(c, req, k.k->p.offset, a->gen)
 			: NULL;
 next:
 		bch2_set_btree_iter_dontneed(trans, &citer);
@@ -374,8 +377,7 @@ next:
 }
 
 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
-						      struct alloc_request *req,
-						      struct closure *cl)
+						      struct alloc_request *req)
 {
 	struct bch_dev *ca = req->ca;
 	struct btree_iter iter;
@@ -417,7 +419,7 @@ again:
 				goto next;
 			}
 
-			ob = try_alloc_bucket(trans, req, &iter, cl);
+			ob = try_alloc_bucket(trans, req, &iter);
 			if (ob) {
 				if (!IS_ERR(ob))
 					*dev_alloc_cursor = iter.pos.offset;
@@ -450,7 +452,6 @@ fail:
 
 static noinline void trace_bucket_alloc2(struct bch_fs *c,
 					 struct alloc_request *req,
-					 struct closure *cl,
 					 struct open_bucket *ob)
 {
 	struct printbuf buf = PRINTBUF;
@@ -460,7 +461,8 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c,
 	prt_printf(&buf, "dev\t%s (%u)\n",	req->ca->name, req->ca->dev_idx);
 	prt_printf(&buf, "watermark\t%s\n",	bch2_watermarks[req->watermark]);
 	prt_printf(&buf, "data type\t%s\n",	__bch2_data_types[req->data_type]);
-	prt_printf(&buf, "blocking\t%u\n",	cl != NULL);
+	prt_printf(&buf, "blocking\t%u\n",	!req->will_retry_target_devices &&
+		   !req->will_retry_all_devices);
 	prt_printf(&buf, "free\t%llu\n",	req->usage.buckets[BCH_DATA_free]);
 	prt_printf(&buf, "avail\t%llu\n",	dev_buckets_free(req->ca, req->usage, req->watermark));
 	prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
@@ -488,28 +490,23 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c,
  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
  * @trans:	transaction object
  * @req:	state for the entire allocation
- * @cl:		if not NULL, closure to be used to wait if buckets not available
- * @nowait:	if true, do not wait for buckets to become available
  *
  * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
-						   struct alloc_request *req,
-						   struct closure *cl,
-						   bool nowait)
+						   struct alloc_request *req)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = req->ca;
 	struct open_bucket *ob = NULL;
 	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
-	u64 avail;
-	bool waiting = nowait;
+	bool waiting = false;
 
 	req->btree_bitmap = req->data_type == BCH_DATA_btree;
 	memset(&req->counters, 0, sizeof(req->counters));
 again:
 	bch2_dev_usage_read_fast(ca, &req->usage);
-	avail = dev_buckets_free(ca, req->usage, req->watermark);
+	u64 avail = dev_buckets_free(ca, req->usage, req->watermark);
 
 	if (req->usage.buckets[BCH_DATA_need_discard] > avail)
 		bch2_dev_do_discards(ca);
@@ -525,8 +522,11 @@ again:
 		    c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
 			goto alloc;
 
-		if (cl && !waiting) {
-			closure_wait(&c->freelist_wait, cl);
+		if (req->cl &&
+		    !req->will_retry_target_devices &&
+		    !req->will_retry_all_devices &&
+		    !(req->flags & BCH_WRITE_alloc_nowait)) {
+			closure_wait(&c->freelist_wait, req->cl);
 			waiting = true;
 			goto again;
 		}
@@ -541,8 +541,8 @@ again:
 		closure_wake_up(&c->freelist_wait);
 alloc:
 	ob = likely(freespace)
-		? bch2_bucket_alloc_freelist(trans, req, cl)
-		: bch2_bucket_alloc_early(trans, req, cl);
+		? bch2_bucket_alloc_freelist(trans, req)
+		: bch2_bucket_alloc_early(trans, req);
 
 	if (req->counters.need_journal_commit * 2 > avail)
 		bch2_journal_flush_async(&c->journal, NULL);
@@ -571,7 +571,7 @@ err:
 	if (!IS_ERR(ob)
 	    ? trace_bucket_alloc_enabled()
 	    : trace_bucket_alloc_fail_enabled())
-		trace_bucket_alloc2(c, req, cl, ob);
+		trace_bucket_alloc2(c, req, ob);
 
 	return ob;
 }
@@ -583,13 +583,14 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct open_bucket *ob;
 	struct alloc_request req = {
+		.cl		= cl,
 		.watermark	= watermark,
 		.data_type	= data_type,
 		.ca		= ca,
 	};
 
 	bch2_trans_do(c,
-		PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
+		PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req)));
 	return ob;
 }
 
@@ -703,18 +704,26 @@ static int add_new_bucket(struct bch_fs *c,
 	return 0;
 }
 
-inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
-				       struct alloc_request *req,
-				       struct dev_stripe_state *stripe,
-				       struct closure *cl)
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+				struct alloc_request *req,
+				struct dev_stripe_state *stripe)
 {
 	struct bch_fs *c = trans->c;
+	struct closure *cl = NULL;
 	int ret = 0;
 
 	BUG_ON(req->nr_effective >= req->nr_replicas);
 
+	/*
+	 * Try nonblocking first, so that if one device is full we'll try from
+	 * other devices:
+	 */
+retry_blocking:
 	bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
 
+	if (req->devs_sorted.nr == 1)
+		req->will_retry_target_devices = false;
+
 	darray_for_each(req->devs_sorted, i) {
 		req->ca = bch2_dev_tryget_noerror(c, *i);
 		if (!req->ca)
@@ -725,8 +734,7 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			continue;
 		}
 
-		struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
-							req->flags & BCH_WRITE_alloc_nowait);
+		struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req);
 		if (!IS_ERR(ob))
 			bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
 		bch2_dev_put(req->ca);
@@ -745,6 +753,14 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 
 	if (ret == 1)
 		return 0;
+
+	if (ret &&
+	    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+	    req->will_retry_target_devices) {
+		req->will_retry_target_devices = false;
+		goto retry_blocking;
+	}
+
 	if (ret)
 		return ret;
 	return bch_err_throw(c, insufficient_devices);
@@ -759,20 +775,13 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
  */
 
 static int bucket_alloc_from_stripe(struct btree_trans *trans,
-				    struct alloc_request *req,
-				    struct closure *cl)
+				    struct alloc_request *req)
 {
 	struct bch_fs *c = trans->c;
 	int ret = 0;
 
-	if (req->nr_replicas < 2)
-		return 0;
-
-	if (ec_open_bucket(c, &req->ptrs))
-		return 0;
-
 	struct ec_stripe_head *h =
-		bch2_ec_stripe_head_get(trans, req, 0, cl);
+		bch2_ec_stripe_head_get(trans, req, 0);
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 	if (!h)
@@ -887,79 +896,6 @@ unlock:
 	return ret;
 }
 
-static int __open_bucket_add_buckets(struct btree_trans *trans,
-				     struct alloc_request *req,
-				     struct closure *_cl)
-{
-	struct bch_fs *c = trans->c;
-	struct open_bucket *ob;
-	struct closure *cl = NULL;
-	unsigned i;
-	int ret;
-
-	req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
-
-	/* Don't allocate from devices we already have pointers to: */
-	darray_for_each(*req->devs_have, i)
-		__clear_bit(*i, req->devs_may_alloc.d);
-
-	open_bucket_for_each(c, &req->ptrs, ob, i)
-		__clear_bit(ob->dev, req->devs_may_alloc.d);
-
-	ret = bucket_alloc_set_writepoint(c, req);
-	if (ret)
-		return ret;
-
-	ret = bucket_alloc_set_partial(c, req);
-	if (ret)
-		return ret;
-
-	if (req->ec) {
-		ret = bucket_alloc_from_stripe(trans, req, _cl);
-	} else {
-retry_blocking:
-		/*
-		 * Try nonblocking first, so that if one device is full we'll try from
-		 * other devices:
-		 */
-		ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
-		if (ret &&
-		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
-		    !cl && _cl) {
-			cl = _cl;
-			goto retry_blocking;
-		}
-	}
-
-	return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
-				   struct alloc_request *req,
-				   struct closure *cl)
-{
-	int ret;
-
-	if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
-		ret = __open_bucket_add_buckets(trans, req, cl);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
-		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
-		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-			return ret;
-		if (req->nr_effective >= req->nr_replicas)
-			return 0;
-	}
-
-	bool ec = false;
-	swap(ec, req->ec);
-	ret = __open_bucket_add_buckets(trans, req, cl);
-	swap(ec, req->ec);
-
-	return ret < 0 ? ret : 0;
-}
-
 /**
  * should_drop_bucket - check if this is open_bucket should go away
  * @ob:		open_bucket to predicate on
@@ -1255,72 +1191,94 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
 		erasure_code = false;
 
+	if (nr_replicas < 2)
+		erasure_code = false;
+
+	req->cl			= cl;
 	req->nr_replicas	= nr_replicas;
 	req->target		= target;
-	req->ec			= erasure_code;
 	req->watermark		= watermark;
 	req->flags		= flags;
 	req->devs_have		= devs_have;
 
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
-	req->ptrs.nr		= 0;
-	req->nr_effective	= 0;
-	req->have_cache		= false;
-	write_points_nr		= c->write_points_nr;
+	req->ec				= erasure_code;
+	req->will_retry_target_devices	= true;
+	req->will_retry_all_devices	= true;
+	req->ptrs.nr			= 0;
+	req->nr_effective		= 0;
+	req->have_cache			= false;
+	write_points_nr			= c->write_points_nr;
 
 	*wp_ret = req->wp = writepoint_find(trans, write_point.v);
 
 	req->data_type		= req->wp->data_type;
 
+	/* metadata may not allocate on cache devices: */
+	if (req->data_type != BCH_DATA_user)
+		req->have_cache = true;
+
 	ret = bch2_trans_relock(trans);
 	if (ret)
 		goto err;
 
-	/* metadata may not allocate on cache devices: */
-	if (req->data_type != BCH_DATA_user)
-		req->have_cache = true;
+	while (1) {
+		req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
 
-	if (target && !(flags & BCH_WRITE_only_specified_devs)) {
-		ret = open_bucket_add_buckets(trans, req, NULL);
-		if (!ret ||
-		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto alloc_done;
-
-		/* Don't retry from all devices if we're out of open buckets: */
-		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-			int ret2 = open_bucket_add_buckets(trans, req, cl);
-			if (!ret2 ||
-			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
-				ret = ret2;
-				goto alloc_done;
-			}
-		}
+		/* Don't allocate from devices we already have pointers to: */
+		darray_for_each(*req->devs_have, i)
+			__clear_bit(*i, req->devs_may_alloc.d);
 
-		/*
-		 * Only try to allocate cache (durability = 0 devices) from the
-		 * specified target:
-		 */
-		req->have_cache	= true;
-		req->target	= 0;
+		open_bucket_for_each(c, &req->ptrs, ob, i)
+			__clear_bit(ob->dev, req->devs_may_alloc.d);
 
-		ret = open_bucket_add_buckets(trans, req, cl);
-	} else {
-		ret = open_bucket_add_buckets(trans, req, cl);
-	}
-alloc_done:
-	BUG_ON(!ret && req->nr_effective < req->nr_replicas);
+		ret =   bucket_alloc_set_writepoint(c, req) ?:
+			bucket_alloc_set_partial(c, req) ?:
+			(req->ec
+			 ? bucket_alloc_from_stripe(trans, req)
+			 : bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe));
 
-	if (erasure_code && !ec_open_bucket(c, &req->ptrs))
-		pr_debug("failed to get ec bucket: ret %u", ret);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+		    bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			goto err;
+
+		if (ret == -BCH_ERR_freelist_empty ||
+		    ret == -BCH_ERR_insufficient_devices) {
+			if (req->will_retry_all_devices) {
+				BUG_ON(!req->will_retry_all_devices);
+				req->will_retry_all_devices	= false;
+				/*
+				 * Only try to allocate cache (durability = 0 devices) from the
+				 * specified target:
+				 */
+				if (req->target &&
+				    (!(flags & BCH_WRITE_only_specified_devs) ||
+				     (ret == -BCH_ERR_insufficient_devices))) {
+					req->have_cache		= true;
+					req->target		= 0;
+				}
+				continue;
+			}
 
-	if (ret == -BCH_ERR_insufficient_devices &&
-	    req->nr_effective >= nr_replicas_required)
-		ret = 0;
+			if (ret == -BCH_ERR_insufficient_devices &&
+			    req->nr_effective >= nr_replicas_required)
+				ret = 0;
+			else
+				goto err;
+		}
 
-	if (ret)
-		goto err;
+		if (req->nr_effective < req->nr_replicas && req->ec) {
+			req->ec				= false;
+			req->will_retry_target_devices	= true;
+			req->will_retry_all_devices	= true;
+			continue;
+		}
+
+		BUG_ON(req->nr_effective < nr_replicas_required);
+		BUG_ON(ret < 0);
+		break;
+	}
 
 	if (req->nr_effective > req->nr_replicas)
 		deallocate_extra_replicas(c, req);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 1b3fc8460096..90eb8604a0a2 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -26,9 +26,12 @@ struct dev_alloc_list {
 };
 
 struct alloc_request {
+	struct closure		*cl;
 	unsigned		nr_replicas;
 	unsigned		target;
-	bool			ec;
+	bool			ec:1;
+	bool			will_retry_target_devices:1;
+	bool			will_retry_all_devices:1;
 	enum bch_watermark	watermark;
 	enum bch_write_flags	flags;
 	enum bch_data_type	data_type;
@@ -224,7 +227,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
 
 enum bch_write_flags;
 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
-				struct dev_stripe_state *, struct closure *);
+				struct dev_stripe_state *);
 
 int bch2_alloc_sectors_start_trans(struct btree_trans *,
 				   unsigned, unsigned,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a3631a903ecf..49505653fe12 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -86,7 +86,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
 	six_unlock_intent(&b->c.lock);
 }
 
-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+void __btree_node_data_free(struct btree *b)
 {
 	BUG_ON(!list_empty(&b->list));
 	BUG_ON(btree_node_hashed(b));
@@ -113,16 +113,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
 	munmap(b->aux_data, btree_aux_data_bytes(b));
 #endif
 	b->aux_data = NULL;
-
-	btree_node_to_freedlist(bc, b);
 }
 
 static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
 {
 	BUG_ON(list_empty(&b->list));
 	list_del_init(&b->list);
+
+	__btree_node_data_free(b);
+
 	--bc->nr_freeable;
-	__btree_node_data_free(bc, b);
+	btree_node_to_freedlist(bc, b);
 }
 
 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -186,10 +187,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	b = __btree_node_mem_alloc(c, GFP_KERNEL);
+	struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
 	if (!b)
 		return NULL;
 
@@ -199,8 +197,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 	}
 
 	bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-
-	__bch2_btree_node_to_freelist(bc, b);
 	return b;
 }
 
@@ -526,7 +522,8 @@ restart:
 			--touched;;
 		} else if (!btree_node_reclaim(c, b)) {
 			__bch2_btree_node_hash_remove(bc, b);
-			__btree_node_data_free(bc, b);
+			__btree_node_data_free(b);
+			btree_node_to_freedlist(bc, b);
 
 			freed++;
 			bc->nr_freed++;
@@ -667,9 +664,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 
 	bch2_recalc_btree_reserve(c);
 
-	for (i = 0; i < bc->nr_reserve; i++)
-		if (!__bch2_btree_node_mem_alloc(c))
+	for (i = 0; i < bc->nr_reserve; i++) {
+		struct btree *b = __bch2_btree_node_mem_alloc(c);
+		if (!b)
 			goto err;
+		__bch2_btree_node_to_freelist(bc, b);
+	}
 
 	list_splice_init(&bc->live[0].list, &bc->freeable);
 
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 3264801cbcbe..649e9dfd178a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig
 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
 int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
 
+void __btree_node_data_free(struct btree *);
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..b30799e494eb 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -26,6 +26,12 @@
 
 #include <linux/sched/mm.h>
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_btree_read_corrupt_ratio;
+module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
+#endif
+
 static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
 {
 	bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
@@ -568,9 +574,9 @@ static int __btree_err(int ret,
 		bch2_mark_btree_validate_failure(failed, ca->dev_idx);
 
 		struct extent_ptr_decoded pick;
-		have_retry = !bch2_bkey_pick_read_device(c,
+		have_retry = bch2_bkey_pick_read_device(c,
 					bkey_i_to_s_c(&b->key),
-					failed, &pick, -1);
+					failed, &pick, -1) == 1;
 	}
 
 	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
@@ -615,7 +621,6 @@ static int __btree_err(int ret,
 			goto out;
 		case -BCH_ERR_btree_node_read_err_bad_node:
 			prt_str(&out, ", ");
-			ret = __bch2_topology_error(c, &out);
 			break;
 		}
 
@@ -644,7 +649,6 @@ static int __btree_err(int ret,
 		goto out;
 	case -BCH_ERR_btree_node_read_err_bad_node:
 		prt_str(&out, ", ");
-		ret = __bch2_topology_error(c, &out);
 		break;
 	}
 print:
@@ -1337,15 +1341,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_node_reset_sib_u64s(b);
 
-	scoped_guard(rcu)
-		bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-			struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
-			if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
-				set_btree_node_need_rewrite(b);
-				set_btree_node_need_rewrite_degraded(b);
+	/*
+	 * XXX:
+	 *
+	 * We deadlock if too many btree updates require node rewrites while
+	 * we're still in journal replay.
+	 *
+	 * This is because btree node rewrites generate more updates for the
+	 * interior updates (alloc, backpointers), and if those updates touch
+	 * new nodes and generate more rewrites - well, you see the problem.
+	 *
+	 * The biggest cause is that we don't use the btree write buffer (for
+	 * the backpointer updates - this needs some real thought on locking in
+	 * order to fix.
+	 *
+	 * The problem with this workaround (not doing the rewrite for degraded
+	 * nodes in journal replay) is that those degraded nodes persist, and we
+	 * don't want that (this is a real bug when a btree node write completes
+	 * with fewer replicas than we wanted and leaves a degraded node due to
+	 * device _removal_, i.e. the device went away mid write).
+	 *
+	 * It's less of a bug here, but still a problem because we don't yet
+	 * have a way of tracking degraded data - we another index (all
+	 * extents/btree nodes, by replicas entry) in order to fix properly
+	 * (re-replicate degraded data at the earliest possible time).
+	 */
+	if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+		scoped_guard(rcu)
+			bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+				struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+				if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+					set_btree_node_need_rewrite(b);
+					set_btree_node_need_rewrite_degraded(b);
+				}
 			}
-		}
+	}
 
 	if (!ptr_written) {
 		set_btree_node_need_rewrite(b);
@@ -1381,7 +1412,7 @@ static void btree_node_read_work(struct work_struct *work)
 		ret = bch2_bkey_pick_read_device(c,
 					bkey_i_to_s_c(&b->key),
 					&failed, &rb->pick, -1);
-		if (ret) {
+		if (ret <= 0) {
 			set_btree_node_read_error(b);
 			break;
 		}
@@ -1412,6 +1443,11 @@ start:
 			continue;
 		}
 
+		memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
+		bio->bi_iter.bi_size	= btree_buf_bytes(b);
+
+		bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
+
 		ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
 		if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
 		    ret == -BCH_ERR_btree_node_read_err_must_retry)
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 341d31b3a1f1..ea839560a136 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -717,18 +717,6 @@ static void __journal_keys_sort(struct journal_keys *keys)
 	keys->nr = dst - keys->data;
 }
 
-static bool should_rewind_entry(struct bch_fs *c, struct jset_entry *entry)
-{
-	if (entry->level)
-		return false;
-	if (btree_id_is_alloc(entry->btree_id))
-		return false;
-	if (c->opts.journal_rewind_no_extents &&
-	    entry->btree_id == BTREE_ID_extents)
-		return false;
-	return true;
-}
-
 int bch2_journal_keys_sort(struct bch_fs *c)
 {
 	struct genradix_iter iter;
@@ -747,8 +735,9 @@ int bch2_journal_keys_sort(struct bch_fs *c)
 		cond_resched();
 
 		vstruct_for_each(&i->j, entry) {
-			bool rewind = le64_to_cpu(i->j.seq) >= rewind_seq &&
-				should_rewind_entry(c, entry);
+			bool rewind = !entry->level &&
+				!btree_id_is_alloc(entry->btree_id) &&
+				le64_to_cpu(i->j.seq) >= rewind_seq;
 
 			if (entry->type != (rewind
 					    ? BCH_JSET_ENTRY_overwrite
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 23d8c62ea4b6..365808b4b7c0 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k)
 	}
 }
 
-static bool found_btree_node_is_readable(struct btree_trans *trans,
-					 struct found_btree_node *f)
-{
-	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
-	found_btree_node_to_key(&tmp.k, f);
-
-	struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
-	bool ret = !IS_ERR_OR_NULL(b);
-	if (!ret)
-		return ret;
-
-	f->sectors_written = b->written;
-	f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
-
-	struct bkey_s_c k;
-	struct bkey unpacked;
-	struct btree_node_iter iter;
-	for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
-		f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
-
-	six_unlock_read(&b->c.lock);
-
-	/*
-	 * We might update this node's range; if that happens, we need the node
-	 * to be re-read so the read path can trim keys that are no longer in
-	 * this node
-	 */
-	if (b != btree_node_root(trans->c, b))
-		bch2_btree_node_evict(trans, &tmp.k);
-	return ret;
-}
-
 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
 {
 	const struct found_btree_node *l = _l;
@@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = {
 };
 
 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
-				struct bio *bio, struct btree_node *bn, u64 offset)
+				struct btree *b, struct bio *bio, u64 offset)
 {
 	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+	struct btree_node *bn = b->data;
 
 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
 	bio->bi_iter.bi_sector	= offset;
-	bch2_bio_map(bio, bn, PAGE_SIZE);
+	bch2_bio_map(bio, b->data, c->opts.block_size);
 
 	u64 submit_time = local_clock();
 	submit_bio_wait(bio);
-
 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
 
 	if (bio->bi_status) {
@@ -201,6 +168,14 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
 		return;
 
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+	bio->bi_iter.bi_sector	= offset;
+	bch2_bio_map(bio, b->data, c->opts.btree_node_size);
+
+	submit_time = local_clock();
+	submit_bio_wait(bio);
+	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
 	rcu_read_lock();
 	struct found_btree_node n = {
 		.btree_id	= BTREE_NODE_ID(bn),
@@ -217,7 +192,20 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	};
 	rcu_read_unlock();
 
-	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+	found_btree_node_to_key(&b->key, &n);
+
+	CLASS(printbuf, buf)();
+	if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
+		/* read_done will swap out b->data for another buffer */
+		bn = b->data;
+		/*
+		 * Grab journal_seq here because we want the max journal_seq of
+		 * any bset; read_done sorts down to a single set and picks the
+		 * max journal_seq
+		 */
+		n.journal_seq		= le64_to_cpu(bn->keys.journal_seq),
+		n.sectors_written	= b->written;
+
 		mutex_lock(&f->lock);
 		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
 			bch_err(c, "try_read_btree_node() can't handle endian conversion");
@@ -237,12 +225,18 @@ static int read_btree_nodes_worker(void *p)
 	struct find_btree_nodes_worker *w = p;
 	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
 	struct bch_dev *ca = w->ca;
-	void *buf = (void *) __get_free_page(GFP_KERNEL);
-	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
 	unsigned long last_print = jiffies;
 
-	if (!buf || !bio) {
-		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+	struct btree *b = __bch2_btree_node_mem_alloc(c);
+	if (!b) {
+		bch_err(c, "read_btree_nodes_worker: error allocating buf");
+		w->f->ret = -ENOMEM;
+		goto err;
+	}
+
+	struct bio *bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
+	if (!bio) {
+		bch_err(c, "read_btree_nodes_worker: error allocating bio");
 		w->f->ret = -ENOMEM;
 		goto err;
 	}
@@ -266,11 +260,13 @@ static int read_btree_nodes_worker(void *p)
 			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
 				continue;
 
-			try_read_btree_node(w->f, ca, bio, buf, sector);
+			try_read_btree_node(w->f, ca, b, bio, sector);
 		}
 err:
+	if (b)
+		__btree_node_data_free(b);
+	kfree(b);
 	bio_put(bio);
-	free_page((unsigned long) buf);
 	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
 	closure_put(w->cl);
 	kfree(w);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 3968f3be7f3b..e848e210a9bf 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -783,9 +783,6 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 	darray_for_each(m->op.devs_have, i)
 		__clear_bit(*i, devs.d);
 
-	CLASS(printbuf, buf)();
-	buf.atomic++;
-
 	guard(rcu)();
 
 	unsigned nr_replicas = 0, i;
@@ -797,11 +794,7 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 		struct bch_dev_usage usage;
 		bch2_dev_usage_read_fast(ca, &usage);
 
-		u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark);
-
-		prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
-
-		if (!nr_free)
+		if (!dev_buckets_free(ca, usage, m->op.watermark))
 			continue;
 
 		nr_replicas += ca->mi.durability;
@@ -809,10 +802,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 			break;
 	}
 
-	if (!nr_replicas) {
-		trace_data_update_done_no_rw_devs(c, buf.buf);
+	if (!nr_replicas)
 		return bch_err_throw(c, data_update_done_no_rw_devs);
-	}
 	if (nr_replicas < m->op.nr_replicas)
 		return bch_err_throw(c, insufficient_devices);
 	return 0;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 901f643ead83..07c2a0f73cc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		c->verify_data = __bch2_btree_node_mem_alloc(c);
 		if (!c->verify_data)
 			goto out;
-
-		list_del_init(&c->verify_data->list);
 	}
 
 	BUG_ON(b->nsets != 1);
@@ -586,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
+
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 restart:
 	seqmutex_lock(&c->btree_trans_lock);
 	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
@@ -599,6 +599,11 @@ restart:
 		if (!closure_get_not_zero(&trans->ref))
 			continue;
 
+		if (!trans->srcu_held) {
+			closure_put(&trans->ref);
+			continue;
+		}
+
 		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
 
 		bch2_btree_trans_to_text(&i->buf, trans);
@@ -620,6 +625,8 @@ restart:
 	}
 	seqmutex_unlock(&c->btree_trans_lock);
 unlocked:
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
 	if (i->buf.allocation_failure)
 		ret = -ENOMEM;
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 687c3ba98095..71956ee86a9c 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1720,8 +1720,7 @@ err:
 
 static int new_stripe_alloc_buckets(struct btree_trans *trans,
 				    struct alloc_request *req,
-				    struct ec_stripe_head *h, struct ec_stripe_new *s,
-				    struct closure *cl)
+				    struct ec_stripe_head *h, struct ec_stripe_new *s)
 {
 	struct bch_fs *c = trans->c;
 	struct open_bucket *ob;
@@ -1771,7 +1770,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
 		req->nr_effective	= nr_have_parity;
 		req->data_type		= BCH_DATA_parity;
 
-		ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
+		ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe);
 
 		open_bucket_for_each(c, &req->ptrs, ob, i) {
 			j = find_next_zero_bit(s->blocks_gotten,
@@ -1794,7 +1793,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
 		req->nr_effective	= nr_have_data;
 		req->data_type		= BCH_DATA_user;
 
-		ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
+		ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe);
 
 		open_bucket_for_each(c, &req->ptrs, ob, i) {
 			j = find_next_zero_bit(s->blocks_gotten,
@@ -1926,7 +1925,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	}
 	bch2_trans_iter_exit(trans, &lru_iter);
 	if (!ret)
-		ret = bch_err_throw(c, stripe_alloc_blocked);
+		return bch_err_throw(c, stripe_alloc_blocked);
 	if (ret == 1)
 		ret = 0;
 	if (ret)
@@ -1998,8 +1997,7 @@ err:
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 					       struct alloc_request *req,
-					       unsigned algo,
-					       struct closure *cl)
+					       unsigned algo)
 {
 	struct bch_fs *c = trans->c;
 	unsigned redundancy = req->nr_replicas - 1;
@@ -2041,12 +2039,18 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	if (s->have_existing_stripe)
 		goto alloc_existing;
 
+
 	/* First, try to allocate a full stripe: */
 	enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
-	swap(req->watermark, saved_watermark);
-	ret =   new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
+	unsigned saved_flags = req->flags | BCH_WRITE_alloc_nowait;
+	swap(req->watermark,	saved_watermark);
+	swap(req->flags,	saved_flags);
+
+	ret =   new_stripe_alloc_buckets(trans, req, h, s) ?:
 		__bch2_ec_stripe_head_reserve(trans, h, s);
-	swap(req->watermark, saved_watermark);
+
+	swap(req->watermark,	saved_watermark);
+	swap(req->flags,	saved_flags);
 
 	if (!ret)
 		goto allocate_buf;
@@ -2062,19 +2066,25 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		ret = __bch2_ec_stripe_head_reuse(trans, h, s);
 		if (!ret)
 			break;
-		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+		if (waiting ||
+		    (req->flags & BCH_WRITE_alloc_nowait) ||
+		    ret != -BCH_ERR_stripe_alloc_blocked)
 			goto err;
 
 		if (req->watermark == BCH_WATERMARK_copygc) {
-			ret =   new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
+			/* Don't self-deadlock copygc */
+			swap(req->flags, saved_flags);
+			ret =   new_stripe_alloc_buckets(trans, req, h, s) ?:
 				__bch2_ec_stripe_head_reserve(trans, h, s);
+			swap(req->flags, saved_flags);
+
 			if (ret)
 				goto err;
 			goto allocate_buf;
 		}
 
 		/* XXX freelist_wait? */
-		closure_wait(&c->freelist_wait, cl);
+		closure_wait(&c->freelist_wait, req->cl);
 		waiting = true;
 	}
 
@@ -2085,7 +2095,7 @@ alloc_existing:
 	 * Retry allocating buckets, with the watermark for this
 	 * particular write:
 	 */
-	ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
+	ret = new_stripe_alloc_buckets(trans, req, h, s);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 548048adf0d5..756f14bd7bb7 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -258,7 +258,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
 
 struct alloc_request;
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
-			struct alloc_request *, unsigned, struct closure *);
+			struct alloc_request *, unsigned);
 
 void bch2_do_stripe_deletes(struct bch_fs *);
 void bch2_ec_do_stripe_creates(struct bch_fs *);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index d27b94a6610a..a66c96bd9556 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -236,6 +236,9 @@
 	x(0,				operation_blocked)			\
 	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
 	x(BCH_ERR_operation_blocked,	journal_res_blocked)			\
+	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
+	x(BCH_ERR_operation_blocked,	open_bucket_alloc_blocked)		\
+	x(BCH_ERR_operation_blocked,	stripe_alloc_blocked)			\
 	x(BCH_ERR_journal_res_blocked,	journal_blocked)			\
 	x(BCH_ERR_journal_res_blocked,	journal_max_in_flight)			\
 	x(BCH_ERR_journal_res_blocked,	journal_max_open)			\
@@ -244,8 +247,6 @@
 	x(BCH_ERR_journal_res_blocked,	journal_buf_enomem)			\
 	x(BCH_ERR_journal_res_blocked,	journal_stuck)				\
 	x(BCH_ERR_journal_res_blocked,	journal_retry_open)			\
-	x(BCH_ERR_journal_res_blocked,	bucket_alloc_blocked)			\
-	x(BCH_ERR_journal_res_blocked,	stripe_alloc_blocked)			\
 	x(BCH_ERR_invalid,		invalid_sb)				\
 	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
 	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
@@ -289,7 +290,6 @@
 	x(EIO,				sb_not_downgraded)			\
 	x(EIO,				btree_node_write_all_failed)		\
 	x(EIO,				btree_node_read_error)			\
-	x(EIO,				btree_node_read_validate_error)		\
 	x(EIO,				btree_need_topology_repair)		\
 	x(EIO,				bucket_ref_update)			\
 	x(EIO,				trigger_alloc)				\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index a9a9fe193966..71649b4164b8 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -103,7 +103,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
 		return bch_err_throw(c, btree_need_topology_repair);
 	} else {
 		return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
-			bch_err_throw(c, btree_node_read_validate_error);
+			bch_err_throw(c, btree_need_topology_repair);
 	}
 }
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 036e4ad95987..83cbd77dcb9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -50,19 +50,17 @@ void bch2_io_failures_to_text(struct printbuf *out,
 			      struct bch_io_failures *failed)
 {
 	static const char * const error_types[] = {
-		"io", "checksum", "ec reconstruct", NULL
+		"btree validate", "io", "checksum", "ec reconstruct", NULL
 	};
 
 	for (struct bch_dev_io_failures *f = failed->devs;
 	     f < failed->devs + failed->nr;
 	     f++) {
 		unsigned errflags =
-			((!!f->failed_io)	<< 0) |
-			((!!f->failed_csum_nr)	<< 1) |
-			((!!f->failed_ec)	<< 2);
-
-		if (!errflags)
-			continue;
+			((!!f->failed_btree_validate)	<< 0) |
+			((!!f->failed_io)		<< 1) |
+			((!!f->failed_csum_nr)		<< 2) |
+			((!!f->failed_ec)		<< 3);
 
 		bch2_printbuf_make_room(out, 1024);
 		out->atomic++;
@@ -77,7 +75,9 @@ void bch2_io_failures_to_text(struct printbuf *out,
 
 		prt_char(out, ' ');
 
-		if (is_power_of_2(errflags)) {
+		if (!errflags) {
+			prt_str(out, "no error - confused");
+		} else if (is_power_of_2(errflags)) {
 			prt_bitflags(out, error_types, errflags);
 			prt_str(out, " error");
 		} else {
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index dad48d44f47b..4e82dfa6c03f 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -257,7 +257,7 @@ err:
 		struct printbuf buf = PRINTBUF;
 		lockrestart_do(trans,
 			bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
-		prt_printf(&buf, "read error %i from btree lookup", ret);
+		prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret));
 		bch_err_ratelimited(c, "%s", buf.buf);
 		printbuf_exit(&buf);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f9bc99eb2d02..3b0783f117ae 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1692,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
 
 		s.mask = map_defined(bch_flags_to_xflags);
 		s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
-		if (fa->fsx_xflags)
-			return bch_err_throw(c, unsupported_fsx_flag);
+		if (fa->fsx_xflags) {
+			ret = bch_err_throw(c, unsupported_fsx_flag);
+			goto err;
+		}
 
-		if (fa->fsx_projid >= U32_MAX)
-			return bch_err_throw(c, projid_too_big);
+		if (fa->fsx_projid >= U32_MAX) {
+			ret = bch_err_throw(c, projid_too_big);
+			goto err;
+		}
 
 		/*
 		 * inode fields accessible via the xattr interface are stored with a +1
@@ -1718,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
 		fa->flags &= ~FS_CASEFOLD_FL;
 
 		s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
-		if (fa->flags)
-			return bch_err_throw(c, unsupported_fa_flag);
+		if (fa->flags) {
+			ret = bch_err_throw(c, unsupported_fa_flag);
+			goto err;
+		}
 	}
 
 	mutex_lock(&inode->ei_update_lock);
@@ -1730,7 +1736,7 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
 		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
 			       ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
-
+err:
 	return bch2_err_class(ret);
 }
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 00afe0a3593f..471e93a3f00c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,6 +12,7 @@
 #include "fs.h"
 #include "fsck.h"
 #include "inode.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "namei.h"
 #include "recovery_passes.h"
@@ -1637,7 +1638,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
 			i->count = count2;
 		}
 
-		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty) &&
+				i->inode.bi_sectors != i->count,
 				trans, inode_i_sectors_wrong,
 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
 				w->last_pos.inode, i->inode.bi_snapshot,
@@ -1963,33 +1965,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 					"extent type past end of inode %llu:%u, i_size %llu\n%s",
 					i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-				struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout));
-				ret = PTR_ERR_OR_ZERO(whiteout);
-				if (ret)
-					goto err;
-
-				bkey_init(&whiteout->k);
-				whiteout->k.p = SPOS(k.k->p.inode,
-						     last_block,
-						     i->inode.bi_snapshot);
-				bch2_key_resize(&whiteout->k,
-						min(KEY_SIZE_MAX & (~0 << c->block_bits),
-						    U64_MAX - whiteout->k.p.offset));
-
-
-				/*
-				 * Need a normal (not BTREE_ITER_all_snapshots)
-				 * iterator, if we're deleting in a different
-				 * snapshot and need to emit a whiteout
-				 */
-				struct btree_iter iter2;
-				bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents,
-						     bkey_start_pos(&whiteout->k),
-						     BTREE_ITER_intent);
-				ret =   bch2_btree_iter_traverse(trans, &iter2) ?:
-					bch2_trans_update(trans, &iter2, whiteout,
-						BTREE_UPDATE_internal_snapshot_node);
-				bch2_trans_iter_exit(trans, &iter2);
+				ret = bch2_fpunch_snapshot(trans,
+							   SPOS(i->inode.bi_inum,
+								last_block,
+								i->inode.bi_snapshot),
+							   POS(i->inode.bi_inum, U64_MAX));
 				if (ret)
 					goto err;
 
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index bf72b1d2e2cb..07023667a475 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -135,6 +135,33 @@ err_noprint:
 	return ret;
 }
 
+/* For fsck */
+int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
+{
+	u32 restart_count = trans->restart_count;
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bkey_i delete;
+
+	int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+			start, end, 0, k,
+			&disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+		bkey_init(&delete.k);
+		delete.k.p = iter.pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end, &delete);
+
+		bch2_extent_trim_atomic(trans, &iter, &delete) ?:
+		bch2_trans_update(trans, &iter, &delete, 0);
+	}));
+
+	bch2_disk_reservation_put(c, &disk_res);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
 /*
  * Returns -BCH_ERR_transacton_restart if we had to drop locks:
  */
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 9cb44a7c43c1..b93e4d4b3c0c 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -5,6 +5,8 @@
 int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
 			  u64, struct bch_io_opts, s64 *,
 			  struct write_point_specifier);
+
+int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
 		   subvol_inum, u64, s64 *);
 int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index 9c5ddbf861b3..cfc8ef35b14d 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -147,7 +147,7 @@ static inline void bch2_read_extent(struct btree_trans *trans,
 	int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
 				     data_btree, k, offset_into_extent, NULL, flags, -1);
 	/* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
-	WARN(ret, "unhandled error from __bch2_read_extent()");
+	WARN(ret, "unhandled error from __bch2_read_extent(): %s", bch2_err_str(ret));
 }
 
 int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ce5340611de6..f22b05e02c1e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1376,6 +1376,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
 		return bch_err_throw(c, erofs_filesystem_full);
 	}
 
+	unsigned nr;
 	int ret;
 
 	if (dynamic_fault("bcachefs:add:journal_alloc")) {
@@ -1384,19 +1385,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
 	}
 
 	/* 1/128th of the device by default: */
-	unsigned nr = ca->mi.nbuckets >> 7;
+	nr = ca->mi.nbuckets >> 7;
 
 	/*
-	 * clamp journal size to 8GB, or 32GB with large_journal option:
+	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
+	 * is smaller:
 	 */
-	unsigned max_sectors = 1 << 24;
-
-	if (c->opts.large_journal)
-		max_sectors *= 4;
-
 	nr = clamp_t(unsigned, nr,
 		     BCH_JOURNAL_BUCKETS_MIN,
-		     max_sectors / ca->mi.bucket_size);
+		     min(1 << 13,
+			 (1 << 24) / ca->mi.bucket_size));
 
 	ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
 err:
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index dd3f3434c1b0..2d6ce4348a22 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1245,6 +1245,8 @@ noinline_for_stack
 static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
 {
 	struct printbuf buf = PRINTBUF;
+	bch2_log_msg_start(c, &buf);
+
 	enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
 	bool have_good = false;
 
@@ -1272,6 +1274,28 @@ static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_r
 	printbuf_exit(&buf);
 }
 
+struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end)
+{
+	BUG_ON(start > end);
+
+	if (start == end)
+		return (struct u64_range) {};
+
+	start = bch2_journal_seq_next_nonblacklisted(c, start);
+	if (start >= end)
+		return (struct u64_range) {};
+
+	struct u64_range missing = {
+		.start	= start,
+		.end	= min(end, bch2_journal_seq_next_blacklisted(c, start)),
+	};
+
+	if (missing.start == missing.end)
+		return (struct u64_range) {};
+
+	return missing;
+}
+
 noinline_for_stack
 static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
 {
@@ -1280,6 +1304,7 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e
 
 	struct genradix_iter radix_iter;
 	struct journal_replay *i, **_i, *prev = NULL;
+	/* Sequence number we expect to find next, to check for missing entries */
 	u64 seq = start_seq;
 
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
@@ -1290,43 +1315,31 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e
 
 		BUG_ON(seq > le64_to_cpu(i->j.seq));
 
-		while (seq < le64_to_cpu(i->j.seq)) {
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			if (seq == le64_to_cpu(i->j.seq))
-				break;
-
-			u64 missing_start = seq;
-
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       !bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			u64 missing_end = seq - 1;
+		struct u64_range missing;
 
+		while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) {
 			printbuf_reset(&buf);
 			prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-				   missing_start, missing_end,
+				   missing.start, missing.end - 1,
 				   start_seq, end_seq);
 
-			prt_printf(&buf, "\nprev at ");
 			if (prev) {
+				prt_printf(&buf, "\n%llu at ", le64_to_cpu(prev->j.seq));
 				bch2_journal_ptrs_to_text(&buf, c, prev);
 				prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
-			} else
-				prt_printf(&buf, "(none)");
+			}
 
-			prt_printf(&buf, "\nnext at ");
+			prt_printf(&buf, "\n%llu at ", le64_to_cpu(i->j.seq));
 			bch2_journal_ptrs_to_text(&buf, c, i);
 			prt_printf(&buf, ", continue?");
 
 			fsck_err(c, journal_entries_missing, "%s", buf.buf);
+
+			seq = missing.end;
 		}
 
 		prev = i;
-		seq++;
+		seq = le64_to_cpu(i->j.seq) + 1;
 	}
 fsck_err:
 	printbuf_exit(&buf);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6fa82c4050fe..f53c5c81d137 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -71,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			       struct journal_replay *);
 
+struct u64_range {
+	u64	start;
+	u64	end;
+};
+
+struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64);
+
 int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 CLOSURE_CALLBACK(bch2_journal_write);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index af4fe416d9ec..6361809b5e2e 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -103,6 +103,52 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
 	return cmp_int(l->start, r->start);
 }
 
+static int journal_seq_blacklist_table_end_cmp(const void *_l, const void *_r)
+{
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
+
+	return cmp_int(l->end, r->end);
+}
+
+u64 bch2_journal_seq_next_blacklisted(struct bch_fs *c, u64 seq)
+{
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+
+	if (!t)
+		return U64_MAX;
+
+	struct journal_seq_blacklist_table_entry search = { .end = seq };
+	int idx = eytzinger0_find_gt(t->entries, t->nr,
+				     sizeof(t->entries[0]),
+				     journal_seq_blacklist_table_end_cmp,
+				     &search);
+	if (idx < 0)
+		return U64_MAX;
+
+	return max(seq, t->entries[idx].start);
+}
+
+u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *c, u64 seq)
+{
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+
+	if (!t)
+		return seq;
+
+	while (true) {
+		struct journal_seq_blacklist_table_entry search = { .start = seq };
+		int idx = eytzinger0_find_le(t->entries, t->nr,
+					     sizeof(t->entries[0]),
+					     journal_seq_blacklist_table_cmp,
+					     &search);
+		if (idx < 0 || t->entries[idx].end <= seq)
+			return seq;
+
+		seq = t->entries[idx].end;
+	}
+}
+
 bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
 				     bool dirty)
 {
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index f06942ccfcdd..389b789b26f4 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -11,6 +11,9 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
 		: 0;
 }
 
+u64 bch2_journal_seq_next_blacklisted(struct bch_fs *, u64);
+u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *, u64);
+
 bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
 u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 4a7a60588c10..63f8e254495c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -343,12 +343,6 @@ enum fsck_err_opts {
 	  OPT_UINT(0, U32_MAX),						\
 	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
 	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
-	x(large_journal,		bool,				\
-	  OPT_FS|OPT_MOUNT|OPT_FORMAT,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Allocate a bigger than normal journal: recovery from unclean "\
-			"shutdown will be slower, but more info will be available for debugging")\
 	x(move_bytes_in_flight,		u32,				\
 	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_UINT(1024, U32_MAX),					\
@@ -395,11 +389,6 @@ enum fsck_err_opts {
 	  OPT_UINT(0, U64_MAX),						\
 	  BCH2_NO_SB_OPT,		0,				\
 	  NULL,		"Rewind journal")				\
-	x(journal_rewind_no_extents,	bool,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		0,				\
-	  NULL,		"Don't rewind extents when rewinding journal")	\
 	x(recovery_passes,		u64,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BITFIELD(bch2_recovery_passes),				\
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 974f8bf9a574..0def4ecb7f88 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -273,24 +273,35 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 		goto out;
 
 	struct btree_path *path = btree_iter_path(trans, &iter);
-	if (unlikely(!btree_path_node(path, k->level) &&
-		     !k->allocated)) {
+	if (unlikely(!btree_path_node(path, k->level))) {
 		struct bch_fs *c = trans->c;
 
+		CLASS(printbuf, buf)();
+		prt_str(&buf, "btree=");
+		bch2_btree_id_to_text(&buf, k->btree_id);
+		prt_printf(&buf, " level=%u ", k->level);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+
 		if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
 						     BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
-			bch_err(c, "have key in journal replay for btree depth that does not exist, confused");
+			bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
+				buf.buf);
 			ret = -EINVAL;
 		}
-#if 0
+
+		if (!k->allocated) {
+			bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
+				   buf.buf);
+			k->overwritten = true;
+			goto out;
+		}
+
 		bch2_trans_iter_exit(trans, &iter);
 		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 					  BTREE_MAX_DEPTH, 0, iter_flags);
 		ret =   bch2_btree_iter_traverse(trans, &iter) ?:
 			bch2_btree_increase_depth(trans, iter.path, 0) ?:
 			-BCH_ERR_transaction_restart_nested;
-#endif
-		k->overwritten = true;
 		goto out;
 	}
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9324ef32903d..b5dae1145afa 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1330,11 +1330,6 @@ DEFINE_EVENT(fs_str, data_update,
 	TP_ARGS(c, str)
 );
 
-DEFINE_EVENT(fs_str, data_update_done_no_rw_devs,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
 DEFINE_EVENT(fs_str, io_move_pred,
 	TP_PROTO(struct bch_fs *c, const char *str),
 	TP_ARGS(c, str)