1 files changed, 232 insertions, 165 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index ede44f73..16bdc48c 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -58,11 +58,13 @@
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@@ -79,7 +81,7 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
 
 /* Ratelimiting/PD controllers */
 
@@ -130,8 +132,7 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-static const char *bch2_alloc_invalid(const struct bch_fs *c,
-				      struct bkey_s_c k)
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	if (k.k->p.inode >= c->sb.nr_devices ||
 	    !c->devs[k.k->p.inode])
@@ -152,8 +153,8 @@ static const char *bch2_alloc_invalid(const struct bch_fs *c,
 	return NULL;
 }
 
-static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
-			       size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
 	buf[0] = '\0';
 
@@ -163,11 +164,6 @@ static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
 	}
 }
 
-const struct bkey_ops bch2_bkey_alloc_ops = {
-	.key_invalid	= bch2_alloc_invalid,
-	.val_to_text	= bch2_alloc_to_text,
-};
-
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
 {
 	unsigned v;
@@ -236,9 +232,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 
 	d = a.v->data;
 	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		g->prio[READ] = get_alloc_field(&d, 2);
+		g->io_time[READ] = get_alloc_field(&d, 2);
 	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		g->prio[WRITE] = get_alloc_field(&d, 2);
+		g->io_time[WRITE] = get_alloc_field(&d, 2);
 
 	lg_local_unlock(&c->usage_lock);
 }
@@ -270,21 +266,21 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}
 
-	mutex_lock(&c->prio_clock[READ].lock);
+	mutex_lock(&c->bucket_clock[READ].lock);
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
-		bch2_recalc_min_prio(c, ca, READ);
+		bch2_recalc_oldest_io(c, ca, READ);
 		up_read(&ca->bucket_lock);
 	}
-	mutex_unlock(&c->prio_clock[READ].lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
 
-	mutex_lock(&c->prio_clock[WRITE].lock);
+	mutex_lock(&c->bucket_clock[WRITE].lock);
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
-		bch2_recalc_min_prio(c, ca, WRITE);
+		bch2_recalc_oldest_io(c, ca, WRITE);
 		up_read(&ca->bucket_lock);
 	}
-	mutex_unlock(&c->prio_clock[WRITE].lock);
+	mutex_unlock(&c->bucket_clock[WRITE].lock);
 
 	return 0;
 }
@@ -320,9 +316,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
 		d = a->v.data;
 		if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-			put_alloc_field(&d, 2, g->prio[READ]);
+			put_alloc_field(&d, 2, g->io_time[READ]);
 		if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-			put_alloc_field(&d, 2, g->prio[WRITE]);
+			put_alloc_field(&d, 2, g->io_time[WRITE]);
 		lg_local_unlock(&c->usage_lock);
 
 		ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
@@ -395,38 +391,34 @@ int bch2_alloc_write(struct bch_fs *c)
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
 {
-	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket_clock *clock = &c->bucket_clock[rw];
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket *g;
-	u16 max_delta = 1;
+	u16 max_last_io = 0;
 	unsigned i;
 
-	lockdep_assert_held(&c->prio_clock[rw].lock);
+	lockdep_assert_held(&c->bucket_clock[rw].lock);
 
-	/* Determine min prio for this particular device */
+	/* Recalculate max_last_io for this device: */
 	for_each_bucket(g, buckets)
-		max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
+		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
 
-	ca->min_prio[rw] = clock->hand - max_delta;
+	ca->max_last_bucket_io[rw] = max_last_io;
 
-	/*
-	 * This may possibly increase the min prio for the whole device, check
-	 * that as well.
-	 */
-	max_delta = 1;
+	/* Recalculate global max_last_io: */
+	max_last_io = 0;
 
 	for_each_member_device(ca, c, i)
-		max_delta = max(max_delta,
-				(u16) (clock->hand - ca->min_prio[rw]));
+		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
 
-	clock->min_prio = clock->hand - max_delta;
+	clock->max_last_io = max_last_io;
 }
 
-static void bch2_rescale_prios(struct bch_fs *c, int rw)
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
 {
-	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket_clock *clock = &c->bucket_clock[rw];
 	struct bucket_array *buckets;
 	struct bch_dev *ca;
 	struct bucket *g;
@@ -439,10 +431,10 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
 		buckets = bucket_array(ca);
 
 		for_each_bucket(g, buckets)
-			g->prio[rw] = clock->hand -
-			(clock->hand - g->prio[rw]) / 2;
+			g->io_time[rw] = clock->hand -
+			bucket_last_io(c, g, rw) / 2;
 
-		bch2_recalc_min_prio(c, ca, rw);
+		bch2_recalc_oldest_io(c, ca, rw);
 
 		up_read(&ca->bucket_lock);
 	}
@@ -450,19 +442,26 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
 
 static void bch2_inc_clock_hand(struct io_timer *timer)
 {
-	struct prio_clock *clock = container_of(timer,
-						struct prio_clock, rescale);
+	struct bucket_clock *clock = container_of(timer,
+						struct bucket_clock, rescale);
 	struct bch_fs *c = container_of(clock,
-					struct bch_fs, prio_clock[clock->rw]);
+					struct bch_fs, bucket_clock[clock->rw]);
+	struct bch_dev *ca;
 	u64 capacity;
+	unsigned i;
 
 	mutex_lock(&clock->lock);
 
-	clock->hand++;
-
 	/* if clock cannot be advanced more, rescale prio */
-	if (clock->hand == (u16) (clock->min_prio - 1))
-		bch2_rescale_prios(c, clock->rw);
+	if (clock->max_last_io >= U16_MAX - 2)
+		bch2_rescale_bucket_io_times(c, clock->rw);
+
+	BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+	for_each_member_device(ca, c, i)
+		ca->max_last_bucket_io[clock->rw]++;
+	clock->max_last_io++;
+	clock->hand++;
 
 	mutex_unlock(&clock->lock);
 
@@ -484,9 +483,9 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
 	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 }
 
-static void bch2_prio_timer_init(struct bch_fs *c, int rw)
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
 {
-	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket_clock *clock = &c->bucket_clock[rw];
 
 	clock->hand		= 1;
 	clock->rw		= rw;
@@ -536,7 +535,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (kthread_should_stop()) {
-			ret = -1;
+			ret = 1;
 			break;
 		}
 
@@ -635,13 +634,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, struct bucket_mark m)
 {
+	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+	unsigned max_last_io = ca->max_last_bucket_io[READ];
+
 	/*
 	 * Time since last read, scaled to [0, 8) where larger value indicates
 	 * more recently read data:
 	 */
-	unsigned long hotness =
-		(bucket(ca, b)->prio[READ]	- ca->min_prio[READ]) * 7 /
-		(c->prio_clock[READ].hand	- ca->min_prio[READ]);
+	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
 
 	/* How much we want to keep the data in this bucket: */
 	unsigned long data_wantness =
@@ -659,23 +659,25 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 				   struct alloc_heap_entry l,
 				   struct alloc_heap_entry r)
 {
-	return (l.key > r.key) - (l.key < r.key);
+	return (l.key > r.key) - (l.key < r.key) ?:
+		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
+		(l.bucket > r.bucket) - (l.bucket < r.bucket);
 }
 
 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
-	struct alloc_heap_entry e;
+	struct alloc_heap_entry e = { 0 };
 	size_t b;
 
 	ca->alloc_heap.used = 0;
 
-	mutex_lock(&c->prio_clock[READ].lock);
+	mutex_lock(&c->bucket_clock[READ].lock);
 	down_read(&ca->bucket_lock);
 
 	buckets = bucket_array(ca);
 
-	bch2_recalc_min_prio(c, ca, READ);
+	bch2_recalc_oldest_io(c, ca, READ);
 
 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
@@ -684,30 +686,45 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	 */
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
 		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		unsigned long key = bucket_sort_key(c, ca, b, m);
 
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
 
-		e = (struct alloc_heap_entry) {
-			.bucket = b,
-			.key	= bucket_sort_key(c, ca, b, m)
-		};
+		if (e.nr && e.bucket + e.nr == b && e.key == key) {
+			e.nr++;
+		} else {
+			if (e.nr)
+				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+			e = (struct alloc_heap_entry) {
+				.bucket = b,
+				.nr	= 1,
+				.key	= key,
+			};
+		}
 
-		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+		cond_resched();
 	}
 
+	if (e.nr)
+		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
 	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->prio_clock[READ].lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);
 
 	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
 
-	/*
-	 * If we run out of buckets to invalidate, bch2_allocator_thread() will
-	 * kick stuff and retry us
-	 */
-	while (!fifo_full(&ca->free_inc) &&
-	       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
-		bch2_invalidate_one_bucket(c, ca, e.bucket);
+	while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
+		for (b = e.bucket;
+		     b < e.bucket + e.nr;
+		     b++) {
+			if (fifo_full(&ca->free_inc))
+				return;
+
+			bch2_invalidate_one_bucket(c, ca, b);
+		}
+	}
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -729,6 +746,8 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 
 		if (bch2_can_invalidate_bucket(ca, b, m))
 			bch2_invalidate_one_bucket(c, ca, b);
+
+		cond_resched();
 	}
 }
 
@@ -749,6 +768,8 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
 
 		if (bch2_can_invalidate_bucket(ca, b, m))
 			bch2_invalidate_one_bucket(c, ca, b);
+
+		cond_resched();
 	}
 }
 
@@ -850,7 +871,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 
 		if ((current->flags & PF_KTHREAD) &&
 		    kthread_should_stop()) {
-			ret = -1;
+			ret = 1;
 			break;
 		}
 
@@ -880,7 +901,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 					     ca->mi.bucket_size, GFP_NOIO, 0);
 
 		if (push_invalidated_bucket(c, ca, bucket))
-			return -1;
+			return 1;
 	}
 
 	return 0;
@@ -905,17 +926,32 @@ static int bch2_allocator_thread(void *arg)
 
 	while (1) {
 		while (1) {
+			cond_resched();
+
+			pr_debug("discarding %zu invalidated buckets",
+				 ca->nr_invalidated);
+
 			ret = discard_invalidated_buckets(c, ca);
 			if (ret)
-				return 0;
+				goto stop;
 
 			if (fifo_empty(&ca->free_inc))
 				break;
 
+			pr_debug("invalidating %zu buckets",
+				 fifo_used(&ca->free_inc));
+
 			journal_seq = 0;
 			ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
-			if (ret)
-				return 0;
+			if (ret) {
+				bch_err(ca, "error invalidating buckets: %i", ret);
+				goto stop;
+			}
+
+			if (!ca->nr_invalidated) {
+				bch_err(ca, "allocator thread unable to make forward progress!");
+				goto stop;
+			}
 
 			if (ca->allocator_invalidating_data)
 				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@@ -927,22 +963,29 @@ static int bch2_allocator_thread(void *arg)
 			 * journal error - buckets haven't actually been
 			 * invalidated, can't discard them:
 			 */
-			if (ret)
-				return 0;
+			if (ret) {
+				bch_err(ca, "journal error: %i", ret);
+				goto stop;
+			}
 		}
 
+		pr_debug("free_inc now empty");
+
 		/* Reset front/back so we can easily sort fifo entries later: */
 		ca->free_inc.front = ca->free_inc.back	= 0;
 		ca->allocator_journal_seq_flush		= 0;
 		ca->allocator_invalidating_data		= false;
 
 		down_read(&c->gc_lock);
-		if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-			up_read(&c->gc_lock);
-			return 0;
-		}
-
 		while (1) {
+			size_t prev = fifo_used(&ca->free_inc);
+
+			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+				up_read(&c->gc_lock);
+				bch_err(ca, "gc failure");
+				goto stop;
+			}
+
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
@@ -950,7 +993,14 @@ static int bch2_allocator_thread(void *arg)
 			 * another cache tier
 			 */
 
+			pr_debug("scanning for reclaimable buckets");
+
 			find_reclaimable_buckets(c, ca);
+
+			pr_debug("found %zu buckets (free_inc %zu/%zu)",
+				 fifo_used(&ca->free_inc) - prev,
+				 fifo_used(&ca->free_inc), ca->free_inc.size);
+
 			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
 					  ca->free_inc.size);
 
@@ -977,15 +1027,20 @@ static int bch2_allocator_thread(void *arg)
 			ca->allocator_blocked = true;
 			closure_wake_up(&c->freelist_wait);
 
-			if (wait_buckets_available(c, ca)) {
+			ret = wait_buckets_available(c, ca);
+			if (ret) {
 				up_read(&c->gc_lock);
-				return 0;
+				goto stop;
 			}
 		}
 
 		ca->allocator_blocked = false;
 		up_read(&c->gc_lock);
 
+		pr_debug("free_inc now %zu/%zu",
+			 fifo_used(&ca->free_inc),
+			 ca->free_inc.size);
+
 		sort_free_inc(c, ca);
 
 		/*
@@ -993,6 +1048,10 @@ static int bch2_allocator_thread(void *arg)
 		 * write out the new bucket gens:
 		 */
 	}
+
+stop:
+	pr_debug("alloc thread stopping (ret %i)", ret);
+	return 0;
 }
 
 /* Allocation */
@@ -1046,8 +1105,8 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }
 
-/* _only_ for allocating the journal and btree roots on a brand new fs: */
-int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	ssize_t b;
@@ -1056,14 +1115,8 @@ int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
 	buckets = bucket_array(ca);
 
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark)) {
-			bch2_mark_alloc_bucket(c, ca, b, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
-			set_bit(b, ca->buckets_dirty);
+		if (is_available_bucket(buckets->b[b].mark))
 			goto success;
-		}
 	b = -1;
 success:
 	rcu_read_unlock();
@@ -1135,9 +1188,8 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		break;
 	}
 
-	if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
-	    (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
-		goto out;
+	if (cl)
+		closure_wait(&c->freelist_wait, cl);
 
 	spin_unlock(&c->freelist_lock);
 
@@ -1218,7 +1270,7 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 		*v = *v < scale ? 0 : *v - scale;
 }
 
-static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
 					struct write_point *wp,
 					unsigned nr_replicas,
 					enum alloc_reserve reserve,
@@ -1284,52 +1336,22 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
 			break;
 		}
 	}
+	rcu_read_unlock();
 
 	EBUG_ON(reserve == RESERVE_MOVINGGC &&
 		ret != ALLOC_SUCCESS &&
 		ret != OPEN_BUCKETS_EMPTY);
-	rcu_read_unlock();
-	return ret;
-}
-
-static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
-				unsigned nr_replicas,
-				enum alloc_reserve reserve,
-				struct bch_devs_mask *devs,
-				struct closure *cl)
-{
-	bool waiting = false;
-
-	while (1) {
-		switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
-						reserve, devs, cl)) {
-		case ALLOC_SUCCESS:
-			if (waiting)
-				closure_wake_up(&c->freelist_wait);
-
-			return 0;
-
-		case NO_DEVICES:
-			if (waiting)
-				closure_wake_up(&c->freelist_wait);
-			return -EROFS;
-
-		case FREELIST_EMPTY:
-			if (!cl)
-				return -ENOSPC;
 
-			if (waiting)
-				return -EAGAIN;
-
-			/* Retry allocation after adding ourself to waitlist: */
-			closure_wait(&c->freelist_wait, cl);
-			waiting = true;
-			break;
-		case OPEN_BUCKETS_EMPTY:
-			return cl ? -EAGAIN : -ENOSPC;
-		default:
-			BUG();
-		}
+	switch (ret) {
+	case ALLOC_SUCCESS:
+		return 0;
+	case NO_DEVICES:
+		return -EROFS;
+	case FREELIST_EMPTY:
+	case OPEN_BUCKETS_EMPTY:
+		return cl ? -EAGAIN : -ENOSPC;
+	default:
+		BUG();
 	}
 }
 
@@ -1530,11 +1552,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	nr_ptrs_have = wp->first_ptr;
 
 	/* does writepoint have ptrs we don't want to use? */
-	writepoint_for_each_ptr(wp, ob, i)
-		if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
-			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-			wp->first_ptr++;
-		}
+	if (target)
+		writepoint_for_each_ptr(wp, ob, i)
+			if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+			}
 
 	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
 		ret = open_bucket_add_buckets(c, target, wp, devs_have,
@@ -1551,7 +1574,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 					      nr_replicas, reserve, cl);
 	}
 
-	if (ret)
+	if (ret && ret != -EROFS)
 		goto err;
 alloc_done:
 	/* check for more than one cache: */
@@ -1584,6 +1607,13 @@ alloc_done:
 		nr_ptrs_effective += ca->mi.durability;
 	}
 
+	if (ret == -EROFS &&
+	    nr_ptrs_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
 	if (nr_ptrs_effective > nr_replicas) {
 		writepoint_for_each_ptr(wp, ob, i) {
 			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
@@ -1749,14 +1779,14 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	if (c->capacity) {
 		bch2_io_timer_add(&c->io_clock[READ],
-				 &c->prio_clock[READ].rescale);
+				 &c->bucket_clock[READ].rescale);
 		bch2_io_timer_add(&c->io_clock[WRITE],
-				 &c->prio_clock[WRITE].rescale);
+				 &c->bucket_clock[WRITE].rescale);
 	} else {
 		bch2_io_timer_del(&c->io_clock[READ],
-				 &c->prio_clock[READ].rescale);
+				 &c->bucket_clock[READ].rescale);
 		bch2_io_timer_del(&c->io_clock[WRITE],
-				 &c->prio_clock[WRITE].rescale);
+				 &c->bucket_clock[WRITE].rescale);
 	}
 
 	/* Wake up case someone was waiting for buckets */
@@ -1889,7 +1919,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	if (ca->alloc_thread)
 		return 0;
 
-	p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
+	p = kthread_create(bch2_allocator_thread, ca,
+			   "bch_alloc[%s]", ca->name);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
@@ -1923,7 +1954,7 @@ static void allocator_start_issue_discards(struct bch_fs *c)
 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	size_t bu, i, devs_have_enough = 0;
+	size_t bu, i;
 	unsigned dev_iter;
 	u64 journal_seq = 0;
 	bool invalidating_data = false;
@@ -1964,16 +1995,21 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 
 	/* did we find enough buckets? */
 	for_each_rw_member(ca, c, dev_iter)
-		devs_have_enough += (fifo_used(&ca->free_inc) >=
-				     ca->free[RESERVE_BTREE].size);
+		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+			percpu_ref_put(&ca->io_ref);
+			goto not_enough;
+		}
 
-	if (devs_have_enough >= c->opts.metadata_replicas)
-		return 0;
+	return 0;
+not_enough:
+	pr_debug("did not find enough empty buckets; issuing discards");
 
 	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
 	for_each_rw_member(ca, c, dev_iter)
 		discard_invalidated_buckets(c, ca);
 
+	pr_debug("scanning for reclaimable buckets");
+
 	for_each_rw_member(ca, c, dev_iter) {
 		BUG_ON(!fifo_empty(&ca->free_inc));
 		ca->free_inc.front = ca->free_inc.back	= 0;
@@ -1988,6 +2024,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 				break;
 	}
 
+	pr_debug("done scanning for reclaimable buckets");
+
 	/*
 	 * We're moving buckets to freelists _before_ they've been marked as
 	 * invalidated on disk - we have to so that we can allocate new btree
@@ -1997,10 +2035,13 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	 * have cached data in them, which is live until they're marked as
 	 * invalidated on disk:
 	 */
-	if (invalidating_data)
+	if (invalidating_data) {
+		pr_debug("invalidating existing data");
 		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-	else
+	} else {
+		pr_debug("issuing discards");
 		allocator_start_issue_discards(c);
+	}
 
 	/*
 	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
@@ -2017,13 +2058,15 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	}
 
 	if (invalidating_data) {
+		pr_debug("flushing journal");
+
 		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
 		if (ret)
 			return ret;
-	}
 
-	if (invalidating_data)
+		pr_debug("issuing discards");
 		allocator_start_issue_discards(c);
+	}
 
 	for_each_rw_member(ca, c, dev_iter)
 		while (ca->nr_invalidated) {
@@ -2038,19 +2081,43 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 		struct bucket_table *tbl;
 		struct rhash_head *pos;
 		struct btree *b;
+		bool flush_updates;
+		size_t nr_pending_updates;
 
 		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 again:
+		pr_debug("flushing dirty btree nodes");
+		cond_resched();
+
+		flush_updates = false;
+		nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+
 		rcu_read_lock();
 		for_each_cached_btree(b, c, tbl, i, pos)
 			if (btree_node_dirty(b) && (!b->written || b->level)) {
-				rcu_read_unlock();
-				six_lock_read(&b->lock);
-				bch2_btree_node_write(c, b, SIX_LOCK_read);
-				six_unlock_read(&b->lock);
-				goto again;
+				if (btree_node_may_write(b)) {
+					rcu_read_unlock();
+					six_lock_read(&b->lock);
+					bch2_btree_node_write(c, b, SIX_LOCK_read);
+					six_unlock_read(&b->lock);
+					goto again;
+				} else {
+					flush_updates = true;
+				}
 			}
 		rcu_read_unlock();
+
+		/*
+		 * This is ugly, but it's needed to flush btree node writes
+		 * without spinning...
+		 */
+		if (flush_updates) {
+			closure_wait_event(&c->btree_interior_update_wait,
+				bch2_btree_interior_updates_nr_pending(c) <
+				nr_pending_updates);
+			goto again;
+		}
 	}
 
 	return 0;
@@ -2087,8 +2154,8 @@ void bch2_fs_allocator_init(struct bch_fs *c)
 
 	mutex_init(&c->write_points_hash_lock);
 	spin_lock_init(&c->freelist_lock);
-	bch2_prio_timer_init(c, READ);
-	bch2_prio_timer_init(c, WRITE);
+	bch2_bucket_clock_init(c, READ);
+	bch2_bucket_clock_init(c, WRITE);
 
 	/* open bucket 0 is a sentinal NULL: */
 	spin_lock_init(&c->open_buckets[0].lock);