cmd_migrate

author: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-01 01:45:15 -0900
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-09 09:14:11 -0900
commit: a17f7bcec7ed810a247c24e56229af8f43a9a6ae (patch)
tree: 1b2d60b21661bd2991324e3efaa83b3cdd87a783 /libbcache
parent: 171ee48e57be78f4e95954c99851553fa523bf91 (diff)
37 files changed, 1264 insertions, 920 deletions
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
index 8cb31944..93f0c2f1 100644
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@@ -73,7 +73,6 @@
 #include <linux/rcupdate.h>
 #include <trace/events/bcache.h>
 
-static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
 static void __bch_bucket_free(struct cache *, struct bucket *);
 
 /* Allocation groups: */
@@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
 
 	spin_lock(&grp->lock);
 
-	for (i = 0; i < grp->nr_devices; i++)
+	for (i = 0; i < grp->nr; i++)
 		if (rcu_access_pointer(grp->d[i].dev) == ca) {
-			grp->nr_devices--;
+			grp->nr--;
 			memmove(&grp->d[i],
 				&grp->d[i + 1],
-				(grp->nr_devices - i) * sizeof(grp->d[0]));
+				(grp->nr- i) * sizeof(grp->d[0]));
 			break;
 		}
 
@@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
 	unsigned i;
 
 	spin_lock(&grp->lock);
-	for (i = 0; i < grp->nr_devices; i++)
+	for (i = 0; i < grp->nr; i++)
 		if (rcu_access_pointer(grp->d[i].dev) == ca)
 			goto out;
 
-	BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+	BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
 
-	rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+	rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
 out:
 	spin_unlock(&grp->lock);
 }
@@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
 					   struct cache_set,
 					   pd_controllers_update);
 	struct cache *ca;
-	unsigned iter;
-	int i;
+	unsigned i, iter;
 
 	/* All units are in bytes */
-	u64 tier_size[BCH_TIER_MAX];
-	u64 tier_free[BCH_TIER_MAX];
-	u64 tier_dirty[BCH_TIER_MAX];
-	u64 tier0_can_free = 0;
+	u64 faster_tiers_size	= 0;
+	u64 faster_tiers_dirty	= 0;
 
-	memset(tier_size, 0, sizeof(tier_size));
-	memset(tier_free, 0, sizeof(tier_free));
-	memset(tier_dirty, 0, sizeof(tier_dirty));
+	u64 fastest_tier_size	= 0;
+	u64 fastest_tier_free	= 0;
+	u64 copygc_can_free	= 0;
 
 	rcu_read_lock();
-	for (i = BCH_TIER_MAX - 1; i >= 0; --i)
-		group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+		bch_pd_controller_update(&c->tiers[i].pd,
+				div_u64(faster_tiers_size *
+					c->tiering_percent, 100),
+				faster_tiers_dirty,
+				-1);
+
+		group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
 			struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
 			unsigned bucket_bits = ca->bucket_bits + 9;
 
+			u64 size = (ca->mi.nbuckets -
+				    ca->mi.first_bucket) << bucket_bits;
+			u64 dirty = stats.buckets_dirty << bucket_bits;
+			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
 			/*
 			 * Bytes of internal fragmentation, which can be
 			 * reclaimed by copy GC
@@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
 				((stats.sectors_dirty +
 				  stats.sectors_cached) << 9);
 
-			u64 dev_size = (ca->mi.nbuckets -
-					ca->mi.first_bucket) << bucket_bits;
-
-			u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
-
 			if (fragmented < 0)
 				fragmented = 0;
 
 			bch_pd_controller_update(&ca->moving_gc_pd,
 						 free, fragmented, -1);
 
-			if (i == 0)
-				tier0_can_free += fragmented;
-
-			tier_size[i] += dev_size;
-			tier_free[i] += free;
-			tier_dirty[i] += stats.buckets_dirty << bucket_bits;
-		}
-	rcu_read_unlock();
-
-	if (tier_size[1]) {
-		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+			faster_tiers_size		+= size;
+			faster_tiers_dirty		+= dirty;
 
-		tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+			if (!c->fastest_tier ||
+			    c->fastest_tier == &c->tiers[i]) {
+				fastest_tier_size	+= size;
+				fastest_tier_free	+= free;
+			}
 
-		bch_pd_controller_update(&c->tiering_pd,
-					 target,
-					 tier_dirty[0],
-					 -1);
+			copygc_can_free			+= fragmented;
+		}
 	}
 
+	rcu_read_unlock();
+
 	/*
 	 * Throttle foreground writes if tier 0 is running out of free buckets,
-	 * and either tiering or copygc can free up space (but don't take both
-	 * into account).
+	 * and either tiering or copygc can free up space.
 	 *
 	 * Target will be small if there isn't any work to do - we don't want to
 	 * throttle foreground writes if we currently have all the free space
@@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
 	 * Otherwise, if there's work to do, try to keep 20% of tier0 available
 	 * for foreground writes.
 	 */
+	if (c->fastest_tier)
+		copygc_can_free = U64_MAX;
+
 	bch_pd_controller_update(&c->foreground_write_pd,
-				 min(tier0_can_free,
-				     div_u64(tier_size[0] *
+				 min(copygc_can_free,
+				     div_u64(fastest_tier_size *
 					     c->foreground_target_percent,
 					     100)),
-				 tier_free[0],
+				 fastest_tier_free,
 				 -1);
 
 	schedule_delayed_work(&c->pd_controllers_update,
@@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
 		 * it getting gc'd from under us
 		 */
 		ca->prio_buckets[i] = r;
-		bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+		bch_mark_metadata_bucket(ca, ca->buckets + r,
+					 BUCKET_PRIOS, false);
 		spin_unlock(&ca->prio_buckets_lock);
 
 		SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
 	do {
 		unsigned u64s = jset_u64s(0);
 
+		if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
+			break;
+
 		ret = bch_journal_res_get(j, &res, u64s, u64s);
 		if (ret)
 			return ret;
@@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
 		if (is_available_bucket(m) &&
 		    !m.cached_sectors &&
 		    !m.had_metadata &&
-		    (!m.wait_on_journal ||
-		     ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+		    !bucket_needs_journal_commit(m, last_seq_ondisk)) {
 			spin_lock(&ca->freelist_lock);
 
 			bch_mark_alloc_bucket(ca, g, true);
@@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
 
 	set_freezable();
 
+	bch_find_empty_buckets(c, ca);
+
 	while (1) {
 		/*
 		 * First, we pull buckets off of the free_inc list, possibly
@@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
 		 * See if we have buckets we can reuse without invalidating them
 		 * or forcing a journal commit:
 		 */
-		bch_find_empty_buckets(c, ca);
+		//bch_find_empty_buckets(c, ca);
 
 		if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
 			up_read(&c->gc_lock);
@@ -967,7 +970,7 @@ out:
  *
  * Returns index of bucket on success, 0 on failure
  * */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
 {
 	struct bucket *g;
 	long r;
@@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
 	u64 available_buckets = 1; /* avoid a divide by zero... */
 	unsigned i;
 
-	for (i = 0; i < devs->nr_devices; i++) {
+	for (i = 0; i < devs->nr; i++) {
 		ca = devs->d[i].dev;
 
 		devs->d[i].weight = buckets_free_cache(ca);
 		available_buckets += devs->d[i].weight;
 	}
 
-	for (i = 0; i < devs->nr_devices; i++) {
+	for (i = 0; i < devs->nr; i++) {
 		const unsigned min_weight = U32_MAX >> 4;
 		const unsigned max_weight = U32_MAX;
 
 		devs->d[i].weight =
 			min_weight +
 			div64_u64(devs->d[i].weight *
-				  devs->nr_devices *
+				  devs->nr *
 				  (max_weight - min_weight),
 				  available_buckets);
 		devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 	rcu_read_lock();
 	spin_lock(&devs->lock);
 
-	for (i = 0; i < devs->nr_devices; i++)
+	for (i = 0; i < devs->nr; i++)
 		available += !test_bit(devs->d[i].dev->dev_idx,
 				       caches_used);
 
@@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 		}
 
 		i++;
-		i %= devs->nr_devices;
+		i %= devs->nr;
 
 		ret = FREELIST_EMPTY;
 		if (i == fail_idx)
@@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
 						    enum alloc_reserve reserve,
 						    long *caches_used)
 {
+	struct bch_tier *tier;
 	/*
 	 * this should implement policy - for a given type of allocation, decide
 	 * which devices to allocate from:
 	 *
 	 * XXX: switch off wp->type and do something more intelligent here
 	 */
+	if (wp->group)
+		return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+					      wp->group, caches_used);
 
-	/* foreground writes: prefer tier 0: */
-	if (wp->group == &c->cache_all)
+	/* foreground writes: prefer fastest tier: */
+	tier = READ_ONCE(c->fastest_tier);
+	if (tier)
 		bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				       &c->cache_tiers[0], caches_used);
+				       &tier->devs, caches_used);
 
 	return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-				      wp->group, caches_used);
+				      &c->cache_all, caches_used);
 }
 
 static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
 		? 0 : BTREE_NODE_RESERVE;
 	int ret;
 
-	BUG_ON(!wp->group);
 	BUG_ON(!reserve);
 	BUG_ON(!nr_replicas);
 retry:
@@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 				   unsigned nr_replicas, struct open_bucket *ob,
 				   unsigned sectors)
 {
-	struct bch_extent_ptr tmp, *ptr;
+	struct bch_extent_ptr tmp;
 	struct cache *ca;
 	bool has_data = false;
 	unsigned i;
@@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 	if (nr_replicas < ob->nr_ptrs)
 		has_data = true;
 
+	rcu_read_lock();
+
 	for (i = 0; i < nr_replicas; i++) {
 		EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
 
@@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
 		extent_ptr_append(e, tmp);
 
 		ob->ptr_offset[i] += sectors;
+
+		if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
+			this_cpu_add(*ca->sectors_written, sectors);
 	}
 
-	open_bucket_for_each_online_device(c, ob, ptr, ca)
-		this_cpu_add(*ca->sectors_written, sectors);
+	rcu_read_unlock();
 }
 
 /*
@@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
 
 /* Startup/shutdown (ro/rw): */
 
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
 {
-	struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+	struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
 	struct cache *ca;
 	u64 total_capacity, capacity = 0, reserved_sectors = 0;
 	unsigned long ra_pages = 0;
@@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
 
 	c->bdi.ra_pages = ra_pages;
 
+	/* Find fastest, slowest tiers with devices: */
+
+	for (tier = c->tiers;
+	     tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+		if (!tier->devs.nr)
+			continue;
+		if (!fastest_tier)
+			fastest_tier = tier;
+		slowest_tier = tier;
+	}
+
+	c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
+
+	c->promote_write_point.group = &fastest_tier->devs;
+
+	if (!fastest_tier)
+		goto set_capacity;
+
 	/*
 	 * Capacity of the cache set is the capacity of all the devices in the
 	 * slowest (highest) tier - we don't include lower tier devices.
 	 */
-	for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
-	     tier > c->cache_tiers && !tier->nr_devices;
-	     --tier)
-		;
-
-	group_for_each_cache_rcu(ca, tier, i) {
+	group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
 		size_t reserve = 0;
 
 		/*
@@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
 			     ca->mi.first_bucket) <<
 			ca->bucket_bits;
 	}
+set_capacity:
 	rcu_read_unlock();
-
 	total_capacity = capacity;
 
 	capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
 void bch_dev_allocator_stop(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct task_struct *p;
 	struct closure cl;
 	unsigned i;
@@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
 int bch_dev_allocator_start(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+	struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
 	struct task_struct *k;
 
 	/*
@@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
 
 	bch_dev_group_add(tier, ca);
 	bch_dev_group_add(&c->cache_all, ca);
+	bch_dev_group_add(&c->journal.devs, ca);
 
 	bch_recalc_capacity(c);
 
@@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
 	return 0;
 }
 
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
 {
 	unsigned i;
 
@@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
 
 	spin_lock_init(&c->cache_all.lock);
 
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
-		c->write_points[i].throttle = true;
-		c->write_points[i].group = &c->cache_tiers[0];
-	}
-
-	for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
-		spin_lock_init(&c->cache_tiers[i].lock);
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+		spin_lock_init(&c->tiers[i].devs.lock);
 
-	c->promote_write_point.group = &c->cache_tiers[0];
-
-	c->migration_write_point.group = &c->cache_all;
-
-	c->btree_write_point.group = &c->cache_all;
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		c->write_points[i].throttle = true;
 
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
index 09139a59..9573dd2c 100644
--- a/libbcache/alloc.h
+++ b/libbcache/alloc.h
@@ -27,6 +27,8 @@ int bch_prio_read(struct cache *);
 
 void bch_recalc_min_prio(struct cache *, int);
 
+size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+
 void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
 
 struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
@@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
 {
 	struct cache *ret = NULL;
 
-	while (*iter < devs->nr_devices &&
+	while (*iter < devs->nr &&
 	       !(ret = rcu_dereference(devs->d[*iter].dev)))
 		(*iter)++;
 
@@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs,
 	     ((_ca) = __open_bucket_next_online_device(_c, _ob,	_ptr, _ca));\
 	     (_ptr)++)
 
+void bch_recalc_capacity(struct cache_set *);
 void bch_dev_allocator_stop(struct cache *);
 int bch_dev_allocator_start(struct cache *);
-void bch_open_buckets_init(struct cache_set *);
+void bch_fs_allocator_init(struct cache_set *);
 
 #endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
index fbe8b75c..f408bd97 100644
--- a/libbcache/alloc_types.h
+++ b/libbcache/alloc_types.h
@@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
 
 struct cache_group {
 	spinlock_t		lock;
-	unsigned		nr_devices;
+	unsigned		nr;
 	unsigned		cur_device;
 	struct {
 		u64		weight;
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
index babc08db..5b668c71 100644
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@@ -464,24 +464,10 @@ struct cache {
  * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
  * all the backing devices first (their cached data gets invalidated, and they
  * won't automatically reattach).
- *
- * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
- * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
- * flushing dirty data).
- *
- * BCH_FS_RUNNING means all cache devices have been registered and journal
- * replay is complete.
  */
 enum {
-	/* Startup: */
 	BCH_FS_INITIAL_GC_DONE,
-	BCH_FS_RUNNING,
-
-	/* Shutdown: */
 	BCH_FS_DETACHING,
-	BCH_FS_STOPPING,
-	BCH_FS_RO,
-	BCH_FS_RO_COMPLETE,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 	BCH_FS_GC_STOPPING,
@@ -498,6 +484,21 @@ struct btree_debug {
 	struct dentry		*failed;
 };
 
+struct bch_tier {
+	unsigned		idx;
+	struct task_struct	*migrate;
+	struct bch_pd_controller pd;
+
+	struct cache_group	devs;
+};
+
+enum bch_fs_state {
+	BCH_FS_STARTING		= 0,
+	BCH_FS_STOPPING,
+	BCH_FS_RO,
+	BCH_FS_RW,
+};
+
 struct cache_set {
 	struct closure		cl;
 
@@ -506,7 +507,6 @@ struct cache_set {
 	struct kobject		internal;
 	struct kobject		opts_dir;
 	struct kobject		time_stats;
-	struct completion	*stop_completion;
 	unsigned long		flags;
 
 	int			minor;
@@ -514,6 +514,10 @@ struct cache_set {
 	struct super_block	*vfs_sb;
 	char			name[40];
 
+	/* ro/rw, add/remove devices: */
+	struct mutex		state_lock;
+	enum bch_fs_state	state;
+
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
 	struct work_struct	read_only_work;
@@ -640,7 +644,9 @@ struct cache_set {
 	 * allocate from:
 	 */
 	struct cache_group	cache_all;
-	struct cache_group	cache_tiers[BCH_TIER_MAX];
+	struct bch_tier		tiers[BCH_TIER_MAX];
+	/* NULL if we only have devices in one tier: */
+	struct bch_tier		*fastest_tier;
 
 	u64			capacity; /* sectors */
 
@@ -753,10 +759,6 @@ struct cache_set {
 	unsigned		writeback_pages_max;
 	atomic_long_t		nr_inodes;
 
-	/* TIERING */
-	struct task_struct	*tiering_read;
-	struct bch_pd_controller tiering_pd;
-
 	/* NOTIFICATIONS */
 	struct mutex		uevent_lock;
 	struct kobj_uevent_env	uevent_env;
@@ -828,6 +830,11 @@ struct cache_set {
 #undef BCH_TIME_STAT
 };
 
+static inline bool bch_fs_running(struct cache_set *c)
+{
+	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
 static inline unsigned bucket_pages(const struct cache *ca)
 {
 	return ca->mi.bucket_size / PAGE_SECTORS;
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
index 82b07f59..ba2e9a8c 100644
--- a/libbcache/blockdev.c
+++ b/libbcache/blockdev.c
@@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	bool found;
 	int ret;
 
+	lockdep_assert_held(&c->state_lock);
+
 	bdevname(dc->disk_sb.bdev, buf);
 
 	if (memcmp(&dc->disk_sb.sb->set_uuid,
@@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 		return -EINVAL;
 	}
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
-		return 0;
-
-	if (test_bit(BCH_FS_STOPPING, &c->flags)) {
-		pr_err("Can't attach %s: shutting down", buf);
+	if (!bch_fs_running(c)) {
+		pr_err("Can't attach %s: not running", buf);
 		return -EINVAL;
 	}
 
@@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
 	struct cached_dev *dc, *t;
 
 	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
 		bch_cached_dev_attach(dc, c);
@@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
 	struct bkey_s_c_inode_blockdev inode;
 	int ret = 0;
 
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EINVAL;
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
index 4d5efdbd..4d0c6d4d 100644
--- a/libbcache/btree_cache.c
+++ b/libbcache/btree_cache.c
@@ -11,8 +11,9 @@
 
 #define DEF_BTREE_ID(kwd, val, name) name,
 
-const char *bch_btree_id_names[BTREE_ID_NR] = {
+const char * const bch_btree_ids[] = {
 	DEFINE_BCH_BTREE_IDS()
+	NULL
 };
 
 #undef DEF_BTREE_ID
@@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
 	return mca_can_free(c) * btree_pages(c);
 }
 
-void bch_btree_cache_free(struct cache_set *c)
+void bch_fs_btree_exit(struct cache_set *c)
 {
 	struct btree *b;
 	unsigned i;
@@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c)
 		rhashtable_destroy(&c->btree_cache_table);
 }
 
-int bch_btree_cache_alloc(struct cache_set *c)
+int bch_fs_btree_init(struct cache_set *c)
 {
 	unsigned i;
 	int ret;
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
index c26489d1..4d67704b 100644
--- a/libbcache/btree_cache.h
+++ b/libbcache/btree_cache.h
@@ -6,7 +6,7 @@
 
 struct btree_iter;
 
-extern const char *bch_btree_id_names[BTREE_ID_NR];
+extern const char * const bch_btree_ids[];
 
 void bch_recalc_btree_reserve(struct cache_set *);
 
@@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *);
 struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
 				 unsigned, enum six_lock_type);
 
-void bch_btree_cache_free(struct cache_set *);
-int bch_btree_cache_alloc(struct cache_set *);
+void bch_fs_btree_exit(struct cache_set *);
+int bch_fs_btree_init(struct cache_set *);
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
 	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,	\
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
index 0eb7290c..b90807f7 100644
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
 	}
 }
 
+static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
+				  enum bucket_data_type type)
+{
+	u64 b = start >> ca->bucket_bits;
+
+	do {
+		bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+		b++;
+	} while (b < end >> ca->bucket_bits);
+}
+
 /*
  * Mark non btree metadata - prios, journal
  */
-static void bch_mark_metadata(struct cache_set *c)
+static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
 {
-	struct cache *ca;
-	unsigned i, j;
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	unsigned i;
 	u64 b;
 
-	for_each_cache(ca, c, i) {
-		for (j = 0; j < ca->journal.nr; j++) {
-			b = ca->journal.buckets[j];
-			bch_mark_metadata_bucket(ca, ca->buckets + b, true);
-		}
+	/* Mark superblocks: */
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		if (layout->sb_offset[i] == BCH_SB_SECTOR)
+			mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
+					      BUCKET_SB);
+
+		mark_metadata_sectors(ca,
+				      layout->sb_offset[i],
+				      layout->sb_offset[i] +
+				      (1 << layout->sb_max_size_bits),
+				      BUCKET_SB);
+	}
 
-		spin_lock(&ca->prio_buckets_lock);
+	spin_lock(&c->journal.lock);
 
-		for (j = 0; j < prio_buckets(ca) * 2; j++) {
-			b = ca->prio_buckets[j];
-			bch_mark_metadata_bucket(ca, ca->buckets + b, true);
-		}
+	for (i = 0; i < ca->journal.nr; i++) {
+		b = ca->journal.buckets[i];
+		bch_mark_metadata_bucket(ca, ca->buckets + b,
+					 BUCKET_JOURNAL, true);
+	}
+
+	spin_unlock(&c->journal.lock);
+
+	spin_lock(&ca->prio_buckets_lock);
 
-		spin_unlock(&ca->prio_buckets_lock);
+	for (i = 0; i < prio_buckets(ca) * 2; i++) {
+		b = ca->prio_buckets[i];
+		if (b)
+			bch_mark_metadata_bucket(ca, ca->buckets + b,
+						 BUCKET_PRIOS, true);
 	}
+
+	spin_unlock(&ca->prio_buckets_lock);
+}
+
+static void bch_mark_metadata(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&c->sb_lock);
+
+	for_each_cache(ca, c, i)
+		bch_mark_dev_metadata(c, ca);
+
+	mutex_unlock(&c->sb_lock);
 }
 
 /* Also see bch_pending_btree_node_free_insert_done() */
@@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c)
 		for_each_bucket(g, ca) {
 			bucket_cmpxchg(g, new, ({
 				new.owned_by_allocator	= 0;
-				new.is_metadata		= 0;
+				new.data_type		= 0;
 				new.cached_sectors	= 0;
 				new.dirty_sectors	= 0;
 			}));
@@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
 	u64 start_time;
 	enum btree_id id;
 
-	if (btree_gc_coalesce_disabled(c))
-		return;
-
 	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 		return;
 
@@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg)
 		last_kick = atomic_read(&c->kick_gc);
 
 		bch_gc(c);
-		bch_coalesce(c);
+		if (!btree_gc_coalesce_disabled(c))
+			bch_coalesce(c);
 
 		debug_check_no_locks_held();
 	}
@@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
 {
 	set_bit(BCH_FS_GC_STOPPING, &c->flags);
 
-	if (!IS_ERR_OR_NULL(c->gc_thread))
+	if (c->gc_thread)
 		kthread_stop(c->gc_thread);
+
+	c->gc_thread = NULL;
+	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 }
 
 int bch_gc_thread_start(struct cache_set *c)
 {
-	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+	struct task_struct *p;
 
-	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
-	if (IS_ERR(c->gc_thread))
-		return PTR_ERR(c->gc_thread);
+	BUG_ON(c->gc_thread);
 
+	p = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	c->gc_thread = p;
 	wake_up_process(c->gc_thread);
 	return 0;
 }
@@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 {
 	enum btree_id id;
 
-	if (journal) {
-		for (id = 0; id < BTREE_ID_NR; id++)
-			bch_initial_gc_btree(c, id);
+	bch_mark_metadata(c);
 
+	for (id = 0; id < BTREE_ID_NR; id++)
+		bch_initial_gc_btree(c, id);
+
+	if (journal)
 		bch_journal_mark(c, journal);
-	}
 
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
@@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);
 
-	bch_mark_metadata(c);
-
 	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
index 315cfbec..ec4ee54a 100644
--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@@ -66,6 +66,7 @@
 #include "alloc.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "error.h"
 
 #include <linux/preempt.h>
 #include <trace/events/bcache.h>
@@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {}
 
 #endif
 
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
 void bch_bucket_seq_cleanup(struct cache_set *c)
 {
 	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
@@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		for_each_bucket(g, ca) {
 			bucket_cmpxchg(g, m, ({
-				if (!m.wait_on_journal ||
-				    ((s16) last_seq_ondisk -
-				     (s16) m.journal_seq < 0))
+				if (!m.journal_seq_valid ||
+				    bucket_needs_journal_commit(m, last_seq_ondisk))
 					break;
 
-				m.wait_on_journal = 0;
+				m.journal_seq_valid = 0;
 			}));
 		}
 }
@@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c)
 
 static inline int is_meta_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && m.is_metadata;
+	return m.data_type != BUCKET_DATA;
 }
 
 static inline int is_dirty_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+	return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
 }
 
 static inline int is_cached_bucket(struct bucket_mark m)
 {
-	return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+	return m.data_type == BUCKET_DATA &&
+		!m.dirty_sectors && !!m.cached_sectors;
 }
 
 void bch_fs_stats_apply(struct cache_set *c,
@@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c,
 	memset(stats, 0, sizeof(*stats));
 }
 
+static bool bucket_became_unavailable(struct cache_set *c,
+				      struct bucket_mark old,
+				      struct bucket_mark new)
+{
+	return is_available_bucket(old) &&
+	       !is_available_bucket(new) &&
+	       c->gc_pos.phase == GC_PHASE_DONE;
+}
+
 static void bucket_stats_update(struct cache *ca,
 			struct bucket_mark old, struct bucket_mark new,
-			bool may_make_unavailable,
 			struct bucket_stats_cache_set *bch_alloc_stats)
 {
 	struct cache_set *c = ca->set;
 	struct bucket_stats_cache *cache_stats;
 
-	BUG_ON(!may_make_unavailable &&
-	       is_available_bucket(old) &&
-	       !is_available_bucket(new) &&
-	       c->gc_pos.phase == GC_PHASE_DONE);
+	bch_fs_inconsistent_on(old.data_type && new.data_type &&
+			old.data_type != new.data_type, c,
+			"different types of metadata in same bucket: %u, %u",
+			old.data_type, new.data_type);
 
 	if (bch_alloc_stats) {
 		bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
 			(int) new.cached_sectors - (int) old.cached_sectors;
 
 		bch_alloc_stats->s[S_COMPRESSED]
-			[old.is_metadata ? S_META : S_DIRTY] -=
+			[is_meta_bucket(old) ? S_META : S_DIRTY] -=
 			old.dirty_sectors;
 
 		bch_alloc_stats->s[S_COMPRESSED]
-			[new.is_metadata ? S_META : S_DIRTY] +=
+			[is_meta_bucket(new) ? S_META : S_DIRTY] +=
 			new.dirty_sectors;
 	}
 
@@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca,
 	cache_stats->sectors_cached +=
 		(int) new.cached_sectors - (int) old.cached_sectors;
 
-	if (old.is_metadata)
+	if (is_meta_bucket(old))
 		cache_stats->sectors_meta -= old.dirty_sectors;
 	else
 		cache_stats->sectors_dirty -= old.dirty_sectors;
 
-	if (new.is_metadata)
+	if (is_meta_bucket(new))
 		cache_stats->sectors_meta += new.dirty_sectors;
 	else
 		cache_stats->sectors_dirty += new.dirty_sectors;
@@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca,
 		bch_wake_allocator(ca);
 }
 
+#define bucket_data_cmpxchg(ca, g, new, expr)			\
+({								\
+	struct bucket_stats_cache_set _stats = { 0 };		\
+	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
+								\
+	bucket_stats_update(ca, _old, new, &_stats);		\
+	_old;							\
+})
+
 void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 {
 	struct bucket_stats_cache_set stats = { 0 };
@@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 
 	old = bucket_cmpxchg(g, new, ({
 		new.owned_by_allocator	= 1;
-		new.is_metadata		= 0;
+		new.had_metadata	= 0;
+		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
 		new.copygc		= 0;
 		new.gen++;
 	}));
 
-	BUG_ON(old.dirty_sectors);
+	bucket_stats_update(ca, old, new, &stats);
 
-	bucket_stats_update(ca, old, new, true, &stats);
+	BUG_ON(old.dirty_sectors);
 
 	/*
 	 * Ick:
@@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 
 void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
 {
-	struct bucket_stats_cache_set stats = { 0 };
 	struct bucket_mark old, new;
 
-	old = bucket_cmpxchg(g, new, ({
+	old = bucket_data_cmpxchg(ca, g, new, ({
 		new.owned_by_allocator	= 0;
-		new.is_metadata		= 0;
+		new.data_type		= 0;
 		new.cached_sectors	= 0;
 		new.dirty_sectors	= 0;
 	}));
 
-	bucket_stats_update(ca, old, new, false, &stats);
+	BUG_ON(bucket_became_unavailable(ca->set, old, new));
 }
 
 void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
 			   bool owned_by_allocator)
 {
-	struct bucket_stats_cache_set stats = { 0 };
-	struct bucket_mark old, new;
+	struct bucket_mark new;
 
-	old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
-
-	bucket_stats_update(ca, old, new, true, &stats);
+	bucket_data_cmpxchg(ca, g, new, ({
+		new.owned_by_allocator = owned_by_allocator;
+	}));
 }
 
 void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+			      enum bucket_data_type type,
 			      bool may_make_unavailable)
 {
-	struct bucket_stats_cache_set stats = { 0 };
 	struct bucket_mark old, new;
 
-	old = bucket_cmpxchg(g, new, ({
-		new.is_metadata = 1;
+	BUG_ON(!type);
+
+	old = bucket_data_cmpxchg(ca, g, new, ({
+		new.data_type = type;
 		new.had_metadata = 1;
 	}));
 
 	BUG_ON(old.cached_sectors);
 	BUG_ON(old.dirty_sectors);
-
-	bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+	BUG_ON(!may_make_unavailable &&
+	       bucket_became_unavailable(ca->set, old, new));
 }
 
 #define saturated_add(ca, dst, src, max)			\
@@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c,
 
 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
-			new.is_metadata = false;
+			new.data_type	= 0;
 
 			if (journal_seq) {
-				new.wait_on_journal = true;
+				new.journal_seq_valid = 1;
 				new.journal_seq = journal_seq;
 			}
 		} else {
-			new.is_metadata = (type == S_META);
+			new.data_type = type == S_META
+				? BUCKET_BTREE : BUCKET_DATA;
 		}
 
-		new.had_metadata |= new.is_metadata;
+		new.had_metadata |= is_meta_bucket(new);
 	} while ((v = cmpxchg(&g->_mark.counter,
 			      old.counter,
 			      new.counter)) != old.counter);
 
-	bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+	bucket_stats_update(ca, old, new, NULL);
+
+	BUG_ON(!may_make_unavailable &&
+	       bucket_became_unavailable(c, old, new));
 
 	if (saturated &&
 	    atomic_long_add_return(saturated,
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
index 9c6e4385..6d70103e 100644
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c)
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
 	return (!mark.owned_by_allocator &&
-		!mark.is_metadata &&
-		!mark.dirty_sectors);
+		mark.data_type == BUCKET_DATA &&
+		!mark.dirty_sectors &&
+		!mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+					       u16 last_seq_ondisk)
+{
+	return m.journal_seq_valid &&
+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
 }
 
 void bch_bucket_seq_cleanup(struct cache_set *);
@@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *);
 void bch_invalidate_bucket(struct cache *, struct bucket *);
 void bch_mark_free_bucket(struct cache *, struct bucket *);
 void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *,
+			      enum bucket_data_type, bool);
 
 void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
 		       struct bucket_stats_cache_set *);
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
index 6bbdcd26..f42e09d8 100644
--- a/libbcache/buckets_types.h
+++ b/libbcache/buckets_types.h
@@ -1,6 +1,14 @@
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 
+enum bucket_data_type {
+	BUCKET_DATA	= 0,
+	BUCKET_BTREE,
+	BUCKET_PRIOS,
+	BUCKET_JOURNAL,
+	BUCKET_SB,
+};
+
 struct bucket_mark {
 	union {
 	struct {
@@ -12,23 +20,30 @@ struct bucket_mark {
 
 		/* generation copygc is going to move this bucket into */
 		unsigned	copygc:1;
-		unsigned	wait_on_journal:1;
+
+		unsigned	journal_seq_valid:1;
 
 		/*
-		 * If this bucket ever had metadata in it, the allocator must
-		 * increment its gen before we reuse it:
+		 * If this bucket had metadata while at the current generation
+		 * number, the allocator must increment its gen before we reuse
+		 * it:
 		 */
 		unsigned	had_metadata:1;
 
 		unsigned	owned_by_allocator:1;
-		unsigned	is_metadata:1;
 
-		u16		cached_sectors;
+		unsigned	data_type:3;
+
+		unsigned	nouse:1;
+
 		u16		dirty_sectors;
+		u16		cached_sectors;
 
 		/*
 		 * low bits of journal sequence number when this bucket was most
-		 * recently modified:
+		 * recently modified: if journal_seq_valid is set, this bucket
+		 * can't be reused until the journal sequence number written to
+		 * disk is >= the bucket's journal sequence number:
 		 */
 		u16		journal_seq;
 	};
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
index b142d7b2..049aa910 100644
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg)
 
 static long bch_ioctl_stop(struct cache_set *c)
 {
-	bch_fs_stop(c);
+	bch_fs_stop_async(c);
 	return 0;
 }
 
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
index dae52d49..92036db4 100644
--- a/libbcache/checksum.c
+++ b/libbcache/checksum.c
@@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed)
 	if (ret)
 		goto err;
 
-	crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
-						sizeof(*crypt) / sizeof(u64)),
-				     struct bch_sb_field_crypt, field);
+	crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
 		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
 		goto err;
 	}
 
-	crypt->field.type = BCH_SB_FIELD_crypt;
 	crypt->key = key;
 
 	/* write superblock */
@@ -560,7 +557,7 @@ err:
 	return ret;
 }
 
-void bch_fs_encryption_free(struct cache_set *c)
+void bch_fs_encryption_exit(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(c->poly1305))
 		crypto_free_shash(c->poly1305);
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
index 137c9155..9d4da08d 100644
--- a/libbcache/checksum.h
+++ b/libbcache/checksum.h
@@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned,
 int bch_disable_encryption(struct cache_set *);
 int bch_enable_encryption(struct cache_set *, bool);
 
-void bch_fs_encryption_free(struct cache_set *);
+void bch_fs_encryption_exit(struct cache_set *);
 int bch_fs_encryption_init(struct cache_set *);
 
 static inline unsigned bch_data_checksum_type(struct cache_set *c)
diff --git a/libbcache/compress.c b/libbcache/compress.c
index f81a8143..89da31e5 100644
--- a/libbcache/compress.c
+++ b/libbcache/compress.c
@@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c,
 		break;
 	}
 
-	return bch_compress_init(c);
+	return bch_fs_compress_init(c);
 }
 
-void bch_compress_free(struct cache_set *c)
+void bch_fs_compress_exit(struct cache_set *c)
 {
 	vfree(c->zlib_workspace);
 	mempool_exit(&c->lz4_workspace_pool);
@@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c)
 	max_t(size_t, zlib_inflate_workspacesize(),			\
 	      zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
 
-int bch_compress_init(struct cache_set *c)
+int bch_fs_compress_init(struct cache_set *c)
 {
 	unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
 	int ret, cpu;
 
-	if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
-	    !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-		return 0;
-
 	if (!c->bio_decompress_worker) {
 		c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
 		if (!c->bio_decompress_worker)
@@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c)
 		}
 	}
 
+	if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+	    !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+		return 0;
+
 	if (!mempool_initialized(&c->compression_bounce[READ])) {
 		ret = mempool_init_page_pool(&c->compression_bounce[READ],
 					     1, order);
diff --git a/libbcache/compress.h b/libbcache/compress.h
index 485acd95..4604b065 100644
--- a/libbcache/compress.h
+++ b/libbcache/compress.h
@@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
 		      struct bio *, size_t *, unsigned *);
 
 int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
-void bch_compress_free(struct cache_set *);
-int bch_compress_init(struct cache_set *);
+void bch_fs_compress_exit(struct cache_set *);
+int bch_fs_compress_init(struct cache_set *);
 
 #endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
index d25c32ae..16cc72b9 100644
--- a/libbcache/debug.c
+++ b/libbcache/debug.c
@@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = {
 	.read		= bch_read_bfloat_failed,
 };
 
-void bch_debug_exit_cache_set(struct cache_set *c)
+void bch_fs_debug_exit(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(c->debug))
 		debugfs_remove_recursive(c->debug);
 }
 
-void bch_debug_init_cache_set(struct cache_set *c)
+void bch_fs_debug_init(struct cache_set *c)
 {
 	struct btree_debug *bd;
 	char name[100];
@@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c)
 	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
 	     bd++) {
 		bd->id = bd - c->btree_debug;
-		bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+		bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
 						0400, c->debug, bd,
 						&btree_debug_ops);
 
 		snprintf(name, sizeof(name), "%s-formats",
-			 bch_btree_id_names[bd->id]);
+			 bch_btree_ids[bd->id]);
 
 		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
 						       &btree_format_debug_ops);
 
 		snprintf(name, sizeof(name), "%s-bfloat-failed",
-			 bch_btree_id_names[bd->id]);
+			 bch_btree_ids[bd->id]);
 
 		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
 						 &bfloat_failed_debug_ops);
diff --git a/libbcache/debug.h b/libbcache/debug.h
index a3635e60..d34a95a0 100644
--- a/libbcache/debug.h
+++ b/libbcache/debug.h
@@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
 }
 
 #ifdef CONFIG_DEBUG_FS
-void bch_debug_exit_cache_set(struct cache_set *);
-void bch_debug_init_cache_set(struct cache_set *);
+void bch_fs_debug_exit(struct cache_set *);
+void bch_fs_debug_init(struct cache_set *);
 #else
-static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
-static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_exit(struct cache_set *c) {}
+static inline void bch_fs_debug_init(struct cache_set *c) {}
 #endif
 
 void bch_debug_exit(void);
diff --git a/libbcache/error.c b/libbcache/error.c
index 9f39be1b..f4109da6 100644
--- a/libbcache/error.c
+++ b/libbcache/error.c
@@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c)
 	case BCH_ON_ERROR_RO:
 		if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
 			/* XXX do something better here? */
-			bch_fs_stop(c);
+			bch_fs_stop_async(c);
 			return;
 		}
 
@@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 	} else {
 		bch_notify_dev_error(ca, true);
 
-		mutex_lock(&bch_register_lock);
+		mutex_lock(&c->state_lock);
 		dev = bch_dev_may_remove(ca);
 		if (dev
 		    ? bch_dev_read_only(ca)
@@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 				"too many IO errors on %s, setting %s RO",
 				bdevname(ca->disk_sb.bdev, buf),
 				dev ? "device" : "filesystem");
-		mutex_unlock(&bch_register_lock);
+		mutex_unlock(&c->state_lock);
 	}
 }
 
diff --git a/libbcache/extents.c b/libbcache/extents.c
index 523f3f48..c5e0e375 100644
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
 			do {
 				seq = read_seqcount_begin(&c->gc_pos_lock);
 				bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-				       !g->mark.is_metadata;
+				       g->mark.data_type != BUCKET_BTREE;
 			} while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
 			err = "inconsistent";
@@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 	const union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
+	struct extent_pick_ptr pick = { .ca = NULL };
 	struct cache *ca;
 
 	rcu_read_lock();
@@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
 				PTR_BUCKET_NR(ca, ptr)))
 			continue;
 
-		percpu_ref_get(&ca->ref);
-		rcu_read_unlock();
+		if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
+			continue;
 
-		return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+		pick.ca		= ca;
+		pick.ptr	= *ptr;
 	}
 
+	if (pick.ca)
+		percpu_ref_get(&pick.ca->ref);
+
 	rcu_read_unlock();
 
-	return (struct extent_pick_ptr) { .ca = NULL, };
+	return pick;
 }
 
 const struct bkey_ops bch_bkey_btree_ops = {
@@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
 				if (stale)
 					break;
 
-				bad = (mark.is_metadata ||
+				bad = (mark.data_type != BUCKET_DATA ||
 				       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
 					!mark.owned_by_allocator &&
 					!(ptr->cached
@@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
 		rcu_read_lock();
 		ret->ca = NULL;
 
-		extent_for_each_online_device_crc(c, e, crc, ptr, ca)
-			if (!ptr_stale(ca, ptr)) {
-				*ret = (struct extent_pick_ptr) {
-					.crc = crc_to_128(e.k, crc),
-					.ptr = *ptr,
-					.ca = ca,
-				};
-
-				if (ca != avoid)
-					break;
-			}
+		extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+			if (ptr_stale(ca, ptr))
+				continue;
+
+			if (ret->ca &&
+			    (ca == avoid ||
+			     ret->ca->mi.tier < ca->mi.tier))
+				continue;
+
+			*ret = (struct extent_pick_ptr) {
+				.crc = crc_to_128(e.k, crc),
+				.ptr = *ptr,
+				.ca = ca,
+			};
+		}
 
 		if (ret->ca)
 			percpu_ref_get(&ret->ca->ref);
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
index e9585fd5..e2f1427f 100644
--- a/libbcache/fs-gc.c
+++ b/libbcache/fs-gc.c
@@ -545,9 +545,9 @@ struct nlink {
 	u32	dir_count;
 };
 
-DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+typedef GENRADIX(struct nlink) nlink_table;
 
-static void inc_link(struct cache_set *c, struct nlinks *links,
+static void inc_link(struct cache_set *c, nlink_table *links,
 		     u64 range_start, u64 *range_end,
 		     u64 inum, bool dir)
 {
@@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links,
 }
 
 noinline_for_stack
-static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links,
 			       u64 range_start, u64 *range_end)
 {
 	struct btree_iter iter;
@@ -776,7 +776,7 @@ fsck_err:
 noinline_for_stack
 static int bch_gc_walk_inodes(struct cache_set *c,
 			      struct bch_inode_unpacked *lostfound_inode,
-			      struct nlinks *links,
+			      nlink_table *links,
 			      u64 range_start, u64 range_end)
 {
 	struct btree_iter iter;
@@ -850,7 +850,7 @@ noinline_for_stack
 static int check_inode_nlinks(struct cache_set *c,
 			      struct bch_inode_unpacked *lostfound_inode)
 {
-	struct nlinks links;
+	nlink_table links;
 	u64 this_iter_range_start, next_iter_range_start = 0;
 	int ret = 0;
 
diff --git a/libbcache/fs.c b/libbcache/fs.c
index ab0d9728..ec70a3e3 100644
--- a/libbcache/fs.c
+++ b/libbcache/fs.c
@@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
 		if (!c)
 			goto err_unlock;
 
-		if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+		mutex_lock(&c->state_lock);
+
+		if (!bch_fs_running(c)) {
+			mutex_unlock(&c->state_lock);
 			err = "incomplete cache set";
 			c = NULL;
 			goto err_unlock;
 		}
 
 		closure_get(&c->cl);
+		mutex_unlock(&c->state_lock);
 		mutex_unlock(&bch_register_lock);
 	}
 
@@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		return ret;
 
-	mutex_lock(&bch_register_lock);
-
 	if (opts.read_only >= 0 &&
 	    opts.read_only != c->opts.read_only) {
 		const char *err = NULL;
 
 		if (opts.read_only) {
-			bch_fs_read_only_sync(c);
+			bch_fs_read_only(c);
 
 			sb->s_flags |= MS_RDONLY;
 		} else {
 			err = bch_fs_read_write(c);
 			if (err) {
 				bch_err(c, "error going rw: %s", err);
-				ret = -EINVAL;
-				goto unlock;
+				return -EINVAL;
 			}
 
 			sb->s_flags &= ~MS_RDONLY;
@@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.errors >= 0)
 		c->opts.errors = opts.errors;
 
-unlock:
-	mutex_unlock(&bch_register_lock);
-
 	return ret;
 }
 
@@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 
 	if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
-		bch_fs_stop_sync(c);
+		bch_fs_stop(c);
 	else
 		closure_put(&c->cl);
 }
@@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = {
 
 MODULE_ALIAS_FS("bcache");
 
-void bch_fs_exit(void)
+void bch_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
 	if (bch_dio_write_bioset)
@@ -1477,7 +1475,7 @@ void bch_fs_exit(void)
 		kmem_cache_destroy(bch_inode_cache);
 }
 
-int __init bch_fs_init(void)
+int __init bch_vfs_init(void)
 {
 	int ret = -ENOMEM;
 
@@ -1504,6 +1502,6 @@ int __init bch_fs_init(void)
 
 	return 0;
 err:
-	bch_fs_exit();
+	bch_vfs_exit();
 	return ret;
 }
diff --git a/libbcache/fs.h b/libbcache/fs.h
index 933fb6de..2a29b132 100644
--- a/libbcache/fs.h
+++ b/libbcache/fs.h
@@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
 int __must_check bch_write_inode(struct cache_set *,
 				 struct bch_inode_info *);
 
-void bch_fs_exit(void);
-int bch_fs_init(void);
+void bch_vfs_exit(void);
+int bch_vfs_init(void);
 
 #else
 
-static inline void bch_fs_exit(void) {}
-static inline int bch_fs_init(void) { return 0; }
+static inline void bch_vfs_exit(void) {}
+static inline int bch_vfs_init(void) { return 0; }
 
 #endif
 
diff --git a/libbcache/io.c b/libbcache/io.c
index be99a973..a3df3794 100644
--- a/libbcache/io.c
+++ b/libbcache/io.c
@@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data)
 	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
 
 	while ((op = c->write_wait_head)) {
-		if (!test_bit(BCH_FS_RO, &c->flags) &&
-		    !test_bit(BCH_FS_STOPPING, &c->flags) &&
-		    time_after(op->expires, jiffies)) {
+		if (time_after(op->expires, jiffies)) {
 			mod_timer(&c->foreground_write_wakeup, op->expires);
 			break;
 		}
@@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
 		return;
 	}
 
-	if (rbio->promote &&
-	    !test_bit(BCH_FS_RO, &c->flags) &&
-	    !test_bit(BCH_FS_STOPPING, &c->flags)) {
+	if (rbio->promote) {
 		struct cache_promote_op *promote = rbio->promote;
 		struct closure *cl = &promote->cl;
 
@@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio)
 		preempt_disable();
 		d = this_cpu_ptr(c->bio_decompress_worker);
 		llist_add(&rbio->list, &d->bio_list);
-		queue_work(system_unbound_wq, &d->work);
+		queue_work(system_highpri_wq, &d->work);
 		preempt_enable();
 	} else {
 		__bch_read_endio(c, rbio);
 	}
 }
 
+static bool should_promote(struct cache_set *c,
+			   struct extent_pick_ptr *pick, unsigned flags)
+{
+	if (!(flags & BCH_READ_PROMOTE))
+		return false;
+
+	if (percpu_ref_is_dying(&c->writes))
+		return false;
+
+	return c->fastest_tier &&
+		c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
 void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 			  struct bvec_iter iter, struct bkey_s_c k,
 			  struct extent_pick_ptr *pick, unsigned flags)
@@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 	 * XXX: multiple promotes can race with each other, wastefully. Keep a
 	 * list of outstanding promotes?
 	 */
-	if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+	if (should_promote(c, pick, flags)) {
 		/*
 		 * biovec needs to be big enough to hold decompressed data, if
 		 * the bch_write_extent() has to decompress/recompress it:
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 99dd9f26..b2838376 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
 		return BCH_FSCK_UNKNOWN_VERSION;
 	}
 
-	if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
-			bytes > c->journal.entry_size_max, c,
+	if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
 			"journal entry too big (%zu bytes), sector %lluu",
 			bytes, sector)) {
 		/* XXX: note we might have missing journal entries */
@@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
 {
 	struct journal *j = &c->journal;
 	struct journal_seq_blacklist *bl;
-	struct cache *ca;
 	u64 new_seq = 0;
-	unsigned i;
-
-	for_each_cache(ca, c, i)
-		if (is_journal_device(ca))
-			bch_dev_group_add(&c->journal.devs, ca);
 
 	list_for_each_entry(bl, &j->seq_blacklist, list)
 		new_seq = max(new_seq, bl->seq);
@@ -1534,48 +1527,111 @@ err:
 	return ret;
 }
 
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
+				      unsigned nr, bool write_super)
 {
+	struct journal *j = &c->journal;
 	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets =
-		bch_sb_get_journal(ca->disk_sb.sb);
-	struct bch_sb_field *f;
-	u64 *p;
+	struct bch_sb_field_journal *journal_buckets;
+	struct disk_reservation disk_res = { 0, 0 };
+	struct closure cl;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	int ret = 0;
 
-	p = krealloc(ja->bucket_seq, nr * sizeof(u64),
-		     GFP_KERNEL|__GFP_ZERO);
-	if (!p)
-		return -ENOMEM;
+	closure_init_stack(&cl);
 
-	ja->bucket_seq = p;
+	mutex_lock(&c->sb_lock);
 
-	p = krealloc(ja->buckets, nr * sizeof(u64),
-		     GFP_KERNEL|__GFP_ZERO);
-	if (!p)
-		return -ENOMEM;
+	/* don't handle reducing nr of buckets yet: */
+	if (nr <= ja->nr)
+		goto err;
 
-	ja->buckets = p;
+	/*
+	 * note: journal buckets aren't really counted as _sectors_ used yet, so
+	 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+	 * when space used goes up without a reservation - but we do need the
+	 * reservation to ensure we'll actually be able to allocate:
+	 */
 
-	f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
-				    sizeof(*journal_buckets) / sizeof(u64));
-	if (!f)
-		return -ENOMEM;
-	f->type = BCH_SB_FIELD_journal;
+	ret = ENOSPC;
+	if (bch_disk_reservation_get(c, &disk_res,
+			(nr - ja->nr) << ca->bucket_bits, 0))
+		goto err;
 
-	ja->nr = nr;
-	return 0;
+	ret = -ENOMEM;
+	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
+	if (!new_buckets || !new_bucket_seq)
+		goto err;
+
+	journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+				nr + sizeof(*journal_buckets) / sizeof(u64));
+	if (!journal_buckets)
+		goto err;
+
+	spin_lock(&j->lock);
+	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
+	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+
+	while (ja->nr < nr) {
+		/* must happen under journal lock, to avoid racing with gc: */
+		u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
+		if (!b) {
+			if (!closure_wait(&c->freelist_wait, &cl)) {
+				spin_unlock(&j->lock);
+				closure_sync(&cl);
+				spin_lock(&j->lock);
+			}
+			continue;
+		}
+
+		bch_mark_metadata_bucket(ca, &ca->buckets[b],
+					 BUCKET_JOURNAL, false);
+		bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
+
+		memmove(ja->buckets + ja->last_idx + 1,
+			ja->buckets + ja->last_idx,
+			(ja->nr - ja->last_idx) * sizeof(u64));
+		memmove(ja->bucket_seq + ja->last_idx + 1,
+			ja->bucket_seq + ja->last_idx,
+			(ja->nr - ja->last_idx) * sizeof(u64));
+		memmove(journal_buckets->buckets + ja->last_idx + 1,
+			journal_buckets->buckets + ja->last_idx,
+			(ja->nr - ja->last_idx) * sizeof(u64));
+
+		ja->buckets[ja->last_idx] = b;
+		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+
+		if (ja->last_idx < ja->nr) {
+			if (ja->cur_idx >= ja->last_idx)
+				ja->cur_idx++;
+			ja->last_idx++;
+		}
+		ja->nr++;
+
+	}
+	spin_unlock(&j->lock);
+
+	BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+	if (write_super)
+		bch_write_super(c);
+
+	ret = 0;
+err:
+	mutex_unlock(&c->sb_lock);
+
+	kfree(new_bucket_seq);
+	kfree(new_buckets);
+	bch_disk_reservation_put(c, &disk_res);
+
+	return ret;
 }
 
 int bch_dev_journal_alloc(struct cache *ca)
 {
-	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets;
-	int ret;
-	unsigned i;
-
-	if (ca->mi.tier != 0)
-		return 0;
-
 	if (dynamic_fault("bcache:add:journal_alloc"))
 		return -ENOMEM;
 
@@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
 	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
 	 * is smaller:
 	 */
-	ret = bch_set_nr_journal_buckets(ca,
+	return bch_set_nr_journal_buckets(ca->set, ca,
 			clamp_t(unsigned, ca->mi.nbuckets >> 8,
 				BCH_JOURNAL_BUCKETS_MIN,
 				min(1 << 10,
-				    (1 << 20) / ca->mi.bucket_size)));
-	if (ret)
-		return ret;
-
-	journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-
-	for (i = 0; i < ja->nr; i++) {
-		u64 bucket = ca->mi.first_bucket + i;
-
-		ja->buckets[i] = bucket;
-		journal_buckets->buckets[i] = cpu_to_le64(bucket);
-
-		bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
-	}
-
-	return 0;
+				    (1 << 20) / ca->mi.bucket_size)),
+			false);
 }
 
 /* Journalling */
@@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
 	     fifo_entry_idx(&j->pin, pin->pin_list))) {
 		if (journal_pin_active(pin))
 			__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->pin_list,
-				  pin, NULL);
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
 	}
 
 	spin_unlock_irq(&j->pin_lock);
 }
 
-
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 {
@@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 	return ret;
 }
 
+static bool journal_has_pins(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	journal_reclaim_fast(j);
+	ret = fifo_used(&j->pin) > 1 ||
+		atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch_journal_flush_pins(struct journal *j)
+{
+	struct journal_entry_pin *pin;
+
+	while ((pin = journal_get_next_pin(j, U64_MAX)))
+		pin->flush(j, pin);
+
+	wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
@@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	struct cache_set *c = container_of(j, struct cache_set, journal);
 	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
 	struct bch_extent_ptr *ptr;
+	struct journal_device *ja;
 	struct cache *ca;
-	unsigned iter, replicas, replicas_want =
+	bool swapped;
+	unsigned i, replicas, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
 
 	spin_lock(&j->lock);
@@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
 	replicas = bch_extent_nr_ptrs(e.c);
 
+	spin_lock(&j->devs.lock);
+
+	/* Sort by tier: */
+	do {
+		swapped = false;
+
+		for (i = 0; i + 1 < j->devs.nr; i++)
+			if (j->devs.d[i + 0].dev->mi.tier >
+			    j->devs.d[i + 1].dev->mi.tier) {
+				swap(j->devs.d[i], j->devs.d[i + 1]);
+				swapped = true;
+			}
+	} while (swapped);
+
 	/*
-	 * Determine location of the next journal write:
-	 * XXX: sort caches by free journal space
+	 * Pick devices for next journal write:
+	 * XXX: sort devices by free journal space?
 	 */
-	group_for_each_cache_rcu(ca, &j->devs, iter) {
-		struct journal_device *ja = &ca->journal;
+	for (i = 0; i < j->devs.nr; i++) {
+		ca = j->devs.d[i].dev;
+		ja = &ca->journal;
 
 		if (replicas >= replicas_want)
 			break;
@@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
 		trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
 	}
-
+	spin_unlock(&j->devs.lock);
 	rcu_read_unlock();
 
 	j->prev_buf_sectors = 0;
@@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
 	return bch_journal_flush_seq(j, seq);
 }
 
-void bch_journal_free(struct journal *j)
-{
-	unsigned order = get_order(j->entry_size_max);
-
-	free_pages((unsigned long) j->buf[1].data, order);
-	free_pages((unsigned long) j->buf[0].data, order);
-	free_fifo(&j->pin);
-}
-
-int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
-{
-	static struct lock_class_key res_key;
-	unsigned order = get_order(entry_size_max);
-
-	spin_lock_init(&j->lock);
-	spin_lock_init(&j->pin_lock);
-	init_waitqueue_head(&j->wait);
-	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
-	mutex_init(&j->blacklist_lock);
-	INIT_LIST_HEAD(&j->seq_blacklist);
-	spin_lock_init(&j->devs.lock);
-	mutex_init(&j->reclaim_lock);
-
-	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
-	j->entry_size_max	= entry_size_max;
-	j->write_delay_ms	= 100;
-	j->reclaim_delay_ms	= 100;
-
-	bkey_extent_init(&j->key);
-
-	atomic64_set(&j->reservations.counter,
-		((union journal_res_state)
-		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
-	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
-		return -ENOMEM;
-
-	return 0;
-}
-
 ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 {
 	union journal_res_state *s = &j->reservations;
@@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
 	return ret;
 }
 
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
+{
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return;
+
+	/*
+	 * Empty out the journal by first flushing everything pinning existing
+	 * journal entries, then force a brand new empty journal entry to be
+	 * written:
+	 */
+	bch_journal_flush_pins(j);
+	bch_journal_flush_async(j, NULL);
+	bch_journal_meta(j);
+
+	cancel_delayed_work_sync(&j->write_work);
+	cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch_dev_journal_exit(struct cache *ca)
 {
 	kfree(ca->journal.buckets);
 	kfree(ca->journal.bucket_seq);
 }
 
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
@@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
 
 	return 0;
 }
+
+void bch_fs_journal_exit(struct journal *j)
+{
+	unsigned order = get_order(j->entry_size_max);
+
+	free_pages((unsigned long) j->buf[1].data, order);
+	free_pages((unsigned long) j->buf[0].data, order);
+	free_fifo(&j->pin);
+}
+
+int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
+{
+	static struct lock_class_key res_key;
+	unsigned order = get_order(entry_size_max);
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->pin_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+	mutex_init(&j->blacklist_lock);
+	INIT_LIST_HEAD(&j->seq_blacklist);
+	spin_lock_init(&j->devs.lock);
+	mutex_init(&j->reclaim_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	j->entry_size_max	= entry_size_max;
+	j->write_delay_ms	= 100;
+	j->reclaim_delay_ms	= 100;
+
+	bkey_extent_init(&j->key);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+	    !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
index 02a6e676..d3a1db0c 100644
--- a/libbcache/journal.h
+++ b/libbcache/journal.h
@@ -111,7 +111,6 @@
 #include <linux/hash.h>
 
 #include "journal_types.h"
-//#include "super-io.h"
 
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
@@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *,
 				  struct journal_entry_pin *,
 				  struct journal_entry_pin *,
 				  journal_pin_flush_fn);
+void bch_journal_flush_pins(struct journal *);
 
 struct closure;
 struct cache_set;
@@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j)
 		? -EIO : 0;
 }
 
-static inline bool is_journal_device(struct cache *ca)
-{
-	return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
-}
-
 static inline bool journal_flushes_device(struct cache *ca)
 {
 	return true;
@@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j)
 	spin_unlock(&j->lock);
 }
 
-void bch_journal_free(struct journal *);
-int bch_journal_alloc(struct journal *, unsigned);
-
 ssize_t bch_journal_print_debug(struct journal *, char *);
 
 int bch_dev_journal_alloc(struct cache *);
@@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
 
 int bch_journal_move(struct cache *);
 
-void bch_journal_free_cache(struct cache *);
-int bch_journal_init_cache(struct cache *);
+void bch_fs_journal_stop(struct journal *);
+void bch_dev_journal_exit(struct cache *);
+int bch_dev_journal_init(struct cache *);
+void bch_fs_journal_exit(struct journal *);
+int bch_fs_journal_init(struct journal *, unsigned);
 
 #endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
index e40dfbca..27f5c63c 100644
--- a/libbcache/movinggc.c
+++ b/libbcache/movinggc.c
@@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca)
 		}
 
 		if (g->mark.owned_by_allocator ||
-		    g->mark.is_metadata)
+		    g->mark.data_type != BUCKET_DATA)
 			continue;
 
 		sectors_used = bucket_sectors_used(g);
@@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
 	return 0;
 }
 
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
 {
-	bch_pd_controller_init(&ca->moving_gc_pd);
-	ca->moving_gc_pd.d_term = 0;
+	ca->moving_gc_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+	if (ca->moving_gc_read)
+		kthread_stop(ca->moving_gc_read);
+	ca->moving_gc_read = NULL;
 }
 
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
 {
 	struct task_struct *t;
 
-	/* The moving gc read thread must be stopped */
-	BUG_ON(ca->moving_gc_read != NULL);
+	BUG_ON(ca->moving_gc_read);
 
 	if (ca->set->opts.nochanges)
 		return 0;
@@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
 	return 0;
 }
 
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
 {
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
-	if (ca->moving_gc_read)
-		kthread_stop(ca->moving_gc_read);
-	ca->moving_gc_read = NULL;
+	bch_pd_controller_init(&ca->moving_gc_pd);
+	ca->moving_gc_pd.d_term = 0;
 }
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
index 5f153085..e8ae95e5 100644
--- a/libbcache/movinggc.h
+++ b/libbcache/movinggc.h
@@ -23,8 +23,8 @@
 #define COPYGC_SECTORS_PER_ITER(ca)					\
 	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
 
-void bch_moving_init_cache(struct cache *);
 void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
+void bch_dev_moving_gc_init(struct cache *);
 
 #endif
diff --git a/libbcache/opts.h b/libbcache/opts.h
index 95184db1..9b10310d 100644
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@@ -86,11 +86,17 @@ enum opt_type {
 	BCH_OPT(noreplay,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
 	BCH_OPT(norecovery,		0444,	NO_SB_OPT,		\
-		s8,  OPT_BOOL())
+		s8,  OPT_BOOL())					\
+	BCH_OPT(noexcl,			0444,	NO_SB_OPT,		\
+		s8,  OPT_BOOL())					\
+	BCH_OPT(sb,			0444,	NO_SB_OPT,		\
+		s64, OPT_UINT(0, S64_MAX))				\
 
 #define BCH_OPTS()							\
 	BCH_OPT(read_only,		0444,	NO_SB_OPT,		\
 		s8,  OPT_BOOL())					\
+	BCH_OPT(nostart,		0444,	NO_SB_OPT,		\
+		s8,  OPT_BOOL())					\
 	BCH_VISIBLE_OPTS()
 
 struct bch_opts {
@@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }
 
+#define opt_defined(_opt)		((_opt) >= 0)
+
 void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
 struct bch_opts bch_sb_opts(struct bch_sb *);
 
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
index be27d3ee..f50a5ee8 100644
--- a/libbcache/super-io.c
+++ b/libbcache/super-io.c
@@ -10,6 +10,7 @@
 #include "vstructs.h"
 
 #include <linux/backing-dev.h>
+#include <linux/sort.h>
 
 static inline void __bch_sb_layout_size_assert(void)
 {
@@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void)
 }
 
 struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
-				      enum bch_sb_field_types type)
+				      enum bch_sb_field_type type)
 {
 	struct bch_sb_field *f;
 
@@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb)
 	if (sb->bio)
 		bio_put(sb->bio);
 	if (!IS_ERR_OR_NULL(sb->bdev))
-		blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+		blkdev_put(sb->bdev, sb->mode);
 
 	free_pages((unsigned long) sb->sb, sb->page_order);
 	memset(sb, 0, sizeof(*sb));
@@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
 	return 0;
 }
 
-int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
 {
 	u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
 	u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
 	le32_add_cpu(&sb->u64s, u64s - old_u64s);
 
 	return f;
+}
+
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
+					 enum bch_sb_field_type type,
+					 unsigned u64s)
+{
+	struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
+	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	ssize_t d = -old_u64s + u64s;
 
+	if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+		return NULL;
+
+	f = __bch_sb_field_resize(sb->sb, f, u64s);
+	f->type = type;
+	return f;
 }
 
 struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
-					    struct bch_sb_field *f,
+					    enum bch_sb_field_type type,
 					    unsigned u64s)
 {
+	struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
 	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
 	ssize_t d = -old_u64s + u64s;
 	struct cache *ca;
@@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
 	for_each_cache(ca, c, i) {
 		struct bcache_superblock *sb = &ca->disk_sb;
 
-		if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+		if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
 			percpu_ref_put(&ca->ref);
 			return NULL;
 		}
 	}
 
-	return __bch_sb_field_resize(c->disk_sb, f, u64s);
-}
-
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
-					     struct bch_sb_field *f,
-					     unsigned u64s)
-{
-	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-	ssize_t d = -old_u64s + u64s;
-
-	if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
-		return NULL;
-
-	return __bch_sb_field_resize(sb->sb, f, u64s);
+	f = __bch_sb_field_resize(c->disk_sb, f, u64s);
+	f->type = type;
+	return f;
 }
 
 static const char *validate_sb_layout(struct bch_sb_layout *layout)
@@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 
 	prev_offset = le64_to_cpu(layout->sb_offset[0]);
 
-	if (prev_offset != BCH_SB_SECTOR)
-		return "Invalid superblock layout: doesn't have default superblock location";
-
 	for (i = 1; i < layout->nr_superblocks; i++) {
 		offset = le64_to_cpu(layout->sb_offset[i]);
 
@@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 	return NULL;
 }
 
+static int u64_cmp(const void *_l, const void *_r)
+{
+	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+	return l < r ? -1 : l > r ? 1 : 0;
+}
+
+const char *bch_validate_journal_layout(struct bch_sb *sb,
+					struct cache_member_cpu mi)
+{
+	struct bch_sb_field_journal *journal;
+	const char *err;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	journal = bch_sb_get_journal(sb);
+	if (!journal)
+		return NULL;
+
+	nr = bch_nr_journal_buckets(journal);
+	if (!nr)
+		return NULL;
+
+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+	if (!b)
+		return "cannot allocate memory";
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	err = "journal bucket at sector 0";
+	if (!b[0])
+		goto err;
+
+	err = "journal bucket before first bucket";
+	if (b[0] < mi.first_bucket)
+		goto err;
+
+	err = "journal bucket past end of device";
+	if (b[nr - 1] >= mi.nbuckets)
+		goto err;
+
+	err = "duplicate journal buckets";
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1])
+			goto err;
+
+	err = NULL;
+err:
+	kfree(b);
+	return err;
+}
+
 const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 {
 	struct bch_sb *sb = disk_sb->sb;
 	struct bch_sb_field *f;
 	struct bch_sb_field_members *sb_mi;
-	struct bch_sb_field_journal *journal;
 	struct cache_member_cpu	mi;
 	const char *err;
 	u16 block_size;
-	unsigned i;
 
 	switch (le64_to_cpu(sb->version)) {
 	case BCACHE_SB_VERSION_CDEV_V4:
@@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 
 	mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
 
-	for (i = 0; i < sb->layout.nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
-		u64 max_size = 1 << sb->layout.sb_max_size_bits;
-
-		if (offset + max_size > mi.first_bucket * mi.bucket_size)
-			return "Invalid superblock: first bucket comes before end of super";
-	}
-
 	if (mi.nbuckets > LONG_MAX)
 		return "Too many buckets";
 
@@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 	    mi.bucket_size * mi.nbuckets)
 		return "Invalid superblock: device too small";
 
-	/* Validate journal buckets: */
-	journal = bch_sb_get_journal(sb);
-	if (journal) {
-		for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
-			u64 b = le64_to_cpu(journal->buckets[i]);
-
-			if (b <  mi.first_bucket || b >= mi.nbuckets)
-				return "bad journal bucket";
-		}
-	}
+	err = bch_validate_journal_layout(sb, mi);
+	if (err)
+		return err;
 
 	return NULL;
 }
@@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev)
 
 static bool bch_is_open(struct block_device *bdev)
 {
-	lockdep_assert_held(&bch_register_lock);
+	bool ret;
+
+	mutex_lock(&bch_register_lock);
+	ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	mutex_unlock(&bch_register_lock);
 
-	return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	return ret;
 }
 
-static const char *bch_blkdev_open(const char *path, void *holder,
-				   struct bch_opts opts,
-				   struct block_device **ret)
+static const char *bch_blkdev_open(const char *path, fmode_t mode,
+				   void *holder, struct block_device **ret)
 {
 	struct block_device *bdev;
-	fmode_t mode = opts.nochanges > 0
-		? FMODE_READ
-		: FMODE_READ|FMODE_WRITE|FMODE_EXCL;
 	const char *err;
 
 	*ret = NULL;
@@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
 	unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
 	int ret;
 
-	ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+	ret = bch_sb_realloc(&ca->disk_sb, u64s);
 	if (ret)
 		return ret;
 
@@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
 reread:
 	bio_reset(sb->bio);
 	sb->bio->bi_bdev = sb->bdev;
-	sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+	sb->bio->bi_iter.bi_sector = offset;
 	sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
 	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 	bch_bio_map(sb->bio, sb->sb);
@@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb,
 			   struct bch_opts opts,
 			   const char *path)
 {
+	u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
 	struct bch_sb_layout layout;
 	const char *err;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
-
 	memset(sb, 0, sizeof(*sb));
+	sb->mode = FMODE_READ;
+
+	if (!(opt_defined(opts.noexcl) && opts.noexcl))
+		sb->mode |= FMODE_EXCL;
 
-	err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+	if (!(opt_defined(opts.nochanges) && opts.nochanges))
+		sb->mode |= FMODE_WRITE;
+
+	err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
 	if (err)
 		return err;
 
@@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb,
 	if (bch_fs_init_fault("read_super"))
 		goto err;
 
-	err = read_one_super(sb, BCH_SB_SECTOR);
+	err = read_one_super(sb, offset);
 	if (!err)
 		goto got_super;
 
-	pr_err("error reading default super: %s", err);
+	if (offset != BCH_SB_SECTOR) {
+		pr_err("error reading superblock: %s", err);
+		goto err;
+	}
+
+	pr_err("error reading default superblock: %s", err);
 
 	/*
 	 * Error reading primary superblock - read location of backup
@@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c)
 
 	lockdep_assert_held(&c->sb_lock);
 
+	if (c->opts.nochanges)
+		return;
+
 	closure_init_stack(cl);
 
 	le64_add_cpu(&c->disk_sb->seq, 1);
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
index 665de811..ae1e8b9d 100644
--- a/libbcache/super-io.h
+++ b/libbcache/super-io.h
@@ -6,16 +6,35 @@
 
 #include <asm/byteorder.h>
 
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
-
-#define BCH_SB_FIELD_TYPE(_name)				\
-static inline struct bch_sb_field_##_name *			\
-bch_sb_get_##_name(struct bch_sb *sb)				\
-{								\
-	struct bch_sb_field *f =				\
-		bch_sb_field_get(sb, BCH_SB_FIELD_##_name);	\
-								\
-	return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
+					 enum bch_sb_field_type, unsigned);
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+					 enum bch_sb_field_type, unsigned);
+
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define BCH_SB_FIELD_TYPE(_name)					\
+static inline struct bch_sb_field_##_name *				\
+bch_sb_get_##_name(struct bch_sb *sb)					\
+{									\
+	return field_to_type(bch_sb_field_get(sb,			\
+				BCH_SB_FIELD_##_name), _name);		\
+}									\
+									\
+static inline struct bch_sb_field_##_name *				\
+bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s)	\
+{									\
+	return field_to_type(bch_sb_field_resize(sb,			\
+				BCH_SB_FIELD_##_name, u64s), _name);	\
+}									\
+									\
+static inline struct bch_sb_field_##_name *				\
+bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s)		\
+{									\
+	return field_to_type(bch_fs_sb_field_resize(c,			\
+				BCH_SB_FIELD_##_name, u64s), _name);	\
 }
 
 BCH_SB_FIELD_TYPE(journal);
@@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned);
 int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
 int bch_sb_from_cache_set(struct cache_set *, struct cache *);
 
-struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
-				struct bch_sb_field *, unsigned);
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
-				struct bch_sb_field *, unsigned);
-
 void bch_free_super(struct bcache_superblock *);
 int bch_super_realloc(struct bcache_superblock *, unsigned);
 
+const char *bch_validate_journal_layout(struct bch_sb *,
+					struct cache_member_cpu);
 const char *bch_validate_cache_super(struct bcache_superblock *);
 
 const char *bch_read_super(struct bcache_superblock *,
diff --git a/libbcache/super.c b/libbcache/super.c
index fab34805..5535639c 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 struct workqueue_struct *bcache_io_wq;
 struct crypto_shash *bch_sha256;
 
-static void bch_dev_stop(struct cache *);
+static void bch_dev_free(struct cache *);
 static int bch_dev_online(struct cache *);
 
 static int bch_congested_fn(void *data, int bdi_bits)
@@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits)
 			}
 		}
 	} else {
-		/* Writes only go to tier 0: */
-		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+		/* Writes prefer fastest tier: */
+		struct bch_tier *tier = READ_ONCE(c->fastest_tier);
+		struct cache_group *grp = tier ? &tier->devs : &c->cache_all;
+
+		group_for_each_cache_rcu(ca, grp, i) {
 			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 
 			if (bdi_congested(bdi, bdi_bits)) {
@@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
 	return ret;
 }
 
-/* Cache set RO/RW: */
+/* Filesystem RO/RW: */
 
 /*
  * For startup/shutdown of RW stuff, the dependencies are:
@@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c)
 	struct cache *ca;
 	unsigned i;
 
-	c->tiering_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&c->tiering_pd.rate);
-	bch_tiering_read_stop(c);
+	bch_tiering_stop(c);
 
 	for_each_cache(ca, c, i)
 		bch_moving_gc_stop(ca);
@@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		bch_dev_allocator_stop(ca);
 
-	/*
-	 * Write a journal entry after flushing the btree, so we don't end up
-	 * replaying everything we just flushed:
-	 */
-	if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-		int ret;
-
-		bch_journal_flush_async(&c->journal, NULL);
-		ret = bch_journal_meta(&c->journal);
-		BUG_ON(ret && !bch_journal_error(&c->journal));
-	}
-
-	cancel_delayed_work_sync(&c->journal.write_work);
-	cancel_delayed_work_sync(&c->journal.reclaim_work);
+	bch_fs_journal_stop(&c->journal);
 }
 
 static void bch_writes_disabled(struct percpu_ref *writes)
@@ -167,12 +155,27 @@ static void bch_writes_disabled(struct percpu_ref *writes)
 	wake_up(&bch_read_only_wait);
 }
 
-static void bch_fs_read_only_work(struct work_struct *work)
+void bch_fs_read_only(struct cache_set *c)
 {
-	struct cache_set *c =
-		container_of(work, struct cache_set, read_only_work);
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STARTING &&
+	    c->state != BCH_FS_RW)
+		goto out;
+
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		goto out;
 
-	percpu_ref_put(&c->writes);
+	trace_fs_read_only(c);
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch_dev_allocator_stop()).
+	 */
+	percpu_ref_kill(&c->writes);
 
 	del_timer(&c->foreground_write_wakeup);
 	cancel_delayed_work(&c->pd_controllers_update);
@@ -180,98 +183,77 @@ static void bch_fs_read_only_work(struct work_struct *work)
 	c->foreground_write_pd.rate.rate = UINT_MAX;
 	bch_wake_delayed_writes((unsigned long) c);
 
-	if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
-		/*
-		 * If we're not doing an emergency shutdown, we want to wait on
-		 * outstanding writes to complete so they don't see spurious
-		 * errors due to shutting down the allocator:
-		 */
-		wait_event(bch_read_only_wait,
-			   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 
-		__bch_fs_read_only(c);
+	__bch_fs_read_only(c);
 
-		if (!bch_journal_error(&c->journal) &&
-		    !test_bit(BCH_FS_ERROR, &c->flags)) {
-			mutex_lock(&c->sb_lock);
-			SET_BCH_SB_CLEAN(c->disk_sb, true);
-			bch_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		}
-	} else {
-		/*
-		 * If we are doing an emergency shutdown outstanding writes may
-		 * hang until we shutdown the allocator so we don't want to wait
-		 * on outstanding writes before shutting everything down - but
-		 * we do need to wait on them before returning and signalling
-		 * that going RO is complete:
-		 */
-		__bch_fs_read_only(c);
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 
-		wait_event(bch_read_only_wait,
-			   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+	if (!bch_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		mutex_lock(&c->sb_lock);
+		SET_BCH_SB_CLEAN(c->disk_sb, true);
+		bch_write_super(c);
+		mutex_unlock(&c->sb_lock);
 	}
 
+	c->state = BCH_FS_RO;
 	bch_notify_fs_read_only(c);
 	trace_fs_read_only_done(c);
-
-	set_bit(BCH_FS_RO_COMPLETE, &c->flags);
-	wake_up(&bch_read_only_wait);
+out:
+	mutex_unlock(&c->state_lock);
 }
 
-bool bch_fs_read_only(struct cache_set *c)
+static void bch_fs_read_only_work(struct work_struct *work)
 {
-	if (test_and_set_bit(BCH_FS_RO, &c->flags))
-		return false;
-
-	trace_fs_read_only(c);
-
-	percpu_ref_get(&c->writes);
+	struct cache_set *c =
+		container_of(work, struct cache_set, read_only_work);
 
-	/*
-	 * Block new foreground-end write operations from starting - any new
-	 * writes will return -EROFS:
-	 *
-	 * (This is really blocking new _allocations_, writes to previously
-	 * allocated space can still happen until stopping the allocator in
-	 * bch_dev_allocator_stop()).
-	 */
-	percpu_ref_kill(&c->writes);
+	bch_fs_read_only(c);
+}
 
-	queue_work(system_freezable_wq, &c->read_only_work);
-	return true;
+static void bch_fs_read_only_async(struct cache_set *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
 }
 
 bool bch_fs_emergency_read_only(struct cache_set *c)
 {
 	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 
-	bch_fs_read_only(c);
+	bch_fs_read_only_async(c);
 	bch_journal_halt(&c->journal);
 
 	wake_up(&bch_read_only_wait);
 	return ret;
 }
 
-void bch_fs_read_only_sync(struct cache_set *c)
-{
-	/* so we don't race with bch_fs_read_write() */
-	lockdep_assert_held(&bch_register_lock);
-
-	bch_fs_read_only(c);
-
-	wait_event(bch_read_only_wait,
-		   test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
-		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-}
-
-static const char *__bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct cache_set *c)
 {
 	struct cache *ca;
-	const char *err;
+	const char *err = NULL;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STARTING &&
+	    c->state != BCH_FS_RO)
+		goto out;
 
 	err = "error starting allocator thread";
 	for_each_cache(ca, c, i)
@@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c)
 	if (bch_gc_thread_start(c))
 		goto err;
 
-	for_each_cache(ca, c, i) {
-		if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
-			continue;
-
-		err = "error starting moving GC thread";
-		if (bch_moving_gc_thread_start(ca)) {
+	err = "error starting moving GC thread";
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+		    bch_moving_gc_start(ca)) {
 			percpu_ref_put(&ca->ref);
 			goto err;
 		}
-	}
 
 	err = "error starting tiering thread";
-	if (bch_tiering_read_start(c))
+	if (bch_tiering_start(c))
 		goto err;
 
 	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 
-	return NULL;
+	if (c->state != BCH_FS_STARTING)
+		percpu_ref_reinit(&c->writes);
+
+	c->state = BCH_FS_RW;
+	err = NULL;
+out:
+	mutex_unlock(&c->state_lock);
+	return err;
 err:
 	__bch_fs_read_only(c);
-	return err;
-}
-
-const char *bch_fs_read_write(struct cache_set *c)
-{
-	const char *err;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
-		return NULL;
-
-	err = __bch_fs_read_write(c);
-	if (err)
-		return err;
-
-	percpu_ref_reinit(&c->writes);
-
-	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-	clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
-	clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
-	clear_bit(BCH_FS_RO, &c->flags);
-	return NULL;
+	goto out;
 }
 
-/* Cache set startup/shutdown: */
+/* Filesystem startup/shutdown: */
 
 static void bch_fs_free(struct cache_set *c)
 {
-	del_timer_sync(&c->foreground_write_wakeup);
-	cancel_delayed_work_sync(&c->pd_controllers_update);
-	cancel_work_sync(&c->read_only_work);
-	cancel_work_sync(&c->bio_submit_work);
-	cancel_work_sync(&c->read_retry_work);
-
-	bch_fs_encryption_free(c);
-	bch_btree_cache_free(c);
-	bch_journal_free(&c->journal);
+	bch_fs_encryption_exit(c);
+	bch_fs_btree_exit(c);
+	bch_fs_journal_exit(&c->journal);
 	bch_io_clock_exit(&c->io_clock[WRITE]);
 	bch_io_clock_exit(&c->io_clock[READ]);
-	bch_compress_free(c);
+	bch_fs_compress_exit(c);
 	bch_fs_blockdev_exit(c);
 	bdi_destroy(&c->bdi);
 	lg_lock_free(&c->bucket_stats_lock);
@@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c)
 	module_put(THIS_MODULE);
 }
 
+static void bch_fs_exit(struct cache_set *c)
+{
+	unsigned i;
+
+	del_timer_sync(&c->foreground_write_wakeup);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+	cancel_work_sync(&c->bio_submit_work);
+	cancel_work_sync(&c->read_retry_work);
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (c->cache[i])
+			bch_dev_free(c->cache[i]);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+static void bch_fs_offline(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_register_lock);
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	for_each_cache(ca, c, i)
+		if (ca->kobj.state_in_sysfs)
+			kobject_del(&ca->kobj);
+
+	bch_fs_debug_exit(c);
+	bch_fs_chardev_exit(c);
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	__bch_fs_read_only(c);
+}
+
 /*
  * should be __bch_fs_stop4 - block devices are closed, now we can finally
  * free it
@@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c)
 void bch_fs_release(struct kobject *kobj)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
-	struct completion *stop_completion = c->stop_completion;
 
 	bch_notify_fs_stopped(c);
-	bch_info(c, "stopped");
-
 	bch_fs_free(c);
-
-	if (stop_completion)
-		complete(stop_completion);
 }
 
 /*
@@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj)
 static void __bch_fs_stop3(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, cl);
-	struct cache *ca;
-	unsigned i;
 
-	mutex_lock(&bch_register_lock);
-	for_each_cache(ca, c, i)
-		bch_dev_stop(ca);
-
-	list_del(&c->list);
-	mutex_unlock(&bch_register_lock);
-
-	closure_debug_destroy(&c->cl);
-	kobject_put(&c->kobj);
+	bch_fs_exit(c);
 }
 
 /*
@@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
 
-	bch_debug_exit_cache_set(c);
-	bch_fs_chardev_exit(c);
-
-	if (c->kobj.state_in_sysfs)
-		kobject_del(&c->kobj);
-
-	bch_cache_accounting_destroy(&c->accounting);
-
-	kobject_put(&c->time_stats);
-	kobject_put(&c->opts_dir);
-	kobject_put(&c->internal);
-
-	mutex_lock(&bch_register_lock);
-	bch_fs_read_only_sync(c);
-	mutex_unlock(&bch_register_lock);
+	bch_fs_offline(c);
 
 	closure_return(cl);
 }
 
 /*
- * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
- * haven't waited for anything to stop yet, we're just punting to process
+ * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
+ * we haven't waited for anything to stop yet, we're just punting to process
  * context to shut down block devices:
  */
 static void __bch_fs_stop1(struct closure *cl)
@@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl)
 	continue_at(cl, __bch_fs_stop2, system_wq);
 }
 
-void bch_fs_stop(struct cache_set *c)
+void bch_fs_stop_async(struct cache_set *c)
 {
-	if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STOPPING) {
+		c->state = BCH_FS_STOPPING;
 		closure_queue(&c->caching);
+	}
+	mutex_unlock(&c->state_lock);
 }
 
-void bch_fs_stop_sync(struct cache_set *c)
+void bch_fs_stop(struct cache_set *c)
 {
-	DECLARE_COMPLETION_ONSTACK(complete);
+	mutex_lock(&c->state_lock);
+	BUG_ON(c->state == BCH_FS_STOPPING);
+	c->state = BCH_FS_STOPPING;
+	mutex_unlock(&c->state_lock);
+
+	bch_blockdevs_stop(c);
+
+	closure_sync(&c->caching);
+	closure_debug_destroy(&c->caching);
+
+	bch_fs_offline(c);
 
-	c->stop_completion = &complete;
-	bch_fs_stop(c);
 	closure_put(&c->cl);
+	closure_sync(&c->cl);
 
-	/* Killable? */
-	wait_for_completion(&complete);
+	bch_fs_exit(c);
+	kobject_put(&c->kobj);
 }
 
 /* Stop, detaching from backing devices: */
 void bch_fs_detach(struct cache_set *c)
 {
 	if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
-		bch_fs_stop(c);
+		bch_fs_stop_async(c);
 }
 
 static unsigned bch_fs_nr_devices(struct cache_set *c)
@@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->minor		= -1;
 
+	mutex_init(&c->state_lock);
 	mutex_init(&c->sb_lock);
 	INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
 	mutex_init(&c->btree_cache_lock);
@@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
 
-	bch_open_buckets_init(c);
-	bch_tiering_init_cache_set(c);
+	bch_fs_allocator_init(c);
+	bch_fs_tiering_init(c);
 
 	INIT_LIST_HEAD(&c->list);
 	INIT_LIST_HEAD(&c->cached_devs);
@@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch_fs_blockdev_init(c) ||
 	    bch_io_clock_init(&c->io_clock[READ]) ||
 	    bch_io_clock_init(&c->io_clock[WRITE]) ||
-	    bch_journal_alloc(&c->journal, journal_entry_bytes) ||
-	    bch_btree_cache_alloc(c) ||
+	    bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
+	    bch_fs_btree_init(c) ||
 	    bch_fs_encryption_init(c) ||
-	    bch_compress_init(c) ||
+	    bch_fs_compress_init(c) ||
 	    bch_check_set_has_compressed_data(c, c->opts.compression))
 		goto err;
 
@@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	closure_init(&c->caching, &c->cl);
 	set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
 
+	closure_get(&c->cl);
 	continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
 	return c;
 err:
@@ -671,7 +660,20 @@ err:
 	return NULL;
 }
 
-static int bch_fs_online(struct cache_set *c)
+static struct cache_set *bch_fs_lookup(uuid_le uuid)
+{
+	struct cache_set *c;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+static const char *__bch_fs_online(struct cache_set *c)
 {
 	struct cache *ca;
 	unsigned i;
@@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c)
 	lockdep_assert_held(&bch_register_lock);
 
 	if (!list_empty(&c->list))
-		return 0;
+		return NULL;
 
-	list_add(&c->list, &bch_fs_list);
+	if (bch_fs_lookup(c->sb.uuid))
+		return "filesystem UUID already open";
 
 	ret = bch_fs_chardev_init(c);
 	if (ret)
-		return ret;
+		return "error creating character device";
+
+	bch_fs_debug_init(c);
 
 	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
 	    kobject_add(&c->internal, &c->kobj, "internal") ||
 	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
 	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
 	    bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
-		return -1;
+		return "error creating sysfs objects";
 
 	for_each_cache(ca, c, i)
 		if (bch_dev_online(ca)) {
 			percpu_ref_put(&ca->ref);
-			return -1;
+			return "error creating sysfs objects";
 		}
 
+	mutex_lock(&c->state_lock);
+
+	if (bch_blockdev_volumes_start(c)) {
+		mutex_unlock(&c->state_lock);
+		return "can't bring up blockdev volumes";
+	}
+
+	bch_attach_backing_devs(c);
+
+	mutex_unlock(&c->state_lock);
+
+	list_add(&c->list, &bch_fs_list);
+
 	return 0;
 }
 
-static const char *bch_fs_start(struct cache_set *c)
+static const char *bch_fs_online(struct cache_set *c)
+{
+	const char *err;
+
+	mutex_lock(&bch_register_lock);
+	err = __bch_fs_online(c);
+	mutex_unlock(&bch_register_lock);
+
+	return err;
+}
+
+static const char *__bch_fs_start(struct cache_set *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
@@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c)
 	struct jset *j;
 	int ret = -EINVAL;
 
-	lockdep_assert_held(&bch_register_lock);
-	BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
-
-	/* We don't want bch_fatal_error() to free underneath us */
-	closure_get(&c->caching);
+	BUG_ON(c->state != BCH_FS_STARTING);
 
 	/*
 	 * Make sure that each cache object's mi is up to date before
@@ -826,6 +851,16 @@ static const char *bch_fs_start(struct cache_set *c)
 
 		bch_notice(c, "initializing new filesystem");
 
+		bch_initial_gc(c, NULL);
+
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+			    bch_dev_allocator_start(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
 		err = "unable to allocate journal buckets";
 		for_each_cache(ca, c, i)
 			if (bch_dev_journal_alloc(ca)) {
@@ -833,8 +868,6 @@ static const char *bch_fs_start(struct cache_set *c)
 				goto err;
 			}
 
-		bch_initial_gc(c, NULL);
-
 		/*
 		 * journal_res_get() will crash if called before this has
 		 * set up the journal.pin FIFO and journal.cur pointer:
@@ -842,14 +875,6 @@ static const char *bch_fs_start(struct cache_set *c)
 		bch_journal_start(c);
 		bch_journal_set_replay_done(&c->journal);
 
-		err = "error starting allocator thread";
-		for_each_cache(ca, c, i)
-			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-			    bch_dev_allocator_start(ca)) {
-				percpu_ref_put(&ca->ref);
-				goto err;
-			}
-
 		err = "cannot allocate new btree root";
 		for (id = 0; id < BTREE_ID_NR; id++)
 			if (bch_btree_root_alloc(c, id, &cl)) {
@@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c)
 			goto err;
 	}
 recovery_done:
+	err = "dynamic fault";
+	if (bch_fs_init_fault("fs_start"))
+		goto err;
+
 	if (c->opts.read_only) {
-		bch_fs_read_only_sync(c);
+		bch_fs_read_only(c);
 	} else {
-		err = __bch_fs_read_write(c);
+		err = bch_fs_read_write(c);
 		if (err)
 			goto err;
 	}
@@ -901,27 +930,9 @@ recovery_done:
 	bch_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	err = "dynamic fault";
-	if (bch_fs_init_fault("fs_start"))
-		goto err;
-
-	err = "error creating kobject";
-	if (bch_fs_online(c))
-		goto err;
-
-	err = "can't bring up blockdev volumes";
-	if (bch_blockdev_volumes_start(c))
-		goto err;
-
-	bch_debug_init_cache_set(c);
-	set_bit(BCH_FS_RUNNING, &c->flags);
-	bch_attach_backing_devs(c);
-
-	bch_notify_fs_read_write(c);
 	err = NULL;
 out:
 	bch_journal_entries_free(&journal);
-	closure_put(&c->caching);
 	return err;
 err:
 	switch (ret) {
@@ -955,6 +966,11 @@ err:
 	goto out;
 }
 
+const char *bch_fs_start(struct cache_set *c)
+{
+	return __bch_fs_start(c) ?: bch_fs_online(c);
+}
+
 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 {
 	struct bch_sb_field_members *sb_mi;
@@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 	return NULL;
 }
 
-/* Cache device */
+/* Device startup/shutdown, ro/rw: */
 
 bool bch_dev_read_only(struct cache *ca)
 {
@@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca)
 
 	bdevname(ca->disk_sb.bdev, buf);
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
 		return false;
 
 	if (!bch_dev_may_remove(ca)) {
 		bch_err(c, "required member %s going RO, forcing fs RO", buf);
-		bch_fs_read_only_sync(c);
+		bch_fs_read_only(c);
 	}
 
 	trace_bcache_cache_read_only(ca);
@@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca)
 
 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 {
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
 		return NULL;
@@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 	if (bch_dev_allocator_start(ca))
 		return "error starting allocator thread";
 
-	if (bch_moving_gc_thread_start(ca))
+	if (bch_moving_gc_start(ca))
 		return "error starting moving GC thread";
 
-	bch_dev_group_add(&c->journal.devs, ca);
-
-	wake_up_process(c->tiering_read);
+	if (bch_tiering_start(c))
+		return "error starting tiering thread";
 
 	bch_notify_dev_read_write(ca);
 	trace_bcache_cache_read_write_done(ca);
@@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca)
 	return NULL;
 }
 
-/*
- * bch_dev_stop has already returned, so we no longer hold the register
- * lock at the point this is called.
- */
-
 void bch_dev_release(struct kobject *kobj)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 
-	percpu_ref_exit(&ca->ref);
 	kfree(ca);
 }
 
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_free(struct cache *ca)
 {
-	struct cache *ca = container_of(work, struct cache, free_work);
 	struct cache_set *c = ca->set;
 	unsigned i;
 
@@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work)
 		kobject_del(&ca->kobj);
 
 	bch_free_super(&ca->disk_sb);
-
-	/*
-	 * bch_dev_stop can be called in the middle of initialization
-	 * of the struct cache object.
-	 * As such, not all the sub-structures may be initialized.
-	 * However, they were zeroed when the object was allocated.
-	 */
-
-	bch_journal_free_cache(ca);
+	bch_dev_journal_exit(ca);
 	free_percpu(ca->sectors_written);
 	bioset_exit(&ca->replica_set);
 	free_percpu(ca->bucket_stats_percpu);
@@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work)
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
 
+	percpu_ref_exit(&ca->ref);
 	kobject_put(&ca->kobj);
 
 	if (c)
 		kobject_put(&c->kobj);
 }
 
+static void bch_dev_free_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, free_work);
+
+	bch_dev_free(ca);
+}
+
 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
 {
 	struct cache *ca = container_of(ref, struct cache, ref);
@@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
-	if (c) {
-		BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
-		rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
-	}
+	BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+	rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
 
 	call_rcu(&ca->free_rcu, bch_dev_free_rcu);
 }
@@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work)
 	 */
 	closure_get(&c->cl);
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
+
 	bch_dev_stop(ca);
 
 	/*
@@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work)
 	 */
 	synchronize_rcu();
 
-	lockdep_assert_held(&bch_register_lock);
-
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
 	 * this device must be gone:
@@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work)
 	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
 
 	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
 
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
 
 	closure_put(&c->cl);
 }
 
-bool bch_dev_remove(struct cache *ca, bool force)
+static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
 {
-	mutex_lock(&bch_register_lock);
-
 	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
 		return false;
 
 	if (!bch_dev_may_remove(ca)) {
-		bch_err(ca->set, "Can't remove last device in tier %u",
-			ca->mi.tier);
+		bch_err(ca->set, "Can't remove last RW device");
 		bch_notify_dev_remove_failed(ca);
 		return false;
 	}
@@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force)
 
 	if (force)
 		set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
+
 	set_bit(BCH_DEV_REMOVING, &ca->flags);
 	bch_notify_dev_removing(ca);
 
-	mutex_unlock(&bch_register_lock);
-
 	/* Migrate the data and finish removal asynchronously: */
 
 	queue_work(system_long_wq, &ca->remove_work);
 	return true;
 }
 
+bool bch_dev_remove(struct cache *ca, bool force)
+{
+	struct cache_set *c = ca->set;
+	bool ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_remove(c, ca, force);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
 static int bch_dev_online(struct cache *ca)
 {
 	char buf[12];
 
-	lockdep_assert_held(&bch_register_lock);
-
 	sprintf(buf, "cache%u", ca->dev_idx);
 
 	if (kobject_add(&ca->kobj,
@@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	kobject_init(&ca->kobj, &bch_dev_ktype);
 
 	spin_lock_init(&ca->self.lock);
-	ca->self.nr_devices = 1;
+	ca->self.nr = 1;
 	rcu_assign_pointer(ca->self.d[0].dev, ca);
 	ca->dev_idx = sb->sb->dev_idx;
 
@@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	spin_lock_init(&ca->freelist_lock);
 	spin_lock_init(&ca->prio_buckets_lock);
 	mutex_init(&ca->heap_lock);
-	bch_moving_init_cache(ca);
+	bch_dev_moving_gc_init(ca);
 
 	ca->disk_sb = *sb;
-	ca->disk_sb.bdev->bd_holder = ca;
+	if (sb->mode & FMODE_EXCL)
+		ca->disk_sb.bdev->bd_holder = ca;
 	memset(sb, 0, sizeof(*sb));
 
 	INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
@@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio)) ||
 	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
-	    bch_journal_init_cache(ca))
+	    bch_dev_journal_init(ca))
 		goto err;
 
 	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	err = "error creating kobject";
 	if (c->kobj.state_in_sysfs &&
 	    bch_dev_online(ca))
-		goto err;
+		pr_warn("error creating sysfs objects");
 
 	if (ret)
 		*ret = ca;
@@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 		kobject_put(&ca->kobj);
 	return NULL;
 err:
-	bch_dev_stop(ca);
+	bch_dev_free(ca);
 	return err;
 }
 
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
-	struct cache_set *c;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
-			return c;
-
-	return NULL;
-}
-
 int bch_dev_add(struct cache_set *c, const char *path)
 {
 	struct bcache_superblock sb;
 	const char *err;
 	struct cache *ca;
-	struct bch_sb_field *f;
 	struct bch_sb_field_members *mi, *dev_mi;
 	struct bch_member saved_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret = -EINVAL;
 
-	mutex_lock(&bch_register_lock);
-
 	err = bch_read_super(&sb, c->opts, path);
 	if (err)
-		goto err_unlock_register;
+		return -EINVAL;
 
 	err = bch_validate_cache_super(&sb);
 	if (err)
-		goto err_unlock_register;
-
-	mutex_lock(&c->sb_lock);
+		return -EINVAL;
 
 	err = bch_dev_may_add(sb.sb, c);
 	if (err)
-		goto err_unlock;
+		return -EINVAL;
+
+	mutex_lock(&c->state_lock);
+	mutex_lock(&c->sb_lock);
 
 	/*
 	 * Preserve the old cache member information (esp. tier)
@@ -1571,17 +1568,14 @@ have_slot:
 		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 	err = "no space in superblock for member info";
 
-	f = bch_fs_sb_field_resize(c, &mi->field, u64s);
-	if (!f)
+	mi = bch_fs_sb_resize_members(c, u64s);
+	if (!mi)
 		goto err_unlock;
 
-	mi = container_of(f, struct bch_sb_field_members, field);
-
-	f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
-	if (!f)
+	dev_mi = bch_sb_resize_members(&sb, u64s);
+	if (!dev_mi)
 		goto err_unlock;
 
-	dev_mi = container_of(f, struct bch_sb_field_members, field);
 	memcpy(dev_mi, mi, u64s * sizeof(u64));
 	dev_mi->members[dev_idx] = saved_mi;
 
@@ -1619,14 +1613,13 @@ have_slot:
 
 	kobject_put(&ca->kobj);
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	return 0;
 err_put:
 	bch_dev_stop(ca);
 err_unlock:
 	mutex_unlock(&c->sb_lock);
-err_unlock_register:
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	bch_free_super(&sb);
 
 	bch_err(c, "Unable to add device: %s", err);
@@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	const char *err;
 	struct cache_set *c = NULL;
 	struct bcache_superblock *sb;
-	uuid_le uuid;
 	unsigned i;
 
-	memset(&uuid, 0, sizeof(uuid_le));
-
 	if (!nr_devices)
 		return "need at least one device";
 
@@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	if (!sb)
 		goto err;
 
-	/*
-	 * bch_read_super() needs to happen under register_lock, so that the
-	 * exclusive open is atomic with adding the new cache set to the list of
-	 * cache sets:
-	 */
-	mutex_lock(&bch_register_lock);
-
 	for (i = 0; i < nr_devices; i++) {
 		err = bch_read_super(&sb[i], opts, devices[i]);
 		if (err)
-			goto err_unlock;
+			goto err;
 
 		err = "attempting to register backing device";
 		if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
-			goto err_unlock;
+			goto err;
 
 		err = bch_validate_cache_super(&sb[i]);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
-	err = "cache set already registered";
-	if (bch_fs_lookup(sb->sb->uuid))
-		goto err_unlock;
-
 	err = "cannot allocate memory";
 	c = bch_fs_alloc(sb[0].sb, opts);
 	if (!c)
-		goto err_unlock;
+		goto err;
 
 	for (i = 0; i < nr_devices; i++) {
 		err = bch_dev_alloc(&sb[i], c, NULL);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
 	err = "insufficient devices";
 	if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
-		goto err_unlock;
+		goto err;
 
-	err = bch_fs_start(c);
-	if (err)
-		goto err_unlock;
+	if (!c->opts.nostart) {
+		err = __bch_fs_start(c);
+		if (err)
+			goto err;
+	}
 
-	err = "error creating kobject";
-	if (bch_fs_online(c))
-		goto err_unlock;
+	err = bch_fs_online(c);
+	if (err)
+		goto err;
 
-	if (ret) {
-		closure_get(&c->cl);
+	if (ret)
 		*ret = c;
-	}
-
-	mutex_unlock(&bch_register_lock);
+	else
+		closure_put(&c->cl);
 
 	err = NULL;
 out:
@@ -1717,20 +1696,18 @@ out:
 	if (err)
 		c = NULL;
 	return err;
-err_unlock:
+err:
 	if (c)
 		bch_fs_stop(c);
-	mutex_unlock(&bch_register_lock);
-err:
+
 	for (i = 0; i < nr_devices; i++)
 		bch_free_super(&sb[i]);
 	goto out;
 }
 
 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
-				  struct bch_opts opts)
+					     struct bch_opts opts)
 {
-	char name[BDEVNAME_SIZE];
 	const char *err;
 	struct cache_set *c;
 	bool allocated_cache_set = false;
@@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (err)
 		return err;
 
-	bdevname(sb->bdev, name);
-
+	mutex_lock(&bch_register_lock);
 	c = bch_fs_lookup(sb->sb->uuid);
 	if (c) {
+		closure_get(&c->cl);
+
 		err = bch_dev_in_fs(sb->sb, c);
 		if (err)
-			return err;
+			goto err;
 	} else {
 		c = bch_fs_alloc(sb->sb, opts);
+		err = "cannot allocate memory";
 		if (!c)
-			return "cannot allocate memory";
+			goto err;
 
 		allocated_cache_set = true;
 	}
@@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (err)
 		goto err;
 
-	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
-		err = bch_fs_start(c);
+	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
+	    !c->opts.nostart) {
+		err = __bch_fs_start(c);
 		if (err)
 			goto err;
-	} else {
-		err = "error creating kobject";
-		if (bch_fs_online(c))
-			goto err;
 	}
 
-	bch_info(c, "started");
+	err = __bch_fs_online(c);
+	if (err)
+		goto err;
+
+	closure_put(&c->cl);
+	mutex_unlock(&bch_register_lock);
+
 	return NULL;
 err:
+	mutex_unlock(&bch_register_lock);
+
 	if (allocated_cache_set)
 		bch_fs_stop(c);
+	else if (c)
+		closure_put(&c->cl);
+
 	return err;
 }
 
@@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path)
 	struct bch_opts opts = bch_opts_empty();
 	const char *err;
 
-	mutex_lock(&bch_register_lock);
-
 	err = bch_read_super(&sb, opts, path);
 	if (err)
-		goto err;
+		return err;
 
-	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
+		mutex_lock(&bch_register_lock);
 		err = bch_backing_dev_register(&sb);
-	else
+		mutex_unlock(&bch_register_lock);
+	} else {
 		err = __bch_fs_open_incremental(&sb, opts);
+	}
 
 	bch_free_super(&sb);
-err:
-	mutex_unlock(&bch_register_lock);
+
 	return err;
 }
 
@@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
 			pr_info("Setting all devices read only:");
 
 		list_for_each_entry(c, &bch_fs_list, list)
-			bch_fs_read_only(c);
+			bch_fs_read_only_async(c);
 
 		list_for_each_entry(c, &bch_fs_list, list)
-			bch_fs_read_only_sync(c);
+			bch_fs_read_only(c);
 
 		mutex_unlock(&bch_register_lock);
 	}
@@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot,		reboot_test);
 static void bcache_exit(void)
 {
 	bch_debug_exit();
-	bch_fs_exit();
+	bch_vfs_exit();
 	bch_blockdev_exit();
 	bch_chardev_exit();
 	if (bcache_kset)
@@ -1917,7 +1904,7 @@ static int __init bcache_init(void)
 	    sysfs_create_files(&bcache_kset->kobj, files) ||
 	    bch_chardev_init() ||
 	    bch_blockdev_init() ||
-	    bch_fs_init() ||
+	    bch_vfs_init() ||
 	    bch_debug_init())
 		goto err;
 
diff --git a/libbcache/super.h b/libbcache/super.h
index bcf7d983..bafd88e0 100644
--- a/libbcache/super.h
+++ b/libbcache/super.h
@@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
 static inline bool bch_dev_may_remove(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
-
-	/*
-	 * Right now, we can't remove the last device from a tier,
-	 * - For tier 0, because all metadata lives in tier 0 and because
-	 *   there is no way to have foreground writes go directly to tier 1.
-	 * - For tier 1, because the code doesn't completely support an
-	 *   empty tier 1.
-	 */
-
-	/*
-	 * Turning a device read-only removes it from the cache group,
-	 * so there may only be one read-write device in a tier, and yet
-	 * the device we are removing is in the same tier, so we have
-	 * to check for identity.
-	 * Removing the last RW device from a tier requires turning the
-	 * whole cache set RO.
-	 */
-
-	return tier->nr_devices != 1 ||
-		rcu_access_pointer(tier->d[0].dev) != ca;
+	struct cache_group *grp = &c->cache_all;
+
+	/* Can't remove the last RW device: */
+	return grp->nr != 1 ||
+		rcu_access_pointer(grp->d[0].dev) != ca;
 }
 
 void bch_dev_release(struct kobject *);
@@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *);
 
 void bch_fs_detach(struct cache_set *);
 
-bool bch_fs_read_only(struct cache_set *);
 bool bch_fs_emergency_read_only(struct cache_set *);
-void bch_fs_read_only_sync(struct cache_set *);
+void bch_fs_read_only(struct cache_set *);
 const char *bch_fs_read_write(struct cache_set *);
 
 void bch_fs_release(struct kobject *);
+void bch_fs_stop_async(struct cache_set *);
 void bch_fs_stop(struct cache_set *);
-void bch_fs_stop_sync(struct cache_set *);
 
+const char *bch_fs_start(struct cache_set *);
 const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
 			struct cache_set **);
 const char *bch_fs_open_incremental(const char *path);
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
index 41eaf0dd..69c747de 100644
--- a/libbcache/super_types.h
+++ b/libbcache/super_types.h
@@ -6,6 +6,7 @@ struct bcache_superblock {
 	struct block_device	*bdev;
 	struct bio		*bio;
 	unsigned		page_order;
+	fmode_t			mode;
 };
 
 #endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
index 9f45a6b0..48f9f1f6 100644
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@@ -22,6 +22,7 @@
 #include "opts.h"
 #include "request.h"
 #include "super-io.h"
+#include "tier.h"
 #include "writeback.h"
 
 #include <linux/blkdev.h>
@@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy);
 rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(tier);
 rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);
@@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent);
 rw_attribute(size);
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
-read_attribute(tier);
 
 #define BCH_DEBUG_PARAM(name, description)				\
 	rw_attribute(name);
@@ -680,7 +682,8 @@ SHOW(bch_fs)
 
 	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
 	sysfs_print(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_show(tiering,	&c->tiering_pd);
+
+	sysfs_pd_controller_show(tiering,	&c->tiers[1].pd); /* XXX */
 
 	sysfs_printf(meta_replicas_have, "%u",	c->sb.meta_replicas_have);
 	sysfs_printf(data_replicas_have, "%u",	c->sb.data_replicas_have);
@@ -694,7 +697,7 @@ SHOW(bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 
 	if (attr == &sysfs_bset_tree_stats)
@@ -723,7 +726,7 @@ STORE(__bch_fs)
 	}
 
 	if (attr == &sysfs_stop) {
-		bch_fs_stop(c);
+		bch_fs_stop_async(c);
 		return size;
 	}
 
@@ -773,25 +776,18 @@ STORE(__bch_fs)
 		ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
 			?: (ssize_t) size;
 
-		if (c->tiering_read)
-			wake_up_process(c->tiering_read);
+		bch_tiering_start(c); /* issue wakeups */
 		return ret;
 	}
 
 	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
 
-	if (attr == &sysfs_journal_flush) {
-		bch_journal_meta_async(&c->journal, NULL);
-
-		return size;
-	}
-
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
 	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
 
 	sysfs_strtoul(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_store(tiering,	&c->tiering_pd);
+	sysfs_pd_controller_store(tiering,	&c->tiers[1].pd); /* XXX */
 
 	/* Debugging: */
 
@@ -799,11 +795,14 @@ STORE(__bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
-		return -EINTR;
+	if (attr == &sysfs_journal_flush) {
+		bch_journal_meta_async(&c->journal, NULL);
+
+		return size;
+	}
 
 	if (attr == &sysfs_blockdev_volume_create) {
 		u64 v = strtoi_h_or_return(buf);
@@ -836,9 +835,9 @@ STORE(bch_fs)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
 	size = __bch_fs_store(kobj, attr, buf, size);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 
 	if (attr == &sysfs_add_device) {
 		char *path = kstrdup(buf, GFP_KERNEL);
@@ -1273,6 +1272,31 @@ STORE(__bch_dev)
 		mutex_unlock(&c->sb_lock);
 	}
 
+	if (attr == &sysfs_tier) {
+		unsigned prev_tier;
+		unsigned v = strtoul_restrict_or_return(buf,
+					0, BCH_TIER_MAX - 1);
+
+		mutex_lock(&c->sb_lock);
+		prev_tier = ca->mi.tier;
+
+		if (v == ca->mi.tier) {
+			mutex_unlock(&c->sb_lock);
+			return size;
+		}
+
+		mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+		SET_BCH_MEMBER_TIER(mi, v);
+		bch_write_super(c);
+
+		bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
+		bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+		mutex_unlock(&c->sb_lock);
+
+		bch_recalc_capacity(c);
+		bch_tiering_start(c);
+	}
+
 	if (attr == &sysfs_state_rw) {
 		char name[BDEVNAME_SIZE];
 		const char *err = NULL;
diff --git a/libbcache/tier.c b/libbcache/tier.c
index 46864594..0ab17708 100644
--- a/libbcache/tier.c
+++ b/libbcache/tier.c
@@ -16,8 +16,7 @@
 #include <trace/events/bcache.h>
 
 struct tiering_state {
-	struct cache_group	*tier;
-	unsigned		tier_idx;
+	struct bch_tier		*tier;
 	unsigned		sectors;
 	unsigned		stripe_size;
 	unsigned		dev_idx;
@@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
 		mi = cache_member_info_get(c);
 		extent_for_each_ptr(e, ptr)
 			if (ptr->dev < mi->nr_devices &&
-			    mi->m[ptr->dev].tier >= s->tier_idx)
+			    mi->m[ptr->dev].tier >= s->tier->idx)
 				replicas++;
 		cache_member_info_put();
 
@@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s)
 		s->sectors = 0;
 		s->dev_idx++;
 
-		spin_lock(&s->tier->lock);
-		if (s->dev_idx >= s->tier->nr_devices)
+		spin_lock(&s->tier->devs.lock);
+		if (s->dev_idx >= s->tier->devs.nr)
 			s->dev_idx = 0;
 
-		if (s->tier->nr_devices) {
-			s->ca = s->tier->d[s->dev_idx].dev;
+		if (s->tier->devs.nr) {
+			s->ca = s->tier->devs.d[s->dev_idx].dev;
 			percpu_ref_get(&s->ca->ref);
 		}
-		spin_unlock(&s->tier->lock);
+		spin_unlock(&s->tier->devs.lock);
 	}
 }
 
@@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c,
  * tiering_next_cache - issue a move to write an extent to the next cache
  * device in round robin order
  */
-static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+static s64 read_tiering(struct cache_set *c, struct bch_tier *tier)
 {
 	struct moving_context ctxt;
 	struct tiering_state s;
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	unsigned nr_devices = READ_ONCE(tier->nr_devices);
+	unsigned nr_devices = READ_ONCE(tier->devs.nr);
 	int ret;
 
 	if (!nr_devices)
@@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
 
 	memset(&s, 0, sizeof(s));
 	s.tier		= tier;
-	s.tier_idx	= tier - c->cache_tiers;
 	s.stripe_size	= 2048; /* 1 mb for now */
 
-	bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+	bch_move_ctxt_init(&ctxt, &tier->pd.rate,
 			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
 	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
 
@@ -164,8 +162,8 @@ next:
 
 static int bch_tiering_thread(void *arg)
 {
-	struct cache_set *c = arg;
-	struct cache_group *tier = &c->cache_tiers[1];
+	struct bch_tier *tier = arg;
+	struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]);
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct cache *ca;
 	u64 tier_capacity, available_sectors;
@@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg)
 
 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   tier->nr_devices))
+					   tier->devs.nr))
 			break;
 
 		while (1) {
-			struct cache_group *faster_tier;
+			struct bch_tier *faster_tier;
 
 			last = atomic_long_read(&clock->now);
 
 			tier_capacity = available_sectors = 0;
 			rcu_read_lock();
-			for (faster_tier = c->cache_tiers;
+			for (faster_tier = c->tiers;
 			     faster_tier != tier;
 			     faster_tier++) {
-				group_for_each_cache_rcu(ca, faster_tier, i) {
+				group_for_each_cache_rcu(ca, &faster_tier->devs, i) {
 					tier_capacity +=
 						(ca->mi.nbuckets -
 						 ca->mi.first_bucket) << ca->bucket_bits;
@@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg)
 	return 0;
 }
 
-void bch_tiering_init_cache_set(struct cache_set *c)
+static void __bch_tiering_stop(struct bch_tier *tier)
 {
-	bch_pd_controller_init(&c->tiering_pd);
+	tier->pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&tier->pd.rate);
+
+	if (tier->migrate)
+		kthread_stop(tier->migrate);
+
+	tier->migrate = NULL;
+}
+
+void bch_tiering_stop(struct cache_set *c)
+{
+	struct bch_tier *tier;
+
+	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
+		__bch_tiering_stop(tier);
+}
+
+static int __bch_tiering_start(struct bch_tier *tier)
+{
+	if (!tier->migrate) {
+		struct task_struct *p =
+			kthread_create(bch_tiering_thread, tier,
+				       "bch_tier[%u]", tier->idx);
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+
+		tier->migrate = p;
+	}
+
+	wake_up_process(tier->migrate);
+	return 0;
 }
 
-int bch_tiering_read_start(struct cache_set *c)
+int bch_tiering_start(struct cache_set *c)
 {
-	struct task_struct *t;
+	struct bch_tier *tier;
+	bool have_faster_tier = false;
 
 	if (c->opts.nochanges)
 		return 0;
 
-	t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
-	if (IS_ERR(t))
-		return PTR_ERR(t);
+	for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+		if (!tier->devs.nr)
+			continue;
 
-	c->tiering_read = t;
-	wake_up_process(c->tiering_read);
+		if (have_faster_tier) {
+			int ret = __bch_tiering_start(tier);
+			if (ret)
+				return ret;
+		} else {
+			__bch_tiering_stop(tier);
+		}
+
+		have_faster_tier = true;
+	}
 
 	return 0;
 }
 
-void bch_tiering_read_stop(struct cache_set *c)
+void bch_fs_tiering_init(struct cache_set *c)
 {
-	if (!IS_ERR_OR_NULL(c->tiering_read)) {
-		kthread_stop(c->tiering_read);
-		c->tiering_read = NULL;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+		c->tiers[i].idx = i;
+		bch_pd_controller_init(&c->tiers[i].pd);
 	}
 }
diff --git a/libbcache/tier.h b/libbcache/tier.h
index 89c2bffd..b53e83d9 100644
--- a/libbcache/tier.h
+++ b/libbcache/tier.h
@@ -1,8 +1,8 @@
 #ifndef _BCACHE_TIER_H
 #define _BCACHE_TIER_H
 
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_read_stop(struct cache_set *);
+void bch_tiering_stop(struct cache_set *);
+int bch_tiering_start(struct cache_set *);
+void bch_fs_tiering_init(struct cache_set *);
 
 #endif
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-01 01:45:15 -0900
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-09 09:14:11 -0900
commit	a17f7bcec7ed810a247c24e56229af8f43a9a6ae (patch)
tree	1b2d60b21661bd2991324e3efaa83b3cdd87a783 /libbcache
parent	171ee48e57be78f4e95954c99851553fa523bf91 (diff)