1 files changed, 335 insertions, 248 deletions
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 9e09b86..3bb9e3c 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -18,7 +18,8 @@
 #include "io.h"
 #include "keylist.h"
 #include "journal.h"
-#include "super.h"
+#include "super-io.h"
+#include "vstructs.h"
 
 #include <trace/events/bcache.h>
 
@@ -52,19 +53,14 @@ static inline u64 journal_pin_seq(struct journal *j,
 	return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
 }
 
-#define for_each_jset_entry(entry, jset)				\
-	for (entry = (jset)->start;					\
-	     entry < bkey_idx(jset, le32_to_cpu((jset)->u64s));		\
-	     entry = jset_keys_next(entry))
-
 static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 					struct jset_entry *entry, unsigned type)
 {
-	while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) {
+	while (entry < vstruct_last(jset)) {
 		if (JOURNAL_ENTRY_TYPE(entry) == type)
 			return entry;
 
-		entry = jset_keys_next(entry);
+		entry = vstruct_next(entry);
 	}
 
 	return NULL;
@@ -73,14 +69,11 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 #define for_each_jset_entry_type(entry, jset, type)			\
 	for (entry = (jset)->start;					\
 	     (entry = __jset_entry_type_next(jset, entry, type));	\
-	     entry = jset_keys_next(entry))
+	     entry = vstruct_next(entry))
 
 #define for_each_jset_key(k, _n, entry, jset)				\
 	for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS)	\
-		for (k = (entry)->start;			\
-		     (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\
-		      (_n = bkey_next(k), 1));			\
-		     k = _n)
+		vstruct_for_each_safe(entry, k, _n)
 
 static inline void bch_journal_add_entry(struct journal_buf *buf,
 					 const void *data, size_t u64s,
@@ -199,8 +192,6 @@ redo_peek:
 
 	closure_sync(&cl);
 
-	mutex_lock(&c->btree_interior_update_lock);
-
 	for (i = 0;; i++) {
 		struct btree_interior_update *as;
 		struct pending_btree_node_free *d;
@@ -212,6 +203,8 @@ redo_peek:
 		}
 		n = bl->entries[i];
 		mutex_unlock(&j->blacklist_lock);
+redo_wait:
+		mutex_lock(&c->btree_interior_update_lock);
 
 		/*
 		 * Is the node on the list of pending interior node updates -
@@ -225,11 +218,11 @@ redo_peek:
 				closure_wait(&as->wait, &cl);
 				mutex_unlock(&c->btree_interior_update_lock);
 				closure_sync(&cl);
-				break;
+				goto redo_wait;
 			}
-	}
 
-	mutex_unlock(&c->btree_interior_update_lock);
+		mutex_unlock(&c->btree_interior_update_lock);
+	}
 
 	mutex_lock(&j->blacklist_lock);
 
@@ -377,7 +370,6 @@ out:
 struct journal_list {
 	struct closure		cl;
 	struct mutex		lock;
-	struct mutex		cache_set_buffer_lock;
 	struct list_head	*head;
 	int			ret;
 };
@@ -394,7 +386,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
 {
 	struct journal_replay *i, *pos;
 	struct list_head *where;
-	size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+	size_t bytes = vstruct_bytes(j);
 	__le64 last_seq;
 	int ret;
 
@@ -422,8 +414,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
 	list_for_each_entry_reverse(i, jlist->head, list) {
 		/* Duplicate? */
 		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-			fsck_err_on(bytes != __set_bytes(&i->j,
-						le32_to_cpu(i->j.u64s)) ||
+			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
 				    memcmp(j, &i->j, bytes), c,
 				    "found duplicate but non identical journal entries (seq %llu)",
 				    le64_to_cpu(j->seq));
@@ -455,11 +446,21 @@ fsck_err:
 	return ret;
 }
 
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
 static void journal_entry_null_range(void *start, void *end)
 {
 	struct jset_entry *entry;
 
-	for (entry = start; entry != end; entry = jset_keys_next(entry)) {
+	for (entry = start; entry != end; entry = vstruct_next(entry)) {
 		entry->u64s	= 0;
 		entry->btree_id	= 0;
 		entry->level	= 0;
@@ -473,7 +474,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
 				struct bkey_i *k, enum bkey_type key_type,
 				const char *type)
 {
-	void *next = jset_keys_next(entry);
+	void *next = vstruct_next(entry);
 	const char *invalid;
 	char buf[160];
 	int ret = 0;
@@ -481,16 +482,16 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
 	if (mustfix_fsck_err_on(!k->k.u64s, c,
 			"invalid %s in journal: k->u64s 0", type)) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-		journal_entry_null_range(jset_keys_next(entry), next);
+		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
 
 	if (mustfix_fsck_err_on((void *) bkey_next(k) >
-			(void *) jset_keys_next(entry), c,
+				(void *) vstruct_next(entry), c,
 			"invalid %s in journal: extends past end of journal entry",
 			type)) {
 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-		journal_entry_null_range(jset_keys_next(entry), next);
+		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
 
@@ -499,7 +500,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
 			type, k->k.format)) {
 		le16_add_cpu(&entry->u64s, -k->k.u64s);
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-		journal_entry_null_range(jset_keys_next(entry), next);
+		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
 
@@ -514,7 +515,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
 
 		le16_add_cpu(&entry->u64s, -k->k.u64s);
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-		journal_entry_null_range(jset_keys_next(entry), next);
+		journal_entry_null_range(vstruct_next(entry), next);
 		return 0;
 	}
 fsck_err:
@@ -525,16 +526,17 @@ fsck_err:
 #define JOURNAL_ENTRY_NONE	6
 #define JOURNAL_ENTRY_BAD	7
 
-static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector,
+static int journal_entry_validate(struct cache_set *c,
+				  struct jset *j, u64 sector,
 				  unsigned bucket_sectors_left,
 				  unsigned sectors_read)
 {
 	struct jset_entry *entry;
-	size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
-	u64 got, expect;
+	size_t bytes = vstruct_bytes(j);
+	struct bch_csum csum;
 	int ret = 0;
 
-	if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb))
+	if (le64_to_cpu(j->magic) != jset_magic(c))
 		return JOURNAL_ENTRY_NONE;
 
 	if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
@@ -554,25 +556,32 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
 	if (bytes > sectors_read << 9)
 		return JOURNAL_ENTRY_REREAD;
 
-	got = le64_to_cpu(j->csum);
-	expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j));
-	if (mustfix_fsck_err_on(got != expect, c,
-			"journal checksum bad (got %llu expect %llu), sector %lluu",
-			got, expect, sector)) {
+	if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+			"journal entry with unknown csum type %llu sector %lluu",
+			JSET_CSUM_TYPE(j), sector))
+		return JOURNAL_ENTRY_BAD;
+
+	csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+	if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c,
+			"journal checksum bad, sector %llu", sector)) {
 		/* XXX: retry IO, when we start retrying checksum errors */
 		/* XXX: note we might have missing journal entries */
 		return JOURNAL_ENTRY_BAD;
 	}
 
-	if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq),
-			c, "invalid journal entry: last_seq > seq"))
+	bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+		    j->encrypted_start,
+		    vstruct_end(j) - (void *) j->encrypted_start);
+
+	if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+			"invalid journal entry: last_seq > seq"))
 		j->last_seq = j->seq;
 
-	for_each_jset_entry(entry, j) {
+	vstruct_for_each(j, entry) {
 		struct bkey_i *k;
 
-		if (mustfix_fsck_err_on(jset_keys_next(entry) >
-				bkey_idx(j, le32_to_cpu(j->u64s)), c,
+		if (mustfix_fsck_err_on(vstruct_next(entry) >
+					vstruct_last(j), c,
 				"journal entry extents past end of jset")) {
 			j->u64s = cpu_to_le64((u64 *) entry - j->_data);
 			break;
@@ -580,9 +589,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
 
 		switch (JOURNAL_ENTRY_TYPE(entry)) {
 		case JOURNAL_ENTRY_BTREE_KEYS:
-			for (k = entry->start;
-			     k < bkey_idx(entry, le16_to_cpu(entry->u64s));
-			     k = bkey_next(k)) {
+			vstruct_for_each(entry, k) {
 				ret = journal_validate_key(c, j, entry, k,
 						bkey_type(entry->level,
 							  entry->btree_id),
@@ -599,7 +606,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
 					le16_to_cpu(entry->u64s) != k->k.u64s, c,
 					"invalid btree root journal entry: wrong number of keys")) {
 				journal_entry_null_range(entry,
-						jset_keys_next(entry));
+						vstruct_next(entry));
 				continue;
 			}
 
@@ -616,14 +623,14 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
 			if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
 				"invalid journal seq blacklist entry: bad size")) {
 				journal_entry_null_range(entry,
-						jset_keys_next(entry));
+						vstruct_next(entry));
 			}
 
 			break;
 		default:
 			mustfix_fsck_err(c, "invalid journal entry type %llu",
 				 JOURNAL_ENTRY_TYPE(entry));
-			journal_entry_null_range(entry, jset_keys_next(entry));
+			journal_entry_null_range(entry, vstruct_next(entry));
 			break;
 		}
 	}
@@ -632,126 +639,127 @@ fsck_err:
 	return ret;
 }
 
-static int journal_read_bucket(struct cache *ca, struct journal_list *jlist,
+struct journal_read_buf {
+	void		*data;
+	size_t		size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+				    size_t new_size)
+{
+	void *n;
+
+	new_size = roundup_pow_of_two(new_size);
+	n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+	if (!n)
+		return -ENOMEM;
+
+	free_pages((unsigned long) b->data, get_order(b->size));
+	b->data = n;
+	b->size = new_size;
+	return 0;
+}
+
+static int journal_read_bucket(struct cache *ca,
+			       struct journal_read_buf *buf,
+			       struct journal_list *jlist,
 			       unsigned bucket, u64 *seq, bool *entries_found)
 {
 	struct cache_set *c = ca->set;
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = ja->bio;
-	struct jset *j, *data;
-	unsigned blocks, sectors_read, bucket_offset = 0;
-	unsigned max_entry_sectors = c->journal.entry_size_max >> 9;
-	u64 sector = bucket_to_sector(ca,
-				journal_bucket(ca->disk_sb.sb, bucket));
+	struct jset *j = NULL;
+	unsigned sectors, sectors_read = 0;
+	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+	    end = offset + ca->mi.bucket_size;
 	bool saw_bad = false;
 	int ret = 0;
 
-	data = (void *) __get_free_pages(GFP_KERNEL,
-				get_order(c->journal.entry_size_max));
-	if (!data) {
-		mutex_lock(&jlist->cache_set_buffer_lock);
-		data = c->journal.buf[0].data;
-	}
-
 	pr_debug("reading %u", bucket);
 
-	while (bucket_offset < ca->mi.bucket_size) {
-reread:
-		sectors_read = min_t(unsigned,
-				     ca->mi.bucket_size - bucket_offset,
-				     max_entry_sectors);
+	while (offset < end) {
+		if (!sectors_read) {
+reread:			sectors_read = min_t(unsigned,
+				end - offset, buf->size >> 9);
 
-		bio_reset(bio);
-		bio->bi_bdev		= ca->disk_sb.bdev;
-		bio->bi_iter.bi_sector	= sector + bucket_offset;
-		bio->bi_iter.bi_size	= sectors_read << 9;
-		bio_set_op_attrs(bio, REQ_OP_READ, 0);
-		bch_bio_map(bio, data);
-
-		ret = submit_bio_wait(bio);
-
-		if (cache_fatal_io_err_on(ret, ca,
-					  "journal read from sector %llu",
-					  sector + bucket_offset) ||
-		    bch_meta_read_fault("journal")) {
-			ret = -EIO;
-			goto err;
-		}
+			bio_reset(bio);
+			bio->bi_bdev		= ca->disk_sb.bdev;
+			bio->bi_iter.bi_sector	= offset;
+			bio->bi_iter.bi_size	= sectors_read << 9;
+			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			bch_bio_map(bio, buf->data);
 
-		/* This function could be simpler now since we no longer write
-		 * journal entries that overlap bucket boundaries; this means
-		 * the start of a bucket will always have a valid journal entry
-		 * if it has any journal entries at all.
-		 */
+			ret = submit_bio_wait(bio);
 
-		j = data;
-		while (sectors_read) {
-			ret = journal_entry_validate(c, j,
-					sector + bucket_offset,
-					ca->mi.bucket_size - bucket_offset,
-					sectors_read);
-			switch (ret) {
-			case BCH_FSCK_OK:
-				break;
-			case JOURNAL_ENTRY_REREAD:
-				goto reread;
-			case JOURNAL_ENTRY_NONE:
-				if (!saw_bad)
-					goto out;
-				blocks = 1;
-				goto next_block;
-			case JOURNAL_ENTRY_BAD:
-				saw_bad = true;
-				blocks = 1;
-				goto next_block;
-			default:
-				goto err;
-			}
+			if (cache_fatal_io_err_on(ret, ca,
+						  "journal read from sector %llu",
+						  offset) ||
+			    bch_meta_read_fault("journal"))
+				return -EIO;
 
-			/*
-			 * This happens sometimes if we don't have discards on -
-			 * when we've partially overwritten a bucket with new
-			 * journal entries. We don't need the rest of the
-			 * bucket:
-			 */
-			if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-				goto out;
-
-			ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
-			ret = journal_entry_add(c, jlist, j);
-			switch (ret) {
-			case JOURNAL_ENTRY_ADD_OK:
-				*entries_found = true;
-				break;
-			case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
-				break;
-			default:
-				goto err;
+			j = buf->data;
+		}
+
+		ret = journal_entry_validate(c, j, offset,
+					end - offset, sectors_read);
+		switch (ret) {
+		case BCH_FSCK_OK:
+			break;
+		case JOURNAL_ENTRY_REREAD:
+			if (vstruct_bytes(j) > buf->size) {
+				ret = journal_read_buf_realloc(buf,
+							vstruct_bytes(j));
+				if (ret)
+					return ret;
 			}
+			goto reread;
+		case JOURNAL_ENTRY_NONE:
+			if (!saw_bad)
+				return 0;
+			sectors = c->sb.block_size;
+			goto next_block;
+		case JOURNAL_ENTRY_BAD:
+			saw_bad = true;
+			sectors = c->sb.block_size;
+			goto next_block;
+		default:
+			return ret;
+		}
 
-			if (le64_to_cpu(j->seq) > *seq)
-				*seq = le64_to_cpu(j->seq);
-next_block:
-			blocks = __set_blocks(j, le32_to_cpu(j->u64s),
-					      block_bytes(c));
+		/*
+		 * This happens sometimes if we don't have discards on -
+		 * when we've partially overwritten a bucket with new
+		 * journal entries. We don't need the rest of the
+		 * bucket:
+		 */
+		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+			return 0;
+
+		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
-			pr_debug("next");
-			bucket_offset	+= blocks * c->sb.block_size;
-			sectors_read	-= blocks * c->sb.block_size;
-			j = ((void *) j) + blocks * block_bytes(c);
+		ret = journal_entry_add(c, jlist, j);
+		switch (ret) {
+		case JOURNAL_ENTRY_ADD_OK:
+			*entries_found = true;
+			break;
+		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+			break;
+		default:
+			return ret;
 		}
+
+		if (le64_to_cpu(j->seq) > *seq)
+			*seq = le64_to_cpu(j->seq);
+
+		sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+		pr_debug("next");
+		offset		+= sectors;
+		sectors_read	-= sectors;
+		j = ((void *) j) + (sectors << 9);
 	}
-out:
-	ret = 0;
-err:
-	if (data == c->journal.buf[0].data)
-		mutex_unlock(&jlist->cache_set_buffer_lock);
-	else
-		free_pages((unsigned long) data,
-				get_order(c->journal.entry_size_max));
 
-	return ret;
+	return 0;
 }
 
 static void bch_journal_read_device(struct closure *cl)
@@ -759,15 +767,11 @@ static void bch_journal_read_device(struct closure *cl)
 #define read_bucket(b)							\
 	({								\
 		bool entries_found = false;				\
-		int ret = journal_read_bucket(ca, jlist, b,		\
-					      &seq, &entries_found);	\
+		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
+					  &entries_found);		\
+		if (ret)						\
+			goto err;					\
 		__set_bit(b, bitmap);					\
-		if (ret) {						\
-			mutex_lock(&jlist->lock);			\
-			jlist->ret = ret;				\
-			mutex_unlock(&jlist->lock);			\
-			closure_return(cl);				\
-		}							\
 		entries_found;						\
 	 })
 
@@ -777,24 +781,29 @@ static void bch_journal_read_device(struct closure *cl)
 	struct journal_list *jlist =
 		container_of(cl->parent, struct journal_list, cl);
 	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+	struct journal_read_buf buf = { NULL, 0 };
 
-	unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
-	DECLARE_BITMAP(bitmap, nr_buckets);
+	DECLARE_BITMAP(bitmap, ja->nr);
 	unsigned i, l, r;
 	u64 seq = 0;
+	int ret;
 
-	if (!nr_buckets)
-		closure_return(cl);
+	if (!ja->nr)
+		goto out;
+
+	bitmap_zero(bitmap, ja->nr);
+	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	if (ret)
+		goto err;
 
-	bitmap_zero(bitmap, nr_buckets);
-	pr_debug("%u journal buckets", nr_buckets);
+	pr_debug("%u journal buckets", ja->nr);
 
 	/*
 	 * If the device supports discard but not secure discard, we can't do
 	 * the fancy fibonacci hash/binary search because the live journal
 	 * entries might not form a contiguous range:
 	 */
-	for (i = 0; i < nr_buckets; i++)
+	for (i = 0; i < ja->nr; i++)
 		read_bucket(i);
 	goto search_done;
 
@@ -805,8 +814,8 @@ static void bch_journal_read_device(struct closure *cl)
 	 * Read journal buckets ordered by golden ratio hash to quickly
 	 * find a sequence of buckets with valid journal entries
 	 */
-	for (i = 0; i < nr_buckets; i++) {
-		l = (i * 2654435769U) % nr_buckets;
+	for (i = 0; i < ja->nr; i++) {
+		l = (i * 2654435769U) % ja->nr;
 
 		if (test_bit(l, bitmap))
 			break;
@@ -821,18 +830,18 @@ static void bch_journal_read_device(struct closure *cl)
 	 */
 	pr_debug("falling back to linear search");
 linear_scan:
-	for (l = find_first_zero_bit(bitmap, nr_buckets);
-	     l < nr_buckets;
-	     l = find_next_zero_bit(bitmap, nr_buckets, l + 1))
+	for (l = find_first_zero_bit(bitmap, ja->nr);
+	     l < ja->nr;
+	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
 		if (read_bucket(l))
 			goto bsearch;
 
 	/* no journal entries on this device? */
-	if (l == nr_buckets)
-		closure_return(cl);
+	if (l == ja->nr)
+		goto out;
 bsearch:
 	/* Binary search */
-	r = find_next_bit(bitmap, nr_buckets, l + 1);
+	r = find_next_bit(bitmap, ja->nr, l + 1);
 	pr_debug("starting binary search, l %u r %u", l, r);
 
 	while (l + 1 < r) {
@@ -858,9 +867,9 @@ search_done:
 	 */
 	seq = 0;
 
-	for (i = 0; i < nr_buckets; i++)
+	for (i = 0; i < ja->nr; i++)
 		if (ja->bucket_seq[i] >= seq &&
-		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) {
+		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
 			/*
 			 * When journal_next_bucket() goes to allocate for
 			 * the first time, it'll use the bucket after
@@ -875,20 +884,26 @@ search_done:
 	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
 	 * pinned when it first runs:
 	 */
-	ja->last_idx = (ja->cur_idx + 1) % nr_buckets;
+	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
 
 	/*
 	 * Read buckets in reverse order until we stop finding more journal
 	 * entries:
 	 */
-	for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets;
+	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
 	     i != ja->cur_idx;
-	     i = (i + nr_buckets - 1) % nr_buckets)
+	     i = (i + ja->nr - 1) % ja->nr)
 		if (!test_bit(i, bitmap) &&
 		    !read_bucket(i))
 			break;
-
+out:
+	free_pages((unsigned long) buf.data, get_order(buf.size));
 	closure_return(cl);
+err:
+	mutex_lock(&jlist->lock);
+	jlist->ret = ret;
+	mutex_unlock(&jlist->lock);
+	goto out;
 #undef read_bucket
 }
 
@@ -930,6 +945,19 @@ static int journal_seq_blacklist_read(struct journal *j,
 	return 0;
 }
 
+static inline bool journal_has_keys(struct list_head *list)
+{
+	struct journal_replay *i;
+	struct jset_entry *entry;
+	struct bkey_i *k, *_n;
+
+	list_for_each_entry(i, list, list)
+		for_each_jset_key(k, _n, entry, &i->j)
+			return true;
+
+	return false;
+}
+
 int bch_journal_read(struct cache_set *c, struct list_head *list)
 {
 	struct jset_entry *prio_ptrs;
@@ -944,7 +972,6 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 
 	closure_init_stack(&jlist.cl);
 	mutex_init(&jlist.lock);
-	mutex_init(&jlist.cache_set_buffer_lock);
 	jlist.head = list;
 	jlist.ret = 0;
 
@@ -964,6 +991,9 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 		return BCH_FSCK_REPAIR_IMPOSSIBLE;
 	}
 
+	fsck_err_on(c->sb.clean && journal_has_keys(list), c,
+		    "filesystem marked clean but journal has keys to replay");
+
 	j = &list_entry(list->prev, struct journal_replay, list)->j;
 
 	unfixable_fsck_err_on(le64_to_cpu(j->seq) -
@@ -1057,7 +1087,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
 
 			if (btree_type_has_ptrs(type))
-				__bch_btree_mark_key(c, type, k_s_c);
+				bch_btree_mark_key_initial(c, type, k_s_c);
 		}
 }
 
@@ -1171,10 +1201,9 @@ static enum {
 	buf->data->last_seq	= cpu_to_le64(last_seq(j));
 
 	j->prev_buf_sectors =
-		__set_blocks(buf->data,
-			     le32_to_cpu(buf->data->u64s) +
-			     journal_entry_u64s_reserve(buf),
-			     block_bytes(c)) * c->sb.block_size;
+		vstruct_blocks_plus(buf->data, c->block_bits,
+				    journal_entry_u64s_reserve(buf)) *
+		c->sb.block_size;
 
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
@@ -1219,9 +1248,8 @@ static unsigned journal_dev_buckets_available(struct journal *j,
 					      struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
-	unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb);
-	unsigned next = (ja->cur_idx + 1) % nr;
-	unsigned available = (ja->last_idx + nr - next) % nr;
+	unsigned next = (ja->cur_idx + 1) % ja->nr;
+	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 
 	/*
 	 * Hack to avoid a deadlock during journal replay:
@@ -1271,7 +1299,7 @@ static int journal_entry_sectors(struct journal *j)
 		 * for the previous entry we have to make sure we have space for
 		 * it too:
 		 */
-		if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+		if (bch_extent_has_device(e.c, ca->dev_idx)) {
 			if (j->prev_buf_sectors > ca->journal.sectors_free)
 				buckets_required++;
 
@@ -1479,17 +1507,28 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list)
 		entries++;
 	}
 
+	if (keys) {
+		bch_btree_flush(c);
+
+		/*
+		 * Write a new journal entry _before_ we start journalling new data -
+		 * otherwise, we could end up with btree node bsets with journal seqs
+		 * arbitrarily far in the future vs. the most recently written journal
+		 * entry on disk, if we crash before writing the next journal entry:
+		 */
+		ret = bch_journal_meta(&c->journal);
+		if (ret)
+			goto err;
+	}
+
 	bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
 		 keys, entries, (u64) atomic64_read(&j->seq));
 
-	fsck_err_on(c->sb.clean && keys, c,
-		    "filesystem marked clean, but journal had keys to replay");
-
 	bch_journal_set_replay_done(&c->journal);
 err:
 	if (ret)
 		bch_err(c, "journal replay error: %d", ret);
-fsck_err:
+
 	bch_journal_entries_free(list);
 
 	return ret;
@@ -1497,28 +1536,40 @@ fsck_err:
 
 static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
 {
-	unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr;
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets =
+		bch_sb_get_journal(ca->disk_sb.sb);
+	struct bch_sb_field *f;
 	u64 *p;
-	int ret;
 
-	ret = bch_super_realloc(&ca->disk_sb, u64s);
-	if (ret)
-		return ret;
+	p = krealloc(ja->bucket_seq, nr * sizeof(u64),
+		     GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	ja->bucket_seq = p;
 
-	p = krealloc(ca->journal.bucket_seq,
-		     nr * sizeof(u64),
+	p = krealloc(ja->buckets, nr * sizeof(u64),
 		     GFP_KERNEL|__GFP_ZERO);
 	if (!p)
 		return -ENOMEM;
 
-	ca->journal.bucket_seq = p;
-	ca->disk_sb.sb->u64s = cpu_to_le16(u64s);
+	ja->buckets = p;
+
+	f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
+				    sizeof(*journal_buckets) / sizeof(u64));
+	if (!f)
+		return -ENOMEM;
+	f->type = BCH_SB_FIELD_journal;
 
+	ja->nr = nr;
 	return 0;
 }
 
 int bch_cache_journal_alloc(struct cache *ca)
 {
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets;
 	int ret;
 	unsigned i;
 
@@ -1540,11 +1591,15 @@ int bch_cache_journal_alloc(struct cache *ca)
 	if (ret)
 		return ret;
 
-	for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) {
-		unsigned long r = ca->mi.first_bucket + i;
+	journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
+
+	for (i = 0; i < ja->nr; i++) {
+		u64 bucket = ca->mi.first_bucket + i;
 
-		bch_mark_metadata_bucket(ca, &ca->buckets[r], true);
-		set_journal_bucket(ca->disk_sb.sb, i, r);
+		ja->buckets[i] = bucket;
+		journal_buckets->buckets[i] = cpu_to_le64(bucket);
+
+		bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
 	}
 
 	return 0;
@@ -1749,7 +1804,7 @@ static void journal_reclaim_work(struct work_struct *work)
 	struct cache *ca;
 	struct journal_entry_pin *pin;
 	u64 seq_to_flush = 0;
-	unsigned iter, nr, bucket_to_flush;
+	unsigned iter, bucket_to_flush;
 	unsigned long next_flush;
 	bool reclaim_lock_held = false, need_flush;
 
@@ -1781,13 +1836,11 @@ static void journal_reclaim_work(struct work_struct *work)
 			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
-						journal_bucket(ca->disk_sb.sb,
-							       ja->last_idx)),
+						ja->buckets[ja->last_idx]),
 					ca->mi.bucket_size, GFP_NOIO, 0);
 
 			spin_lock(&j->lock);
-			ja->last_idx = (ja->last_idx + 1) %
-				bch_nr_journal_buckets(ca->disk_sb.sb);
+			ja->last_idx = (ja->last_idx + 1) % ja->nr;
 			spin_unlock(&j->lock);
 
 			wake_up(&j->wait);
@@ -1798,8 +1851,7 @@ static void journal_reclaim_work(struct work_struct *work)
 		 * buckets
 		 */
 		spin_lock(&j->lock);
-		nr = bch_nr_journal_buckets(ca->disk_sb.sb),
-		bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr;
+		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
 		seq_to_flush = max_t(u64, seq_to_flush,
 				     ja->bucket_seq[bucket_to_flush]);
 		spin_unlock(&j->lock);
@@ -1861,7 +1913,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	 */
 	extent_for_each_ptr_backwards(e, ptr)
 		if (!(ca = PTR_CACHE(c, ptr)) ||
-		    ca->mi.state != CACHE_ACTIVE ||
+		    ca->mi.state != BCH_MEMBER_STATE_ACTIVE ||
 		    ca->journal.sectors_free <= sectors)
 			__bch_extent_drop_ptr(e, ptr);
 		else
@@ -1875,7 +1927,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 	 */
 	group_for_each_cache_rcu(ca, &j->devs, iter) {
 		struct journal_device *ja = &ca->journal;
-		unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
 
 		if (replicas >= replicas_want)
 			break;
@@ -1884,21 +1935,20 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 		 * Check that we can use this device, and aren't already using
 		 * it:
 		 */
-		if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) ||
+		if (bch_extent_has_device(e.c, ca->dev_idx) ||
 		    !journal_dev_buckets_available(j, ca) ||
 		    sectors > ca->mi.bucket_size)
 			continue;
 
 		ja->sectors_free = ca->mi.bucket_size - sectors;
-		ja->cur_idx = (ja->cur_idx + 1) % nr_buckets;
+		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 		ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
 
 		extent_ptr_append(bkey_i_to_extent(&j->key),
 			(struct bch_extent_ptr) {
 				  .offset = bucket_to_sector(ca,
-					journal_bucket(ca->disk_sb.sb,
-						       ja->cur_idx)),
-				  .dev = ca->sb.nr_this_dev,
+					ja->buckets[ja->cur_idx]),
+				  .dev = ca->dev_idx,
 		});
 		replicas++;
 
@@ -1928,10 +1978,7 @@ static void journal_write_compact(struct jset *jset)
 	 * If we wanted to be really fancy here, we could sort all the keys in
 	 * the jset and drop keys that were overwritten - probably not worth it:
 	 */
-	for (i = jset->start;
-	     i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) &&
-	     (next = jset_keys_next(i), true);
-	     i = next) {
+	vstruct_for_each_safe(jset, i, next) {
 		unsigned u64s = le16_to_cpu(i->u64s);
 
 		/* Empty entry: */
@@ -1945,7 +1992,7 @@ static void journal_write_compact(struct jset *jset)
 		    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
 		    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
 		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-			memmove_u64s_down(jset_keys_next(prev),
+			memmove_u64s_down(vstruct_next(prev),
 					  i->_data,
 					  u64s);
 			le16_add_cpu(&prev->u64s, u64s);
@@ -1953,12 +2000,12 @@ static void journal_write_compact(struct jset *jset)
 		}
 
 		/* Couldn't merge, move i into new position (after prev): */
-		prev = prev ? jset_keys_next(prev) : jset->start;
+		prev = prev ? vstruct_next(prev) : jset->start;
 		if (i != prev)
 			memmove_u64s_down(prev, i, jset_u64s(u64s));
 	}
 
-	prev = prev ? jset_keys_next(prev) : jset->start;
+	prev = prev ? vstruct_next(prev) : jset->start;
 	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
 }
 
@@ -2019,6 +2066,7 @@ static void journal_write(struct closure *cl)
 	struct cache_set *c = container_of(j, struct cache_set, journal);
 	struct cache *ca;
 	struct journal_buf *w = journal_prev_buf(j);
+	struct jset *jset = w->data;
 	struct bio *bio;
 	struct bch_extent_ptr *ptr;
 	unsigned i, sectors, bytes;
@@ -2036,24 +2084,27 @@ static void journal_write(struct closure *cl)
 	}
 	mutex_unlock(&c->btree_root_lock);
 
-	journal_write_compact(w->data);
+	journal_write_compact(jset);
+
+	jset->read_clock	= cpu_to_le16(c->prio_clock[READ].hand);
+	jset->write_clock	= cpu_to_le16(c->prio_clock[WRITE].hand);
+	jset->magic		= cpu_to_le64(jset_magic(c));
+	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
 
-	w->data->read_clock	= cpu_to_le16(c->prio_clock[READ].hand);
-	w->data->write_clock	= cpu_to_le16(c->prio_clock[WRITE].hand);
-	w->data->magic		= cpu_to_le64(jset_magic(&c->disk_sb));
-	w->data->version	= cpu_to_le32(BCACHE_JSET_VERSION);
+	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c));
 
-	SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
-	SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
-	w->data->csum = cpu_to_le64(__csum_set(w->data,
-					       le32_to_cpu(w->data->u64s),
-					       JSET_CSUM_TYPE(w->data)));
+	bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
 
-	sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s),
-			       block_bytes(c)) * c->sb.block_size;
+	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+				  journal_nonce(jset), jset);
+
+	sectors = vstruct_sectors(jset, c->block_bits);
 	BUG_ON(sectors > j->prev_buf_sectors);
 
-	bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s));
+	bytes = vstruct_bytes(w->data);
 	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
 
 	if (journal_write_alloc(j, sectors)) {
@@ -2096,7 +2147,7 @@ static void journal_write(struct closure *cl)
 		bio->bi_private		= ca;
 		bio_set_op_attrs(bio, REQ_OP_WRITE,
 				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-		bch_bio_map(bio, w->data);
+		bch_bio_map(bio, jset);
 
 		trace_bcache_journal_write(bio);
 		closure_bio_submit_punt(bio, cl, c);
@@ -2105,7 +2156,7 @@ static void journal_write(struct closure *cl)
 	}
 
 	for_each_cache(ca, c, i)
-		if (ca->mi.state == CACHE_ACTIVE &&
+		if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
 		    journal_flushes_device(ca) &&
 		    !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
 			percpu_ref_get(&ca->ref);
@@ -2503,7 +2554,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 				 "\tnr\t\t%u\n"
 				 "\tcur_idx\t\t%u (seq %llu)\n"
 				 "\tlast_idx\t%u (seq %llu)\n",
-				 iter, bch_nr_journal_buckets(ca->disk_sb.sb),
+				 iter, ja->nr,
 				 ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
 				 ja->last_idx,	ja->bucket_seq[ja->last_idx]);
 	}
@@ -2521,7 +2572,7 @@ static bool bch_journal_writing_to_device(struct cache *ca)
 
 	spin_lock(&j->lock);
 	ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
-				    ca->sb.nr_this_dev);
+				    ca->dev_idx);
 	spin_unlock(&j->lock);
 
 	return ret;
@@ -2541,10 +2592,11 @@ static bool bch_journal_writing_to_device(struct cache *ca)
 
 int bch_journal_move(struct cache *ca)
 {
-	unsigned i, nr_buckets;
 	u64 last_flushed_seq;
+	struct journal_device *ja = &ca->journal;
 	struct cache_set *c = ca->set;
 	struct journal *j = &c->journal;
+	unsigned i;
 	int ret = 0;		/* Success */
 
 	if (bch_journal_writing_to_device(ca)) {
@@ -2585,10 +2637,45 @@ int bch_journal_move(struct cache *ca)
 	last_flushed_seq = last_seq(j);
 	spin_unlock(&j->lock);
 
-	nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
-
-	for (i = 0; i < nr_buckets; i += 1)
-		BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq);
+	for (i = 0; i < ja->nr; i += 1)
+		BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
 
 	return ret;
 }
+
+void bch_journal_free_cache(struct cache *ca)
+{
+	kfree(ca->journal.buckets);
+	kfree(ca->journal.bucket_seq);
+}
+
+int bch_journal_init_cache(struct cache *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets =
+		bch_sb_get_journal(ca->disk_sb.sb);
+	unsigned i, journal_entry_pages;
+
+	journal_entry_pages =
+		DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+			     PAGE_SECTORS);
+
+	ja->nr = bch_nr_journal_buckets(journal_buckets);
+
+	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->bucket_seq)
+		return -ENOMEM;
+
+	ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+	if (!ca->journal.bio)
+		return -ENOMEM;
+
+	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->buckets)
+		return -ENOMEM;
+
+	for (i = 0; i < ja->nr; i++)
+		ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+	return 0;
+}