summaryrefslogtreecommitdiff
path: root/libbcache/journal.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcache/journal.c')
-rw-r--r--libbcache/journal.c583
1 files changed, 335 insertions, 248 deletions
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 9e09b86..3bb9e3c 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -18,7 +18,8 @@
#include "io.h"
#include "keylist.h"
#include "journal.h"
-#include "super.h"
+#include "super-io.h"
+#include "vstructs.h"
#include <trace/events/bcache.h>
@@ -52,19 +53,14 @@ static inline u64 journal_pin_seq(struct journal *j,
return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
}
-#define for_each_jset_entry(entry, jset) \
- for (entry = (jset)->start; \
- entry < bkey_idx(jset, le32_to_cpu((jset)->u64s)); \
- entry = jset_keys_next(entry))
-
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
- while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) {
+ while (entry < vstruct_last(jset)) {
if (JOURNAL_ENTRY_TYPE(entry) == type)
return entry;
- entry = jset_keys_next(entry);
+ entry = vstruct_next(entry);
}
return NULL;
@@ -73,14 +69,11 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
- entry = jset_keys_next(entry))
+ entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
- for (k = (entry)->start; \
- (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\
- (_n = bkey_next(k), 1)); \
- k = _n)
+ vstruct_for_each_safe(entry, k, _n)
static inline void bch_journal_add_entry(struct journal_buf *buf,
const void *data, size_t u64s,
@@ -199,8 +192,6 @@ redo_peek:
closure_sync(&cl);
- mutex_lock(&c->btree_interior_update_lock);
-
for (i = 0;; i++) {
struct btree_interior_update *as;
struct pending_btree_node_free *d;
@@ -212,6 +203,8 @@ redo_peek:
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
+redo_wait:
+ mutex_lock(&c->btree_interior_update_lock);
/*
* Is the node on the list of pending interior node updates -
@@ -225,11 +218,11 @@ redo_peek:
closure_wait(&as->wait, &cl);
mutex_unlock(&c->btree_interior_update_lock);
closure_sync(&cl);
- break;
+ goto redo_wait;
}
- }
- mutex_unlock(&c->btree_interior_update_lock);
+ mutex_unlock(&c->btree_interior_update_lock);
+ }
mutex_lock(&j->blacklist_lock);
@@ -377,7 +370,6 @@ out:
struct journal_list {
struct closure cl;
struct mutex lock;
- struct mutex cache_set_buffer_lock;
struct list_head *head;
int ret;
};
@@ -394,7 +386,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
{
struct journal_replay *i, *pos;
struct list_head *where;
- size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+ size_t bytes = vstruct_bytes(j);
__le64 last_seq;
int ret;
@@ -422,8 +414,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
list_for_each_entry_reverse(i, jlist->head, list) {
/* Duplicate? */
if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
- fsck_err_on(bytes != __set_bytes(&i->j,
- le32_to_cpu(i->j.u64s)) ||
+ fsck_err_on(bytes != vstruct_bytes(&i->j) ||
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
@@ -455,11 +446,21 @@ fsck_err:
return ret;
}
+static struct nonce journal_nonce(const struct jset *jset)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = ((__le32 *) &jset->seq)[0],
+ [2] = ((__le32 *) &jset->seq)[1],
+ [3] = BCH_NONCE_JOURNAL,
+ }};
+}
+
static void journal_entry_null_range(void *start, void *end)
{
struct jset_entry *entry;
- for (entry = start; entry != end; entry = jset_keys_next(entry)) {
+ for (entry = start; entry != end; entry = vstruct_next(entry)) {
entry->u64s = 0;
entry->btree_id = 0;
entry->level = 0;
@@ -473,7 +474,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
struct bkey_i *k, enum bkey_type key_type,
const char *type)
{
- void *next = jset_keys_next(entry);
+ void *next = vstruct_next(entry);
const char *invalid;
char buf[160];
int ret = 0;
@@ -481,16 +482,16 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
if (mustfix_fsck_err_on(!k->k.u64s, c,
"invalid %s in journal: k->u64s 0", type)) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
if (mustfix_fsck_err_on((void *) bkey_next(k) >
- (void *) jset_keys_next(entry), c,
+ (void *) vstruct_next(entry), c,
"invalid %s in journal: extends past end of journal entry",
type)) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
@@ -499,7 +500,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
type, k->k.format)) {
le16_add_cpu(&entry->u64s, -k->k.u64s);
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
@@ -514,7 +515,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
le16_add_cpu(&entry->u64s, -k->k.u64s);
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
fsck_err:
@@ -525,16 +526,17 @@ fsck_err:
#define JOURNAL_ENTRY_NONE 6
#define JOURNAL_ENTRY_BAD 7
-static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector,
+static int journal_entry_validate(struct cache_set *c,
+ struct jset *j, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read)
{
struct jset_entry *entry;
- size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
- u64 got, expect;
+ size_t bytes = vstruct_bytes(j);
+ struct bch_csum csum;
int ret = 0;
- if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb))
+ if (le64_to_cpu(j->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
@@ -554,25 +556,32 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
if (bytes > sectors_read << 9)
return JOURNAL_ENTRY_REREAD;
- got = le64_to_cpu(j->csum);
- expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j));
- if (mustfix_fsck_err_on(got != expect, c,
- "journal checksum bad (got %llu expect %llu), sector %lluu",
- got, expect, sector)) {
+ if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+ "journal entry with unknown csum type %llu sector %lluu",
+ JSET_CSUM_TYPE(j), sector))
+ return JOURNAL_ENTRY_BAD;
+
+ csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+ if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c,
+ "journal checksum bad, sector %llu", sector)) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
- if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq),
- c, "invalid journal entry: last_seq > seq"))
+ bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+ j->encrypted_start,
+ vstruct_end(j) - (void *) j->encrypted_start);
+
+ if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+ "invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
- for_each_jset_entry(entry, j) {
+ vstruct_for_each(j, entry) {
struct bkey_i *k;
- if (mustfix_fsck_err_on(jset_keys_next(entry) >
- bkey_idx(j, le32_to_cpu(j->u64s)), c,
+ if (mustfix_fsck_err_on(vstruct_next(entry) >
+ vstruct_last(j), c,
"journal entry extents past end of jset")) {
j->u64s = cpu_to_le64((u64 *) entry - j->_data);
break;
@@ -580,9 +589,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
switch (JOURNAL_ENTRY_TYPE(entry)) {
case JOURNAL_ENTRY_BTREE_KEYS:
- for (k = entry->start;
- k < bkey_idx(entry, le16_to_cpu(entry->u64s));
- k = bkey_next(k)) {
+ vstruct_for_each(entry, k) {
ret = journal_validate_key(c, j, entry, k,
bkey_type(entry->level,
entry->btree_id),
@@ -599,7 +606,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
le16_to_cpu(entry->u64s) != k->k.u64s, c,
"invalid btree root journal entry: wrong number of keys")) {
journal_entry_null_range(entry,
- jset_keys_next(entry));
+ vstruct_next(entry));
continue;
}
@@ -616,14 +623,14 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry,
- jset_keys_next(entry));
+ vstruct_next(entry));
}
break;
default:
mustfix_fsck_err(c, "invalid journal entry type %llu",
JOURNAL_ENTRY_TYPE(entry));
- journal_entry_null_range(entry, jset_keys_next(entry));
+ journal_entry_null_range(entry, vstruct_next(entry));
break;
}
}
@@ -632,126 +639,127 @@ fsck_err:
return ret;
}
-static int journal_read_bucket(struct cache *ca, struct journal_list *jlist,
+struct journal_read_buf {
+ void *data;
+ size_t size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+ size_t new_size)
+{
+ void *n;
+
+ new_size = roundup_pow_of_two(new_size);
+ n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+ if (!n)
+ return -ENOMEM;
+
+ free_pages((unsigned long) b->data, get_order(b->size));
+ b->data = n;
+ b->size = new_size;
+ return 0;
+}
+
+static int journal_read_bucket(struct cache *ca,
+ struct journal_read_buf *buf,
+ struct journal_list *jlist,
unsigned bucket, u64 *seq, bool *entries_found)
{
struct cache_set *c = ca->set;
struct journal_device *ja = &ca->journal;
struct bio *bio = ja->bio;
- struct jset *j, *data;
- unsigned blocks, sectors_read, bucket_offset = 0;
- unsigned max_entry_sectors = c->journal.entry_size_max >> 9;
- u64 sector = bucket_to_sector(ca,
- journal_bucket(ca->disk_sb.sb, bucket));
+ struct jset *j = NULL;
+ unsigned sectors, sectors_read = 0;
+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+ end = offset + ca->mi.bucket_size;
bool saw_bad = false;
int ret = 0;
- data = (void *) __get_free_pages(GFP_KERNEL,
- get_order(c->journal.entry_size_max));
- if (!data) {
- mutex_lock(&jlist->cache_set_buffer_lock);
- data = c->journal.buf[0].data;
- }
-
pr_debug("reading %u", bucket);
- while (bucket_offset < ca->mi.bucket_size) {
-reread:
- sectors_read = min_t(unsigned,
- ca->mi.bucket_size - bucket_offset,
- max_entry_sectors);
+ while (offset < end) {
+ if (!sectors_read) {
+reread: sectors_read = min_t(unsigned,
+ end - offset, buf->size >> 9);
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = sector + bucket_offset;
- bio->bi_iter.bi_size = sectors_read << 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch_bio_map(bio, data);
-
- ret = submit_bio_wait(bio);
-
- if (cache_fatal_io_err_on(ret, ca,
- "journal read from sector %llu",
- sector + bucket_offset) ||
- bch_meta_read_fault("journal")) {
- ret = -EIO;
- goto err;
- }
+ bio_reset(bio);
+ bio->bi_bdev = ca->disk_sb.bdev;
+ bio->bi_iter.bi_sector = offset;
+ bio->bi_iter.bi_size = sectors_read << 9;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bch_bio_map(bio, buf->data);
- /* This function could be simpler now since we no longer write
- * journal entries that overlap bucket boundaries; this means
- * the start of a bucket will always have a valid journal entry
- * if it has any journal entries at all.
- */
+ ret = submit_bio_wait(bio);
- j = data;
- while (sectors_read) {
- ret = journal_entry_validate(c, j,
- sector + bucket_offset,
- ca->mi.bucket_size - bucket_offset,
- sectors_read);
- switch (ret) {
- case BCH_FSCK_OK:
- break;
- case JOURNAL_ENTRY_REREAD:
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- goto out;
- blocks = 1;
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
- blocks = 1;
- goto next_block;
- default:
- goto err;
- }
+ if (cache_fatal_io_err_on(ret, ca,
+ "journal read from sector %llu",
+ offset) ||
+ bch_meta_read_fault("journal"))
+ return -EIO;
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- goto out;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- ret = journal_entry_add(c, jlist, j);
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- *entries_found = true;
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- goto err;
+ j = buf->data;
+ }
+
+ ret = journal_entry_validate(c, j, offset,
+ end - offset, sectors_read);
+ switch (ret) {
+ case BCH_FSCK_OK:
+ break;
+ case JOURNAL_ENTRY_REREAD:
+ if (vstruct_bytes(j) > buf->size) {
+ ret = journal_read_buf_realloc(buf,
+ vstruct_bytes(j));
+ if (ret)
+ return ret;
}
+ goto reread;
+ case JOURNAL_ENTRY_NONE:
+ if (!saw_bad)
+ return 0;
+ sectors = c->sb.block_size;
+ goto next_block;
+ case JOURNAL_ENTRY_BAD:
+ saw_bad = true;
+ sectors = c->sb.block_size;
+ goto next_block;
+ default:
+ return ret;
+ }
- if (le64_to_cpu(j->seq) > *seq)
- *seq = le64_to_cpu(j->seq);
-next_block:
- blocks = __set_blocks(j, le32_to_cpu(j->u64s),
- block_bytes(c));
+ /*
+ * This happens sometimes if we don't have discards on -
+ * when we've partially overwritten a bucket with new
+ * journal entries. We don't need the rest of the
+ * bucket:
+ */
+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+ return 0;
+
+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- pr_debug("next");
- bucket_offset += blocks * c->sb.block_size;
- sectors_read -= blocks * c->sb.block_size;
- j = ((void *) j) + blocks * block_bytes(c);
+ ret = journal_entry_add(c, jlist, j);
+ switch (ret) {
+ case JOURNAL_ENTRY_ADD_OK:
+ *entries_found = true;
+ break;
+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+ break;
+ default:
+ return ret;
}
+
+ if (le64_to_cpu(j->seq) > *seq)
+ *seq = le64_to_cpu(j->seq);
+
+ sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+ pr_debug("next");
+ offset += sectors;
+ sectors_read -= sectors;
+ j = ((void *) j) + (sectors << 9);
}
-out:
- ret = 0;
-err:
- if (data == c->journal.buf[0].data)
- mutex_unlock(&jlist->cache_set_buffer_lock);
- else
- free_pages((unsigned long) data,
- get_order(c->journal.entry_size_max));
- return ret;
+ return 0;
}
static void bch_journal_read_device(struct closure *cl)
@@ -759,15 +767,11 @@ static void bch_journal_read_device(struct closure *cl)
#define read_bucket(b) \
({ \
bool entries_found = false; \
- int ret = journal_read_bucket(ca, jlist, b, \
- &seq, &entries_found); \
+ ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
+ &entries_found); \
+ if (ret) \
+ goto err; \
__set_bit(b, bitmap); \
- if (ret) { \
- mutex_lock(&jlist->lock); \
- jlist->ret = ret; \
- mutex_unlock(&jlist->lock); \
- closure_return(cl); \
- } \
entries_found; \
})
@@ -777,24 +781,29 @@ static void bch_journal_read_device(struct closure *cl)
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+ struct journal_read_buf buf = { NULL, 0 };
- unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
- DECLARE_BITMAP(bitmap, nr_buckets);
+ DECLARE_BITMAP(bitmap, ja->nr);
unsigned i, l, r;
u64 seq = 0;
+ int ret;
- if (!nr_buckets)
- closure_return(cl);
+ if (!ja->nr)
+ goto out;
+
+ bitmap_zero(bitmap, ja->nr);
+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ if (ret)
+ goto err;
- bitmap_zero(bitmap, nr_buckets);
- pr_debug("%u journal buckets", nr_buckets);
+ pr_debug("%u journal buckets", ja->nr);
/*
* If the device supports discard but not secure discard, we can't do
* the fancy fibonacci hash/binary search because the live journal
* entries might not form a contiguous range:
*/
- for (i = 0; i < nr_buckets; i++)
+ for (i = 0; i < ja->nr; i++)
read_bucket(i);
goto search_done;
@@ -805,8 +814,8 @@ static void bch_journal_read_device(struct closure *cl)
* Read journal buckets ordered by golden ratio hash to quickly
* find a sequence of buckets with valid journal entries
*/
- for (i = 0; i < nr_buckets; i++) {
- l = (i * 2654435769U) % nr_buckets;
+ for (i = 0; i < ja->nr; i++) {
+ l = (i * 2654435769U) % ja->nr;
if (test_bit(l, bitmap))
break;
@@ -821,18 +830,18 @@ static void bch_journal_read_device(struct closure *cl)
*/
pr_debug("falling back to linear search");
linear_scan:
- for (l = find_first_zero_bit(bitmap, nr_buckets);
- l < nr_buckets;
- l = find_next_zero_bit(bitmap, nr_buckets, l + 1))
+ for (l = find_first_zero_bit(bitmap, ja->nr);
+ l < ja->nr;
+ l = find_next_zero_bit(bitmap, ja->nr, l + 1))
if (read_bucket(l))
goto bsearch;
/* no journal entries on this device? */
- if (l == nr_buckets)
- closure_return(cl);
+ if (l == ja->nr)
+ goto out;
bsearch:
/* Binary search */
- r = find_next_bit(bitmap, nr_buckets, l + 1);
+ r = find_next_bit(bitmap, ja->nr, l + 1);
pr_debug("starting binary search, l %u r %u", l, r);
while (l + 1 < r) {
@@ -858,9 +867,9 @@ search_done:
*/
seq = 0;
- for (i = 0; i < nr_buckets; i++)
+ for (i = 0; i < ja->nr; i++)
if (ja->bucket_seq[i] >= seq &&
- ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) {
+ ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
/*
* When journal_next_bucket() goes to allocate for
* the first time, it'll use the bucket after
@@ -875,20 +884,26 @@ search_done:
* reclaimed - journal reclaim will immediately reclaim whatever isn't
* pinned when it first runs:
*/
- ja->last_idx = (ja->cur_idx + 1) % nr_buckets;
+ ja->last_idx = (ja->cur_idx + 1) % ja->nr;
/*
* Read buckets in reverse order until we stop finding more journal
* entries:
*/
- for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets;
+ for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
i != ja->cur_idx;
- i = (i + nr_buckets - 1) % nr_buckets)
+ i = (i + ja->nr - 1) % ja->nr)
if (!test_bit(i, bitmap) &&
!read_bucket(i))
break;
-
+out:
+ free_pages((unsigned long) buf.data, get_order(buf.size));
closure_return(cl);
+err:
+ mutex_lock(&jlist->lock);
+ jlist->ret = ret;
+ mutex_unlock(&jlist->lock);
+ goto out;
#undef read_bucket
}
@@ -930,6 +945,19 @@ static int journal_seq_blacklist_read(struct journal *j,
return 0;
}
+static inline bool journal_has_keys(struct list_head *list)
+{
+ struct journal_replay *i;
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
+
+ list_for_each_entry(i, list, list)
+ for_each_jset_key(k, _n, entry, &i->j)
+ return true;
+
+ return false;
+}
+
int bch_journal_read(struct cache_set *c, struct list_head *list)
{
struct jset_entry *prio_ptrs;
@@ -944,7 +972,6 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
- mutex_init(&jlist.cache_set_buffer_lock);
jlist.head = list;
jlist.ret = 0;
@@ -964,6 +991,9 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
return BCH_FSCK_REPAIR_IMPOSSIBLE;
}
+ fsck_err_on(c->sb.clean && journal_has_keys(list), c,
+ "filesystem marked clean but journal has keys to replay");
+
j = &list_entry(list->prev, struct journal_replay, list)->j;
unfixable_fsck_err_on(le64_to_cpu(j->seq) -
@@ -1057,7 +1087,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
if (btree_type_has_ptrs(type))
- __bch_btree_mark_key(c, type, k_s_c);
+ bch_btree_mark_key_initial(c, type, k_s_c);
}
}
@@ -1171,10 +1201,9 @@ static enum {
buf->data->last_seq = cpu_to_le64(last_seq(j));
j->prev_buf_sectors =
- __set_blocks(buf->data,
- le32_to_cpu(buf->data->u64s) +
- journal_entry_u64s_reserve(buf),
- block_bytes(c)) * c->sb.block_size;
+ vstruct_blocks_plus(buf->data, c->block_bits,
+ journal_entry_u64s_reserve(buf)) *
+ c->sb.block_size;
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
@@ -1219,9 +1248,8 @@ static unsigned journal_dev_buckets_available(struct journal *j,
struct cache *ca)
{
struct journal_device *ja = &ca->journal;
- unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb);
- unsigned next = (ja->cur_idx + 1) % nr;
- unsigned available = (ja->last_idx + nr - next) % nr;
+ unsigned next = (ja->cur_idx + 1) % ja->nr;
+ unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
/*
* Hack to avoid a deadlock during journal replay:
@@ -1271,7 +1299,7 @@ static int journal_entry_sectors(struct journal *j)
* for the previous entry we have to make sure we have space for
* it too:
*/
- if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+ if (bch_extent_has_device(e.c, ca->dev_idx)) {
if (j->prev_buf_sectors > ca->journal.sectors_free)
buckets_required++;
@@ -1479,17 +1507,28 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list)
entries++;
}
+ if (keys) {
+ bch_btree_flush(c);
+
+ /*
+ * Write a new journal entry _before_ we start journalling new data -
+ * otherwise, we could end up with btree node bsets with journal seqs
+ * arbitrarily far in the future vs. the most recently written journal
+ * entry on disk, if we crash before writing the next journal entry:
+ */
+ ret = bch_journal_meta(&c->journal);
+ if (ret)
+ goto err;
+ }
+
bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
keys, entries, (u64) atomic64_read(&j->seq));
- fsck_err_on(c->sb.clean && keys, c,
- "filesystem marked clean, but journal had keys to replay");
-
bch_journal_set_replay_done(&c->journal);
err:
if (ret)
bch_err(c, "journal replay error: %d", ret);
-fsck_err:
+
bch_journal_entries_free(list);
return ret;
@@ -1497,28 +1536,40 @@ fsck_err:
static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
{
- unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr;
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets =
+ bch_sb_get_journal(ca->disk_sb.sb);
+ struct bch_sb_field *f;
u64 *p;
- int ret;
- ret = bch_super_realloc(&ca->disk_sb, u64s);
- if (ret)
- return ret;
+ p = krealloc(ja->bucket_seq, nr * sizeof(u64),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!p)
+ return -ENOMEM;
+
+ ja->bucket_seq = p;
- p = krealloc(ca->journal.bucket_seq,
- nr * sizeof(u64),
+ p = krealloc(ja->buckets, nr * sizeof(u64),
GFP_KERNEL|__GFP_ZERO);
if (!p)
return -ENOMEM;
- ca->journal.bucket_seq = p;
- ca->disk_sb.sb->u64s = cpu_to_le16(u64s);
+ ja->buckets = p;
+
+ f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
+ sizeof(*journal_buckets) / sizeof(u64));
+ if (!f)
+ return -ENOMEM;
+ f->type = BCH_SB_FIELD_journal;
+ ja->nr = nr;
return 0;
}
int bch_cache_journal_alloc(struct cache *ca)
{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets;
int ret;
unsigned i;
@@ -1540,11 +1591,15 @@ int bch_cache_journal_alloc(struct cache *ca)
if (ret)
return ret;
- for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) {
- unsigned long r = ca->mi.first_bucket + i;
+ journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
+
+ for (i = 0; i < ja->nr; i++) {
+ u64 bucket = ca->mi.first_bucket + i;
- bch_mark_metadata_bucket(ca, &ca->buckets[r], true);
- set_journal_bucket(ca->disk_sb.sb, i, r);
+ ja->buckets[i] = bucket;
+ journal_buckets->buckets[i] = cpu_to_le64(bucket);
+
+ bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
}
return 0;
@@ -1749,7 +1804,7 @@ static void journal_reclaim_work(struct work_struct *work)
struct cache *ca;
struct journal_entry_pin *pin;
u64 seq_to_flush = 0;
- unsigned iter, nr, bucket_to_flush;
+ unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
@@ -1781,13 +1836,11 @@ static void journal_reclaim_work(struct work_struct *work)
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
- journal_bucket(ca->disk_sb.sb,
- ja->last_idx)),
+ ja->buckets[ja->last_idx]),
ca->mi.bucket_size, GFP_NOIO, 0);
spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) %
- bch_nr_journal_buckets(ca->disk_sb.sb);
+ ja->last_idx = (ja->last_idx + 1) % ja->nr;
spin_unlock(&j->lock);
wake_up(&j->wait);
@@ -1798,8 +1851,7 @@ static void journal_reclaim_work(struct work_struct *work)
* buckets
*/
spin_lock(&j->lock);
- nr = bch_nr_journal_buckets(ca->disk_sb.sb),
- bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr;
+ bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
spin_unlock(&j->lock);
@@ -1861,7 +1913,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
*/
extent_for_each_ptr_backwards(e, ptr)
if (!(ca = PTR_CACHE(c, ptr)) ||
- ca->mi.state != CACHE_ACTIVE ||
+ ca->mi.state != BCH_MEMBER_STATE_ACTIVE ||
ca->journal.sectors_free <= sectors)
__bch_extent_drop_ptr(e, ptr);
else
@@ -1875,7 +1927,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
*/
group_for_each_cache_rcu(ca, &j->devs, iter) {
struct journal_device *ja = &ca->journal;
- unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
if (replicas >= replicas_want)
break;
@@ -1884,21 +1935,20 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* Check that we can use this device, and aren't already using
* it:
*/
- if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) ||
+ if (bch_extent_has_device(e.c, ca->dev_idx) ||
!journal_dev_buckets_available(j, ca) ||
sectors > ca->mi.bucket_size)
continue;
ja->sectors_free = ca->mi.bucket_size - sectors;
- ja->cur_idx = (ja->cur_idx + 1) % nr_buckets;
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
extent_ptr_append(bkey_i_to_extent(&j->key),
(struct bch_extent_ptr) {
.offset = bucket_to_sector(ca,
- journal_bucket(ca->disk_sb.sb,
- ja->cur_idx)),
- .dev = ca->sb.nr_this_dev,
+ ja->buckets[ja->cur_idx]),
+ .dev = ca->dev_idx,
});
replicas++;
@@ -1928,10 +1978,7 @@ static void journal_write_compact(struct jset *jset)
* If we wanted to be really fancy here, we could sort all the keys in
* the jset and drop keys that were overwritten - probably not worth it:
*/
- for (i = jset->start;
- i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) &&
- (next = jset_keys_next(i), true);
- i = next) {
+ vstruct_for_each_safe(jset, i, next) {
unsigned u64s = le16_to_cpu(i->u64s);
/* Empty entry: */
@@ -1945,7 +1992,7 @@ static void journal_write_compact(struct jset *jset)
JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(jset_keys_next(prev),
+ memmove_u64s_down(vstruct_next(prev),
i->_data,
u64s);
le16_add_cpu(&prev->u64s, u64s);
@@ -1953,12 +2000,12 @@ static void journal_write_compact(struct jset *jset)
}
/* Couldn't merge, move i into new position (after prev): */
- prev = prev ? jset_keys_next(prev) : jset->start;
+ prev = prev ? vstruct_next(prev) : jset->start;
if (i != prev)
memmove_u64s_down(prev, i, jset_u64s(u64s));
}
- prev = prev ? jset_keys_next(prev) : jset->start;
+ prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
}
@@ -2019,6 +2066,7 @@ static void journal_write(struct closure *cl)
struct cache_set *c = container_of(j, struct cache_set, journal);
struct cache *ca;
struct journal_buf *w = journal_prev_buf(j);
+ struct jset *jset = w->data;
struct bio *bio;
struct bch_extent_ptr *ptr;
unsigned i, sectors, bytes;
@@ -2036,24 +2084,27 @@ static void journal_write(struct closure *cl)
}
mutex_unlock(&c->btree_root_lock);
- journal_write_compact(w->data);
+ journal_write_compact(jset);
+
+ jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
+ jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
+ jset->magic = cpu_to_le64(jset_magic(c));
+ jset->version = cpu_to_le32(BCACHE_JSET_VERSION);
- w->data->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
- w->data->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
- w->data->magic = cpu_to_le64(jset_magic(&c->disk_sb));
- w->data->version = cpu_to_le32(BCACHE_JSET_VERSION);
+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+ SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c));
- SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
- SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
- w->data->csum = cpu_to_le64(__csum_set(w->data,
- le32_to_cpu(w->data->u64s),
- JSET_CSUM_TYPE(w->data)));
+ bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ jset->encrypted_start,
+ vstruct_end(jset) - (void *) jset->encrypted_start);
- sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s),
- block_bytes(c)) * c->sb.block_size;
+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+ journal_nonce(jset), jset);
+
+ sectors = vstruct_sectors(jset, c->block_bits);
BUG_ON(sectors > j->prev_buf_sectors);
- bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s));
+ bytes = vstruct_bytes(w->data);
memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
if (journal_write_alloc(j, sectors)) {
@@ -2096,7 +2147,7 @@ static void journal_write(struct closure *cl)
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_WRITE,
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch_bio_map(bio, w->data);
+ bch_bio_map(bio, jset);
trace_bcache_journal_write(bio);
closure_bio_submit_punt(bio, cl, c);
@@ -2105,7 +2156,7 @@ static void journal_write(struct closure *cl)
}
for_each_cache(ca, c, i)
- if (ca->mi.state == CACHE_ACTIVE &&
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
journal_flushes_device(ca) &&
!bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
percpu_ref_get(&ca->ref);
@@ -2503,7 +2554,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
"\tnr\t\t%u\n"
"\tcur_idx\t\t%u (seq %llu)\n"
"\tlast_idx\t%u (seq %llu)\n",
- iter, bch_nr_journal_buckets(ca->disk_sb.sb),
+ iter, ja->nr,
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}
@@ -2521,7 +2572,7 @@ static bool bch_journal_writing_to_device(struct cache *ca)
spin_lock(&j->lock);
ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
- ca->sb.nr_this_dev);
+ ca->dev_idx);
spin_unlock(&j->lock);
return ret;
@@ -2541,10 +2592,11 @@ static bool bch_journal_writing_to_device(struct cache *ca)
int bch_journal_move(struct cache *ca)
{
- unsigned i, nr_buckets;
u64 last_flushed_seq;
+ struct journal_device *ja = &ca->journal;
struct cache_set *c = ca->set;
struct journal *j = &c->journal;
+ unsigned i;
int ret = 0; /* Success */
if (bch_journal_writing_to_device(ca)) {
@@ -2585,10 +2637,45 @@ int bch_journal_move(struct cache *ca)
last_flushed_seq = last_seq(j);
spin_unlock(&j->lock);
- nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
-
- for (i = 0; i < nr_buckets; i += 1)
- BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq);
+ for (i = 0; i < ja->nr; i += 1)
+ BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
return ret;
}
+
+void bch_journal_free_cache(struct cache *ca)
+{
+ kfree(ca->journal.buckets);
+ kfree(ca->journal.bucket_seq);
+}
+
+int bch_journal_init_cache(struct cache *ca)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets =
+ bch_sb_get_journal(ca->disk_sb.sb);
+ unsigned i, journal_entry_pages;
+
+ journal_entry_pages =
+ DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+ PAGE_SECTORS);
+
+ ja->nr = bch_nr_journal_buckets(journal_buckets);
+
+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+ if (!ja->bucket_seq)
+ return -ENOMEM;
+
+ ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+ if (!ca->journal.bio)
+ return -ENOMEM;
+
+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+ if (!ja->buckets)
+ return -ENOMEM;
+
+ for (i = 0; i < ja->nr; i++)
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+ return 0;
+}