diff options
Diffstat (limited to 'libbcache/journal.c')
-rw-r--r-- | libbcache/journal.c | 583 |
1 files changed, 335 insertions, 248 deletions
diff --git a/libbcache/journal.c b/libbcache/journal.c index 9e09b86..3bb9e3c 100644 --- a/libbcache/journal.c +++ b/libbcache/journal.c @@ -18,7 +18,8 @@ #include "io.h" #include "keylist.h" #include "journal.h" -#include "super.h" +#include "super-io.h" +#include "vstructs.h" #include <trace/events/bcache.h> @@ -52,19 +53,14 @@ static inline u64 journal_pin_seq(struct journal *j, return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); } -#define for_each_jset_entry(entry, jset) \ - for (entry = (jset)->start; \ - entry < bkey_idx(jset, le32_to_cpu((jset)->u64s)); \ - entry = jset_keys_next(entry)) - static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { - while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) { + while (entry < vstruct_last(jset)) { if (JOURNAL_ENTRY_TYPE(entry) == type) return entry; - entry = jset_keys_next(entry); + entry = vstruct_next(entry); } return NULL; @@ -73,14 +69,11 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, #define for_each_jset_entry_type(entry, jset, type) \ for (entry = (jset)->start; \ (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = jset_keys_next(entry)) + entry = vstruct_next(entry)) #define for_each_jset_key(k, _n, entry, jset) \ for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ - for (k = (entry)->start; \ - (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\ - (_n = bkey_next(k), 1)); \ - k = _n) + vstruct_for_each_safe(entry, k, _n) static inline void bch_journal_add_entry(struct journal_buf *buf, const void *data, size_t u64s, @@ -199,8 +192,6 @@ redo_peek: closure_sync(&cl); - mutex_lock(&c->btree_interior_update_lock); - for (i = 0;; i++) { struct btree_interior_update *as; struct pending_btree_node_free *d; @@ -212,6 +203,8 @@ redo_peek: } n = bl->entries[i]; mutex_unlock(&j->blacklist_lock); +redo_wait: + mutex_lock(&c->btree_interior_update_lock); /* * Is the node on the list of pending interior node updates - @@ -225,11 +218,11 @@ redo_peek: closure_wait(&as->wait, &cl); mutex_unlock(&c->btree_interior_update_lock); closure_sync(&cl); - break; + goto redo_wait; } - } - mutex_unlock(&c->btree_interior_update_lock); + mutex_unlock(&c->btree_interior_update_lock); + } mutex_lock(&j->blacklist_lock); @@ -377,7 +370,6 @@ out: struct journal_list { struct closure cl; struct mutex lock; - struct mutex cache_set_buffer_lock; struct list_head *head; int ret; }; @@ -394,7 +386,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist, { struct journal_replay *i, *pos; struct list_head *where; - size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s)); + size_t bytes = vstruct_bytes(j); __le64 last_seq; int ret; @@ -422,8 +414,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist, list_for_each_entry_reverse(i, jlist->head, list) { /* Duplicate? */ if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - fsck_err_on(bytes != __set_bytes(&i->j, - le32_to_cpu(i->j.u64s)) || + fsck_err_on(bytes != vstruct_bytes(&i->j) || memcmp(j, &i->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); @@ -455,11 +446,21 @@ fsck_err: return ret; } +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + static void journal_entry_null_range(void *start, void *end) { struct jset_entry *entry; - for (entry = start; entry != end; entry = jset_keys_next(entry)) { + for (entry = start; entry != end; entry = vstruct_next(entry)) { entry->u64s = 0; entry->btree_id = 0; entry->level = 0; @@ -473,7 +474,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, struct bkey_i *k, enum bkey_type key_type, const char *type) { - void *next = jset_keys_next(entry); + void *next = vstruct_next(entry); const char *invalid; char buf[160]; int ret = 0; @@ -481,16 +482,16 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, if (mustfix_fsck_err_on(!k->k.u64s, c, "invalid %s in journal: k->u64s 0", type)) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } if (mustfix_fsck_err_on((void *) bkey_next(k) > - (void *) jset_keys_next(entry), c, + (void *) vstruct_next(entry), c, "invalid %s in journal: extends past end of journal entry", type)) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } @@ -499,7 +500,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, type, k->k.format)) { le16_add_cpu(&entry->u64s, -k->k.u64s); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } @@ -514,7 +515,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, le16_add_cpu(&entry->u64s, -k->k.u64s); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } fsck_err: @@ -525,16 +526,17 @@ fsck_err: #define JOURNAL_ENTRY_NONE 6 #define JOURNAL_ENTRY_BAD 7 -static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector, +static int journal_entry_validate(struct cache_set *c, + struct jset *j, u64 sector, unsigned bucket_sectors_left, unsigned sectors_read) { struct jset_entry *entry; - size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s)); - u64 got, expect; + size_t bytes = vstruct_bytes(j); + struct bch_csum csum; int ret = 0; - if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb)) + if (le64_to_cpu(j->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) { @@ -554,25 +556,32 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto if (bytes > sectors_read << 9) return JOURNAL_ENTRY_REREAD; - got = le64_to_cpu(j->csum); - expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j)); - if (mustfix_fsck_err_on(got != expect, c, - "journal checksum bad (got %llu expect %llu), sector %lluu", - got, expect, sector)) { + if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c, + "journal entry with unknown csum type %llu sector %lluu", + JSET_CSUM_TYPE(j), sector)) + return JOURNAL_ENTRY_BAD; + + csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); + if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c, + "journal checksum bad, sector %llu", sector)) { /* XXX: retry IO, when we start retrying checksum errors */ /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; } - if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), - c, "invalid journal entry: last_seq > seq")) + bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + + if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c, + "invalid journal entry: last_seq > seq")) j->last_seq = j->seq; - for_each_jset_entry(entry, j) { + vstruct_for_each(j, entry) { struct bkey_i *k; - if (mustfix_fsck_err_on(jset_keys_next(entry) > - bkey_idx(j, le32_to_cpu(j->u64s)), c, + if (mustfix_fsck_err_on(vstruct_next(entry) > + vstruct_last(j), c, "journal entry extents past end of jset")) { j->u64s = cpu_to_le64((u64 *) entry - j->_data); break; @@ -580,9 +589,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto switch (JOURNAL_ENTRY_TYPE(entry)) { case JOURNAL_ENTRY_BTREE_KEYS: - for (k = entry->start; - k < bkey_idx(entry, le16_to_cpu(entry->u64s)); - k = bkey_next(k)) { + vstruct_for_each(entry, k) { ret = journal_validate_key(c, j, entry, k, bkey_type(entry->level, entry->btree_id), @@ -599,7 +606,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto le16_to_cpu(entry->u64s) != k->k.u64s, c, "invalid btree root journal entry: wrong number of keys")) { journal_entry_null_range(entry, - jset_keys_next(entry)); + vstruct_next(entry)); continue; } @@ -616,14 +623,14 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, - jset_keys_next(entry)); + vstruct_next(entry)); } break; default: mustfix_fsck_err(c, "invalid journal entry type %llu", JOURNAL_ENTRY_TYPE(entry)); - journal_entry_null_range(entry, jset_keys_next(entry)); + journal_entry_null_range(entry, vstruct_next(entry)); break; } } @@ -632,126 +639,127 @@ fsck_err: return ret; } -static int journal_read_bucket(struct cache *ca, struct journal_list *jlist, +struct journal_read_buf { + void *data; + size_t size; +}; + +static int journal_read_buf_realloc(struct journal_read_buf *b, + size_t new_size) +{ + void *n; + + new_size = roundup_pow_of_two(new_size); + n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size)); + if (!n) + return -ENOMEM; + + free_pages((unsigned long) b->data, get_order(b->size)); + b->data = n; + b->size = new_size; + return 0; +} + +static int journal_read_bucket(struct cache *ca, + struct journal_read_buf *buf, + struct journal_list *jlist, unsigned bucket, u64 *seq, bool *entries_found) { struct cache_set *c = ca->set; struct journal_device *ja = &ca->journal; struct bio *bio = ja->bio; - struct jset *j, *data; - unsigned blocks, sectors_read, bucket_offset = 0; - unsigned max_entry_sectors = c->journal.entry_size_max >> 9; - u64 sector = bucket_to_sector(ca, - journal_bucket(ca->disk_sb.sb, bucket)); + struct jset *j = NULL; + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; bool saw_bad = false; int ret = 0; - data = (void *) __get_free_pages(GFP_KERNEL, - get_order(c->journal.entry_size_max)); - if (!data) { - mutex_lock(&jlist->cache_set_buffer_lock); - data = c->journal.buf[0].data; - } - pr_debug("reading %u", bucket); - while (bucket_offset < ca->mi.bucket_size) { -reread: - sectors_read = min_t(unsigned, - ca->mi.bucket_size - bucket_offset, - max_entry_sectors); + while (offset < end) { + if (!sectors_read) { +reread: sectors_read = min_t(unsigned, + end - offset, buf->size >> 9); - bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_iter.bi_sector = sector + bucket_offset; - bio->bi_iter.bi_size = sectors_read << 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - - ret = submit_bio_wait(bio); - - if (cache_fatal_io_err_on(ret, ca, - "journal read from sector %llu", - sector + bucket_offset) || - bch_meta_read_fault("journal")) { - ret = -EIO; - goto err; - } + bio_reset(bio); + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_iter.bi_sector = offset; + bio->bi_iter.bi_size = sectors_read << 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bch_bio_map(bio, buf->data); - /* This function could be simpler now since we no longer write - * journal entries that overlap bucket boundaries; this means - * the start of a bucket will always have a valid journal entry - * if it has any journal entries at all. - */ + ret = submit_bio_wait(bio); - j = data; - while (sectors_read) { - ret = journal_entry_validate(c, j, - sector + bucket_offset, - ca->mi.bucket_size - bucket_offset, - sectors_read); - switch (ret) { - case BCH_FSCK_OK: - break; - case JOURNAL_ENTRY_REREAD: - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - goto out; - blocks = 1; - goto next_block; - case JOURNAL_ENTRY_BAD: - saw_bad = true; - blocks = 1; - goto next_block; - default: - goto err; - } + if (cache_fatal_io_err_on(ret, ca, + "journal read from sector %llu", + offset) || + bch_meta_read_fault("journal")) + return -EIO; - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - ret = journal_entry_add(c, jlist, j); - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - goto err; + j = buf->data; + } + + ret = journal_entry_validate(c, j, offset, + end - offset, sectors_read); + switch (ret) { + case BCH_FSCK_OK: + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { + ret = journal_read_buf_realloc(buf, + vstruct_bytes(j)); + if (ret) + return ret; } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; + sectors = c->sb.block_size; + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; + sectors = c->sb.block_size; + goto next_block; + default: + return ret; + } - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); -next_block: - blocks = __set_blocks(j, le32_to_cpu(j->u64s), - block_bytes(c)); + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) + return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - pr_debug("next"); - bucket_offset += blocks * c->sb.block_size; - sectors_read -= blocks * c->sb.block_size; - j = ((void *) j) + blocks * block_bytes(c); + ret = journal_entry_add(c, jlist, j); + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: + *entries_found = true; + break; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: + return ret; } + + if (le64_to_cpu(j->seq) > *seq) + *seq = le64_to_cpu(j->seq); + + sectors = vstruct_sectors(j, c->block_bits); +next_block: + pr_debug("next"); + offset += sectors; + sectors_read -= sectors; + j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - if (data == c->journal.buf[0].data) - mutex_unlock(&jlist->cache_set_buffer_lock); - else - free_pages((unsigned long) data, - get_order(c->journal.entry_size_max)); - return ret; + return 0; } static void bch_journal_read_device(struct closure *cl) @@ -759,15 +767,11 @@ static void bch_journal_read_device(struct closure *cl) #define read_bucket(b) \ ({ \ bool entries_found = false; \ - int ret = journal_read_bucket(ca, jlist, b, \ - &seq, &entries_found); \ + ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ + &entries_found); \ + if (ret) \ + goto err; \ __set_bit(b, bitmap); \ - if (ret) { \ - mutex_lock(&jlist->lock); \ - jlist->ret = ret; \ - mutex_unlock(&jlist->lock); \ - closure_return(cl); \ - } \ entries_found; \ }) @@ -777,24 +781,29 @@ static void bch_journal_read_device(struct closure *cl) struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); + struct journal_read_buf buf = { NULL, 0 }; - unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); - DECLARE_BITMAP(bitmap, nr_buckets); + DECLARE_BITMAP(bitmap, ja->nr); unsigned i, l, r; u64 seq = 0; + int ret; - if (!nr_buckets) - closure_return(cl); + if (!ja->nr) + goto out; + + bitmap_zero(bitmap, ja->nr); + ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + if (ret) + goto err; - bitmap_zero(bitmap, nr_buckets); - pr_debug("%u journal buckets", nr_buckets); + pr_debug("%u journal buckets", ja->nr); /* * If the device supports discard but not secure discard, we can't do * the fancy fibonacci hash/binary search because the live journal * entries might not form a contiguous range: */ - for (i = 0; i < nr_buckets; i++) + for (i = 0; i < ja->nr; i++) read_bucket(i); goto search_done; @@ -805,8 +814,8 @@ static void bch_journal_read_device(struct closure *cl) * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ - for (i = 0; i < nr_buckets; i++) { - l = (i * 2654435769U) % nr_buckets; + for (i = 0; i < ja->nr; i++) { + l = (i * 2654435769U) % ja->nr; if (test_bit(l, bitmap)) break; @@ -821,18 +830,18 @@ static void bch_journal_read_device(struct closure *cl) */ pr_debug("falling back to linear search"); linear_scan: - for (l = find_first_zero_bit(bitmap, nr_buckets); - l < nr_buckets; - l = find_next_zero_bit(bitmap, nr_buckets, l + 1)) + for (l = find_first_zero_bit(bitmap, ja->nr); + l < ja->nr; + l = find_next_zero_bit(bitmap, ja->nr, l + 1)) if (read_bucket(l)) goto bsearch; /* no journal entries on this device? */ - if (l == nr_buckets) - closure_return(cl); + if (l == ja->nr) + goto out; bsearch: /* Binary search */ - r = find_next_bit(bitmap, nr_buckets, l + 1); + r = find_next_bit(bitmap, ja->nr, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { @@ -858,9 +867,9 @@ search_done: */ seq = 0; - for (i = 0; i < nr_buckets; i++) + for (i = 0; i < ja->nr; i++) if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) { + ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { /* * When journal_next_bucket() goes to allocate for * the first time, it'll use the bucket after @@ -875,20 +884,26 @@ search_done: * reclaimed - journal reclaim will immediately reclaim whatever isn't * pinned when it first runs: */ - ja->last_idx = (ja->cur_idx + 1) % nr_buckets; + ja->last_idx = (ja->cur_idx + 1) % ja->nr; /* * Read buckets in reverse order until we stop finding more journal * entries: */ - for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets; + for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; i != ja->cur_idx; - i = (i + nr_buckets - 1) % nr_buckets) + i = (i + ja->nr - 1) % ja->nr) if (!test_bit(i, bitmap) && !read_bucket(i)) break; - +out: + free_pages((unsigned long) buf.data, get_order(buf.size)); closure_return(cl); +err: + mutex_lock(&jlist->lock); + jlist->ret = ret; + mutex_unlock(&jlist->lock); + goto out; #undef read_bucket } @@ -930,6 +945,19 @@ static int journal_seq_blacklist_read(struct journal *j, return 0; } +static inline bool journal_has_keys(struct list_head *list) +{ + struct journal_replay *i; + struct jset_entry *entry; + struct bkey_i *k, *_n; + + list_for_each_entry(i, list, list) + for_each_jset_key(k, _n, entry, &i->j) + return true; + + return false; +} + int bch_journal_read(struct cache_set *c, struct list_head *list) { struct jset_entry *prio_ptrs; @@ -944,7 +972,6 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) closure_init_stack(&jlist.cl); mutex_init(&jlist.lock); - mutex_init(&jlist.cache_set_buffer_lock); jlist.head = list; jlist.ret = 0; @@ -964,6 +991,9 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) return BCH_FSCK_REPAIR_IMPOSSIBLE; } + fsck_err_on(c->sb.clean && journal_has_keys(list), c, + "filesystem marked clean but journal has keys to replay"); + j = &list_entry(list->prev, struct journal_replay, list)->j; unfixable_fsck_err_on(le64_to_cpu(j->seq) - @@ -1057,7 +1087,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) struct bkey_s_c k_s_c = bkey_i_to_s_c(k); if (btree_type_has_ptrs(type)) - __bch_btree_mark_key(c, type, k_s_c); + bch_btree_mark_key_initial(c, type, k_s_c); } } @@ -1171,10 +1201,9 @@ static enum { buf->data->last_seq = cpu_to_le64(last_seq(j)); j->prev_buf_sectors = - __set_blocks(buf->data, - le32_to_cpu(buf->data->u64s) + - journal_entry_u64s_reserve(buf), - block_bytes(c)) * c->sb.block_size; + vstruct_blocks_plus(buf->data, c->block_bits, + journal_entry_u64s_reserve(buf)) * + c->sb.block_size; BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); @@ -1219,9 +1248,8 @@ static unsigned journal_dev_buckets_available(struct journal *j, struct cache *ca) { struct journal_device *ja = &ca->journal; - unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb); - unsigned next = (ja->cur_idx + 1) % nr; - unsigned available = (ja->last_idx + nr - next) % nr; + unsigned next = (ja->cur_idx + 1) % ja->nr; + unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; /* * Hack to avoid a deadlock during journal replay: @@ -1271,7 +1299,7 @@ static int journal_entry_sectors(struct journal *j) * for the previous entry we have to make sure we have space for * it too: */ - if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) { + if (bch_extent_has_device(e.c, ca->dev_idx)) { if (j->prev_buf_sectors > ca->journal.sectors_free) buckets_required++; @@ -1479,17 +1507,28 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list) entries++; } + if (keys) { + bch_btree_flush(c); + + /* + * Write a new journal entry _before_ we start journalling new data - + * otherwise, we could end up with btree node bsets with journal seqs + * arbitrarily far in the future vs. the most recently written journal + * entry on disk, if we crash before writing the next journal entry: + */ + ret = bch_journal_meta(&c->journal); + if (ret) + goto err; + } + bch_info(c, "journal replay done, %i keys in %i entries, seq %llu", keys, entries, (u64) atomic64_read(&j->seq)); - fsck_err_on(c->sb.clean && keys, c, - "filesystem marked clean, but journal had keys to replay"); - bch_journal_set_replay_done(&c->journal); err: if (ret) bch_err(c, "journal replay error: %d", ret); -fsck_err: + bch_journal_entries_free(list); return ret; @@ -1497,28 +1536,40 @@ fsck_err: static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr) { - unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr; + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch_sb_get_journal(ca->disk_sb.sb); + struct bch_sb_field *f; u64 *p; - int ret; - ret = bch_super_realloc(&ca->disk_sb, u64s); - if (ret) - return ret; + p = krealloc(ja->bucket_seq, nr * sizeof(u64), + GFP_KERNEL|__GFP_ZERO); + if (!p) + return -ENOMEM; + + ja->bucket_seq = p; - p = krealloc(ca->journal.bucket_seq, - nr * sizeof(u64), + p = krealloc(ja->buckets, nr * sizeof(u64), GFP_KERNEL|__GFP_ZERO); if (!p) return -ENOMEM; - ca->journal.bucket_seq = p; - ca->disk_sb.sb->u64s = cpu_to_le16(u64s); + ja->buckets = p; + + f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr + + sizeof(*journal_buckets) / sizeof(u64)); + if (!f) + return -ENOMEM; + f->type = BCH_SB_FIELD_journal; + ja->nr = nr; return 0; } int bch_cache_journal_alloc(struct cache *ca) { + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets; int ret; unsigned i; @@ -1540,11 +1591,15 @@ int bch_cache_journal_alloc(struct cache *ca) if (ret) return ret; - for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) { - unsigned long r = ca->mi.first_bucket + i; + journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); + + for (i = 0; i < ja->nr; i++) { + u64 bucket = ca->mi.first_bucket + i; - bch_mark_metadata_bucket(ca, &ca->buckets[r], true); - set_journal_bucket(ca->disk_sb.sb, i, r); + ja->buckets[i] = bucket; + journal_buckets->buckets[i] = cpu_to_le64(bucket); + + bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true); } return 0; @@ -1749,7 +1804,7 @@ static void journal_reclaim_work(struct work_struct *work) struct cache *ca; struct journal_entry_pin *pin; u64 seq_to_flush = 0; - unsigned iter, nr, bucket_to_flush; + unsigned iter, bucket_to_flush; unsigned long next_flush; bool reclaim_lock_held = false, need_flush; @@ -1781,13 +1836,11 @@ static void journal_reclaim_work(struct work_struct *work) blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, - journal_bucket(ca->disk_sb.sb, - ja->last_idx)), + ja->buckets[ja->last_idx]), ca->mi.bucket_size, GFP_NOIO, 0); spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % - bch_nr_journal_buckets(ca->disk_sb.sb); + ja->last_idx = (ja->last_idx + 1) % ja->nr; spin_unlock(&j->lock); wake_up(&j->wait); @@ -1798,8 +1851,7 @@ static void journal_reclaim_work(struct work_struct *work) * buckets */ spin_lock(&j->lock); - nr = bch_nr_journal_buckets(ca->disk_sb.sb), - bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr; + bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; seq_to_flush = max_t(u64, seq_to_flush, ja->bucket_seq[bucket_to_flush]); spin_unlock(&j->lock); @@ -1861,7 +1913,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) */ extent_for_each_ptr_backwards(e, ptr) if (!(ca = PTR_CACHE(c, ptr)) || - ca->mi.state != CACHE_ACTIVE || + ca->mi.state != BCH_MEMBER_STATE_ACTIVE || ca->journal.sectors_free <= sectors) __bch_extent_drop_ptr(e, ptr); else @@ -1875,7 +1927,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) */ group_for_each_cache_rcu(ca, &j->devs, iter) { struct journal_device *ja = &ca->journal; - unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); if (replicas >= replicas_want) break; @@ -1884,21 +1935,20 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) * Check that we can use this device, and aren't already using * it: */ - if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) || + if (bch_extent_has_device(e.c, ca->dev_idx) || !journal_dev_buckets_available(j, ca) || sectors > ca->mi.bucket_size) continue; ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % nr_buckets; + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq); extent_ptr_append(bkey_i_to_extent(&j->key), (struct bch_extent_ptr) { .offset = bucket_to_sector(ca, - journal_bucket(ca->disk_sb.sb, - ja->cur_idx)), - .dev = ca->sb.nr_this_dev, + ja->buckets[ja->cur_idx]), + .dev = ca->dev_idx, }); replicas++; @@ -1928,10 +1978,7 @@ static void journal_write_compact(struct jset *jset) * If we wanted to be really fancy here, we could sort all the keys in * the jset and drop keys that were overwritten - probably not worth it: */ - for (i = jset->start; - i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) && - (next = jset_keys_next(i), true); - i = next) { + vstruct_for_each_safe(jset, i, next) { unsigned u64s = le16_to_cpu(i->u64s); /* Empty entry: */ @@ -1945,7 +1992,7 @@ static void journal_write_compact(struct jset *jset) JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) && JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS && le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(jset_keys_next(prev), + memmove_u64s_down(vstruct_next(prev), i->_data, u64s); le16_add_cpu(&prev->u64s, u64s); @@ -1953,12 +2000,12 @@ static void journal_write_compact(struct jset *jset) } /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? jset_keys_next(prev) : jset->start; + prev = prev ? vstruct_next(prev) : jset->start; if (i != prev) memmove_u64s_down(prev, i, jset_u64s(u64s)); } - prev = prev ? jset_keys_next(prev) : jset->start; + prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); } @@ -2019,6 +2066,7 @@ static void journal_write(struct closure *cl) struct cache_set *c = container_of(j, struct cache_set, journal); struct cache *ca; struct journal_buf *w = journal_prev_buf(j); + struct jset *jset = w->data; struct bio *bio; struct bch_extent_ptr *ptr; unsigned i, sectors, bytes; @@ -2036,24 +2084,27 @@ static void journal_write(struct closure *cl) } mutex_unlock(&c->btree_root_lock); - journal_write_compact(w->data); + journal_write_compact(jset); + + jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand); + jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(BCACHE_JSET_VERSION); - w->data->read_clock = cpu_to_le16(c->prio_clock[READ].hand); - w->data->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); - w->data->magic = cpu_to_le64(jset_magic(&c->disk_sb)); - w->data->version = cpu_to_le32(BCACHE_JSET_VERSION); + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c)); - SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum); - w->data->csum = cpu_to_le64(__csum_set(w->data, - le32_to_cpu(w->data->u64s), - JSET_CSUM_TYPE(w->data))); + bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); - sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s), - block_bytes(c)) * c->sb.block_size; + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + sectors = vstruct_sectors(jset, c->block_bits); BUG_ON(sectors > j->prev_buf_sectors); - bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s)); + bytes = vstruct_bytes(w->data); memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); if (journal_write_alloc(j, sectors)) { @@ -2096,7 +2147,7 @@ static void journal_write(struct closure *cl) bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); - bch_bio_map(bio, w->data); + bch_bio_map(bio, jset); trace_bcache_journal_write(bio); closure_bio_submit_punt(bio, cl, c); @@ -2105,7 +2156,7 @@ static void journal_write(struct closure *cl) } for_each_cache(ca, c, i) - if (ca->mi.state == CACHE_ACTIVE && + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && journal_flushes_device(ca) && !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { percpu_ref_get(&ca->ref); @@ -2503,7 +2554,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf) "\tnr\t\t%u\n" "\tcur_idx\t\t%u (seq %llu)\n" "\tlast_idx\t%u (seq %llu)\n", - iter, bch_nr_journal_buckets(ca->disk_sb.sb), + iter, ja->nr, ja->cur_idx, ja->bucket_seq[ja->cur_idx], ja->last_idx, ja->bucket_seq[ja->last_idx]); } @@ -2521,7 +2572,7 @@ static bool bch_journal_writing_to_device(struct cache *ca) spin_lock(&j->lock); ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), - ca->sb.nr_this_dev); + ca->dev_idx); spin_unlock(&j->lock); return ret; @@ -2541,10 +2592,11 @@ static bool bch_journal_writing_to_device(struct cache *ca) int bch_journal_move(struct cache *ca) { - unsigned i, nr_buckets; u64 last_flushed_seq; + struct journal_device *ja = &ca->journal; struct cache_set *c = ca->set; struct journal *j = &c->journal; + unsigned i; int ret = 0; /* Success */ if (bch_journal_writing_to_device(ca)) { @@ -2585,10 +2637,45 @@ int bch_journal_move(struct cache *ca) last_flushed_seq = last_seq(j); spin_unlock(&j->lock); - nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); - - for (i = 0; i < nr_buckets; i += 1) - BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq); + for (i = 0; i < ja->nr; i += 1) + BUG_ON(ja->bucket_seq[i] > last_flushed_seq); return ret; } + +void bch_journal_free_cache(struct cache *ca) +{ + kfree(ca->journal.buckets); + kfree(ca->journal.bucket_seq); +} + +int bch_journal_init_cache(struct cache *ca) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch_sb_get_journal(ca->disk_sb.sb); + unsigned i, journal_entry_pages; + + journal_entry_pages = + DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb), + PAGE_SECTORS); + + ja->nr = bch_nr_journal_buckets(journal_buckets); + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) + return -ENOMEM; + + ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages); + if (!ca->journal.bio) + return -ENOMEM; + + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->buckets) + return -ENOMEM; + + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + + return 0; +} |