summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2015-02-11 17:55:21 -0800
committerKent Overstreet <kmo@daterainc.com>2015-02-12 23:44:10 -0800
commitb9c284fce93e07f97d415d6c1aec8e4bf6b65f94 (patch)
tree5b2214ed9b4853fb29aa02d378718bc79890dd31
parentdfb3374c0079851846acfff5384c455d9d4308e7 (diff)
bcache: New btree node format
Change-Id: I48c8ee1ac9648c13411de92583fc18d43285c7d7
-rw-r--r--drivers/md/bcache/bset.c92
-rw-r--r--drivers/md/bcache/bset.h30
-rw-r--r--drivers/md/bcache/btree.c342
-rw-r--r--drivers/md/bcache/btree.h29
-rw-r--r--drivers/md/bcache/debug.c2
-rw-r--r--drivers/md/bcache/extents.c18
-rw-r--r--drivers/md/bcache/gc.c2
-rw-r--r--include/uapi/linux/bcache.h37
8 files changed, 314 insertions, 238 deletions
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 6f58fbf537d3..9be2f4c56d3b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -34,7 +34,7 @@ static bool keys_out_of_order(const struct bkey_format *f,
void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
{
- struct bkey_format *f = &b->set->data->format;
+ struct bkey_format *f = &b->format;
struct bkey_packed *_k, *_n;
struct bkey k, n;
char buf[80];
@@ -68,8 +68,7 @@ void bch_dump_bucket(struct btree_keys *b)
console_lock();
for (i = 0; i <= b->nsets; i++)
- bch_dump_bset(b, b->set[i].data,
- bset_sector_offset(b, b->set[i].data));
+ bch_dump_bset(b, b->set[i].data, i);
console_unlock();
}
@@ -186,11 +185,8 @@ void bch_btree_keys_free(struct btree_keys *b)
free_pages((unsigned long) t->tree,
get_order(bset_tree_bytes(b)));
- free_pages((unsigned long) t->data, b->page_order);
-
t->prev = NULL;
t->tree = NULL;
- t->data = NULL;
}
EXPORT_SYMBOL(bch_btree_keys_free);
@@ -198,14 +194,10 @@ int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
{
struct bset_tree *t = b->set;
- BUG_ON(t->data);
+ BUG_ON(t->tree || t->prev);
b->page_order = page_order;
- t->data = (void *) __get_free_pages(gfp, b->page_order);
- if (!t->data)
- goto err;
-
t->tree = bset_tree_bytes(b) < PAGE_SIZE
? kmalloc(bset_tree_bytes(b), gfp)
: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
@@ -228,8 +220,12 @@ EXPORT_SYMBOL(bch_btree_keys_alloc);
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
{
+ struct bkey_format_state s;
unsigned i;
+ bch_bkey_format_init(&s);
+ b->format = bch_bkey_format_done(&s);
+
b->ops = ops;
b->nsets = 0;
b->last_set_unwritten = 0;
@@ -239,16 +235,10 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
#ifdef CONFIG_BCACHE_DEBUG
b->expensive_debug_checks = expensive_debug_checks;
#endif
-
- /* XXX: shouldn't be needed */
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].size = 0;
- /*
- * Second loop starts at 1 because b->keys[0]->data is the memory we
- * allocated
- */
- for (i = 1; i < MAX_BSETS; i++)
+ for (i = 0; i < MAX_BSETS; i++) {
b->set[i].data = NULL;
+ b->set[i].size = 0;
+ }
}
EXPORT_SYMBOL(bch_btree_keys_init);
@@ -520,22 +510,21 @@ static void bch_bset_build_unwritten_tree(struct btree_keys *b)
}
}
-void bch_bset_init_next(struct btree_keys *b, struct bset *i)
+void bch_bset_init_first(struct btree_keys *b, struct bset *i)
{
+ b->set[0].data = i;
memset(i, 0, sizeof(*i));
+ get_random_bytes(&i->seq, sizeof(i->seq));
- if (i != b->set->data) {
- b->set[++b->nsets].data = i;
- i->seq = b->set->data->seq;
- i->format = b->set->data->format;
- } else {
- struct bkey_format_state s;
-
- bch_bkey_format_init(&s);
- b->set->data->format = bch_bkey_format_done(&s);
+ bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_first);
- get_random_bytes(&i->seq, sizeof(uint64_t));
- }
+void bch_bset_init_next(struct btree_keys *b, struct bset *i)
+{
+ b->set[++b->nsets].data = i;
+ memset(i, 0, sizeof(*i));
+ i->seq = b->set->data->seq;
bch_bset_build_unwritten_tree(b);
}
@@ -590,7 +579,7 @@ retry:
for (j = inorder_next(0, t->size);
j;
j = inorder_next(j, t->size))
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
}
EXPORT_SYMBOL(bch_bset_build_written_tree);
@@ -632,7 +621,7 @@ static void verify_insert_pos(struct btree_keys *b,
const struct bkey_i *insert)
{
#ifdef CONFIG_BCACHE_DEBUG
- const struct bkey_format *f = &b->set->data->format;
+ const struct bkey_format *f = &b->format;
struct bset_tree *t = bset_tree_last(b);
BUG_ON(prev &&
@@ -696,13 +685,13 @@ found_set:
if (k == t->data->start)
for (j = 1; j < t->size; j = j * 2)
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
if (bkey_next(k) == bset_bkey_last(t->data)) {
t->end = *k;
for (j = 1; j < t->size; j = j * 2 + 1)
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
}
j = inorder_to_tree(inorder, t);
@@ -711,11 +700,11 @@ found_set:
j < t->size &&
k == tree_to_bkey(t, j)) {
/* Fix the auxiliary search tree node this key corresponds to */
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
/* Children for which this key is the right side boundary */
for (j = j * 2; j < t->size; j = j * 2 + 1)
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
}
j = inorder_to_tree(inorder + 1, t);
@@ -723,11 +712,11 @@ found_set:
if (j &&
j < t->size &&
k == tree_to_prev_bkey(t, j)) {
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
/* Children for which this key is the left side boundary */
for (j = j * 2 + 1; j < t->size; j = j * 2)
- make_bfloat(&b->set->data->format, t, j);
+ make_bfloat(&b->format, t, j);
}
}
EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
@@ -785,7 +774,7 @@ void bch_bset_insert(struct btree_keys *b,
struct btree_node_iter *iter,
struct bkey_i *insert)
{
- struct bkey_format *f = &b->set->data->format;
+ struct bkey_format *f = &b->format;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = t->data;
struct bkey_packed *prev = NULL;
@@ -794,12 +783,12 @@ void bch_bset_insert(struct btree_keys *b,
struct bkey_packed packed, *src;
BKEY_PADDED(k) tmp;
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(b));
BUG_ON(b->ops->is_extents &&
(!insert->k.size || bkey_deleted(&insert->k)));
BUG_ON(!b->last_set_unwritten);
BUG_ON(where < i->start);
BUG_ON(where > bset_bkey_last(i));
+ bch_verify_btree_keys_accounting(b);
while (where != bset_bkey_last(i) &&
keys_out_of_order(f, bkey_to_packed(insert),
@@ -875,6 +864,7 @@ void bch_bset_insert(struct btree_keys *b,
bch_btree_node_iter_fix(iter, b, where);
bch_btree_node_iter_verify(iter, b);
+ bch_verify_btree_keys_accounting(b);
}
EXPORT_SYMBOL(bch_bset_insert);
@@ -986,7 +976,7 @@ static struct bkey_packed *bch_bset_search(struct btree_keys *b,
struct bpos search,
struct bkey_packed *packed_search)
{
- const struct bkey_format *f = &b->set->data->format;
+ const struct bkey_format *f = &b->format;
struct bkey_packed *m;
/*
@@ -1054,7 +1044,7 @@ static inline bool btree_node_iter_cmp(struct btree_node_iter *iter,
{
struct bkey_packed *l = __btree_node_offset_to_key(b, ls.k);
struct bkey_packed *r = __btree_node_offset_to_key(b, rs.k);
- s64 c = bkey_cmp_packed(&b->set->data->format, l, r);
+ s64 c = bkey_cmp_packed(&b->format, l, r);
/*
* For non extents, when keys compare equal the deleted keys have to
@@ -1110,7 +1100,7 @@ void bch_btree_node_iter_init(struct btree_node_iter *iter,
{
struct bset_tree *t;
struct bkey_packed p, *packed_search =
- bkey_pack_pos(&p, search, &b->set->data->format) ? &p : NULL;
+ bkey_pack_pos(&p, search, &b->format) ? &p : NULL;
__bch_btree_node_iter_init(iter, b, b->set);
@@ -1234,7 +1224,7 @@ static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
struct btree_keys *b,
struct bkey_packed *k)
{
- const struct bkey_format *f = &b->set->data->format;
+ const struct bkey_format *f = &b->format;
const struct bkey_packed *n = bch_btree_node_iter_peek_all(iter, b);
bkey_unpack_key(f, k);
@@ -1271,7 +1261,7 @@ bool bch_btree_node_iter_next_unpack(struct btree_node_iter *iter,
struct btree_keys *b,
struct bkey_tup *tup)
{
- struct bkey_format *f = &b->set->data->format;
+ struct bkey_format *f = &b->format;
struct bkey_packed *k = bch_btree_node_iter_next(iter, b);
if (!k)
@@ -1332,8 +1322,8 @@ static void btree_mergesort(struct btree_keys *dst,
struct btree_node_iter *iter,
ptr_filter_fn filter)
{
- struct bkey_format *in_f = &src->set->data->format;
- struct bkey_format *out_f = &dst->set->data->format;
+ struct bkey_format *in_f = &src->format;
+ struct bkey_format *out_f = &dst->format;
struct bkey_packed *k, *prev = NULL, *out = dst_set->start;
struct bkey_tup tup;
@@ -1422,8 +1412,6 @@ static void __btree_sort(struct btree_keys *b, struct btree_node_iter *iter,
start_time = local_clock();
- out->format = b->set->data->format;
-
/*
* If we're only doing a partial sort (start != 0), then we can't merge
* extents because that might produce extents that overlap with 0 size
@@ -1440,7 +1428,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_node_iter *iter,
b->nsets = start;
- if (!start && order == b->page_order) {
+ if (0 && !start && order == b->page_order) {
unsigned u64s = out->u64s;
/*
* Our temporary buffer is the same size as the btree node's
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index f877f9ecb261..51a6bb6105a1 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -231,6 +231,8 @@ struct btree_keys {
u16 nr_packed_keys;
u16 nr_unpacked_keys;
+ struct bkey_format format;
+
/*
* Sets of sorted keys - the real btree node - plus a binary search tree
*
@@ -268,16 +270,6 @@ static inline bool bkey_written(struct btree_keys *b, struct bkey_packed *k)
return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
}
-static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
-{
- return ((size_t) i) - ((size_t) b->set->data);
-}
-
-static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
-{
- return bset_byte_offset(b, i) >> 9;
-}
-
#define __set_bytes(_i, _u64s) (sizeof(*(_i)) + (_u64s) * sizeof(u64))
#define set_bytes(_i) __set_bytes(_i, (_i)->u64s)
@@ -287,21 +279,6 @@ static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
#define set_blocks(_i, _block_bytes) \
__set_blocks((_i), (_i)->u64s, (_block_bytes))
-static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
-{
- struct bset_tree *t = bset_tree_last(b);
-
- BUG_ON((PAGE_SIZE << b->page_order) <
- (bset_byte_offset(b, t->data) + set_bytes(t->data)));
-
- if (!b->last_set_unwritten)
- return 0;
-
- return ((PAGE_SIZE << b->page_order) -
- (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
- sizeof(u64);
-}
-
static inline struct bset *bset_next_set(struct btree_keys *b,
unsigned block_bytes)
{
@@ -315,6 +292,7 @@ int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
bool *);
+void bch_bset_init_first(struct btree_keys *, struct bset *);
void bch_bset_init_next(struct btree_keys *, struct bset *);
void bch_bset_build_written_tree(struct btree_keys *);
void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey_packed *);
@@ -504,7 +482,7 @@ bch_btree_node_iter_peek_overlapping(struct btree_node_iter *iter,
struct btree_keys *b,
struct bkey *end)
{
- const struct bkey_format *f = &b->set->data->format;
+ const struct bkey_format *f = &b->format;
struct bkey_packed *ret;
struct bkey u;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 9cfb8d830df8..a79beb263ee0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -190,9 +190,11 @@ bool bch_btree_iter_upgrade(struct btree_iter *iter)
return true;
}
-static inline struct bset *write_block(struct btree *b)
+static inline struct btree_node_entry *write_block(struct btree *b)
{
- return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+ BUG_ON(!b->written);
+
+ return (void *) b->data + (b->written << (b->c->block_bits + 9));
}
/* Returns true if we sorted (i.e. invalidated iterators */
@@ -218,47 +220,109 @@ static void bch_btree_init_next(struct btree *b, struct btree_iter *iter)
if (nsets && !b->keys.nsets)
bch_btree_verify(b);
- if (b->written < btree_blocks(b->c)) {
- struct bset *i = write_block(b);
-
- bch_bset_init_next(&b->keys, i);
- i->magic = bset_magic(&b->c->sb);
- }
+ if (b->written < btree_blocks(b->c))
+ bch_bset_init_next(&b->keys, &write_block(b)->keys);
if (iter && sorted)
btree_iter_node_set(iter, b);
+
+ clear_btree_node_need_init_next(b);
}
/* Btree IO */
-static uint64_t btree_csum_set(struct btree *b, struct bset *i)
-{
- const struct bkey_i_extent *e = bkey_i_to_extent_c(&b->key);
- u64 crc = e->v.ptr[0]._val;
- void *data = (void *) i + 8, *end = bset_bkey_last(i);
-
- crc = bch_checksum_update(BSET_CSUM_TYPE(i), crc, data, end - data);
-
- return crc ^ 0xffffffffffffffffULL;
-}
+#define btree_csum_set(_b, _i) \
+({ \
+ void *_data = (void *) (_i) + 8; \
+ void *_end = bset_bkey_last(&(_i)->keys); \
+ \
+ bch_checksum_update(BSET_CSUM_TYPE(&(_i)->keys), \
+ bkey_i_to_extent_c(&(_b)->key)->v.ptr[0]._val,\
+ _data, \
+ _end - _data) ^ 0xffffffffffffffffULL; \
+})
#define btree_node_error(b, ca, ptr, fmt, ...) \
bch_cache_error(ca, \
"btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
(b)->btree_id, (b)->level, btree_node_root(b) \
? btree_node_root(b)->level : -1, \
- PTR_BUCKET_NR(ca, ptr), bset_block_offset(b, i), \
- i->u64s, ##__VA_ARGS__)
+ PTR_BUCKET_NR(ca, ptr), (b)->written, \
+ (i)->u64s, ##__VA_ARGS__)
+
+static const char *validate_bset(struct btree *b, struct cache *ca,
+ const struct bch_extent_ptr *ptr,
+ struct bset *i, unsigned blocks)
+{
+ struct bkey_format *f = &b->keys.format;
+ struct cache_set *c = b->c;
+ struct bkey_packed *k;
+
+ if (i->version != BCACHE_BSET_VERSION)
+ return "unsupported bset version";
+
+ if (b->written + blocks > btree_blocks(c))
+ return "bset past end of btree node";
+
+ if (i != &b->data->keys && !i->u64s)
+ btree_node_error(b, ca, ptr, "empty set");
+
+ for (k = i->start;
+ k != bset_bkey_last(i);) {
+ struct bkey_tup tup;
+
+ if (!k->u64s) {
+ btree_node_error(b, ca, ptr,
+ "KEY_U64s 0: %zu bytes of metadata lost",
+ (void *) bset_bkey_last(i) - (void *) k);
+
+ i->u64s = (u64 *) k - i->_data;
+ break;
+ }
+
+ if (bkey_next(k) > bset_bkey_last(i)) {
+ btree_node_error(b, ca, ptr,
+ "key extends past end of bset");
+
+ i->u64s = (u64 *) k - i->_data;
+ break;
+ }
+
+ bkey_disassemble(&tup, f, k);
+
+ if (bkey_invalid(c, b->level
+ ? BKEY_TYPE_BTREE
+ : b->btree_id,
+ bkey_tup_to_s_c(&tup))) {
+ char buf[160];
+
+ bkey_disassemble(&tup, f, k);
+ bch_bkey_val_to_text(b, buf, sizeof(buf),
+ bkey_tup_to_s_c(&tup));
+ btree_node_error(b, ca, ptr,
+ "invalid bkey %s", buf);
+
+ i->u64s -= k->u64s;
+ memmove(k, bkey_next(k),
+ (void *) bset_bkey_last(i) - (void *) k);
+ continue;
+ }
+
+ k = bkey_next(k);
+ }
+
+ b->written += blocks;
+ return NULL;
+}
void bch_btree_node_read_done(struct btree *b, struct cache *ca,
const struct bch_extent_ptr *ptr)
{
struct cache_set *c = b->c;
- const struct bkey_format *f = &b->keys.set->data->format;
- const char *err;
- struct bset *i = btree_bset_first(b);
+ struct btree_node_entry *bne;
+ struct bset *i = &b->data->keys;
struct btree_node_iter *iter;
- struct bkey_packed *k;
+ const char *err;
iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
iter->used = 0;
@@ -268,91 +332,67 @@ void bch_btree_node_read_done(struct btree *b, struct cache *ca,
if (bch_meta_read_fault("btree"))
goto err;
- err = "bad btree header";
- if (!i->seq)
+ err = "bad magic";
+ if (b->data->magic != bset_magic(&c->sb))
goto err;
- for (;
- b->written < btree_blocks(c) && i->seq == b->keys.set[0].data->seq;
- i = write_block(b)) {
- b->written += set_blocks(i, block_bytes(c));
-
- err = "unsupported bset version";
- if (i->version != BCACHE_BSET_VERSION)
- goto err;
-
- err = "bad magic";
- if (i->magic != bset_magic(&c->sb))
- goto err;
-
- err = "unknown checksum type";
- if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
- goto err;
-
- err = "bad btree header";
- if (b->written > btree_blocks(c))
- goto err;
+ err = "bad btree header";
+ if (!b->data->keys.seq)
+ goto err;
- err = "bad checksum";
- if (i->csum != btree_csum_set(b, i))
- goto err;
+ b->keys.format = b->data->format;
+ i = b->keys.set->data = &b->data->keys;
- if (i != b->keys.set[0].data && !i->u64s)
- btree_node_error(b, ca, ptr, "empty set");
+ while (b->written < btree_blocks(c)) {
+ unsigned blocks;
- for (k = i->start;
- k != bset_bkey_last(i);) {
- struct bkey_tup tup;
+ if (!b->written) {
+ i = &b->data->keys;
- if (!k->u64s) {
- btree_node_error(b, ca, ptr,
- "KEY_U64s 0: %zu bytes of metadata lost",
- (void *) bset_bkey_last(i) - (void *) k);
+ err = "unknown checksum type";
+ if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+ goto err;
- i->u64s = (u64 *) k - i->_data;
- break;
- }
+ err = "bad checksum";
+ if (b->data->csum != btree_csum_set(b, b->data))
+ goto err;
- if (bkey_next(k) > bset_bkey_last(i)) {
- btree_node_error(b, ca, ptr,
- "key extends past end of bset");
+ blocks = __set_blocks(b->data,
+ b->data->keys.u64s,
+ block_bytes(c));
+ } else {
+ bne = write_block(b);
+ i = &bne->keys;
- i->u64s = (u64 *) k - i->_data;
+ if (i->seq != b->data->keys.seq)
break;
- }
-
- bkey_disassemble(&tup, f, k);
-
- if (bkey_invalid(c, b->level
- ? BKEY_TYPE_BTREE
- : b->btree_id,
- bkey_tup_to_s_c(&tup))) {
- char buf[160];
- bkey_disassemble(&tup, f, k);
- bch_bkey_val_to_text(b, buf, sizeof(buf),
- bkey_tup_to_s_c(&tup));
- btree_node_error(b, ca, ptr,
- "invalid bkey %s", buf);
+ err = "unknown checksum type";
+ if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+ goto err;
- i->u64s -= k->u64s;
- memmove(k, bkey_next(k),
- (void *) bset_bkey_last(i) - (void *) k);
- continue;
- }
+ err = "bad checksum";
+ if (bne->csum != btree_csum_set(b, bne))
+ goto err;
- k = bkey_next(k);
+ blocks = __set_blocks(bne,
+ bne->keys.u64s,
+ block_bytes(c));
}
+ err = validate_bset(b, ca, ptr, i, blocks);
+ if (err)
+ goto err;
+
bch_btree_node_iter_push(iter, &b->keys,
i->start, bset_bkey_last(i));
}
err = "corrupted btree";
- for (i = write_block(b);
- bset_sector_offset(&b->keys, i) < btree_sectors(c);
- i = ((void *) i) + block_bytes(c))
- if (i->seq == b->keys.set[0].data->seq)
+ for (bne = write_block(b);
+ bset_byte_offset(b, bne) < btree_bytes(c);
+ bne = (void *) bne + block_bytes(c))
+ if (bne->keys.seq == b->data->keys.seq)
goto err;
bch_btree_sort_and_fix_extents(&b->keys, iter,
@@ -361,11 +401,10 @@ void bch_btree_node_read_done(struct btree *b, struct cache *ca,
: bch_key_sort_fix_overlapping,
&c->sort);
- i = b->keys.set[0].data;
err = "short btree key";
if (b->keys.set[0].size &&
- bkey_cmp_packed(&b->keys.set->data->format,
- &b->key.k, &b->keys.set[0].end) < 0)
+ bkey_cmp_packed(&b->keys.format, &b->key.k,
+ &b->keys.set[0].end) < 0)
goto err;
out:
@@ -406,7 +445,7 @@ static void bch_btree_node_read(struct btree *b)
bio->bio.bi_end_io = btree_node_read_endio;
bio->bio.bi_private = &cl;
- bch_bio_map(&bio->bio, b->keys.set[0].data);
+ bch_bio_map(&bio->bio, b->data);
bio_get(&bio->bio);
bch_submit_bbio(bio, ca, &b->key, ptr, true);
@@ -500,25 +539,45 @@ static void do_btree_node_write(struct closure *cl)
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct cache *ca;
- size_t blocks_to_write = set_blocks(i, block_bytes(b->c));
+ size_t blocks_to_write;
+ void *data;
trace_bcache_btree_write(b);
BUG_ON(b->written >= btree_blocks(b->c));
- BUG_ON(b->written + blocks_to_write > btree_blocks(b->c));
BUG_ON(b->written && !i->u64s);
BUG_ON(btree_bset_first(b)->seq != i->seq);
cancel_delayed_work(&b->work);
change_bit(BTREE_NODE_write_idx, &b->flags);
-
- b->written += blocks_to_write;
+ set_btree_node_need_init_next(b);
i->version = BCACHE_BSET_VERSION;
SET_BSET_CSUM_TYPE(i, CACHE_PREFERRED_CSUM_TYPE(&b->c->sb));
- i->csum = btree_csum_set(b, i);
+
+ if (!b->written) {
+ BUG_ON(b->data->magic != bset_magic(&b->c->sb));
+
+ b->data->format = b->keys.format;
+ data = b->data;
+ b->data->csum = btree_csum_set(b, b->data);
+ blocks_to_write = __set_blocks(b->data,
+ b->data->keys.u64s,
+ block_bytes(b->c));
+
+ } else {
+ struct btree_node_entry *bne = write_block(b);
+
+ data = bne;
+ bne->csum = btree_csum_set(b, bne);
+ blocks_to_write = __set_blocks(bne,
+ bne->keys.u64s,
+ block_bytes(b->c));
+ }
+
+ BUG_ON(b->written + blocks_to_write > btree_blocks(b->c));
BUG_ON(b->bio);
b->bio = bch_bbio_alloc(b->c);
@@ -530,8 +589,8 @@ static void do_btree_node_write(struct closure *cl)
b->bio->bi_end_io = btree_node_write_endio;
b->bio->bi_private = cl;
b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
- b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
- bch_bio_map(b->bio, i);
+ b->bio->bi_iter.bi_size = blocks_to_write << (b->c->block_bits + 9);
+ bch_bio_map(b->bio, data);
/*
* If we're appending to a leaf node, we don't technically need FUA -
@@ -553,22 +612,24 @@ static void do_btree_node_write(struct closure *cl)
extent_for_each_ptr(e, ptr)
SET_PTR_OFFSET(ptr, PTR_OFFSET(ptr) +
- bset_sector_offset(&b->keys, i));
+ (b->written << b->c->block_bits));
rcu_read_lock();
extent_for_each_online_device(b->c, e, ptr, ca)
- atomic_long_add(blocks_to_write * b->c->sb.block_size,
+ atomic_long_add(blocks_to_write << b->c->block_bits,
&ca->btree_sectors_written);
rcu_read_unlock();
+ b->written += blocks_to_write;
+
if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
int j;
struct bio_vec *bv;
- void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+ void *base = (void *) ((unsigned long) data & ~(PAGE_SIZE - 1));
bio_for_each_segment_all(bv, b->bio, j)
memcpy(page_address(bv->bv_page),
- base + j * PAGE_SIZE, PAGE_SIZE);
+ base + (j << PAGE_SHIFT), PAGE_SIZE);
bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true);
continue_at(cl, btree_node_write_done, NULL);
@@ -576,7 +637,7 @@ static void do_btree_node_write(struct closure *cl)
trace_bcache_btree_bounce_write_fail(b);
b->bio->bi_vcnt = 0;
- bch_bio_map(b->bio, i);
+ bch_bio_map(b->bio, data);
bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true);
@@ -720,6 +781,8 @@ static void mca_data_free(struct btree *b)
{
BUG_ON(b->io_mutex.count != 1);
+ free_pages((unsigned long) b->data, b->keys.page_order);
+ b->data = NULL;
bch_btree_keys_free(&b->keys);
b->c->btree_cache_used--;
@@ -737,12 +800,22 @@ static void mca_bucket_free(struct btree *b)
static void mca_data_alloc(struct btree *b, gfp_t gfp)
{
- if (!bch_btree_keys_alloc(&b->keys, ilog2(b->c->btree_pages), gfp)) {
- b->c->btree_cache_used++;
- list_move(&b->list, &b->c->btree_cache);
- } else {
- list_move(&b->list, &b->c->btree_cache_freed);
- }
+ unsigned order = ilog2(b->c->btree_pages);
+
+ b->data = (void *) __get_free_pages(gfp, order);
+ if (!b->data)
+ goto err;
+
+ if (bch_btree_keys_alloc(&b->keys, order, gfp))
+ goto err;
+
+ b->c->btree_cache_used++;
+ list_move(&b->list, &b->c->btree_cache_freeable);
+ return;
+err:
+ free_pages((unsigned long) b->data, order);
+ b->data = NULL;
+ list_move(&b->list, &b->c->btree_cache_freed);
}
static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp)
@@ -758,7 +831,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, gfp_t gfp)
sema_init(&b->io_mutex, 1);
mca_data_alloc(b, gfp);
- return b;
+ return b->data ? b : NULL;
}
/*
@@ -948,7 +1021,7 @@ void bch_btree_cache_free(struct cache_set *c)
if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
+ clear_btree_node_dirty(b);
mca_data_free(b);
}
@@ -1128,7 +1201,7 @@ static struct btree *mca_alloc(struct cache_set *c, const struct bkey_i *k,
list_for_each_entry(b, &c->btree_cache_freed, list)
if (!mca_reap_notrace(b, false)) {
mca_data_alloc(b, __GFP_NOWARN|GFP_NOIO);
- if (!b->keys.set[0].data)
+ if (!b->data)
goto err;
else
goto out;
@@ -1140,8 +1213,6 @@ static struct btree *mca_alloc(struct cache_set *c, const struct bkey_i *k,
BUG_ON(!six_trylock_intent(&b->lock));
BUG_ON(!six_trylock_write(&b->lock));
- if (!b->keys.set->data)
- goto err;
out:
BUG_ON(b->io_mutex.count != 1);
@@ -1305,7 +1376,7 @@ void btree_node_free(struct btree *b)
if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
+ clear_btree_node_dirty(b);
cancel_delayed_work(&b->work);
@@ -1396,8 +1467,8 @@ static struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
bch_check_mark_super(c, &b->key, true);
b->accessed = 1;
- bch_bset_init_next(&b->keys, b->keys.set->data);
- b->keys.set->data->magic = bset_magic(&c->sb);
+ b->data->magic = bset_magic(&c->sb);
+ bch_bset_init_first(&b->keys, &b->data->keys);
set_btree_node_dirty(b);
trace_bcache_btree_node_alloc(b);
@@ -1410,8 +1481,10 @@ struct btree *__btree_node_alloc_replacement(struct btree *b,
{
struct btree *n;
+ bch_verify_btree_keys_accounting(&b->keys);
+
n = bch_btree_node_alloc(b->c, b->level, b->btree_id, reserve);
- n->keys.set->data->format = format;
+ n->keys.format = format;
bch_btree_sort_into(&n->keys, &b->keys,
b->keys.ops->key_normalize,
@@ -1470,7 +1543,7 @@ struct btree *btree_node_alloc_replacement(struct btree *b,
struct bkey_format new_f = bch_btree_calc_format(b);
if (!btree_node_format_fits(b, &new_f))
- new_f = b->keys.set->data->format;
+ new_f = b->keys.format;
return __btree_node_alloc_replacement(b, reserve, new_f);
}
@@ -1685,7 +1758,6 @@ static bool btree_insert_key(struct btree_iter *iter, struct btree *b,
bool do_insert;
BUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- BUG_ON(write_block(b) != btree_bset_last(b));
BUG_ON(!b->level &&
bkey_cmp(bkey_start_pos(&insert->k), iter->pos) < 0);
bch_btree_node_iter_verify(node_iter, &b->keys);
@@ -1742,7 +1814,7 @@ static bool have_enough_space(struct btree *b, struct keylist *insert_keys)
? BKEY_EXTENT_MAX_U64s * 3
: bch_keylist_front(insert_keys)->k.u64s;
- return u64s <= bch_btree_keys_u64s_remaining(&b->keys);
+ return u64s <= bch_btree_keys_u64s_remaining(b);
}
static void verify_keys_sorted(struct keylist *l)
@@ -1805,8 +1877,7 @@ bch_btree_insert_keys(struct btree *b,
six_lock_write(&b->lock);
/* just wrote a set? */
- if (write_block(b) != btree_bset_last(b) &&
- b->keys.last_set_unwritten)
+ if (btree_node_need_init_next(b))
bch_btree_init_next(b, iter);
while (!bch_keylist_empty(insert_keys)) {
@@ -1850,7 +1921,10 @@ bch_btree_insert_keys(struct btree *b,
if (b->level)
bch_btree_node_write_sync(b, iter);
else {
- unsigned long bytes = set_bytes(btree_bset_last(b));
+ struct btree_node_entry *bne =
+ container_of(btree_bset_last(b),
+ struct btree_node_entry, keys);
+ unsigned long bytes = __set_bytes(bne, bne->keys.u64s);
if (b->io_mutex.count > 0 &&
((max(roundup(bytes, block_bytes(iter->c)),
@@ -1902,6 +1976,7 @@ static int btree_split(struct btree *b,
}
n1 = btree_node_alloc_replacement(b, reserve);
+ bch_verify_btree_keys_accounting(&n1->keys);
set1 = btree_bset_first(n1);
/*
@@ -1953,6 +2028,7 @@ static int btree_split(struct btree *b,
} else
k = bkey_next(k);
}
+ bch_verify_btree_keys_accounting(&n1->keys);
/*
* Note that on recursive parent_keys == insert_keys, so we can't start
@@ -1960,15 +2036,17 @@ static int btree_split(struct btree *b,
* insert, which we just did above)
*/
- if (set_blocks(set1, block_bytes(n1->c)) > btree_blocks(iter->c) * 3 / 4) {
+ if (__set_blocks(n1->data,
+ n1->data->keys.u64s,
+ block_bytes(n1->c)) > btree_blocks(iter->c) * 3 / 4) {
size_t nr_packed = 0, nr_unpacked = 0;
trace_bcache_btree_node_split(b, set1->u64s);
n2 = bch_btree_node_alloc(iter->c, b->level,
iter->btree_id, reserve);
+ n2->keys.format = n1->keys.format;
set2 = btree_bset_first(n2);
- set2->format = set1->format;
if (!parent) {
n3 = bch_btree_node_alloc(iter->c, b->level + 1,
@@ -1993,7 +2071,7 @@ static int btree_split(struct btree *b,
k = bkey_next(k);
}
- n1->key.k.p = bkey_unpack_key(&set1->format, k).p;
+ n1->key.k.p = bkey_unpack_key(&n1->keys.format, k).p;
k = bkey_next(k);
set2->u64s = (u64 *) bset_bkey_last(set1) - (u64 *) k;
@@ -2326,8 +2404,7 @@ int bch_btree_iter_unlock(struct btree_iter *iter)
/* peek_all() doesn't skip deleted keys */
static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
{
- const struct bkey_format *f =
- &iter->nodes[iter->level]->keys.set->data->format;
+ const struct bkey_format *f = &iter->nodes[iter->level]->keys.format;
struct bkey_packed *k =
bch_btree_node_iter_peek_all(&iter->node_iters[iter->level],
&iter->nodes[iter->level]->keys);
@@ -2347,8 +2424,7 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter)
{
- const struct bkey_format *f =
- &iter->nodes[iter->level]->keys.set->data->format;
+ const struct bkey_format *f = &iter->nodes[iter->level]->keys.format;
struct bkey_packed *k =
bch_btree_node_iter_peek(&iter->node_iters[iter->level],
&iter->nodes[iter->level]->keys);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a11f9741cf3f..608d947de373 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -110,6 +110,7 @@ struct btree {
uint8_t btree_id;
struct btree_keys keys;
+ struct btree_node *data;
/* For outstanding btree writes, used as a lock - protects write_idx */
struct closure io;
@@ -128,16 +129,21 @@ static inline bool btree_node_ ## flag(struct btree *b) \
\
static inline void set_btree_node_ ## flag(struct btree *b) \
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
+ \
+static inline void clear_btree_node_ ## flag(struct btree *b) \
+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
enum btree_flags {
BTREE_NODE_io_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
+ BTREE_NODE_need_init_next,
};
BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
+BTREE_FLAG(need_init_next);
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -159,9 +165,24 @@ static inline struct bset *btree_bset_last(struct btree *b)
return bset_tree_last(&b->keys)->data;
}
-static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+ return i - (void *) b->data;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct btree *b)
{
- return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
+ struct bset *i = btree_bset_last(b);
+
+ BUG_ON((PAGE_SIZE << b->keys.page_order) <
+ (bset_byte_offset(b, i) + set_bytes(i)));
+
+ if (!b->keys.last_set_unwritten)
+ return 0;
+
+ return ((PAGE_SIZE << b->keys.page_order) -
+ (bset_byte_offset(b, i) + set_bytes(i))) /
+ sizeof(u64);
}
static inline size_t btree_bytes(struct cache_set *c)
@@ -171,7 +192,7 @@ static inline size_t btree_bytes(struct cache_set *c)
static inline unsigned btree_sectors(struct cache_set *c)
{
- return c->btree_pages * PAGE_SECTORS;
+ return c->btree_pages << (PAGE_SHIFT - 9);
}
static inline unsigned btree_blocks(struct cache_set *c)
@@ -318,7 +339,7 @@ void bch_btree_write_oldest(struct cache_set *, u64);
static inline bool btree_node_format_fits(struct btree *b,
struct bkey_format *new_f)
{
- struct bkey_format *old_f = &b->keys.set->data->format;
+ struct bkey_format *old_f = &b->keys.format;
/* stupid integer promotion rules */
ssize_t new_u64s =
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index ea95be322141..5b2919e08888 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -296,7 +296,7 @@ static ssize_t bch_read_btree_formats(struct file *file, char __user *buf,
return i->ret;
for_each_btree_node(&iter, i->c, i->id, b, i->from) {
- const struct bkey_format *f = &b->keys.set->data->format;
+ const struct bkey_format *f = &b->keys.format;
struct bset_stats stats = { 0 };
bch_btree_keys_stats(&b->keys, &stats);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 598a24e4d343..d03d7159ff4a 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -46,7 +46,7 @@ static void sort_key_next(struct btree_node_iter *iter,
*/
#define key_sort_cmp(l, r) \
({ \
- int _c = bkey_cmp_packed(&b->set->data->format, \
+ int _c = bkey_cmp_packed(&b->format, \
__btree_node_offset_to_key(b, (l).k), \
__btree_node_offset_to_key(b, (r).k)); \
\
@@ -56,7 +56,7 @@ static void sort_key_next(struct btree_node_iter *iter,
static inline bool should_drop_next_key(struct btree_node_iter *iter,
struct btree_keys *b)
{
- const struct bkey_format *f = &b->set->data->format;
+ const struct bkey_format *f = &b->format;
struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
if (bkey_deleted(__btree_node_offset_to_key(b, l->k)))
@@ -124,7 +124,7 @@ bool bch_insert_fixup_key(struct btree *b, struct bkey_i *insert,
struct bpos *done,
struct journal_res *res)
{
- const struct bkey_format *f = &b->keys.set->data->format;
+ const struct bkey_format *f = &b->keys.format;
struct bkey_packed *k;
int c;
@@ -187,7 +187,7 @@ static bool should_drop_ptr(const struct cache_set *c,
unsigned bch_extent_nr_ptrs_after_normalize(const struct btree *b,
const struct bkey_packed *k)
{
- const struct bkey_format *f = &b->keys.set->data->format;
+ const struct bkey_format *f = &b->keys.format;
const struct bch_extent *e;
unsigned ret = 0, ptr;
@@ -622,7 +622,7 @@ static void extent_save(struct bkey_packed *dst, struct bkey *src,
*/
#define extent_sort_cmp(l, r) \
({ \
- const struct bkey_format *_f = &b->set->data->format; \
+ const struct bkey_format *_f = &b->format; \
struct bkey _ul = bkey_unpack_key(_f, \
__btree_node_offset_to_key(b, (l).k)); \
struct bkey _ur = bkey_unpack_key(_f, \
@@ -680,7 +680,7 @@ void bch_extent_sort_fix_overlapping(struct btree_keys *b,
struct bset *bset,
struct btree_node_iter *iter)
{
- struct bkey_format *f = &b->set->data->format;
+ struct bkey_format *f = &b->format;
struct btree_node_iter_set *_l = iter->data, *_r;
struct bkey_packed *prev = NULL, *out = bset->start, *lk, *rk;
struct bkey_tup l, r;
@@ -1169,7 +1169,7 @@ bool bch_insert_fixup_extent(struct btree *b, struct bkey_i *insert,
struct bpos *done,
struct journal_res *res)
{
- const struct bkey_format *f = &b->keys.set->data->format;
+ const struct bkey_format *f = &b->keys.format;
struct bpos orig_insert = insert->k.p;
struct bkey_packed *_k;
struct bkey_tup tup;
@@ -1236,7 +1236,7 @@ bool bch_insert_fixup_extent(struct btree *b, struct bkey_i *insert,
* iteration of this room will insert one key, so we need
* room for three keys.
*/
- needs_split = (bch_btree_keys_u64s_remaining(&b->keys) <
+ needs_split = (bch_btree_keys_u64s_remaining(b) <
BKEY_EXTENT_MAX_U64s * 3);
res_full = journal_res_full(res, &insert->k);
@@ -1731,7 +1731,7 @@ static bool bch_extent_merge_inline(struct btree_keys *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
- const struct bkey_format *f = &b->set->data->format;
+ const struct bkey_format *f = &b->format;
struct bset_tree *t;
struct bkey_packed *k, *m;
struct bkey uk;
diff --git a/drivers/md/bcache/gc.c b/drivers/md/bcache/gc.c
index b252b91eb2fd..2f52d0b5ef78 100644
--- a/drivers/md/bcache/gc.c
+++ b/drivers/md/bcache/gc.c
@@ -92,7 +92,7 @@ static inline bool btree_node_has_ptrs(struct btree *b)
bool btree_gc_mark_node(struct cache_set *c, struct btree *b,
struct gc_stat *stat)
{
- struct bkey_format *f = &b->keys.set->data->format;
+ struct bkey_format *f = &b->keys.format;
struct bset_tree *t;
for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index d66a5b91209e..951cf33b505e 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -617,7 +617,7 @@ static inline __u64 bset_magic(struct cache_sb *sb)
enum btree_id {
DEFINE_BCH_BTREE_IDS()
- BTREE_ID_NR = 5,
+ BTREE_ID_NR
};
#undef DEF_BTREE_ID
@@ -697,28 +697,41 @@ BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4);
* sorted
*/
struct bset {
- __u64 csum;
- __u64 magic;
- __u32 version;
- __u32 flags;
-
__u64 seq;
__u64 journal_seq;
- __u32 pad;
- __u32 u64s; /* count of d[] in u64s */
-
- /* NOTE: all bsets in the same btree node must have the same format */
- struct bkey_format format;
+ __u32 flags;
+ __u16 version;
+ __u16 u64s; /* count of d[] in u64s */
union {
struct bkey_packed start[0];
__u64 _data[0];
};
-};
+} __attribute((packed));
BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
+/* Only used in first bset */
+BITMASK(BSET_BTREE_LEVEL, struct bset, flags, 4, 8);
+
+struct btree_node {
+ __u64 csum;
+ __u64 magic;
+
+ /* Closed interval: */
+ struct bpos min_key;
+ struct bpos max_key;
+ struct bkey_format format;
+
+ struct bset keys;
+} __attribute((packed));
+
+struct btree_node_entry {
+ __u64 csum;
+ struct bset keys;
+} __attribute((packed));
+
/* OBSOLETE */
struct bkey_v0 {