summaryrefslogtreecommitdiff
path: root/drivers/md/bcache/btree.h
blob: 94808dc5d23f646c140219f2fde5506e723741a0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#ifndef _BCACHE_BTREE_H
#define _BCACHE_BTREE_H

/*
 * THE BTREE:
 *
 * At a high level, bcache's btree is relatively standard b+ tree. All keys and
 * pointers are in the leaves; interior nodes only have pointers to the child
 * nodes.
 *
 * In the interior nodes, a struct bkey always points to a child btree node, and
 * the key is the highest key in the child node - except that the highest key in
 * an interior node is always MAX_KEY. The size field refers to the size on disk
 * of the child node - this would allow us to have variable sized btree nodes
 * (handy for keeping the depth of the btree 1 by expanding just the root).
 *
 * Btree nodes are themselves log structured, but this is hidden fairly
 * thoroughly. Btree nodes on disk will in practice have extents that overlap
 * (because they were written at different times), but in memory we never have
 * overlapping extents - when we read in a btree node from disk, the first thing
 * we do is resort all the sets of keys with a mergesort, and in the same pass
 * we check for overlapping extents and adjust them appropriately.
 *
 * BTREE CACHE:
 *
 * Btree nodes are cached in memory; traversing the btree might require reading
 * in btree nodes which is handled mostly transparently.
 *
 * bch_btree_node_get() looks up a btree node in the cache and reads it in from
 * disk if necessary. This function is almost never called directly though - the
 * btree() macro is used to get a btree node, call some function on it, and
 * unlock the node after the function returns.
 *
 * The root is special cased - it's taken out of the cache's lru (thus pinning
 * it in memory), so we can find the root of the btree by just dereferencing a
 * pointer instead of looking it up in the cache. This makes locking a bit
 * tricky, since the root pointer is protected by the lock in the btree node it
 * points to - the btree_root() macro handles this.
 *
 * In various places we must be able to allocate memory for multiple btree nodes
 * in order to make forward progress. To do this we use the btree cache itself
 * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
 * cache we can reuse. We can't allow more than one thread to be doing this at a
 * time, so there's a lock, implemented by a pointer to the btree_op closure -
 * this allows the btree_root() macro to implicitly release this lock.
 *
 * BTREE IO:
 *
 * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
 * this.
 *
 * For writing, we have two btree_write structs embeddded in struct btree - one
 * write in flight, and one being set up, and we toggle between them.
 *
 * LOCKING:
 *
 * When traversing the btree, we may need write locks starting at some level -
 * inserting a key into the btree will typically only require a write lock on
 * the leaf node.
 *
 * This is specified with the lock field in struct btree_op; lock = 0 means we
 * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
 * checks this field and returns the node with the appropriate lock held.
 *
 * If, after traversing the btree, the insertion code discovers it has to split
 * then it must restart from the root and take new locks - to do this it changes
 * the lock field and returns -EINTR, which causes the btree_root() macro to
 * loop.
 *
 * Handling cache misses require a different mechanism for upgrading to a write
 * lock. We do cache lookups with only a read lock held, but if we get a cache
 * miss and we wish to insert this data into the cache, we have to insert a
 * placeholder key to detect races - otherwise, we could race with a write and
 * overwrite the data that was just written to the cache with stale data from
 * the backing device.
 *
 * For this we use a sequence number that write locks and unlocks increment - to
 * insert the check key it unlocks the btree node and then takes a write lock,
 * and fails if the sequence number doesn't match.
 */

#include "bset.h"
#include "debug.h"
#include "six.h"
#include "journal_types.h"

extern const char *bch_btree_id_names[BTREE_ID_NR];

struct btree_write {
	atomic_t		*journal;
};

struct btree {
	/* Hottest entries first */
	struct hlist_node	hash;

	/* Key/pointer for this btree node */
	BKEY_PADDED(key);

	/* Single bit - set when accessed, cleared by shrinker */
	unsigned long		accessed;

	struct six_lock		lock;

	struct cache_set	*c;

	unsigned long		flags;
	uint16_t		written;	/* would be nice to kill */
	uint8_t			level;
	uint8_t			btree_id;

	struct btree_keys	keys;

	/* For outstanding btree writes, used as a lock - protects write_idx */
	struct closure		io;
	struct semaphore	io_mutex;

	struct list_head	list;
	struct delayed_work	work;

	struct btree_write	writes[2];
	struct bio		*bio;
};

#define BTREE_FLAG(flag)						\
static inline bool btree_node_ ## flag(struct btree *b)			\
{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
									\
static inline void set_btree_node_ ## flag(struct btree *b)		\
{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\

enum btree_flags {
	BTREE_NODE_io_error,
	BTREE_NODE_dirty,
	BTREE_NODE_write_idx,
};

BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);

static inline struct btree_write *btree_current_write(struct btree *b)
{
	return b->writes + btree_node_write_idx(b);
}

static inline struct btree_write *btree_prev_write(struct btree *b)
{
	return b->writes + (btree_node_write_idx(b) ^ 1);
}

static inline struct bset *btree_bset_first(struct btree *b)
{
	return b->keys.set->data;
}

static inline struct bset *btree_bset_last(struct btree *b)
{
	return bset_tree_last(&b->keys)->data;
}

static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
{
	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
}

static inline size_t btree_bytes(struct cache_set *c)
{
	return c->btree_pages * PAGE_SIZE;
}

static inline unsigned btree_sectors(struct cache_set *c)
{
	return c->btree_pages * PAGE_SECTORS;
}

static inline unsigned btree_blocks(struct cache_set *c)
{
	return btree_sectors(c) >> c->block_bits;
}

#define for_each_cached_btree(b, c, iter)				\
	for (iter = 0;							\
	     iter < ARRAY_SIZE((c)->bucket_hash);			\
	     iter++)							\
		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)

#define BTREE_MAX_DEPTH		4

struct btree_iter {
	struct closure		cl;

	/* Current btree depth */
	u8			level;

	/*
	 * Used in bch_btree_iter_traverse(), to indicate whether we're
	 * searching for @pos or the first key strictly greater than @pos
	 */
	u8			is_extents;

	/* Bitmasks for read/intent locks held per level */
	u8			nodes_locked;
	u8			nodes_intent_locked;

	/* Btree level below which we start taking intent locks */
	s8			locks_want;

	enum btree_id		btree_id:8;

	s8			error;

	struct cache_set	*c;

	/* Current position of the iterator */
	struct bpos		pos;

	u32			lock_seq[BTREE_MAX_DEPTH];

	/*
	 * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root().
	 *
	 * This is because iter->nodes[iter->level] == NULL is how
	 * btree_iter_next_node() knows that it's finished with a depth first
	 * traversal. Just unlocking a node (with btree_node_unlock()) is fine,
	 * and if you really don't want that node used again (e.g. btree_split()
	 * freed it) decrementing lock_seq will cause btree_node_relock() to
	 * always fail (but since freeing a btree node takes a write lock on the
	 * node, which increments the node's lock seq, that's not actually
	 * necessary in that example).
	 *
	 * One extra slot for a sentinel NULL:
	 */
	struct btree		*nodes[BTREE_MAX_DEPTH + 1];
	struct btree_node_iter	node_iters[BTREE_MAX_DEPTH];

	/*
	 * Current unpacked key - so that bch_btree_iter_next()/
	 * bch_btree_iter_next_with_holes() can correctly advance pos.
	 */
	struct bkey_tup		tup;
};

int bch_btree_iter_unlock(struct btree_iter *);

static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
{
	return iter->nodes_locked & (1 << level);
}

static inline void mark_btree_node_unlocked(struct btree_iter *iter,
					    unsigned level)
{
	iter->nodes_locked &= ~(1 << level);
	iter->nodes_intent_locked &= ~(1 << level);
}

int bch_btree_iter_unlock(struct btree_iter *);
void bch_btree_iter_init(struct btree_iter *, struct cache_set *,
			 enum btree_id, struct bpos);

struct btree *bch_btree_iter_peek_node(struct btree_iter *);
struct btree *bch_btree_iter_next_node(struct btree_iter *);

struct bkey_s_c bch_btree_iter_peek(struct btree_iter *);
struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *);
void bch_btree_iter_set_pos(struct btree_iter *, struct bpos);
void bch_btree_iter_advance_pos(struct btree_iter *);
bool bch_btree_iter_upgrade(struct btree_iter *);

static inline void __btree_iter_node_set(struct btree_iter *iter,
					 struct btree *b,
					 struct bpos pos)
{
	struct bpos search = iter->is_extents && bkey_cmp(pos, POS_MAX)
		? bkey_successor(pos)
		: pos;

	iter->lock_seq[b->level] = b->lock.state.seq;
	iter->nodes[b->level] = b;
	bch_btree_node_iter_init(&iter->node_iters[b->level],
				 &b->keys, search);
}

static void inline btree_iter_node_set(struct btree_iter *iter, struct btree *b)
{
	__btree_iter_node_set(iter, b, iter->pos);
}

#define for_each_btree_node(iter, c, btree_id, b, start)		\
	for (bch_btree_iter_init((iter), (c), (btree_id), start),	\
	     (iter)->is_extents = false,				\
	     b = bch_btree_iter_peek_node(iter);			\
	     (b);							\
	     (b) = bch_btree_iter_next_node(iter))

#define for_each_btree_key(iter, c, btree_id, _k, start)		\
	for (bch_btree_iter_init((iter), (c), (btree_id), start);	\
	     ((_k) = bch_btree_iter_peek(iter)).k;			\
	     bch_btree_iter_advance_pos(iter))

#define for_each_btree_key_with_holes(iter, c, btree_id, _k, start)	\
	for (bch_btree_iter_init((iter), (c), (btree_id), start);	\
	     ((_k) = bch_btree_iter_peek_with_holes(iter)).k;		\
	     bch_btree_iter_advance_pos(iter))

#define btree_node_root(b)	((b)->c->btree_roots[(b)->btree_id])

void btree_node_free(struct btree *);

void bch_btree_node_write(struct btree *, struct closure *,
			  struct btree_iter *);
void bch_btree_node_read_done(struct btree *, struct cache *,
			      const struct bch_extent_ptr *);
void bch_btree_flush(struct cache_set *);
void bch_btree_write_oldest(struct cache_set *, u64);

struct btree *__btree_node_alloc_replacement(struct btree *,
					     enum alloc_reserve,
					     struct bkey_format);
struct btree *btree_node_alloc_replacement(struct btree *,
					   enum alloc_reserve);
int btree_check_reserve(struct btree *, struct btree_iter *,
			enum alloc_reserve, unsigned);

int bch_btree_root_alloc(struct cache_set *, enum btree_id, struct closure *);
int bch_btree_root_read(struct cache_set *, enum btree_id,
			const struct bkey_i *, unsigned);

void bch_btree_insert_and_journal(struct btree *,
				  struct btree_node_iter *,
				  struct bkey_i *,
				  struct journal_res *);

int bch_btree_insert_node(struct btree *, struct btree_iter *,
			  struct keylist *, struct bch_replace_info *,
			  struct closure *, enum alloc_reserve);

/*
 * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
 * locks, -EAGAIN if need to wait on btree reserve
 */
#define BTREE_INSERT_ATOMIC		1

int bch_btree_insert_at(struct btree_iter *, struct keylist *,
			struct bch_replace_info *, struct closure *,
			enum alloc_reserve, unsigned);
int bch_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
int bch_btree_insert(struct cache_set *, enum btree_id, struct keylist *,
		     struct bch_replace_info *, struct closure *);

int bch_btree_node_rewrite(struct btree *, struct btree_iter *, bool);

void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);

#endif