summaryrefslogtreecommitdiff
path: root/libbcache/btree_types.h
blob: 176d42a7a434878116eebbc354a1d0ac3c6c798c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#ifndef _BCACHE_BTREE_TYPES_H
#define _BCACHE_BTREE_TYPES_H

#include <linux/bcache.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/rhashtable.h>
#include <linux/semaphore.h>
#include <linux/workqueue.h>

#include "bkey_methods.h"
#include "journal_types.h"
#include "six.h"

struct cache_set;
struct open_bucket;
struct btree_interior_update;

#define MAX_BSETS		3U

struct btree_nr_keys {

	/*
	 * Amount of live metadata (i.e. size of node after a compaction) in
	 * units of u64s
	 */
	u16			live_u64s;
	u16			bset_u64s[MAX_BSETS];

	/* live keys only: */
	u16			packed_keys;
	u16			unpacked_keys;
};

struct bset_tree {
	/*
	 * We construct a binary tree in an array as if the array
	 * started at 1, so that things line up on the same cachelines
	 * better: see comments in bset.c at cacheline_to_bkey() for
	 * details
	 */

	/* size of the binary tree and prev array */
	u16			size;

	/* function of size - precalculated for to_inorder() */
	u16			extra;

	u16			data_offset;
	u16			aux_data_offset;
	u16			end_offset;

	struct bpos		max_key;
};

struct btree_write {
	struct journal_entry_pin	journal;
	struct closure_waitlist		wait;
};

struct btree {
	/* Hottest entries first */
	struct rhash_head	hash;

	/* Key/pointer for this btree node */
	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);

	struct six_lock		lock;

	unsigned long		flags;
	u16			written;
	u8			level;
	u8			btree_id;
	u8			nsets;
	u8			nr_key_bits;

	struct bkey_format	format;

	struct btree_node	*data;
	void			*aux_data;

	/*
	 * Sets of sorted keys - the real btree node - plus a binary search tree
	 *
	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
	 * to the memory we have allocated for this btree node. Additionally,
	 * set[0]->data points to the entire btree node as it exists on disk.
	 */
	struct bset_tree	set[MAX_BSETS];

	struct btree_nr_keys	nr;
	u16			sib_u64s[2];
	u16			whiteout_u64s;
	u16			uncompacted_whiteout_u64s;
	u8			page_order;
	u8			unpack_fn_len;

	/*
	 * XXX: add a delete sequence number, so when btree_node_relock() fails
	 * because the lock sequence number has changed - i.e. the contents were
	 * modified - we can still relock the node if it's still the one we
	 * want, without redoing the traversal
	 */

	/*
	 * For asynchronous splits/interior node updates:
	 * When we do a split, we allocate new child nodes and update the parent
	 * node to point to them: we update the parent in memory immediately,
	 * but then we must wait until the children have been written out before
	 * the update to the parent can be written - this is a list of the
	 * btree_interior_updates that are blocking this node from being
	 * written:
	 */
	struct list_head	write_blocked;

	struct open_bucket	*ob;

	/* lru list */
	struct list_head	list;

	struct btree_write	writes[2];

#ifdef CONFIG_BCACHE_DEBUG
	bool			*expensive_debug_checks;
#endif
};

#define BTREE_FLAG(flag)						\
static inline bool btree_node_ ## flag(struct btree *b)			\
{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
									\
static inline void set_btree_node_ ## flag(struct btree *b)		\
{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
									\
static inline void clear_btree_node_ ## flag(struct btree *b)		\
{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }

enum btree_flags {
	BTREE_NODE_read_error,
	BTREE_NODE_write_error,
	BTREE_NODE_dirty,
	BTREE_NODE_noevict,
	BTREE_NODE_write_idx,
	BTREE_NODE_accessed,
	BTREE_NODE_write_in_flight,
	BTREE_NODE_just_written,
};

BTREE_FLAG(read_error);
BTREE_FLAG(write_error);
BTREE_FLAG(dirty);
BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);

static inline struct btree_write *btree_current_write(struct btree *b)
{
	return b->writes + btree_node_write_idx(b);
}

static inline struct btree_write *btree_prev_write(struct btree *b)
{
	return b->writes + (btree_node_write_idx(b) ^ 1);
}

static inline struct bset_tree *bset_tree_last(struct btree *b)
{
	EBUG_ON(!b->nsets);
	return b->set + b->nsets - 1;
}

static inline struct bset *bset(const struct btree *b,
				const struct bset_tree *t)
{
	return (void *) b->data + t->data_offset * sizeof(u64);
}

static inline struct bset *btree_bset_first(struct btree *b)
{
	return bset(b, b->set);
}

static inline struct bset *btree_bset_last(struct btree *b)
{
	return bset(b, bset_tree_last(b));
}

static inline u16
__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
{
	size_t ret = (u64 *) k - (u64 *) b->data - 1;

	EBUG_ON(ret > U16_MAX);
	return ret;
}

static inline struct bkey_packed *
__btree_node_offset_to_key(const struct btree *b, u16 k)
{
	return (void *) ((u64 *) b->data + k + 1);
}

#define __bkey_idx(_set, _offset)				\
	((_set)->_data + (_offset))

#define bkey_idx(_set, _offset)					\
	((typeof(&(_set)->start[0])) __bkey_idx((_set), (_offset)))

#define __bset_bkey_last(_set)					\
	 __bkey_idx((_set), (_set)->u64s)

#define bset_bkey_last(_set)					\
	 bkey_idx((_set), le16_to_cpu((_set)->u64s))

#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)

#define btree_bkey_last(_b, _t)						\
({									\
	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
		bset_bkey_last(bset(_b, _t)));				\
									\
	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
})

static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
{
	t->end_offset =
		__btree_node_key_to_offset(b, bset_bkey_last(bset(b, t)));
	btree_bkey_last(b, t);
}

static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
				  const struct bset *i)
{
	t->data_offset = (u64 *) i - (u64 *) b->data;

	EBUG_ON(bset(b, t) != i);

	set_btree_bset_end(b, t);
}

static inline unsigned bset_byte_offset(struct btree *b, void *i)
{
	return i - (void *) b->data;
}

/* Type of keys @b contains: */
static inline enum bkey_type btree_node_type(struct btree *b)
{
	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
}

static inline const struct bkey_ops *btree_node_ops(struct btree *b)
{
	return bch_bkey_ops[btree_node_type(b)];
}

static inline bool btree_node_has_ptrs(struct btree *b)
{
	return btree_type_has_ptrs(btree_node_type(b));
}

static inline bool btree_node_is_extents(struct btree *b)
{
	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
}

struct btree_root {
	struct btree		*b;

	struct btree_interior_update *as;

	/* On disk root - see async splits: */
	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
	u8			level;
	u8			alive;
};

/*
 * Optional hook that will be called just prior to a btree node update, when
 * we're holding the write lock and we know what key is about to be overwritten:
 */

struct btree_iter;
struct bucket_stats_cache_set;
struct btree_node_iter;

enum extent_insert_hook_ret {
	BTREE_HOOK_DO_INSERT,
	BTREE_HOOK_NO_INSERT,
	BTREE_HOOK_RESTART_TRANS,
};

struct extent_insert_hook {
	enum extent_insert_hook_ret
	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
	      struct bkey_s_c, const struct bkey_i *);
};

enum btree_insert_ret {
	BTREE_INSERT_OK,
	/* extent spanned multiple leaf nodes: have to traverse to next node: */
	BTREE_INSERT_NEED_TRAVERSE,
	/* write lock held for too long */
	BTREE_INSERT_NEED_RESCHED,
	/* leaf node needs to be split */
	BTREE_INSERT_BTREE_NODE_FULL,
	BTREE_INSERT_JOURNAL_RES_FULL,
	BTREE_INSERT_ENOSPC,
	BTREE_INSERT_NEED_GC_LOCK,
};

enum btree_gc_coalesce_fail_reason {
	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
};

typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
							struct btree *,
							struct btree_node_iter *);

#endif /* _BCACHE_BTREE_TYPES_H */