summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-02-05 00:26:03 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2018-02-05 00:31:32 -0500
commit99adc23cf6260a8e5b237f498119ee163d8f71f6 (patch)
tree1b1488e62f2b94c0a1a1f8e2c1a5dceba0037612
parent27c0b6fbc5ffe0979436a6436187d1bf952e9a24 (diff)
Update bcachefs sources to 0e765bc37c bcachefs: foreground merging of interior btree nodes
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c3
-rw-r--r--include/linux/bio.h42
-rw-r--r--include/linux/random.h23
-rw-r--r--include/trace/events/bcachefs.h2
-rw-r--r--libbcachefs/alloc.c12
-rw-r--r--libbcachefs/alloc.h2
-rw-r--r--libbcachefs/bcachefs.h3
-rw-r--r--libbcachefs/bkey.c4
-rw-r--r--libbcachefs/bkey_methods.c4
-rw-r--r--libbcachefs/bset.c64
-rw-r--r--libbcachefs/bset.h51
-rw-r--r--libbcachefs/btree_cache.c27
-rw-r--r--libbcachefs/btree_gc.c6
-rw-r--r--libbcachefs/btree_io.c121
-rw-r--r--libbcachefs/btree_io.h23
-rw-r--r--libbcachefs/btree_iter.c594
-rw-r--r--libbcachefs/btree_iter.h112
-rw-r--r--libbcachefs/btree_locking.h16
-rw-r--r--libbcachefs/btree_update.h2
-rw-r--r--libbcachefs/btree_update_interior.c176
-rw-r--r--libbcachefs/btree_update_interior.h47
-rw-r--r--libbcachefs/btree_update_leaf.c161
-rw-r--r--libbcachefs/debug.c30
-rw-r--r--libbcachefs/dirent.c7
-rw-r--r--libbcachefs/extents.c78
-rw-r--r--libbcachefs/fifo.h11
-rw-r--r--libbcachefs/fs-io.c40
-rw-r--r--libbcachefs/fs.c4
-rw-r--r--libbcachefs/fsck.c4
-rw-r--r--libbcachefs/inode.c44
-rw-r--r--libbcachefs/inode.h2
-rw-r--r--libbcachefs/io.c10
-rw-r--r--libbcachefs/journal.c209
-rw-r--r--libbcachefs/journal.h16
-rw-r--r--libbcachefs/journal_types.h6
-rw-r--r--libbcachefs/migrate.c6
-rw-r--r--libbcachefs/move.c8
-rw-r--r--libbcachefs/quota.c8
-rw-r--r--libbcachefs/six.c216
-rw-r--r--libbcachefs/six.h71
-rw-r--r--libbcachefs/str_hash.h78
-rw-r--r--libbcachefs/super-io.c6
-rw-r--r--libbcachefs/super.c121
-rw-r--r--libbcachefs/sysfs.c12
45 files changed, 1501 insertions, 983 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index be8f2eb4..4ffc0c53 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-2834633e12dd6e8087f5f52742a3465a228998e5
+0e765bc37c971c4b0c91ddd281e5ea82e2d682dc
diff --git a/cmd_migrate.c b/cmd_migrate.c
index 34348370..b3fcf47a 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -119,7 +119,8 @@ static void update_inode(struct bch_fs *c,
int ret;
bch2_inode_pack(&packed, inode);
- ret = bch2_btree_update(c, BTREE_ID_INODES, &packed.inode.k_i, NULL);
+ ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+ NULL, NULL, NULL, 0);
if (ret)
die("error creating file: %s", strerror(-ret));
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7293eef0..dcaffedb 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -186,38 +186,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
bio->bi_flags &= ~(1U << bit);
}
-static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
-{
- *bv = bio_iovec(bio);
-}
-
-static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
-{
- struct bvec_iter iter = bio->bi_iter;
- int idx;
-
- if (unlikely(!bio_multiple_segments(bio))) {
- *bv = bio_iovec(bio);
- return;
- }
-
- bio_advance_iter(bio, &iter, iter.bi_size);
-
- if (!iter.bi_bvec_done)
- idx = iter.bi_idx - 1;
- else /* in the middle of bvec */
- idx = iter.bi_idx;
-
- *bv = bio->bi_io_vec[idx];
-
- /*
- * iter.bi_bvec_done records actual length of the last bvec
- * if this bio ends in the middle of one io vector
- */
- if (iter.bi_bvec_done)
- bv->bv_len = iter.bi_bvec_done;
-}
-
extern struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs);
@@ -298,6 +266,16 @@ static inline void zero_fill_bio(struct bio *bio)
zero_fill_bio_iter(bio, bio->bi_iter);
}
+#define bio_set_dev(bio, bdev) \
+do { \
+ (bio)->bi_bdev = (bdev); \
+} while (0)
+
+#define bio_copy_dev(dst, src) \
+do { \
+ (dst)->bi_bdev = (src)->bi_bdev; \
+} while (0)
+
static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
{
return page_address(bvec->bv_page) + bvec->bv_offset;
diff --git a/include/linux/random.h b/include/linux/random.h
index 6d63e045..243c0602 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -29,20 +29,17 @@ static inline void get_random_bytes(void *buf, int nbytes)
BUG_ON(getrandom(buf, nbytes, 0) != nbytes);
}
-static inline int get_random_int(void)
-{
- int v;
-
- get_random_bytes(&v, sizeof(v));
- return v;
+#define get_random_type(type) \
+static inline type get_random_##type(void) \
+{ \
+ type v; \
+ \
+ get_random_bytes(&v, sizeof(v)); \
+ return v; \
}
-static inline long get_random_long(void)
-{
- long v;
-
- get_random_bytes(&v, sizeof(v));
- return v;
-}
+get_random_type(int);
+get_random_type(long);
+get_random_type(u64);
#endif /* _LINUX_RANDOM_H */
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index bf187f5e..d132dd8a 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -87,7 +87,7 @@ DECLARE_EVENT_CLASS(bio,
),
TP_fast_assign(
- __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0;
+ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index f3833846..f6592de9 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -343,7 +343,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
do {
- ret = bch2_btree_iter_traverse(iter);
+ ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
if (ret)
break;
@@ -393,7 +393,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
return 0;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL);
bch2_btree_iter_unlock(&iter);
@@ -407,7 +407,7 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca)
int ret = 0;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
down_read(&ca->bucket_lock);
for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
@@ -826,7 +826,7 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
int ret = 0;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
/*
* XXX: if ca->nr_invalidated != 0, just return if we'd block doing the
@@ -1862,8 +1862,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
static void allocator_start_issue_discards(struct bch_fs *c)
{
struct bch_dev *ca;
- unsigned i, dev_iter;
- size_t bu;
+ unsigned dev_iter;
+ size_t i, bu;
for_each_rw_member(ca, c, dev_iter) {
unsigned done = 0;
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index 1b9d960b..3bd765b6 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -83,7 +83,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
struct task_struct *p;
rcu_read_lock();
- if ((p = ACCESS_ONCE(ca->alloc_thread)))
+ if ((p = READ_ONCE(ca->alloc_thread)))
wake_up_process(p);
rcu_read_unlock();
}
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 78c427fa..298f26d4 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -520,6 +520,8 @@ struct bch_fs {
unsigned short block_bits; /* ilog2(block_size) */
+ u16 btree_foreground_merge_threshold;
+
struct closure sb_write;
struct mutex sb_lock;
@@ -548,6 +550,7 @@ struct bch_fs {
mempool_t btree_interior_update_pool;
struct list_head btree_interior_update_list;
struct mutex btree_interior_update_lock;
+ struct closure_waitlist btree_interior_update_wait;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 97015084..850ba72c 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -1040,8 +1040,8 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
high_word(f, r),
b->nr_key_bits);
- EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p,
- bkey_unpack_key_format_checked(b, r).p));
+ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+ bkey_unpack_pos(b, r)));
return ret;
}
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 3b3a09eb..84cdf662 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -72,6 +72,10 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
if (k.k->p.snapshot)
return "nonzero snapshot";
+ if (type != BKEY_TYPE_BTREE &&
+ !bkey_cmp(k.k->p, POS_MAX))
+ return "POS_MAX key";
+
return NULL;
}
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 02be5bb4..a07d5540 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -264,9 +264,9 @@ void bch2_verify_key_order(struct btree *b,
#else
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
- struct btree *b,
- struct bkey_packed *k) {}
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+ struct btree *b,
+ struct bkey_packed *k) {}
#endif
@@ -569,7 +569,7 @@ static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
unsigned j, struct bkey_packed *k)
{
- BUG_ON(k >= btree_bkey_last(b, t));
+ EBUG_ON(k >= btree_bkey_last(b, t));
rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
.offset = __btree_node_key_to_offset(b, k),
@@ -619,7 +619,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
{
unsigned l = 0, r = t->size;
- BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
while (l < r) {
unsigned m = (l + r) >> 1;
@@ -630,13 +630,13 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
r = m;
}
- BUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset < offset);
- BUG_ON(l &&
- rw_aux_tree(b, t)[l - 1].offset >= offset);
+ EBUG_ON(l < t->size &&
+ rw_aux_tree(b, t)[l].offset < offset);
+ EBUG_ON(l &&
+ rw_aux_tree(b, t)[l - 1].offset >= offset);
- BUG_ON(l > r);
- BUG_ON(l > t->size);
+ EBUG_ON(l > r);
+ EBUG_ON(l > t->size);
return l;
}
@@ -764,16 +764,16 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
#ifdef __LITTLE_ENDIAN
shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
- BUG_ON(shift + bits > b->format.key_u64s * 64);
+ EBUG_ON(shift + bits > b->format.key_u64s * 64);
#else
shift = high_bit_offset +
b->nr_key_bits -
exponent -
bits;
- BUG_ON(shift < KEY_PACKED_BITS_START);
+ EBUG_ON(shift < KEY_PACKED_BITS_START);
#endif
- BUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
f->exponent = shift;
mantissa = bkey_mantissa(m, f, j);
@@ -890,6 +890,7 @@ retry:
prev = k, k = bkey_next(k);
if (k >= btree_bkey_last(b, t)) {
+ /* XXX: this path sucks */
t->size--;
goto retry;
}
@@ -898,8 +899,8 @@ retry:
bkey_float(b, t, j)->key_offset =
bkey_to_cacheline_offset(b, t, cacheline++, k);
- BUG_ON(tree_to_prev_bkey(b, t, j) != prev);
- BUG_ON(tree_to_bkey(b, t, j) != k);
+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+ EBUG_ON(tree_to_bkey(b, t, j) != k);
}
while (bkey_next(k) != btree_bkey_last(b, t))
@@ -1074,7 +1075,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
struct bkey_packed min_key, max_key;
unsigned inorder, j;
- BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
/* signal to make_bfloat() that they're uninitialized: */
min_key.u64s = max_key.u64s = 0;
@@ -1148,7 +1149,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
int shift = new_u64s - clobber_u64s;
unsigned l, j, where = __btree_node_key_to_offset(b, _where);
- BUG_ON(bset_has_ro_aux_tree(t));
+ EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
@@ -1157,8 +1158,8 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
/* l is first >= than @where */
- BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
- BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
+ EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
+ EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
if (!l) /* never delete first entry */
l++;
@@ -1189,9 +1190,9 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
for (j = l; j < t->size; j++)
rw_aux_tree(b, t)[j].offset += shift;
- BUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset ==
- rw_aux_tree(b, t)[l - 1].offset);
+ EBUG_ON(l < t->size &&
+ rw_aux_tree(b, t)[l].offset ==
+ rw_aux_tree(b, t)[l - 1].offset);
if (t->size < bset_rw_tree_capacity(b, t) &&
(l < t->size
@@ -1248,8 +1249,8 @@ void bch2_bset_insert(struct btree *b,
u64 *src_p = where->_data + clobber_u64s;
u64 *dst_p = where->_data + src->u64s;
- BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
- (int) clobber_u64s - src->u64s);
+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+ (int) clobber_u64s - src->u64s);
memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
@@ -1277,7 +1278,7 @@ void bch2_bset_delete(struct btree *b,
bch2_bset_verify_rw_aux_tree(b, t);
- BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
@@ -1594,7 +1595,7 @@ struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
{
struct btree_node_iter_set *set;
- BUG_ON(iter->used > MAX_BSETS);
+ EBUG_ON(iter->used > MAX_BSETS);
btree_node_iter_for_each(iter, set)
if (set->end == t->end_offset)
@@ -1653,20 +1654,23 @@ void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
struct btree *b)
{
+#ifdef CONFIG_BCACHEFS_DEBUG
struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
-
+#endif
iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
- BUG_ON(iter->data->k > iter->data->end);
+ EBUG_ON(iter->data->k > iter->data->end);
if (iter->data->k == iter->data->end) {
- BUG_ON(iter->used == 0);
+ EBUG_ON(iter->used == 0);
iter->data[0] = iter->data[--iter->used];
}
btree_node_iter_sift(iter, b, 0);
+#ifdef CONFIG_BCACHEFS_DEBUG
bch2_btree_node_iter_next_check(iter, b, k);
+#endif
}
/*
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index c195cd91..f5a84481 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -188,16 +188,15 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+ struct bkey *dst,
const struct bkey_packed *src)
{
- struct bkey dst;
-
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
{
compiled_unpack_fn unpack_fn = b->aux_data;
- unpack_fn(&dst, src);
+ unpack_fn(dst, src);
if (btree_keys_expensive_checks(b)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
@@ -206,17 +205,36 @@ bkey_unpack_key_format_checked(const struct btree *b,
* hack around a harmless race when compacting whiteouts
* for a write:
*/
- dst2.needs_whiteout = dst.needs_whiteout;
+ dst2.needs_whiteout = dst->needs_whiteout;
- BUG_ON(memcmp(&dst, &dst2, sizeof(dst)));
+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
}
}
#else
- dst = __bch2_bkey_unpack_key(&b->format, src);
+ *dst = __bch2_bkey_unpack_key(&b->format, src);
#endif
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ struct bkey dst;
+
+ __bkey_unpack_key_format_checked(b, &dst, src);
return dst;
}
+static inline void __bkey_unpack_key(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (likely(bkey_packed(src)))
+ __bkey_unpack_key_format_checked(b, dst, src);
+ else
+ *dst = *packed_to_bkey_c(src);
+}
+
/**
* bkey_unpack_key -- unpack just the key, not the value
*/
@@ -253,7 +271,7 @@ static inline struct bkey_s_c bkey_disassemble(struct btree *b,
const struct bkey_packed *k,
struct bkey *u)
{
- *u = bkey_unpack_key(b, k);
+ __bkey_unpack_key(b, u, k);
return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
}
@@ -263,7 +281,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
struct bkey_packed *k,
struct bkey *u)
{
- *u = bkey_unpack_key(b, k);
+ __bkey_unpack_key(b, u, k);
return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
}
@@ -353,15 +371,6 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
}
/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k,
- bool strictly_greater)
-{
- int cmp = bkey_cmp(k->p, pos);
-
- return cmp > 0 ||
- (cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
struct bpos *pos,
const struct bkey_packed *k,
@@ -493,14 +502,14 @@ static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
static inline struct bkey_packed *
__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
+ struct btree *b)
{
return __btree_node_offset_to_key(b, iter->data->k);
}
static inline struct bkey_packed *
bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
+ struct btree *b)
{
return bch2_btree_node_iter_end(iter)
? NULL
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 78e36299..0bde449e 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -57,10 +57,20 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
list_move(&b->list, &bc->freed);
}
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct btree *b = obj;
+ const u64 *v = arg->key;
+
+ return PTR_HASH(&b->key) == *v ? 0 : 1;
+}
+
static const struct rhashtable_params bch_btree_cache_params = {
.head_offset = offsetof(struct btree, hash),
.key_offset = offsetof(struct btree, key.v),
.key_len = sizeof(struct bch_extent_ptr),
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
};
static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
@@ -337,6 +347,9 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
while (!list_empty(&bc->live)) {
b = list_first_entry(&bc->live, struct btree, list);
+ BUG_ON(btree_node_read_in_flight(b) ||
+ btree_node_write_in_flight(b));
+
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
clear_btree_node_dirty(b);
@@ -736,7 +749,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree *ret;
unsigned level = b->level;
- parent = iter->nodes[level + 1];
+ parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
@@ -745,7 +758,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
return ERR_PTR(-EINTR);
}
- node_iter = iter->node_iters[parent->level];
+ node_iter = iter->l[parent->level].iter;
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
@@ -774,9 +787,13 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
}
- if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) {
- six_unlock_intent(&ret->lock);
- ret = ERR_PTR(-EINTR);
+ if (!bch2_btree_node_relock(iter, level)) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+ if (!IS_ERR(ret)) {
+ six_unlock_intent(&ret->lock);
+ ret = ERR_PTR(-EINTR);
+ }
}
return ret;
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 43e64d27..63508663 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -589,7 +589,7 @@ static void recalc_packed_keys(struct btree *b)
static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
struct btree *old_nodes[GC_MERGE_NODES])
{
- struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
+ struct btree *parent = btree_node_parent(iter, old_nodes[0]);
unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
unsigned blocks = btree_blocks(c) * 2 / 3;
struct btree *new_nodes[GC_MERGE_NODES];
@@ -775,7 +775,7 @@ next:
BUG_ON(!bch2_keylist_empty(&keylist));
- BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]);
+ BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
@@ -868,7 +868,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
* and the nodes in our sliding window might not have the same
* parent anymore - blow away the sliding window:
*/
- if (iter.nodes[iter.level + 1] &&
+ if (btree_iter_node(&iter, iter.level + 1) &&
!btree_node_intent_locked(&iter, iter.level + 1))
memset(merge + 1, 0,
(GC_MERGE_NODES - 1) * sizeof(merge[0]));
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index fb76b29e..9b4eff1c 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -266,19 +266,16 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
bool compacting,
enum compact_mode mode)
{
- unsigned live_u64s = b->nr.bset_u64s[t - b->set];
unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-
- if (live_u64s == bset_u64s)
- return 0;
+ unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
if (mode == COMPACT_LAZY) {
- if (live_u64s * 4 < bset_u64s * 3 ||
+ if (should_compact_bset_lazy(b, t) ||
(compacting && bset_unwritten(b, bset(b, t))))
- return bset_u64s - live_u64s;
+ return dead_u64s;
} else {
if (bset_written(b, bset(b, t)))
- return bset_u64s - live_u64s;
+ return dead_u64s;
}
return 0;
@@ -426,7 +423,7 @@ static bool bch2_drop_whiteouts(struct btree *b)
struct bset *i = bset(b, t);
struct bkey_packed *k, *n, *out, *start, *end;
- if (!should_compact_bset(b, t, true, true))
+ if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
continue;
start = btree_bkey_first(b, t);
@@ -830,13 +827,13 @@ void bch2_btree_build_aux_trees(struct btree *b)
* Returns true if we sorted (i.e. invalidated iterators
*/
void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
+ struct btree_iter *iter)
{
struct btree_node_entry *bne;
bool did_sort;
EBUG_ON(!(b->lock.state.seq & 1));
- EBUG_ON(iter && iter->nodes[b->level] != b);
+ EBUG_ON(iter && iter->l[b->level].b != b);
did_sort = btree_node_compact(c, b, iter);
@@ -1297,8 +1294,8 @@ static void btree_node_read_work(struct work_struct *work)
do {
bch_info(c, "retrying read");
bio_reset(bio);
+ bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
- bio->bi_bdev = rb->pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio);
@@ -1333,7 +1330,7 @@ static void btree_node_read_endio(struct bio *bio)
bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
INIT_WORK(&rb->work, btree_node_read_work);
- schedule_work(&rb->work);
+ queue_work(system_unbound_wq, &rb->work);
}
void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
@@ -1357,8 +1354,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
rb->c = c;
rb->start_time = local_clock();
rb->pick = pick;
+ bio_set_dev(bio, pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
- bio->bi_bdev = pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
bch2_bio_map(bio, b->data);
@@ -1470,15 +1467,13 @@ retry:
goto err;
/* has node been freed? */
- if (iter.nodes[b->level] != b) {
+ if (iter.l[b->level].b != b) {
/* node has been freed: */
- if (!btree_node_dying(b))
- panic("foo4\n");
+ BUG_ON(!btree_node_dying(b));
goto out;
}
- if (!btree_node_hashed(b))
- panic("foo5\n");
+ BUG_ON(!btree_node_hashed(b));
bkey_copy(&tmp.k, &b->key);
@@ -1583,7 +1578,7 @@ static void btree_node_write_endio(struct bio *bio)
container_of(orig, struct btree_write_bio, wbio);
INIT_WORK(&wb->work, btree_node_write_work);
- schedule_work(&wb->work);
+ queue_work(system_unbound_wq, &wb->work);
}
}
@@ -1664,8 +1659,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
- if (lock_type_held == SIX_LOCK_intent) {
- six_lock_write(&b->lock);
+ /*
+ * We can't block on six_lock_write() here; another thread might be
+ * trying to get a journal reservation with read locks held, and getting
+ * a journal reservation might be blocked on flushing the journal and
+ * doing btree writes:
+ */
+ if (lock_type_held == SIX_LOCK_intent &&
+ six_trylock_write(&b->lock)) {
__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
six_unlock_write(&b->lock);
} else {
@@ -1900,13 +1901,12 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
BUG_ON(lock_type_held == SIX_LOCK_write);
if (lock_type_held == SIX_LOCK_intent ||
- six_trylock_convert(&b->lock, SIX_LOCK_read,
- SIX_LOCK_intent)) {
+ six_lock_tryupgrade(&b->lock)) {
__bch2_btree_node_write(c, b, SIX_LOCK_intent);
/* don't cycle lock unnecessarily: */
- if (btree_node_just_written(b)) {
- six_lock_write(&b->lock);
+ if (btree_node_just_written(b) &&
+ six_trylock_write(&b->lock)) {
bch2_btree_post_write_cleanup(c, b);
six_unlock_write(&b->lock);
}
@@ -1918,6 +1918,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
}
}
+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+ unsigned i;
+restart:
+ rcu_read_lock();
+ for_each_cached_btree(b, c, tbl, i, pos)
+ if (test_bit(flag, &b->flags)) {
+ rcu_read_unlock();
+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+ goto restart;
+
+ }
+ rcu_read_unlock();
+}
+
+void bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+void bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
void bch2_btree_verify_flushed(struct bch_fs *c)
{
struct bucket_table *tbl;
@@ -1926,7 +1954,46 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
unsigned i;
rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos)
- BUG_ON(btree_node_dirty(b));
+ for_each_cached_btree(b, c, tbl, i, pos) {
+ unsigned long flags = READ_ONCE(b->flags);
+
+ BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
+ (flags & (1 << BTREE_NODE_write_in_flight)));
+ }
+ rcu_read_unlock();
+}
+
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+{
+ char *out = buf, *end = buf + PAGE_SIZE;
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+ unsigned i;
+
+ rcu_read_lock();
+ for_each_cached_btree(b, c, tbl, i, pos) {
+ unsigned long flags = READ_ONCE(b->flags);
+ unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
+
+ if (//!(flags & (1 << BTREE_NODE_dirty)) &&
+ !b->writes[0].wait.list.first &&
+ !b->writes[1].wait.list.first &&
+ !(b->will_make_reachable & 1))
+ continue;
+
+ out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+ b,
+ (flags & (1 << BTREE_NODE_dirty)) != 0,
+ b->level,
+ b->written,
+ !list_empty_careful(&b->write_blocked),
+ b->will_make_reachable != 0,
+ b->will_make_reachable & 1,
+ b->writes[ idx].wait.list.first != NULL,
+ b->writes[!idx].wait.list.first != NULL);
+ }
rcu_read_unlock();
+
+ return out - buf;
}
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 66053d53..f2790716 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -57,21 +57,23 @@ enum compact_mode {
bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+{
+ unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+ unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+ return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+}
+
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t;
- for_each_bset(b, t) {
- unsigned live_u64s = b->nr.bset_u64s[t - b->set];
- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-
- if (live_u64s * 4 < bset_u64s * 3)
- goto compact;
- }
+ for_each_bset(b, t)
+ if (should_compact_bset_lazy(b, t))
+ return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
return false;
-compact:
- return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
}
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
@@ -134,6 +136,9 @@ do { \
} \
} while (0)
+void bch2_btree_flush_all_reads(struct bch_fs *);
+void bch2_btree_flush_all_writes(struct bch_fs *);
void bch2_btree_verify_flushed(struct bch_fs *);
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index ee463f36..21a6cbc2 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -10,11 +10,15 @@
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
+ struct btree_iter_level *,
+ struct bkey *);
+
#define BTREE_ITER_NOT_END ((struct btree *) 1)
static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
{
- return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END;
+ return iter->l[l].b && iter->l[l].b != BTREE_ITER_NOT_END;
}
/* Btree node locking: */
@@ -27,7 +31,7 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
{
struct btree_iter *linked;
- EBUG_ON(iter->nodes[b->level] != b);
+ EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
for_each_linked_btree_node(iter, b, linked)
@@ -43,14 +47,14 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
struct btree_iter *linked;
unsigned readers = 0;
- EBUG_ON(iter->nodes[b->level] != b);
+ EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
if (six_trylock_write(&b->lock))
return;
for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[b->level] == b &&
+ if (linked->l[b->level].b == b &&
btree_node_read_locked(linked, b->level))
readers++;
@@ -71,10 +75,10 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
}
}
-bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{
struct btree_iter *linked;
- struct btree *b = iter->nodes[level];
+ struct btree *b = iter->l[level].b;
int want = btree_lock_want(iter, level);
int have = btree_node_locked_type(iter, level);
@@ -93,7 +97,7 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
goto success;
for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[level] == b &&
+ if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) == want &&
iter->lock_seq[level] == b->lock.state.seq) {
btree_node_unlock(iter, level);
@@ -112,10 +116,16 @@ bool bch2_btree_iter_relock(struct btree_iter *iter)
{
unsigned l;
- for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
- if (!bch2_btree_node_relock(iter, l))
+ for (l = iter->level;
+ l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b;
+ l++)
+ if (!bch2_btree_node_relock(iter, l)) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
return false;
+ }
+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+ iter->uptodate = BTREE_ITER_NEED_PEEK;
return true;
}
@@ -139,7 +149,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
iter->nodes_locked != iter->nodes_intent_locked);
for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[level] == b &&
+ if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) == type) {
six_lock_increment(&b->lock, type);
return true;
@@ -212,7 +222,7 @@ static void btree_iter_drop_extra_locks(struct btree_iter *iter)
btree_node_unlock(iter, l);
} else {
if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->nodes[l]->lock);
+ six_lock_downgrade(&iter->l[l].b->lock);
iter->nodes_intent_locked ^= 1 << l;
}
break;
@@ -255,7 +265,7 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
static void __bch2_btree_iter_unlock(struct btree_iter *iter)
{
- iter->flags &= ~BTREE_ITER_UPTODATE;
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked));
@@ -277,13 +287,13 @@ int bch2_btree_iter_unlock(struct btree_iter *iter)
#ifdef CONFIG_BCACHEFS_DEBUG
static void __bch2_btree_iter_verify(struct btree_iter *iter,
- struct btree *b)
+ struct btree *b)
{
- struct btree_node_iter *node_iter = &iter->node_iters[b->level];
- struct btree_node_iter tmp = *node_iter;
+ struct btree_iter_level *l = &iter->l[b->level];
+ struct btree_node_iter tmp = l->iter;
struct bkey_packed *k;
- bch2_btree_node_iter_verify(node_iter, b);
+ bch2_btree_node_iter_verify(&l->iter, b);
/*
* For interior nodes, the iterator will have skipped past
@@ -302,7 +312,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
buf, iter->pos.inode, iter->pos.offset);
}
- k = bch2_btree_node_iter_peek_all(node_iter, b);
+ k = bch2_btree_node_iter_peek_all(&l->iter, b);
if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
iter->flags & BTREE_ITER_IS_EXTENTS)) {
char buf[100];
@@ -318,7 +328,7 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
{
struct btree_iter *linked;
- if (iter->nodes[b->level] == b)
+ if (iter->l[b->level].b == b)
__bch2_btree_iter_verify(iter, b);
for_each_linked_btree_node(iter, b, linked)
@@ -348,8 +358,15 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter->flags & BTREE_ITER_IS_EXTENTS)) {
bch2_btree_node_iter_push(node_iter, b, where, end);
+
+ if (!b->level &&
+ node_iter == &iter->l[0].iter)
+ bkey_disassemble(b,
+ bch2_btree_node_iter_peek_all(node_iter, b),
+ &iter->k);
+ }
return;
found:
set->end = (int) set->end + shift;
@@ -362,16 +379,20 @@ found:
btree_iter_pos_cmp_packed(b, &iter->pos, where,
iter->flags & BTREE_ITER_IS_EXTENTS)) {
set->k = offset;
- bch2_btree_node_iter_sort(node_iter, b);
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
if (set->k == set->end)
*set = node_iter->data[--node_iter->used];
- bch2_btree_node_iter_sort(node_iter, b);
} else {
set->k = (int) set->k + shift;
+ goto iter_current_key_not_modified;
}
+ bch2_btree_node_iter_sort(node_iter, b);
+ if (!b->level && node_iter == &iter->l[0].iter)
+ __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
+iter_current_key_not_modified:
+
/*
* Interior nodes are special because iterators for interior nodes don't
* obey the usual invariants regarding the iterator position:
@@ -439,18 +460,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
{
struct btree_iter *linked;
- if (node_iter != &iter->node_iters[b->level])
+ if (node_iter != &iter->l[b->level].iter)
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (iter->nodes[b->level] == b)
+ if (iter->l[b->level].b == b)
__bch2_btree_node_iter_fix(iter, b,
- &iter->node_iters[b->level], t,
+ &iter->l[b->level].iter, t,
where, clobber_u64s, new_u64s);
for_each_linked_btree_node(iter, b, linked)
__bch2_btree_node_iter_fix(linked, b,
- &linked->node_iters[b->level], t,
+ &linked->l[b->level].iter, t,
where, clobber_u64s, new_u64s);
/* interior node iterators are... special... */
@@ -458,51 +479,49 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
bch2_btree_iter_verify(iter, b);
}
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
+ struct btree_iter_level *l,
+ struct bkey *u,
+ struct bkey_packed *k)
{
- struct btree *b = iter->nodes[iter->level];
- struct bkey_packed *k =
- bch2_btree_node_iter_peek_all(&iter->node_iters[iter->level], b);
struct bkey_s_c ret;
- EBUG_ON(!btree_node_locked(iter, iter->level));
-
- if (!k)
+ if (unlikely(!k)) {
+ /*
+ * signal to bch2_btree_iter_peek_slot() that we're currently at
+ * a hole
+ */
+ u->type = KEY_TYPE_DELETED;
return bkey_s_c_null;
+ }
- ret = bkey_disassemble(b, k, &iter->k);
+ ret = bkey_disassemble(l->b, k, u);
if (debug_check_bkeys(iter->c))
- bch2_bkey_debugcheck(iter->c, b, ret);
+ bch2_bkey_debugcheck(iter->c, l->b, ret);
return ret;
}
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter)
+/* peek_all() doesn't skip deleted keys */
+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
+ struct btree_iter_level *l,
+ struct bkey *u)
{
- struct btree *b = iter->nodes[iter->level];
- struct bkey_packed *k =
- bch2_btree_node_iter_peek(&iter->node_iters[iter->level], b);
- struct bkey_s_c ret;
-
- EBUG_ON(!btree_node_locked(iter, iter->level));
-
- if (!k)
- return bkey_s_c_null;
-
- ret = bkey_disassemble(b, k, &iter->k);
-
- if (debug_check_bkeys(iter->c))
- bch2_bkey_debugcheck(iter->c, b, ret);
+ return __btree_iter_unpack(iter, l, u,
+ bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
- return ret;
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
+ struct btree_iter_level *l)
+{
+ return __btree_iter_unpack(iter, l, &iter->k,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
}
-static inline void __btree_iter_advance(struct btree_iter *iter)
+static inline void __btree_iter_advance(struct btree_iter_level *l)
{
- bch2_btree_node_iter_advance(&iter->node_iters[iter->level],
- iter->nodes[iter->level]);
+ bch2_btree_node_iter_advance(&l->iter, l->b);
}
/*
@@ -510,24 +529,28 @@ static inline void __btree_iter_advance(struct btree_iter *iter)
*/
static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
{
+ struct btree_iter_level *l;
+ unsigned plevel;
bool parent_locked;
struct bkey_packed *k;
- if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
- !iter->nodes[b->level + 1])
+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return;
- parent_locked = btree_node_locked(iter, b->level + 1);
+ plevel = b->level + 1;
+ if (!btree_iter_node(iter, plevel))
+ return;
- if (!bch2_btree_node_relock(iter, b->level + 1))
+ parent_locked = btree_node_locked(iter, plevel);
+
+ if (!bch2_btree_node_relock(iter, plevel))
return;
- k = bch2_btree_node_iter_peek_all(&iter->node_iters[b->level + 1],
- iter->nodes[b->level + 1]);
+ l = &iter->l[plevel];
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
if (!k ||
bkey_deleted(k) ||
- bkey_cmp_left_packed(iter->nodes[b->level + 1],
- k, &b->key.k.p)) {
+ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
@@ -540,16 +563,21 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
btree_node_unlock(iter, b->level + 1);
}
-static inline void __btree_iter_init(struct btree_iter *iter,
- struct btree *b)
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
+ const struct bkey *k)
{
- bch2_btree_node_iter_init(&iter->node_iters[b->level], b, iter->pos,
- iter->flags & BTREE_ITER_IS_EXTENTS,
- btree_node_is_extents(b));
+ int cmp = bkey_cmp(k->p, iter->pos);
- /* Skip to first non whiteout: */
- if (b->level)
- bch2_btree_node_iter_peek(&iter->node_iters[b->level], b);
+ return cmp > 0 ||
+ (cmp == 0 &&
+ !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+ struct btree *b)
+{
+ return !btree_iter_pos_cmp(iter, &b->key.k);
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -557,8 +585,21 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
{
return iter->btree_id == b->btree_id &&
bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
- btree_iter_pos_cmp(iter->pos, &b->key.k,
- iter->flags & BTREE_ITER_IS_EXTENTS);
+ !btree_iter_pos_after_node(iter, b);
+}
+
+static inline void __btree_iter_init(struct btree_iter *iter,
+ struct btree *b)
+{
+ struct btree_iter_level *l = &iter->l[b->level];
+
+ bch2_btree_node_iter_init(&l->iter, b, iter->pos,
+ iter->flags & BTREE_ITER_IS_EXTENTS,
+ btree_node_is_extents(b));
+
+ /* Skip to first non whiteout: */
+ if (b->level)
+ bch2_btree_node_iter_peek(&l->iter, b);
}
static inline void btree_iter_node_set(struct btree_iter *iter,
@@ -570,7 +611,7 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
EBUG_ON(b->lock.state.seq & 1);
iter->lock_seq[b->level] = b->lock.state.seq;
- iter->nodes[b->level] = b;
+ iter->l[b->level].b = b;
__btree_iter_init(iter, b);
}
@@ -632,10 +673,10 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
{
unsigned level = b->level;
- if (iter->nodes[level] == b) {
- iter->flags &= ~BTREE_ITER_UPTODATE;
+ if (iter->l[level].b == b) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
btree_node_unlock(iter, level);
- iter->nodes[level] = BTREE_ITER_NOT_END;
+ iter->l[level].b = BTREE_ITER_NOT_END;
}
}
@@ -674,7 +715,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
* that depth
*/
iter->level = depth_want;
- iter->nodes[iter->level] = NULL;
+ iter->l[iter->level].b = NULL;
return 0;
}
@@ -687,8 +728,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
b->level == iter->level &&
!race_fault())) {
for (i = 0; i < iter->level; i++)
- iter->nodes[i] = BTREE_ITER_NOT_END;
- iter->nodes[iter->level] = b;
+ iter->l[i].b = BTREE_ITER_NOT_END;
+ iter->l[iter->level].b = b;
mark_btree_node_locked(iter, iter->level, lock_type);
btree_iter_node_set(iter, b);
@@ -703,52 +744,57 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
noinline
static void btree_iter_prefetch(struct btree_iter *iter)
{
- struct btree *b = iter->nodes[iter->level + 1];
- struct btree_node_iter node_iter = iter->node_iters[iter->level + 1];
+ struct btree_iter_level *l = &iter->l[iter->level];
+ struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
- unsigned nr = iter->level ? 1 : 8;
- bool was_locked = btree_node_locked(iter, iter->level + 1);
+ unsigned nr = iter->level > 1 ? 1 : 8;
+ bool was_locked = btree_node_locked(iter, iter->level);
while (nr) {
- if (!bch2_btree_node_relock(iter, iter->level + 1))
+ if (!bch2_btree_node_relock(iter, iter->level))
return;
- bch2_btree_node_iter_advance(&node_iter, b);
- k = bch2_btree_node_iter_peek(&node_iter, b);
+ bch2_btree_node_iter_advance(&node_iter, l->b);
+ k = bch2_btree_node_iter_peek(&node_iter, l->b);
if (!k)
break;
- bch2_bkey_unpack(b, &tmp.k, k);
+ bch2_bkey_unpack(l->b, &tmp.k, k);
bch2_btree_node_prefetch(iter->c, &tmp.k,
- iter->level, iter->btree_id);
+ iter->level - 1,
+ iter->btree_id);
}
if (!was_locked)
- btree_node_unlock(iter, iter->level + 1);
+ btree_node_unlock(iter, iter->level);
}
static inline int btree_iter_down(struct btree_iter *iter)
{
+ struct btree_iter_level *l = &iter->l[iter->level];
struct btree *b;
- struct bkey_s_c k = __btree_iter_peek(iter);
unsigned level = iter->level - 1;
enum six_lock_type lock_type = btree_lock_want(iter, level);
BKEY_PADDED(k) tmp;
- bkey_reassemble(&tmp.k, k);
+ BUG_ON(!btree_node_locked(iter, iter->level));
+
+ bch2_bkey_unpack(l->b, &tmp.k,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type);
if (unlikely(IS_ERR(b)))
return PTR_ERR(b);
- iter->level = level;
mark_btree_node_locked(iter, level, lock_type);
btree_iter_node_set(iter, b);
if (iter->flags & BTREE_ITER_PREFETCH)
btree_iter_prefetch(iter);
+ iter->level = level;
+
return 0;
}
@@ -829,7 +875,7 @@ io_error:
BUG_ON(ret != -EIO);
iter->flags |= BTREE_ITER_ERROR;
- iter->nodes[iter->level] = NULL;
+ iter->l[iter->level].b = NULL;
goto out;
}
@@ -846,22 +892,22 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
unsigned depth_want = iter->level;
- if (unlikely(!iter->nodes[iter->level]))
+ if (unlikely(!iter->l[iter->level].b))
return 0;
- iter->flags &= ~(BTREE_ITER_UPTODATE|BTREE_ITER_AT_END_OF_LEAF);
+ iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
/* make sure we have all the intent locks we need - ugh */
- if (unlikely(iter->nodes[iter->level] &&
+ if (unlikely(iter->l[iter->level].b &&
iter->level + 1 < iter->locks_want)) {
unsigned i;
for (i = iter->level + 1;
- i < iter->locks_want && iter->nodes[i];
+ i < iter->locks_want && iter->l[i].b;
i++)
if (!bch2_btree_node_relock(iter, i)) {
while (iter->level < BTREE_MAX_DEPTH &&
- iter->nodes[iter->level] &&
+ iter->l[iter->level].b &&
iter->level + 1 < iter->locks_want)
btree_iter_up(iter);
break;
@@ -872,27 +918,31 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
* If the current node isn't locked, go up until we have a locked node
* or run out of nodes:
*/
- while (iter->level < BTREE_MAX_DEPTH &&
- iter->nodes[iter->level] &&
+ while (btree_iter_node(iter, iter->level) &&
!(is_btree_node(iter, iter->level) &&
bch2_btree_node_relock(iter, iter->level) &&
- btree_iter_pos_cmp(iter->pos,
- &iter->nodes[iter->level]->key.k,
- iter->flags & BTREE_ITER_IS_EXTENTS)))
+
+ /*
+ * XXX: correctly using BTREE_ITER_UPTODATE should make
+ * comparing iter->pos against node's key unnecessary
+ */
+ btree_iter_pos_in_node(iter, iter->l[iter->level].b)))
btree_iter_up(iter);
/*
* If we've got a btree node locked (i.e. we aren't about to relock the
* root) - advance its node iterator if necessary:
+ *
+ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
*/
- if (iter->level < BTREE_MAX_DEPTH &&
- iter->nodes[iter->level]) {
+ if (btree_iter_node(iter, iter->level)) {
+ struct btree_iter_level *l = &iter->l[iter->level];
struct bkey_s_c k;
+ struct bkey u;
- while ((k = __btree_iter_peek_all(iter)).k &&
- !btree_iter_pos_cmp(iter->pos, k.k,
- iter->flags & BTREE_ITER_IS_EXTENTS))
- __btree_iter_advance(iter);
+ while ((k = __btree_iter_peek_all(iter, l, &u)).k &&
+ !btree_iter_pos_cmp(iter, k.k))
+ __btree_iter_advance(l);
}
/*
@@ -902,16 +952,17 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
* btree_iter_lock_root() comes next and that it can't fail
*/
while (iter->level > depth_want) {
- int ret = iter->nodes[iter->level]
+ int ret = btree_iter_node(iter, iter->level)
? btree_iter_down(iter)
: btree_iter_lock_root(iter, depth_want);
if (unlikely(ret)) {
iter->level = depth_want;
- iter->nodes[iter->level] = BTREE_ITER_NOT_END;
+ iter->l[iter->level].b = BTREE_ITER_NOT_END;
return ret;
}
}
+ iter->uptodate = BTREE_ITER_NEED_PEEK;
return 0;
}
@@ -919,6 +970,9 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
+ return 0;
+
ret = __bch2_btree_iter_traverse(iter);
if (unlikely(ret))
ret = btree_iter_traverse_error(iter, ret);
@@ -939,7 +993,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
if (ret)
return ERR_PTR(ret);
- b = iter->nodes[iter->level];
+ b = iter->l[iter->level].b;
if (b) {
EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
@@ -958,16 +1012,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
btree_iter_up(iter);
- if (iter->level == BTREE_MAX_DEPTH ||
- !iter->nodes[iter->level])
+ if (!btree_iter_node(iter, iter->level))
return NULL;
/* parent node usually won't be locked: redo traversal if necessary */
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return NULL;
- b = iter->nodes[iter->level];
+ b = iter->l[iter->level].b;
if (!b)
return b;
@@ -980,11 +1034,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
: bkey_successor(iter->pos);
iter->level = depth;
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return NULL;
- b = iter->nodes[iter->level];
+ b = iter->l[iter->level].b;
}
iter->pos = b->key.k.p;
@@ -996,179 +1051,259 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
struct bkey_packed *k;
EBUG_ON(iter->level != 0);
EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
EBUG_ON(!btree_node_locked(iter, 0));
- EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0);
+ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
- while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
- !btree_iter_pos_cmp_packed(b, &new_pos, k,
+ iter->pos = new_pos;
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+ !btree_iter_pos_cmp_packed(l->b, &iter->pos, k,
iter->flags & BTREE_ITER_IS_EXTENTS))
- bch2_btree_node_iter_advance(node_iter, b);
+ __btree_iter_advance(l);
- if (!k &&
- !btree_iter_pos_cmp(new_pos, &b->key.k,
- iter->flags & BTREE_ITER_IS_EXTENTS))
+ if (!k && btree_iter_pos_after_node(iter, l->b)) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
-
- iter->pos = new_pos;
- iter->flags &= ~BTREE_ITER_UPTODATE;
+ }
}
void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
iter->pos = new_pos;
- iter->flags &= ~BTREE_ITER_UPTODATE;
-}
-
-void bch2_btree_iter_advance_pos(struct btree_iter *iter)
-{
- if (iter->flags & BTREE_ITER_UPTODATE &&
- !(iter->flags & BTREE_ITER_WITH_HOLES)) {
- struct bkey_s_c k;
-
- __btree_iter_advance(iter);
- k = __btree_iter_peek(iter);
- if (likely(k.k)) {
- iter->pos = bkey_start_pos(k.k);
- return;
- }
- }
-
- /*
- * We use iter->k instead of iter->pos for extents: iter->pos will be
- * equal to the start of the extent we returned, but we need to advance
- * to the end of the extent we returned.
- */
- bch2_btree_iter_set_pos(iter,
- btree_type_successor(iter->btree_id, iter->k.p));
-}
-
-/* XXX: expensive */
-void bch2_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
-{
- /* incapable of rewinding across nodes: */
- BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0);
- iter->pos = pos;
- iter->flags &= ~BTREE_ITER_UPTODATE;
- __btree_iter_init(iter, iter->nodes[iter->level]);
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
}
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
+ struct btree_iter_level *l = &iter->l[0];
struct bkey_s_c k;
int ret;
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
+ EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
- if (iter->flags & BTREE_ITER_UPTODATE) {
- struct btree *b = iter->nodes[0];
+ if (iter->uptodate == BTREE_ITER_UPTODATE) {
struct bkey_packed *k =
- __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b);
+ __bch2_btree_node_iter_peek_all(&l->iter, l->b);
struct bkey_s_c ret = {
.k = &iter->k,
- .v = bkeyp_val(&b->format, k)
+ .v = bkeyp_val(&l->b->format, k)
};
EBUG_ON(!btree_node_locked(iter, 0));
if (debug_check_bkeys(iter->c))
- bch2_bkey_debugcheck(iter->c, b, ret);
+ bch2_bkey_debugcheck(iter->c, l->b, ret);
return ret;
}
+ if (iter->uptodate == BTREE_ITER_END)
+ return bkey_s_c_null;
+
while (1) {
ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret)) {
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+ if (unlikely(ret))
return bkey_s_c_err(ret);
- }
- k = __btree_iter_peek(iter);
- if (likely(k.k)) {
- /*
- * iter->pos should always be equal to the key we just
- * returned - except extents can straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- iter->pos = bkey_start_pos(k.k);
-
- iter->flags |= BTREE_ITER_UPTODATE;
- return k;
- }
-
- iter->pos = iter->nodes[0]->key.k.p;
+ k = __btree_iter_peek(iter, l);
+ if (likely(k.k))
+ break;
+ /* got to the end of the leaf, iterator needs to be traversed: */
+ iter->pos = l->b->key.k.p;
if (!bkey_cmp(iter->pos, POS_MAX)) {
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
- bch2_btree_iter_unlock(iter);
+ iter->uptodate = BTREE_ITER_END;
return bkey_s_c_null;
}
iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ }
+
+ /*
+ * iter->pos should always be equal to the key we just
+ * returned - except extents can straddle iter->pos:
+ */
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ iter->pos = bkey_start_pos(k.k);
+
+ iter->uptodate = BTREE_ITER_UPTODATE;
+ return k;
+}
+
+static noinline
+struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
+{
+ struct btree_iter_level *l = &iter->l[0];
+
+ iter->pos = l->b->key.k.p;
+ if (!bkey_cmp(iter->pos, POS_MAX)) {
+ iter->uptodate = BTREE_ITER_END;
+ return bkey_s_c_null;
}
+
+ iter->pos = btree_type_successor(iter->btree_id, iter->pos);
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+
+ return bch2_btree_iter_peek(iter);
}
-struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
+ struct btree_iter_level *l = &iter->l[0];
+ struct bkey_packed *p;
struct bkey_s_c k;
- struct bkey n;
- int ret;
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
+ EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
- iter->flags &= ~BTREE_ITER_UPTODATE;
+ if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+ k = bch2_btree_iter_peek(iter);
+ if (IS_ERR_OR_NULL(k.k))
+ return k;
+ }
- while (1) {
+ do {
+ __btree_iter_advance(l);
+ p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ if (unlikely(!p))
+ return bch2_btree_iter_peek_next_leaf(iter);
+ } while (bkey_deleted(p));
+
+ k = __btree_iter_unpack(iter, l, &iter->k, p);
+
+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
+ iter->pos = bkey_start_pos(k.k);
+ return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+ struct btree_iter_level *l = &iter->l[0];
+ struct bkey_s_c k;
+ struct bkey n;
+ int ret;
+
+recheck:
+ while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+ bkey_deleted(k.k) &&
+ bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+ __btree_iter_advance(l);
+
+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+ EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
+ EBUG_ON(bkey_deleted(k.k));
+ iter->uptodate = BTREE_ITER_UPTODATE;
+ return k;
+ }
+
+ /*
+ * If we got to the end of the node, check if we need to traverse to the
+ * next node:
+ */
+ if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret)) {
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
+ if (unlikely(ret))
return bkey_s_c_err(ret);
- }
- k = __btree_iter_peek_all(iter);
-recheck:
- if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) {
- /* hole */
- bkey_init(&n);
- n.p = iter->pos;
-
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
- if (n.p.offset == KEY_OFFSET_MAX) {
- iter->pos = bkey_successor(iter->pos);
- goto recheck;
- }
-
- if (!k.k)
- k.k = &iter->nodes[0]->key.k;
-
- bch2_key_resize(&n,
- min_t(u64, KEY_SIZE_MAX,
- (k.k->p.inode == n.p.inode
- ? bkey_start_offset(k.k)
- : KEY_OFFSET_MAX) -
- n.p.offset));
-
- EBUG_ON(!n.size);
+ goto recheck;
+ }
+
+ /* hole */
+ bkey_init(&n);
+ n.p = iter->pos;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (n.p.offset == KEY_OFFSET_MAX) {
+ if (n.p.inode == KEY_INODE_MAX) {
+ iter->uptodate = BTREE_ITER_END;
+ return bkey_s_c_null;
}
- iter->k = n;
- return (struct bkey_s_c) { &iter->k, NULL };
- } else if (!bkey_deleted(k.k)) {
- return k;
- } else {
- __btree_iter_advance(iter);
+ iter->pos = bkey_successor(iter->pos);
+ goto recheck;
}
+
+ if (!k.k)
+ k.k = &l->b->key.k;
+
+ bch2_key_resize(&n,
+ min_t(u64, KEY_SIZE_MAX,
+ (k.k->p.inode == n.p.inode
+ ? bkey_start_offset(k.k)
+ : KEY_OFFSET_MAX) -
+ n.p.offset));
+
+ EBUG_ON(!n.size);
}
+
+ iter->k = n;
+ iter->uptodate = BTREE_ITER_UPTODATE;
+ return (struct bkey_s_c) { &iter->k, NULL };
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+ struct btree_iter_level *l = &iter->l[0];
+ int ret;
+
+ EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+ (iter->btree_id == BTREE_ID_EXTENTS));
+ EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
+
+ if (iter->uptodate == BTREE_ITER_UPTODATE) {
+ struct bkey_s_c ret = { .k = &iter->k };;
+
+ if (!bkey_deleted(&iter->k))
+ ret.v = bkeyp_val(&l->b->format,
+ __bch2_btree_node_iter_peek_all(&l->iter, l->b));
+
+ EBUG_ON(!btree_node_locked(iter, 0));
+
+ if (debug_check_bkeys(iter->c))
+ bch2_bkey_debugcheck(iter->c, l->b, ret);
+ return ret;
+ }
+
+ if (iter->uptodate == BTREE_ITER_END)
+ return bkey_s_c_null;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ return __bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+ if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+ struct bkey_s_c k;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ if (btree_iter_err(k))
+ return k;
+ }
+
+ iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
+ if (!bkey_deleted(&iter->k))
+ __btree_iter_advance(&iter->l[0]);
+
+ return __bch2_btree_iter_peek_slot(iter);
}
void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
@@ -1176,19 +1311,23 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
unsigned locks_want, unsigned depth,
unsigned flags)
{
+ unsigned i;
+
EBUG_ON(depth >= BTREE_MAX_DEPTH);
EBUG_ON(locks_want > BTREE_MAX_DEPTH);
iter->c = c;
iter->pos = pos;
iter->flags = flags;
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
iter->btree_id = btree_id;
iter->level = depth;
iter->locks_want = locks_want;
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
- memset(iter->nodes, 0, sizeof(iter->nodes));
- iter->nodes[iter->level] = BTREE_ITER_NOT_END;
+ for (i = 0; i < ARRAY_SIZE(iter->l); i++)
+ iter->l[i].b = NULL;
+ iter->l[iter->level].b = BTREE_ITER_NOT_END;
iter->next = iter;
prefetch(c->btree_roots[btree_id].b);
@@ -1236,4 +1375,5 @@ void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
__bch2_btree_iter_unlock(dst);
memcpy(dst, src, offsetof(struct btree_iter, next));
dst->nodes_locked = dst->nodes_intent_locked = 0;
+ dst->uptodate = BTREE_ITER_NEED_RELOCK;
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index eb196a3a..318b0424 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -1,23 +1,32 @@
#ifndef _BCACHEFS_BTREE_ITER_H
#define _BCACHEFS_BTREE_ITER_H
-#include "btree_types.h"
+#include <linux/dynamic_fault.h>
+#include "btree_types.h"
+#include "bset.h"
-#define BTREE_ITER_UPTODATE (1 << 0)
-#define BTREE_ITER_WITH_HOLES (1 << 1)
-#define BTREE_ITER_INTENT (1 << 2)
-#define BTREE_ITER_PREFETCH (1 << 3)
+#define BTREE_ITER_SLOTS (1 << 0)
+#define BTREE_ITER_INTENT (1 << 1)
+#define BTREE_ITER_PREFETCH (1 << 2)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 4)
+#define BTREE_ITER_IS_EXTENTS (1 << 3)
/*
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
*/
-#define BTREE_ITER_AT_END_OF_LEAF (1 << 5)
-#define BTREE_ITER_ERROR (1 << 6)
+#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
+#define BTREE_ITER_ERROR (1 << 5)
+
+enum btree_iter_uptodate {
+ BTREE_ITER_UPTODATE = 0,
+ BTREE_ITER_NEED_PEEK = 1,
+ BTREE_ITER_NEED_RELOCK = 2,
+ BTREE_ITER_NEED_TRAVERSE = 3,
+ BTREE_ITER_END = 4,
+};
/*
* @pos - iterator's current position
@@ -31,32 +40,23 @@ struct btree_iter {
struct bpos pos;
u8 flags;
- enum btree_id btree_id:8;
+ unsigned uptodate:4;
+ enum btree_id btree_id:4;
unsigned level:4,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
- u32 lock_seq[BTREE_MAX_DEPTH];
+ struct btree_iter_level {
+ struct btree *b;
+ struct btree_node_iter iter;
+ } l[BTREE_MAX_DEPTH];
- /*
- * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root().
- *
- * This is because iter->nodes[iter->level] == NULL is how
- * btree_iter_next_node() knows that it's finished with a depth first
- * traversal. Just unlocking a node (with btree_node_unlock()) is fine,
- * and if you really don't want that node used again (e.g. btree_split()
- * freed it) decrementing lock_seq will cause bch2_btree_node_relock() to
- * always fail (but since freeing a btree node takes a write lock on the
- * node, which increments the node's lock seq, that's not actually
- * necessary in that example).
- */
- struct btree *nodes[BTREE_MAX_DEPTH];
- struct btree_node_iter node_iters[BTREE_MAX_DEPTH];
+ u32 lock_seq[BTREE_MAX_DEPTH];
/*
* Current unpacked key - so that bch2_btree_iter_next()/
- * bch2_btree_iter_next_with_holes() can correctly advance pos.
+ * bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
@@ -71,6 +71,24 @@ struct btree_iter {
struct btree_iter *next;
};
+static inline void btree_iter_set_dirty(struct btree_iter *iter,
+ enum btree_iter_uptodate u)
+{
+ iter->uptodate = max_t(unsigned, iter->uptodate, u);
+}
+
+static inline struct btree *btree_iter_node(struct btree_iter *iter,
+ unsigned level)
+{
+ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+}
+
+static inline struct btree *btree_node_parent(struct btree_iter *iter,
+ struct btree *b)
+{
+ return btree_iter_node(iter, b->level + 1);
+}
+
static inline bool btree_iter_linked(const struct btree_iter *iter)
{
return iter->next != iter;
@@ -102,7 +120,7 @@ __next_linked_btree_node(struct btree_iter *iter, struct btree *b,
* sequence number is incremented by taking and releasing write
* locks and is even when unlocked:
*/
- } while (linked->nodes[b->level] != b ||
+ } while (linked->l[b->level].b != b ||
linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
return linked;
@@ -159,11 +177,13 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
-void bch2_btree_iter_advance_pos(struct btree_iter *);
-void bch2_btree_iter_rewind(struct btree_iter *, struct bpos);
void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
enum btree_id, struct bpos,
@@ -175,8 +195,8 @@ static inline void bch2_btree_iter_init(struct btree_iter *iter,
{
__bch2_btree_iter_init(iter, c, btree_id, pos,
flags & BTREE_ITER_INTENT ? 1 : 0, 0,
- btree_id == BTREE_ID_EXTENTS
- ? BTREE_ITER_IS_EXTENTS : 0);
+ (btree_id == BTREE_ID_EXTENTS
+ ? BTREE_ITER_IS_EXTENTS : 0)|flags);
}
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
@@ -225,16 +245,30 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_WITH_HOLES
- ? bch2_btree_iter_peek_with_holes(iter)
+ return flags & BTREE_ITER_SLOTS
+ ? bch2_btree_iter_peek_slot(iter)
: bch2_btree_iter_peek(iter);
}
+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
+ unsigned flags)
+{
+ return flags & BTREE_ITER_SLOTS
+ ? bch2_btree_iter_next_slot(iter)
+ : bch2_btree_iter_next(iter);
+}
+
#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \
for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \
- (_start), (_flags)); \
- !IS_ERR_OR_NULL(((_k) = __bch2_btree_iter_peek(_iter, _flags)).k);\
- bch2_btree_iter_advance_pos(_iter))
+ (_start), (_flags)), \
+ (_k) = __bch2_btree_iter_peek(_iter, _flags); \
+ !IS_ERR_OR_NULL((_k).k); \
+ (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+#define for_each_btree_key_continue(_iter, _flags, _k) \
+ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
+ !IS_ERR_OR_NULL((_k).k); \
+ (_k) = __bch2_btree_iter_next(_iter, _flags))
static inline int btree_iter_err(struct bkey_s_c k)
{
@@ -247,16 +281,10 @@ static inline int btree_iter_err(struct bkey_s_c k)
*/
static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
{
- struct btree_iter *linked;
-
if (need_resched()) {
- for_each_linked_btree_iter(iter, linked)
- bch2_btree_iter_unlock(linked);
bch2_btree_iter_unlock(iter);
schedule();
} else if (race_fault()) {
- for_each_linked_btree_iter(iter, linked)
- bch2_btree_iter_unlock(linked);
bch2_btree_iter_unlock(iter);
}
}
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index ca2992ba..0581f44a 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -91,11 +91,10 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
int lock_type = btree_node_locked_type(iter, level);
- EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED)
- six_unlock_type(&iter->nodes[level]->lock, lock_type);
+ six_unlock_type(&iter->l[level].b->lock, lock_type);
mark_btree_node_unlocked(iter, level);
}
@@ -113,10 +112,21 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
__bch2_btree_node_lock(b, pos, level, iter, type);
}
-bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_node_relock(struct btree_iter *iter,
+ unsigned level)
+{
+ return likely(btree_lock_want(iter, level) ==
+ btree_node_locked_type(iter, level)) ||
+ __bch2_btree_node_relock(iter, level);
+}
+
bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
#endif /* _BCACHEFS_BTREE_LOCKING_H */
+
+
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index c7c29306..f357095d 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -120,8 +120,6 @@ int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *,
struct extent_insert_hook *, u64 *, int flags);
-int bch2_btree_update(struct bch_fs *, enum btree_id,
- struct bkey_i *, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, struct bversion,
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index e92a6a3a..c45527a2 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -393,6 +393,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
struct bch_fs *c = as->c;
struct btree *b;
+ BUG_ON(level >= BTREE_MAX_DEPTH);
BUG_ON(!as->reserve->nr);
b = as->reserve->b[--as->reserve->nr];
@@ -589,17 +590,18 @@ static void bch2_btree_update_free(struct btree_update *as)
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->list);
- mutex_unlock(&c->btree_interior_update_lock);
closure_debug_destroy(&as->cl);
mempool_free(as, &c->btree_interior_update_pool);
percpu_ref_put(&c->writes);
+
+ closure_wake_up(&c->btree_interior_update_wait);
+ mutex_unlock(&c->btree_interior_update_lock);
}
static void btree_update_nodes_reachable(struct closure *cl)
{
- struct btree_update *as =
- container_of(cl, struct btree_update, cl);
+ struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -609,8 +611,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
while (as->nr_new_nodes) {
struct btree *b = as->new_nodes[--as->nr_new_nodes];
- BUG_ON(b->will_make_reachable &&
- (struct btree_update *) b->will_make_reachable != as);
+ BUG_ON(b->will_make_reachable != (unsigned long) as);
b->will_make_reachable = 0;
mutex_unlock(&c->btree_interior_update_lock);
@@ -634,10 +635,26 @@ static void btree_update_nodes_reachable(struct closure *cl)
bch2_btree_update_free(as);
}
+static void btree_update_wait_on_journal(struct closure *cl)
+{
+ struct btree_update *as = container_of(cl, struct btree_update, cl);
+ struct bch_fs *c = as->c;
+ int ret;
+
+ ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
+ if (ret < 0)
+ goto err;
+ if (!ret)
+ continue_at(cl, btree_update_wait_on_journal, system_wq);
+
+ bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
+err:
+ continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
static void btree_update_nodes_written(struct closure *cl)
{
- struct btree_update *as =
- container_of(cl, struct btree_update, cl);
+ struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
struct btree *b;
@@ -648,6 +665,8 @@ static void btree_update_nodes_written(struct closure *cl)
*/
retry:
mutex_lock(&c->btree_interior_update_lock);
+ as->nodes_written = true;
+
switch (as->mode) {
case BTREE_INTERIOR_NO_UPDATE:
BUG();
@@ -672,7 +691,7 @@ retry:
* b->write_blocked prevented it from being written, so
* write it now if it needs to be written:
*/
- bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+ bch2_btree_node_write_cond(c, b, true);
six_unlock_read(&b->lock);
break;
@@ -732,13 +751,10 @@ retry:
*/
bch2_journal_pin_drop(&c->journal, &as->journal);
- /*
- * And, do a journal write to write the pointer to the new root,
- * then wait for it to complete before freeing the nodes we
- * replaced:
- */
- bch2_journal_meta_async(&c->journal, cl);
- break;
+ as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
+
+ btree_update_wait_on_journal(cl);
+ return;
}
continue_at(cl, btree_update_nodes_reachable, system_wq);
@@ -883,13 +899,15 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
unsigned long v;
unsigned i;
- if (!b->will_make_reachable)
- return;
-
mutex_lock(&c->btree_interior_update_lock);
v = xchg(&b->will_make_reachable, 0);
-
as = (struct btree_update *) (v & ~1UL);
+
+ if (!as) {
+ mutex_unlock(&c->btree_interior_update_lock);
+ return;
+ }
+
for (i = 0; i < as->nr_new_nodes; i++)
if (as->new_nodes[i] == b)
goto found;
@@ -1044,7 +1062,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
bch2_keylist_init(&as->parent_keys, as->inline_keys);
mutex_lock(&c->btree_interior_update_lock);
- list_add(&as->list, &c->btree_interior_update_list);
+ list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
return as;
@@ -1341,7 +1359,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
struct btree_iter *iter, struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree *parent = iter->nodes[b->level + 1];
+ struct btree *parent = btree_node_parent(iter, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
u64 start_time = local_clock();
@@ -1437,30 +1455,17 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_time_stats_update(&c->btree_split_time, start_time);
}
-static int
+static void
bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
struct btree_iter *iter, struct keylist *keys)
{
- struct bch_fs *c = as->c;
struct btree_iter *linked;
struct btree_node_iter node_iter;
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
- BUG_ON(!b->level);
- BUG_ON(!as || as->b);
- bch2_verify_keylist_sorted(keys);
-
- bch2_btree_node_lock_for_insert(c, b, iter);
-
- if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
- bch2_btree_node_unlock_write(b, iter);
- return -1;
- }
-
/* Don't screw up @iter's position: */
- node_iter = iter->node_iters[b->level];
+ node_iter = iter->l[b->level].iter;
/*
* btree_split(), btree_gc_coalesce() will insert keys before
@@ -1481,19 +1486,10 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
btree_update_updated_node(as, b);
for_each_linked_btree_node(iter, b, linked)
- bch2_btree_node_iter_peek(&linked->node_iters[b->level],
- b);
- bch2_btree_node_iter_peek(&iter->node_iters[b->level], b);
+ bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
+ bch2_btree_node_iter_peek(&iter->l[b->level].iter, b);
bch2_btree_iter_verify(iter, b);
-
- if (bch2_maybe_compact_whiteouts(c, b))
- bch2_btree_iter_reinit_node(iter, b);
-
- bch2_btree_node_unlock_write(b, iter);
-
- btree_node_interior_verify(b);
- return 0;
}
/**
@@ -1511,17 +1507,54 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
struct btree_iter *iter, struct keylist *keys)
{
+ struct bch_fs *c = as->c;
+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+ int old_live_u64s = b->nr.live_u64s;
+ int live_u64s_added, u64s_added;
+
+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
BUG_ON(!b->level);
+ BUG_ON(!as || as->b);
+ bch2_verify_keylist_sorted(keys);
+
+ if (as->must_rewrite)
+ goto split;
+
+ bch2_btree_node_lock_for_insert(c, b, iter);
+
+ if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+ bch2_btree_node_unlock_write(b, iter);
+ goto split;
+ }
+
+ bch2_btree_insert_keys_interior(as, b, iter, keys);
+
+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+ if (u64s_added > live_u64s_added &&
+ bch2_maybe_compact_whiteouts(c, b))
+ bch2_btree_iter_reinit_node(iter, b);
+
+ bch2_btree_node_unlock_write(b, iter);
+
+ btree_node_interior_verify(b);
- if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) ||
- bch2_btree_insert_keys_interior(as, b, iter, keys))
- btree_split(as, b, iter, keys);
+ bch2_foreground_maybe_merge(c, iter, b->level);
+ return;
+split:
+ btree_split(as, b, iter, keys);
}
int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
unsigned btree_reserve_flags)
{
- struct btree *b = iter->nodes[0];
+ struct btree *b = iter->l[0].b;
struct btree_update *as;
struct closure cl;
int ret = 0;
@@ -1577,9 +1610,10 @@ out:
return ret;
}
-int bch2_foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
- enum btree_node_sibling sib)
+int __bch2_foreground_maybe_merge(struct bch_fs *c,
+ struct btree_iter *iter,
+ unsigned level,
+ enum btree_node_sibling sib)
{
struct btree_update *as;
struct bkey_format_state new_s;
@@ -1592,12 +1626,12 @@ int bch2_foreground_maybe_merge(struct bch_fs *c,
closure_init_stack(&cl);
retry:
- if (!bch2_btree_node_relock(iter, iter->level))
+ if (!bch2_btree_node_relock(iter, level))
return 0;
- b = iter->nodes[iter->level];
+ b = iter->l[level].b;
- parent = iter->nodes[b->level + 1];
+ parent = btree_node_parent(iter, b);
if (!parent)
return 0;
@@ -1734,7 +1768,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
struct btree *b, unsigned flags,
struct closure *cl)
{
- struct btree *n, *parent = iter->nodes[b->level + 1];
+ struct btree *n, *parent = btree_node_parent(iter, b);
struct btree_update *as;
as = bch2_btree_update_start(c, iter->btree_id,
@@ -1834,7 +1868,6 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
struct bkey_i_extent *new_key)
{
struct btree *parent;
- bool must_rewrite_parent = false;
int ret;
/*
@@ -1859,14 +1892,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
*/
if (b->will_make_reachable)
- must_rewrite_parent = true;
-
- if (must_rewrite_parent)
- as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
+ as->must_rewrite = true;
btree_interior_update_add_node_reference(as, b);
- parent = iter->nodes[b->level + 1];
+ parent = btree_node_parent(iter, b);
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i);
@@ -2066,3 +2096,21 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+{
+ char *out = buf, *end = buf + PAGE_SIZE;
+ struct btree_update *as;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_for_each_entry(as, &c->btree_interior_update_list, list)
+ out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
+ as,
+ as->mode,
+ as->nodes_written,
+ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+ bch2_journal_pin_seq(&c->journal, &as->journal));
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ return out - buf;
+}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 23ee3980..0b58ccc9 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -2,6 +2,7 @@
#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
#include "btree_cache.h"
+#include "btree_locking.h"
#include "btree_update.h"
struct btree_reserve {
@@ -61,9 +62,12 @@ struct btree_update {
BTREE_INTERIOR_UPDATING_ROOT,
BTREE_INTERIOR_UPDATING_AS,
} mode;
+
+ unsigned must_rewrite:1;
+ unsigned nodes_written:1;
+
enum btree_id btree_id;
- unsigned flags;
struct btree_reserve *reserve;
/*
@@ -120,8 +124,6 @@ struct btree_update {
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
-#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0)
-
#define for_each_pending_btree_node_free(c, as, p) \
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
@@ -146,8 +148,34 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *,
void bch2_btree_insert_node(struct btree_update *, struct btree *,
struct btree_iter *, struct keylist *);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
-int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
- enum btree_node_sibling);
+
+int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+ unsigned, enum btree_node_sibling);
+
+static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+ struct btree_iter *iter,
+ unsigned level,
+ enum btree_node_sibling sib)
+{
+ struct btree *b;
+
+ if (!bch2_btree_node_relock(iter, level))
+ return 0;
+
+ b = iter->l[level].b;
+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+ return 0;
+
+ return __bch2_foreground_maybe_merge(c, iter, level, sib);
+}
+
+static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+ struct btree_iter *iter,
+ unsigned level)
+{
+ bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib);
+ bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib);
+}
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
@@ -220,16 +248,17 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct bset *i = btree_bset_last(b);
unsigned offset = max_t(unsigned, b->written << 9,
bset_byte_offset(b, vstruct_end(i)));
- ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
+ ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t)
(offset + sizeof(struct btree_node_entry) +
b->whiteout_u64s * sizeof(u64) +
b->uncompacted_whiteout_u64s * sizeof(u64));
EBUG_ON(offset > btree_bytes(c));
- if ((unlikely(bset_written(b, i)) && n > 0) ||
+ if ((unlikely(bset_written(b, i)) &&
+ remaining_space > block_bytes(c)) ||
(unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
- n > btree_write_set_buffer(b)))
+ remaining_space > btree_write_set_buffer(b)))
return (void *) b->data + offset;
return NULL;
@@ -312,4 +341,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
return u64s <= trans->journal_res.u64s;
}
+ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+
#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index dbf70569..4b252b6d 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -39,9 +39,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
t = bch2_bkey_to_bset(b, k);
if (bset_unwritten(b, bset(b, t)) &&
- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
- BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
-
+ bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
+ !bkey_whiteout(&insert->k)) {
k->type = insert->k.type;
memcpy_u64s(bkeyp_val(f, k), &insert->v,
bkey_val_u64s(&insert->k));
@@ -131,28 +130,21 @@ void bch2_btree_journal_key(struct btree_insert *trans,
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
- struct btree *b = iter->nodes[0];
+ struct btree *b = iter->l[0].b;
struct btree_write *w = btree_current_write(b);
EBUG_ON(iter->level || b->level);
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
- if (!journal_pin_active(&w->journal))
- bch2_journal_pin_add(j, &trans->journal_res,
- &w->journal,
- btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
-
- if (trans->journal_res.ref) {
+ if (likely(trans->journal_res.ref)) {
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
- b->btree_id, insert);
+ iter->btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
bch2_journal_set_has_inode(j, &trans->journal_res,
@@ -163,7 +155,14 @@ void bch2_btree_journal_key(struct btree_insert *trans,
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
}
- if (!btree_node_dirty(b))
+ if (unlikely(!journal_pin_active(&w->journal)))
+ bch2_journal_pin_add(j, &trans->journal_res,
+ &w->journal,
+ btree_node_write_idx(b) == 0
+ ? btree_node_flush0
+ : btree_node_flush1);
+
+ if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
}
@@ -172,13 +171,13 @@ bch2_insert_fixup_key(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
struct btree_iter *iter = insert->iter;
+ struct btree_iter_level *l = &iter->l[0];
- BUG_ON(iter->level);
- BUG_ON(insert->k->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, iter->nodes[0]));
+ EBUG_ON(iter->level);
+ EBUG_ON(insert->k->k.u64s >
+ bch_btree_keys_u64s_remaining(trans->c, l->b));
- if (bch2_btree_bset_insert_key(iter, iter->nodes[0],
- &iter->node_iters[0],
+ if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
insert->k))
bch2_btree_journal_key(trans, iter, insert->k);
@@ -186,38 +185,22 @@ bch2_insert_fixup_key(struct btree_insert *trans,
return BTREE_INSERT_OK;
}
-static int inline foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
- enum btree_node_sibling sib)
-{
- struct btree *b;
-
- if (!btree_node_locked(iter, iter->level))
- return 0;
-
- b = iter->nodes[iter->level];
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
- return 0;
-
- return bch2_foreground_maybe_merge(c, iter, sib);
-}
-
/**
* btree_insert_key - insert a key one key into a leaf node
*/
static enum btree_insert_ret
-btree_insert_key(struct btree_insert *trans,
- struct btree_insert_entry *insert)
+btree_insert_key_leaf(struct btree_insert *trans,
+ struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
- struct btree *b = iter->nodes[0];
+ struct btree *b = iter->l[0].b;
enum btree_insert_ret ret;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- iter->flags &= ~BTREE_ITER_UPTODATE;
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
ret = !btree_node_is_extents(b)
? bch2_insert_fixup_key(trans, insert)
@@ -247,7 +230,7 @@ static bool same_leaf_as_prev(struct btree_insert *trans,
* point to the same leaf node they'll always be adjacent now:
*/
return i != trans->entries &&
- i[0].iter->nodes[0] == i[-1].iter->nodes[0];
+ i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
#define trans_for_each_entry(trans, i) \
@@ -276,7 +259,8 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_lock_for_insert(c, i->iter->nodes[0], i->iter);
+ bch2_btree_node_lock_for_insert(c, i->iter->l[0].b,
+ i->iter);
}
static void multi_unlock_write(struct btree_insert *trans)
@@ -285,15 +269,18 @@ static void multi_unlock_write(struct btree_insert *trans)
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter);
+ bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
}
-static int btree_trans_entry_cmp(const void *_l, const void *_r)
+static inline void btree_trans_sort(struct btree_insert *trans)
{
- const struct btree_insert_entry *l = _l;
- const struct btree_insert_entry *r = _r;
+ int i, end = trans->nr;
- return btree_iter_cmp(l->iter, r->iter);
+ while (--end > 0)
+ for (i = 0; i < end; i++)
+ if (btree_iter_cmp(trans->entries[i].iter,
+ trans->entries[i + 1].iter) > 0)
+ swap(trans->entries[i], trans->entries[i + 1]);
}
/* Normal update interface: */
@@ -326,16 +313,22 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
bkey_i_to_s_c(i->k)));
}
- sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
- btree_trans_entry_cmp, NULL);
+ btree_trans_sort(trans);
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
retry_locks:
ret = -EINTR;
- trans_for_each_entry(trans, i)
+ trans_for_each_entry(trans, i) {
if (!bch2_btree_iter_set_locks_want(i->iter, 1))
goto err;
+
+ if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) {
+ ret = bch2_btree_iter_traverse(i->iter);
+ if (ret)
+ goto err;
+ }
+ }
retry:
trans->did_work = false;
u64s = 0;
@@ -375,7 +368,7 @@ retry:
if (!i->done) {
u64s += i->k->k.u64s + i->extra_res;
if (!bch2_btree_node_insert_fits(c,
- i->iter->nodes[0], u64s)) {
+ i->iter->l[0].b, u64s)) {
split = i->iter;
goto unlock;
}
@@ -390,7 +383,7 @@ retry:
if (i->done)
continue;
- switch (btree_insert_key(trans, i)) {
+ switch (btree_insert_key_leaf(trans, i)) {
case BTREE_INSERT_OK:
i->done = true;
break;
@@ -427,19 +420,19 @@ unlock:
if (ret)
goto err;
- /*
- * hack: iterators are inconsistent when they hit end of leaf, until
- * traversed again
- */
trans_for_each_entry(trans, i)
if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
goto out;
- trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i)) {
- foreground_maybe_merge(c, i->iter, btree_prev_sib);
- foreground_maybe_merge(c, i->iter, btree_next_sib);
- }
+ trans_for_each_entry(trans, i) {
+ /*
+ * iterators are inconsistent when they hit end of leaf, until
+ * traversed again
+ */
+ if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
+ !same_leaf_as_prev(trans, i))
+ bch2_foreground_maybe_merge(c, i->iter, 0);
+ }
out:
/* make sure we didn't lose an error: */
if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
@@ -513,12 +506,7 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
bch2_verify_keylist_sorted(keys);
while (!bch2_keylist_empty(keys)) {
- /* need to traverse between each insert */
- int ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+ int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
journal_seq, flags,
BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
if (ret)
@@ -544,51 +532,14 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
u64 *journal_seq, int flags)
{
struct btree_iter iter;
- int ret, ret2;
+ int ret;
bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
-
- ret = bch2_btree_iter_traverse(&iter);
- if (unlikely(ret))
- goto out;
-
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
BTREE_INSERT_ENTRY(&iter, k));
-out: ret2 = bch2_btree_iter_unlock(&iter);
-
- return ret ?: ret2;
-}
-
-/**
- * bch_btree_update - like bch2_btree_insert(), but asserts that we're
- * overwriting an existing key
- */
-int bch2_btree_update(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k, u64 *journal_seq)
-{
- struct btree_iter iter;
- struct bkey_s_c u;
- int ret;
-
- EBUG_ON(id == BTREE_ID_EXTENTS);
-
- bch2_btree_iter_init(&iter, c, id, k->k.p,
- BTREE_ITER_INTENT);
-
- u = bch2_btree_iter_peek_with_holes(&iter);
- ret = btree_iter_err(u);
- if (ret)
- return ret;
-
- if (bkey_deleted(u.k)) {
- bch2_btree_iter_unlock(&iter);
- return -ENOENT;
- }
-
- ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0,
- BTREE_INSERT_ENTRY(&iter, k));
bch2_btree_iter_unlock(&iter);
+
return ret;
}
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 0f090ca5..00e0de16 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -59,7 +59,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
return;
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
- bio->bi_bdev = pick.ca->disk_sb.bdev;
+ bio_set_dev(bio, pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
@@ -212,10 +212,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
return i->ret;
- bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+ for_each_btree_key(&iter, i->c, i->id, i->from,
+ BTREE_ITER_PREFETCH, k) {
+ i->from = iter.pos;
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(err = btree_iter_err(k))) {
bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
i->buf, sizeof(i->buf), k);
i->bytes = strlen(i->buf);
@@ -223,9 +223,6 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
i->buf[i->bytes] = '\n';
i->bytes++;
- bch2_btree_iter_advance_pos(&iter);
- i->from = iter.pos;
-
err = flush_buf(i);
if (err)
break;
@@ -233,7 +230,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
break;
}
- bch2_btree_iter_unlock(&iter);
+ err = bch2_btree_iter_unlock(&iter) ?: err;
return err < 0 ? err : i->ret;
}
@@ -318,26 +315,27 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(err = btree_iter_err(k))) {
- struct btree *b = iter.nodes[0];
- struct btree_node_iter *node_iter = &iter.node_iters[0];
- struct bkey_packed *_k = bch2_btree_node_iter_peek(node_iter, b);
+ struct btree_iter_level *l = &iter.l[0];
+ struct bkey_packed *_k =
+ bch2_btree_node_iter_peek(&l->iter, l->b);
- if (iter.nodes[0] != prev_node) {
- i->bytes = bch2_print_btree_node(i->c, b, i->buf,
+ if (l->b != prev_node) {
+ i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
sizeof(i->buf));
err = flush_buf(i);
if (err)
break;
}
- prev_node = iter.nodes[0];
+ prev_node = l->b;
- i->bytes = bch2_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf));
+ i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
+ sizeof(i->buf));
err = flush_buf(i);
if (err)
break;
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next(&iter);
i->from = iter.pos;
err = flush_buf(i);
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index a900d397..6bdece3a 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -213,12 +213,13 @@ int bch2_dirent_rename(struct bch_fs *c,
int ret = -ENOMEM;
bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos,
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
bch2_btree_iter_link(&src_iter, &dst_iter);
- bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, 0);
+ bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos,
+ BTREE_ITER_SLOTS);
bch2_btree_iter_link(&src_iter, &whiteout_iter);
if (mode == BCH_RENAME_EXCHANGE) {
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 7e7c38ef..c2469167 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -1022,10 +1022,10 @@ struct extent_insert_state {
};
static void bch2_add_sectors(struct extent_insert_state *s,
- struct bkey_s_c k, u64 offset, s64 sectors)
+ struct bkey_s_c k, u64 offset, s64 sectors)
{
struct bch_fs *c = s->trans->c;
- struct btree *b = s->insert->iter->nodes[0];
+ struct btree *b = s->insert->iter->l[0].b;
EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
@@ -1079,7 +1079,7 @@ static bool bch2_extent_merge_inline(struct bch_fs *,
static enum btree_insert_ret
extent_insert_should_stop(struct extent_insert_state *s)
{
- struct btree *b = s->insert->iter->nodes[0];
+ struct btree *b = s->insert->iter->l[0].b;
/*
* Check if we have sufficient space in both the btree node and the
@@ -1101,19 +1101,18 @@ extent_insert_should_stop(struct extent_insert_state *s)
static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bset_tree *t = bset_tree_last(b);
+ struct btree_iter_level *l = &iter->l[0];
+ struct bset_tree *t = bset_tree_last(l->b);
struct bkey_packed *where =
- bch2_btree_node_iter_bset_pos(node_iter, b, t);
- struct bkey_packed *prev = bch2_bkey_prev(b, t, where);
+ bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
+ struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where);
struct bkey_packed *next_live_key = where;
unsigned clobber_u64s;
if (prev)
where = bkey_next(prev);
- while (next_live_key != btree_bkey_last(b, t) &&
+ while (next_live_key != btree_bkey_last(l->b, t) &&
bkey_deleted(next_live_key))
next_live_key = bkey_next(next_live_key);
@@ -1127,18 +1126,19 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
goto drop_deleted_keys;
- if (next_live_key != btree_bkey_last(b, t) &&
+ if (next_live_key != btree_bkey_last(l->b, t) &&
bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
next_live_key, false))
goto drop_deleted_keys;
- bch2_bset_insert(b, node_iter, where, insert, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, t, where,
+ bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
clobber_u64s, where->u64s);
return;
drop_deleted_keys:
- bch2_bset_delete(b, where, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
+ bch2_bset_delete(l->b, where, clobber_u64s);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+ where, clobber_u64s, 0);
}
static void extent_insert_committed(struct extent_insert_state *s)
@@ -1181,8 +1181,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
}
if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, iter->nodes[iter->level],
- bkey_i_to_s_c(&split.k));
+ bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
bch2_btree_journal_key(s->trans, iter, &split.k);
@@ -1224,7 +1223,7 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
static enum btree_insert_ret
extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
{
- struct btree *b = s->insert->iter->nodes[0];
+ struct btree *b = s->insert->iter->l[0].b;
struct bpos next_pos = bpos_min(s->insert->k->k.p,
k.k ? k.k->p : b->key.k.p);
enum btree_insert_ret ret;
@@ -1287,8 +1286,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
{
struct bch_fs *c = s->trans->c;
struct btree_iter *iter = s->insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
enum btree_insert_ret ret;
switch (overlap) {
@@ -1397,8 +1397,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
{
struct bch_fs *c = s->trans->c;
struct btree_iter *iter = s->insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
struct bkey_packed *_k;
struct bkey unpacked;
struct bkey_i *insert = s->insert->k;
@@ -1544,8 +1545,9 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
struct bkey_packed *_k;
struct bkey unpacked;
enum btree_insert_ret ret = BTREE_INSERT_OK;
@@ -2176,8 +2178,7 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter,
struct bkey_packed *k, struct bkey uk,
bool check, bool could_pack)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
BUG_ON(!bkey_deleted(k));
@@ -2185,10 +2186,10 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter,
return !bkey_packed(k) || could_pack;
} else {
uk.p = new_pos;
- extent_save(b, node_iter, k, &uk);
- bch2_bset_fix_invalidated_key(b, t, k);
- bch2_btree_node_iter_fix(iter, b, node_iter, t,
- k, k->u64s, k->u64s);
+ extent_save(l->b, &l->iter, k, &uk);
+ bch2_bset_fix_invalidated_key(l->b, t, k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+ k, k->u64s, k->u64s);
return true;
}
}
@@ -2196,8 +2197,9 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter,
static bool extent_merge_do_overlapping(struct btree_iter *iter,
struct bkey *m, bool back_merge)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
struct bset_tree *t;
struct bkey_packed *k;
struct bkey uk;
@@ -2288,8 +2290,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
struct bkey_packed *r,
bool back_merge)
{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
+ struct btree *b = iter->l[0].b;
+ struct btree_node_iter *node_iter = &iter->l[0].iter;
const struct bkey_format *f = &b->format;
struct bset_tree *t = bset_tree_last(b);
struct bkey_packed *m;
@@ -2332,8 +2334,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
if (back_merge)
bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
- bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
- t, m, m->u64s, m->u64s);
+ bch2_btree_node_iter_fix(iter, b, node_iter,
+ t, m, m->u64s, m->u64s);
if (!back_merge)
bkey_copy(packed_to_bkey(l), &li.k);
@@ -2350,8 +2352,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
extent_i_save(b, m, &li.k);
bch2_bset_fix_invalidated_key(b, t, m);
- bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
- t, m, m->u64s, m->u64s);
+ bch2_btree_node_iter_fix(iter, b, node_iter,
+ t, m, m->u64s, m->u64s);
return true;
default:
BUG();
@@ -2368,7 +2370,7 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
end.offset += size;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
- BTREE_ITER_WITH_HOLES, k) {
+ BTREE_ITER_SLOTS, k) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h
index 08739d26..789ae663 100644
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -56,6 +56,11 @@ do { \
#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+#define fifo_entry_idx_abs(fifo, p) \
+ ((((p) >= &fifo_peek_front(fifo) \
+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
+ (((p) - (fifo)->data)))
+
#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
@@ -103,13 +108,15 @@ do { \
#define fifo_peek(fifo) fifo_peek_front(fifo)
#define fifo_for_each_entry(_entry, _fifo, _iter) \
- for (_iter = (_fifo)->front; \
+ for (((void) (&(_iter) == &(_fifo)->front)), \
+ _iter = (_fifo)->front; \
((_iter != (_fifo)->back) && \
(_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
_iter++)
#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
- for (_iter = (_fifo)->front; \
+ for (((void) (&(_iter) == &(_fifo)->front)), \
+ _iter = (_fifo)->front; \
((_iter != (_fifo)->back) && \
(_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
_iter++)
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 7693520d..1bffddf6 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -400,17 +400,13 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BTREE_ITER_INTENT);
bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
POS(extent_iter.pos.inode, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
hook.op = op;
hook.hook.fn = bchfs_extent_update_hook;
hook.need_inode_update = false;
do {
- ret = bch2_btree_iter_traverse(&extent_iter);
- if (ret)
- goto err;
-
/* XXX: inode->i_size locking */
k = bch2_keylist_front(keys);
if (min(k->k.p.offset << 9, op->new_i_size) >
@@ -423,7 +419,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
if (!btree_iter_linked(&inode_iter))
bch2_btree_iter_link(&extent_iter, &inode_iter);
- inode = bch2_btree_iter_peek_with_holes(&inode_iter);
+ inode = bch2_btree_iter_peek_slot(&inode_iter);
if ((ret = btree_iter_err(inode)))
goto err;
@@ -994,7 +990,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
- k = bch2_btree_iter_peek_with_holes(iter);
+ k = bch2_btree_iter_peek_slot(iter);
BUG_ON(!k.k);
if (IS_ERR(k.k)) {
@@ -1067,7 +1063,8 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
.mapping = mapping, .nr_pages = nr_pages
};
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_SLOTS);
INIT_LIST_HEAD(&readpages_iter.pages);
list_add(&readpages_iter.pages, pages);
@@ -1107,7 +1104,8 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
bio_add_page_contig(&rbio->bio, page);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_SLOTS);
bchfs_read(c, &iter, rbio, inum, NULL);
}
@@ -2236,9 +2234,10 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
POS(inode->v.i_ino, offset >> 9),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
/* position will be set from dst iter's position: */
- bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, 0);
+ bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_SLOTS);
bch2_btree_iter_link(&src, &dst);
/*
@@ -2276,11 +2275,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
bch2_btree_iter_set_pos(&src,
POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
- ret = bch2_btree_iter_traverse(&dst);
- if (ret)
- goto btree_iter_err;
-
- k = bch2_btree_iter_peek_with_holes(&src);
+ k = bch2_btree_iter_peek_slot(&src);
if ((ret = btree_iter_err(k)))
goto btree_iter_err;
@@ -2356,7 +2351,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
int ret;
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
@@ -2403,20 +2398,20 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
struct bkey_i_reservation reservation;
struct bkey_s_c k;
- k = bch2_btree_iter_peek_with_holes(&iter);
+ k = bch2_btree_iter_peek_slot(&iter);
if ((ret = btree_iter_err(k)))
goto btree_iter_err;
/* already reserved */
if (k.k->type == BCH_RESERVATION &&
bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next_slot(&iter);
continue;
}
if (bkey_extent_is_data(k.k)) {
if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next_slot(&iter);
continue;
}
}
@@ -2542,9 +2537,8 @@ static loff_t bch2_next_pagecache_data(struct inode *vinode,
for (index = start_offset >> PAGE_SHIFT;
index < end_offset >> PAGE_SHIFT;
index++) {
- if (find_get_pages(mapping, index, 1, &page)) {
+ if (find_get_pages(mapping, &index, 1, &page)) {
lock_page(page);
- index = page->index;
if (page_is_data(page))
end_offset =
@@ -2646,7 +2640,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
POS(inode->v.i_ino, offset >> 9),
- BTREE_ITER_WITH_HOLES, k) {
+ BTREE_ITER_SLOTS, k) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_next_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index aba845b2..98f282b2 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -85,10 +85,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
lockdep_assert_held(&inode->ei_update_lock);
bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
if ((ret = btree_iter_err(k)))
goto out;
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index ce14aa78..2991a0dd 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -231,7 +231,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
return ret;
return 1;
}
- bch2_btree_iter_advance_pos(&h->iter);
+ bch2_btree_iter_next(&h->iter);
}
fsck_err:
return ret;
@@ -1081,7 +1081,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
if (nlinks_pos == iter.pos.inode)
genradix_iter_advance(&nlinks_iter, links);
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next(&iter);
bch2_btree_iter_cond_resched(&iter);
}
fsck_err:
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 71a24cc6..797aa2a9 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -303,10 +303,10 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
searched_from_start = true;
again:
bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
u32 bi_generation = 0;
ret = btree_iter_err(k);
@@ -322,7 +322,7 @@ again:
if (iter.pos.inode == max)
goto out;
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next_slot(&iter);
break;
case BCH_INODE_GENERATION: {
@@ -415,9 +415,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
return ret;
bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
u32 bi_generation = 0;
ret = btree_iter_err(k);
@@ -474,7 +474,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
for_each_btree_key(&iter, c, BTREE_ID_INODES,
POS(inode_nr, 0),
- BTREE_ITER_WITH_HOLES, k) {
+ BTREE_ITER_SLOTS, k) {
switch (k.k->type) {
case BCH_INODE_FS:
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
@@ -491,38 +491,6 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
return bch2_btree_iter_unlock(&iter) ?: ret;
}
-int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid,
- struct bkey_i_inode_blockdev *ret)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
-
- for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), 0, k) {
- if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
- break;
-
- if (k.k->type == BCH_INODE_BLOCKDEV) {
- struct bkey_s_c_inode_blockdev inode =
- bkey_s_c_to_inode_blockdev(k);
-
- pr_debug("found inode %llu: %pU (u64s %u)",
- inode.k->p.inode, inode.v->i_uuid.b,
- inode.k->u64s);
-
- if (CACHED_DEV(inode.v) &&
- !memcmp(uuid, &inode.v->i_uuid, 16)) {
- bkey_reassemble(&ret->k_i, k);
- bch2_btree_iter_unlock(&iter);
- return 0;
- }
- }
-
- bch2_btree_iter_cond_resched(&iter);
- }
- bch2_btree_iter_unlock(&iter);
- return -ENOENT;
-}
-
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void)
{
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 8ebb6fb6..5c7aeadc 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -40,8 +40,6 @@ int bch2_inode_rm(struct bch_fs *, u64);
int bch2_inode_find_by_inum(struct bch_fs *, u64,
struct bch_inode_unpacked *);
-int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *, uuid_le *,
- struct bkey_i_inode_blockdev *);
static inline struct timespec bch2_time_to_timespec(struct bch_fs *c, u64 time)
{
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 6f6d42fc..7cddbccd 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -179,7 +179,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
bio_sectors(&n->bio));
n->have_io_ref = true;
- n->bio.bi_bdev = ca->disk_sb.bdev;
+ bio_set_dev(&n->bio, ca->disk_sb.bdev);
submit_bio(&n->bio);
} else {
n->have_io_ref = false;
@@ -1410,7 +1410,7 @@ noclone:
rbio->promote = promote ? promote_alloc(rbio) : NULL;
INIT_WORK(&rbio->work, NULL);
- rbio->bio.bi_bdev = pick->ca->disk_sb.bdev;
+ bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
@@ -1452,9 +1452,9 @@ static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_WITH_HOLES);
+ BTREE_ITER_SLOTS);
retry:
- k = bch2_btree_iter_peek_with_holes(&iter);
+ k = bch2_btree_iter_peek_slot(&iter);
if (btree_iter_err(k)) {
bch2_btree_iter_unlock(&iter);
goto err;
@@ -1525,7 +1525,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
retry:
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_WITH_HOLES, k) {
+ BTREE_ITER_SLOTS, k) {
BKEY_PADDED(k) tmp;
struct extent_pick_ptr pick;
struct bvec_iter fragment;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 1952643f..1870534d 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -31,6 +31,12 @@ static void journal_pin_add_entry(struct journal *,
struct journal_entry_pin *,
journal_pin_flush_fn);
+static inline void journal_wake(struct journal *j)
+{
+ wake_up(&j->wait);
+ closure_wake_up(&j->async_wait);
+}
+
static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
return j->buf + j->reservations.idx;
@@ -43,15 +49,27 @@ static inline struct journal_buf *journal_prev_buf(struct journal *j)
/* Sequence number of oldest dirty journal entry */
-static inline u64 last_seq(struct journal *j)
+static inline u64 journal_last_seq(struct journal *j)
{
- return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1;
+ return j->pin.front;
}
static inline u64 journal_pin_seq(struct journal *j,
struct journal_entry_pin_list *pin_list)
{
- return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
+ return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+ u64 ret = 0;
+
+ spin_lock(&j->lock);
+ if (journal_pin_active(pin))
+ ret = journal_pin_seq(j, pin->pin_list);
+ spin_unlock(&j->lock);
+
+ return ret;
}
static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
@@ -678,7 +696,7 @@ reread: sectors_read = min_t(unsigned,
end - offset, buf->size >> 9);
bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
+ bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = offset;
bio->bi_iter.bi_size = sectors_read << 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
@@ -968,7 +986,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
struct journal_replay *i;
struct journal_entry_pin_list *p;
struct bch_dev *ca;
- u64 cur_seq, end_seq;
+ u64 cur_seq, end_seq, seq;
unsigned iter, keys = 0, entries = 0;
size_t nr;
int ret = 0;
@@ -1035,11 +1053,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
j->pin.front = le64_to_cpu(i->j.last_seq);
j->pin.back = le64_to_cpu(i->j.seq) + 1;
- BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq));
- BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
- &fifo_peek_back(&j->pin));
-
- fifo_for_each_entry_ptr(p, &j->pin, iter) {
+ fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
@@ -1062,7 +1076,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
mutex_unlock(&j->blacklist_lock);
- cur_seq = last_seq(j);
+ cur_seq = journal_last_seq(j);
end_seq = le64_to_cpu(list_last_entry(list,
struct journal_replay, list)->j.seq);
@@ -1087,7 +1101,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
cur_seq, le64_to_cpu(i->j.seq) - 1,
- last_seq(j), end_seq);
+ journal_last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
@@ -1155,10 +1169,10 @@ static void journal_pin_new_entry(struct journal *j, int count)
/*
* The fifo_push() needs to happen at the same time as j->seq is
- * incremented for last_seq() to be calculated correctly
+ * incremented for journal_last_seq() to be calculated correctly
*/
- p = fifo_push_ref(&j->pin);
atomic64_inc(&j->seq);
+ p = fifo_push_ref(&j->pin);
EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
&fifo_peek_back(&j->pin));
@@ -1237,7 +1251,7 @@ static enum {
journal_reclaim_fast(j);
/* XXX: why set this here, and not in journal_write()? */
- buf->data->last_seq = cpu_to_le64(last_seq(j));
+ buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
journal_pin_new_entry(j, 1);
@@ -1272,7 +1286,7 @@ void bch2_journal_halt(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- wake_up(&j->wait);
+ journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
closure_wake_up(&journal_prev_buf(j)->wait);
}
@@ -1302,7 +1316,7 @@ static unsigned journal_dev_buckets_available(struct journal *j,
* Don't use the last bucket unless writing the new last_seq
* will make another bucket available:
*/
- if (ja->bucket_seq[ja->last_idx] >= last_seq(j))
+ if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
available = max((int) available - 1, 0);
return available;
@@ -1441,7 +1455,7 @@ static int journal_entry_open(struct journal *j)
mod_delayed_work(system_freezable_wq,
&j->write_work,
msecs_to_jiffies(j->write_delay_ms));
- wake_up(&j->wait);
+ journal_wake(j);
return 1;
}
@@ -1537,7 +1551,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
}
if (atomic_dec_and_test(&j->replay_pin_list->count))
- wake_up(&j->wait);
+ journal_wake(j);
}
j->replay_pin_list = NULL;
@@ -1704,7 +1718,7 @@ static void journal_reclaim_fast(struct journal *j)
}
if (popped)
- wake_up(&j->wait);
+ journal_wake(j);
}
/*
@@ -1718,6 +1732,7 @@ static inline void __journal_pin_add(struct journal *j,
journal_pin_flush_fn flush_fn)
{
BUG_ON(journal_pin_active(pin));
+ BUG_ON(!atomic_read(&pin_list->count));
atomic_inc(&pin_list->count);
pin->pin_list = pin_list;
@@ -1727,7 +1742,12 @@ static inline void __journal_pin_add(struct journal *j,
list_add(&pin->list, &pin_list->list);
else
INIT_LIST_HEAD(&pin->list);
- wake_up(&j->wait);
+
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
}
static void journal_pin_add_entry(struct journal *j,
@@ -1754,38 +1774,32 @@ void bch2_journal_pin_add(struct journal *j,
spin_unlock(&j->lock);
}
-static inline bool __journal_pin_drop(struct journal *j,
+static inline void __journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
struct journal_entry_pin_list *pin_list = pin->pin_list;
- pin->pin_list = NULL;
+ if (!journal_pin_active(pin))
+ return;
- /* journal_reclaim_work() might have already taken us off the list */
- if (!list_empty_careful(&pin->list))
- list_del_init(&pin->list);
+ pin->pin_list = NULL;
+ list_del_init(&pin->list);
- return atomic_dec_and_test(&pin_list->count);
+ /*
+ * Unpinning a journal entry make make journal_next_bucket() succeed, if
+ * writing a new last_seq will now make another bucket available:
+ */
+ if (atomic_dec_and_test(&pin_list->count) &&
+ pin_list == &fifo_peek_front(&j->pin))
+ journal_reclaim_fast(j);
}
void bch2_journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
- bool wakeup = false;
-
spin_lock(&j->lock);
- if (journal_pin_active(pin))
- wakeup = __journal_pin_drop(j, pin);
+ __journal_pin_drop(j, pin);
spin_unlock(&j->lock);
-
- /*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- *
- * Nested irqsave is expensive, don't do the wakeup with lock held:
- */
- if (wakeup)
- wake_up(&j->wait);
}
void bch2_journal_pin_add_if_older(struct journal *j,
@@ -1797,10 +1811,9 @@ void bch2_journal_pin_add_if_older(struct journal *j,
if (journal_pin_active(src_pin) &&
(!journal_pin_active(pin) ||
- fifo_entry_idx(&j->pin, src_pin->pin_list) <
- fifo_entry_idx(&j->pin, pin->pin_list))) {
- if (journal_pin_active(pin))
- __journal_pin_drop(j, pin);
+ journal_pin_seq(j, src_pin->pin_list) <
+ journal_pin_seq(j, pin->pin_list))) {
+ __journal_pin_drop(j, pin);
__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
@@ -1812,13 +1825,13 @@ __journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret;
- unsigned iter;
+ u64 iter;
/* no need to iterate over empty fifo entries: */
journal_reclaim_fast(j);
fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
- if (journal_pin_seq(j, pin_list) > seq_to_flush)
+ if (iter > seq_to_flush)
break;
ret = list_first_entry_or_null(&pin_list->list,
@@ -1826,7 +1839,7 @@ __journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_move(&ret->list, &pin_list->flushed);
- *seq = journal_pin_seq(j, pin_list);
+ *seq = iter;
return ret;
}
}
@@ -1863,9 +1876,9 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
*/
- ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) ||
+ ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
- last_seq(j) > seq_to_flush ||
+ journal_last_seq(j) > seq_to_flush ||
(fifo_used(&j->pin) == 1 &&
atomic_read(&fifo_peek_front(&j->pin).count) == 1);
spin_unlock(&j->lock);
@@ -1891,7 +1904,7 @@ again:
}
spin_lock(&j->lock);
- flush = last_seq(j) != j->last_seq_ondisk ||
+ flush = journal_last_seq(j) != j->last_seq_ondisk ||
(seq_to_flush == U64_MAX && c->btree_roots_dirty);
spin_unlock(&j->lock);
@@ -1984,7 +1997,7 @@ static void journal_reclaim_work(struct work_struct *work)
ja->last_idx = (ja->last_idx + 1) % ja->nr;
spin_unlock(&j->lock);
- wake_up(&j->wait);
+ journal_wake(j);
}
/*
@@ -2224,7 +2237,7 @@ out:
&j->reservations.counter);
closure_wake_up(&w->wait);
- wake_up(&j->wait);
+ journal_wake(j);
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
@@ -2338,8 +2351,8 @@ static void journal_write(struct closure *cl)
bio = ca->journal.bio;
bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_bdev = ca->disk_sb.bdev;
bio->bi_iter.bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
@@ -2360,7 +2373,7 @@ static void journal_write(struct closure *cl)
bio = ca->journal.bio;
bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
+ bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_FLUSH;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
@@ -2377,19 +2390,34 @@ err:
continue_at(cl, journal_write_done, system_highpri_wq);
}
-static void journal_write_work(struct work_struct *work)
+/*
+ * returns true if there's nothing to flush and no journal write still in flight
+ */
+static bool journal_flush_write(struct journal *j)
{
- struct journal *j = container_of(to_delayed_work(work),
- struct journal, write_work);
+ bool ret;
+
spin_lock(&j->lock);
+ ret = !j->reservations.prev_buf_unwritten;
+
if (!journal_entry_is_open(j)) {
spin_unlock(&j->lock);
- return;
+ return ret;
}
set_bit(JOURNAL_NEED_WRITE, &j->flags);
- if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED)
+ if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
+ ret = false;
+ else
spin_unlock(&j->lock);
+ return ret;
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+ struct journal *j = container_of(work, struct journal, write_work.work);
+
+ journal_flush_write(j);
}
/*
@@ -2514,6 +2542,43 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret < 0 ? ret : 0;
}
+u64 bch2_journal_last_unwritten_seq(struct journal *j)
+{
+ u64 seq;
+
+ spin_lock(&j->lock);
+ seq = atomic64_read(&j->seq);
+ if (j->reservations.prev_buf_unwritten)
+ seq--;
+ spin_unlock(&j->lock);
+
+ return seq;
+}
+
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+ int ret;
+
+ spin_lock(&j->lock);
+ BUG_ON(seq > atomic64_read(&j->seq));
+
+ if (seq < atomic64_read(&j->seq) ||
+ journal_entry_is_open(j)) {
+ spin_unlock(&j->lock);
+ return 1;
+ }
+
+ ret = journal_entry_open(j);
+ if (!ret)
+ closure_wait(&j->async_wait, parent);
+ spin_unlock(&j->lock);
+
+ if (!ret)
+ journal_reclaim_work(&j->reclaim_work.work);
+
+ return ret;
+}
+
void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
{
spin_lock(&j->lock);
@@ -2737,14 +2802,13 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct bch_devs_list devs;
- u64 seq = 0;
- unsigned iter;
+ u64 iter, seq = 0;
int ret = 0;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(p, &j->pin, iter)
if (bch2_dev_list_has_dev(p->devs, dev_idx))
- seq = journal_pin_seq(j, p);
+ seq = iter;
spin_unlock(&j->lock);
ret = bch2_journal_flush_pins(j, seq);
@@ -2758,7 +2822,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
spin_lock(&j->lock);
while (!ret && seq < atomic64_read(&j->seq)) {
- seq = max(seq, last_seq(j));
+ seq = max(seq, journal_last_seq(j));
devs = journal_seq_pin(j, seq)->devs;
seq++;
@@ -2804,15 +2868,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return;
-
- /*
- * Empty out the journal by first flushing everything pinning existing
- * journal entries, then force a brand new empty journal entry to be
- * written:
- */
- bch2_journal_flush_all_pins(j);
+ wait_event(j->wait, journal_flush_write(j));
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);
@@ -2914,7 +2970,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
spin_lock(&j->lock);
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "active journal entries:\t%zu\n"
+ "active journal entries:\t%llu\n"
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
@@ -2927,7 +2983,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
"replay done:\t\t%i\n",
fifo_used(&j->pin),
(u64) atomic64_read(&j->seq),
- last_seq(j),
+ journal_last_seq(j),
j->last_seq_ondisk,
journal_state_count(*s, s->idx),
s->cur_entry_offset,
@@ -2965,14 +3021,13 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
ssize_t ret = 0;
- unsigned i;
+ u64 i;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"%llu: count %u\n",
- journal_pin_seq(j, pin_list),
- atomic_read(&pin_list->count));
+ i, atomic_read(&pin_list->count));
list_for_each_entry(pin, &pin_list->list, list)
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 5abf356e..52d74eec 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -155,9 +155,11 @@ static inline bool journal_pin_active(struct journal_entry_pin *pin)
static inline struct journal_entry_pin_list *
journal_seq_pin(struct journal *j, u64 seq)
{
- return &j->pin.data[(size_t) seq & j->pin.mask];
+ return &j->pin.data[seq & j->pin.mask];
}
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
void bch2_journal_pin_add(struct journal *, struct journal_res *,
struct journal_entry_pin *, journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
@@ -195,8 +197,11 @@ static inline void bch2_journal_set_has_inode(struct journal *j,
u64 inum)
{
struct journal_buf *buf = &j->buf[res->idx];
+ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
- set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode);
+ /* avoid atomic op if possible */
+ if (unlikely(!test_bit(bit, buf->has_inode)))
+ set_bit(bit, buf->has_inode);
}
/*
@@ -234,7 +239,7 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
unsigned actual = jset_u64s(u64s);
EBUG_ON(!res->ref);
- BUG_ON(actual > res->u64s);
+ EBUG_ON(actual > res->u64s);
bch2_journal_add_entry_at(buf, res->offset, type,
id, level, data, u64s);
@@ -352,6 +357,9 @@ out:
return 0;
}
+u64 bch2_journal_last_unwritten_seq(struct journal *);
+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
+
void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
@@ -370,6 +378,8 @@ static inline int bch2_journal_error(struct journal *j)
? -EIO : 0;
}
+struct bch_dev;
+
static inline bool journal_flushes_device(struct bch_dev *ca)
{
return true;
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 5eea6579..e39b18f2 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -140,6 +140,7 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
+ struct closure_waitlist async_wait;
struct closure io;
struct delayed_work write_work;
@@ -166,7 +167,10 @@ struct journal {
* needed. When all journal entries in the oldest journal bucket are no
* longer needed, the bucket can be discarded and reused.
*/
- DECLARE_FIFO(struct journal_entry_pin_list, pin);
+ struct {
+ u64 front, back, size, mask;
+ struct journal_entry_pin_list *data;
+ } pin;
struct journal_entry_pin_list *replay_pin_list;
struct mutex blacklist_lock;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 2033db81..01c88960 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -162,7 +162,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
bch2_bkey_devs(k));
if (ret)
break;
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next(&iter);
continue;
}
@@ -265,10 +265,6 @@ retry:
}
}
bch2_btree_iter_unlock(&iter);
-
- /* btree root */
- mutex_lock(&c->btree_root_lock);
- mutex_unlock(&c->btree_root_lock);
}
ret = 0;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 7c7f436c..a67e7a45 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -49,10 +49,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
struct bkey_i_extent *insert, *new =
bkey_i_to_extent(bch2_keylist_front(keys));
BKEY_PADDED(k) _new, _insert;
@@ -143,7 +143,7 @@ nomatch:
&m->ctxt->stats->sectors_raced);
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
- bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_next_slot(&iter);
goto next;
}
out:
@@ -435,7 +435,7 @@ next:
atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
&stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance_pos(&stats->iter);
+ bch2_btree_iter_next(&stats->iter);
bch2_btree_iter_cond_resched(&stats->iter);
}
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index 854f7f55..6ab2c866 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -245,7 +245,7 @@ int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
memset(&msgs, 0, sizeof(msgs));
for_each_set_qtype(c, i, q, qtypes)
- mutex_lock(&q->lock);
+ mutex_lock_nested(&q->lock, i);
for_each_set_qtype(c, i, q, qtypes) {
mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
@@ -296,7 +296,7 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
memset(&msgs, 0, sizeof(msgs));
for_each_set_qtype(c, i, q, qtypes)
- mutex_lock(&q->lock);
+ mutex_lock_nested(&q->lock, i);
for_each_set_qtype(c, i, q, qtypes) {
src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
@@ -735,8 +735,8 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
- BTREE_ITER_WITH_HOLES|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_with_holes(&iter);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = btree_iter_err(k);
if (unlikely(ret))
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index c60a6730..f0ff8d41 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -10,10 +10,6 @@
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
#define six_release(l) lock_release(l, 0, _RET_IP_)
-#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1)
-
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */
u64 lock_val;
@@ -31,6 +27,10 @@ struct six_lock_vals {
enum six_lock_type unlock_wakeup;
};
+#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0)
+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
+#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1)
+
#define LOCK_VALS { \
[SIX_LOCK_read] = { \
.lock_val = __SIX_VAL(read_lock, 1), \
@@ -55,25 +55,40 @@ struct six_lock_vals {
}, \
}
-static void six_set_owner(struct six_lock *lock, enum six_lock_type type)
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+ union six_lock_state old)
{
- if (type == SIX_LOCK_intent)
+ if (type != SIX_LOCK_intent)
+ return;
+
+ if (!old.intent_lock) {
+ EBUG_ON(lock->owner);
lock->owner = current;
+ } else {
+ EBUG_ON(lock->owner != current);
+ }
}
-static void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
+static inline void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
{
- if (type == SIX_LOCK_intent)
+ if (type != SIX_LOCK_intent)
+ return;
+
+ EBUG_ON(lock->owner != current);
+
+ if (lock->state.intent_lock == 1)
lock->owner = NULL;
}
-static inline bool __six_trylock_type(struct six_lock *lock,
- enum six_lock_type type)
+static __always_inline bool do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old;
u64 v = READ_ONCE(lock->state.v);
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+
do {
old.v = v;
@@ -86,23 +101,24 @@ static inline bool __six_trylock_type(struct six_lock *lock,
} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
old.v,
old.v + l[type].lock_val)) != old.v);
+
+ six_set_owner(lock, type, old);
return true;
}
-bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+__always_inline __flatten
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
{
- bool ret = __six_trylock_type(lock, type);
-
- if (ret) {
- six_acquire(&lock->dep_map, 1);
- six_set_owner(lock, type);
- }
+ if (!do_six_trylock_type(lock, type))
+ return false;
- return ret;
+ six_acquire(&lock->dep_map, 1);
+ return true;
}
-bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
- unsigned seq)
+__always_inline __flatten
+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old;
@@ -117,8 +133,8 @@ bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
old.v,
old.v + l[type].lock_val)) != old.v);
+ six_set_owner(lock, type, old);
six_acquire(&lock->dep_map, 1);
- six_set_owner(lock, type);
return true;
}
@@ -150,7 +166,8 @@ static inline int six_can_spin_on_owner(struct six_lock *lock)
return retval;
}
-static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
+static inline bool six_spin_on_owner(struct six_lock *lock,
+ struct task_struct *owner)
{
bool ret = true;
@@ -176,7 +193,7 @@ static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
return ret;
}
-static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
{
struct task_struct *task = current;
@@ -201,7 +218,7 @@ static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
if (owner && !six_spin_on_owner(lock, owner))
break;
- if (__six_trylock_type(lock, type)) {
+ if (do_six_trylock_type(lock, type)) {
osq_unlock(&lock->osq);
preempt_enable();
return true;
@@ -240,20 +257,16 @@ fail:
return false;
}
-void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+noinline
+static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
struct six_lock_waiter wait;
u64 v;
- six_acquire(&lock->dep_map, 0);
-
- if (__six_trylock_type(lock, type))
- goto done;
-
if (six_optimistic_spin(lock, type))
- goto done;
+ return;
lock_contended(&lock->dep_map, _RET_IP_);
@@ -262,7 +275,9 @@ void six_lock_type(struct six_lock *lock, enum six_lock_type type)
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (list_empty_careful(&wait.list)) {
+ if (type == SIX_LOCK_write)
+ EBUG_ON(lock->owner != current);
+ else if (list_empty_careful(&wait.list)) {
raw_spin_lock(&lock->wait_lock);
list_add_tail(&wait.list, &lock->wait_list[type]);
raw_spin_unlock(&lock->wait_lock);
@@ -287,6 +302,8 @@ void six_lock_type(struct six_lock *lock, enum six_lock_type type)
schedule();
}
+ six_set_owner(lock, type, old);
+
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.list)) {
@@ -294,9 +311,17 @@ void six_lock_type(struct six_lock *lock, enum six_lock_type type)
list_del_init(&wait.list);
raw_spin_unlock(&lock->wait_lock);
}
-done:
+}
+
+__always_inline
+static void __six_lock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ six_acquire(&lock->dep_map, 0);
+
+ if (!do_six_trylock_type(lock, type))
+ __six_lock_type_slowpath(lock, type);
+
lock_acquired(&lock->dep_map, _RET_IP_);
- six_set_owner(lock, type);
}
static inline void six_lock_wakeup(struct six_lock *lock,
@@ -315,6 +340,14 @@ static inline void six_lock_wakeup(struct six_lock *lock,
clear_bit(waitlist_bitnr(waitlist_id),
(unsigned long *) &lock->state.v);
+ if (waitlist_id == SIX_LOCK_write) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+
+ if (p)
+ wake_up_process(p);
+ return;
+ }
+
raw_spin_lock(&lock->wait_lock);
list_for_each_entry_safe(w, next, wait_list, list) {
@@ -332,26 +365,87 @@ static inline void six_lock_wakeup(struct six_lock *lock,
raw_spin_unlock(&lock->wait_lock);
}
-void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state state;
- six_clear_owner(lock, type);
-
EBUG_ON(!(lock->state.v & l[type].held_mask));
EBUG_ON(type == SIX_LOCK_write &&
!(lock->state.v & __SIX_LOCK_HELD_intent));
+ six_clear_owner(lock, type);
+
state.v = atomic64_add_return_release(l[type].unlock_val,
&lock->state.counter);
six_release(&lock->dep_map);
six_lock_wakeup(lock, state, l[type].unlock_wakeup);
}
-bool six_trylock_convert(struct six_lock *lock,
- enum six_lock_type from,
- enum six_lock_type to)
+#ifdef SIX_LOCK_SEPARATE_LOCKFNS
+
+#define __SIX_LOCK(type) \
+bool six_trylock_##type(struct six_lock *lock) \
+{ \
+ return __six_trylock_type(lock, SIX_LOCK_##type); \
+} \
+ \
+bool six_relock_##type(struct six_lock *lock, u32 seq) \
+{ \
+ return __six_relock_type(lock, SIX_LOCK_##type, seq); \
+} \
+ \
+void six_lock_##type(struct six_lock *lock) \
+{ \
+ __six_lock_type(lock, SIX_LOCK_##type); \
+} \
+ \
+void six_unlock_##type(struct six_lock *lock) \
+{ \
+ __six_unlock_type(lock, SIX_LOCK_##type); \
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+
+#undef __SIX_LOCK
+
+#else
+
+bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ return __six_trylock_type(lock, type);
+}
+
+bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq)
+{
+ return __six_relock_type(lock, type, seq);
+
+}
+
+void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ __six_lock_type(lock, type);
+}
+
+void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ __six_unlock_type(lock, type);
+}
+
+#endif
+
+/* Convert from intent to read: */
+void six_lock_downgrade(struct six_lock *lock)
+{
+ six_lock_increment(lock, SIX_LOCK_read);
+ six_unlock_intent(lock);
+}
+
+bool six_lock_tryupgrade(struct six_lock *lock)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
@@ -359,22 +453,41 @@ bool six_trylock_convert(struct six_lock *lock,
do {
new.v = old.v = v;
- new.v += l[from].unlock_val;
- if (new.v & l[to].lock_fail)
+ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask));
+
+ new.v += l[SIX_LOCK_read].unlock_val;
+
+ if (new.v & l[SIX_LOCK_intent].lock_fail)
return false;
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v,
- new.v + l[to].lock_val)) != old.v);
- six_clear_owner(lock, from);
- six_set_owner(lock, to);
+ new.v += l[SIX_LOCK_intent].lock_val;
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+ old.v, new.v)) != old.v);
- six_lock_wakeup(lock, new, l[from].unlock_wakeup);
+ six_set_owner(lock, SIX_LOCK_intent, old);
+ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup);
return true;
}
+bool six_trylock_convert(struct six_lock *lock,
+ enum six_lock_type from,
+ enum six_lock_type to)
+{
+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+ if (to == from)
+ return true;
+
+ if (to == SIX_LOCK_read) {
+ six_lock_downgrade(lock);
+ return true;
+ } else {
+ return six_lock_tryupgrade(lock);
+ }
+}
+
/*
* Increment read/intent lock count, assuming we already have it read or intent
* locked:
@@ -390,10 +503,3 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
atomic64_add(l[type].lock_val, &lock->state.counter);
}
-
-/* Convert from intent to read: */
-void six_lock_downgrade(struct six_lock *lock)
-{
- six_lock_increment(lock, SIX_LOCK_read);
- six_unlock_intent(lock);
-}
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
index 0f319df6..f518c64c 100644
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@@ -8,6 +8,8 @@
#include "util.h"
+#define SIX_LOCK_SEPARATE_LOCKFNS
+
/*
* LOCK STATES:
*
@@ -68,7 +70,7 @@ struct six_lock {
struct optimistic_spin_queue osq;
raw_spinlock_t wait_lock;
- struct list_head wait_list[3];
+ struct list_head wait_list[2];
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -82,7 +84,6 @@ static __always_inline void __six_lock_init(struct six_lock *lock,
raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *) lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
@@ -96,16 +97,60 @@ do { \
__six_lock_init((lock), #lock, &__key); \
} while (0)
+#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
+
+#ifdef SIX_LOCK_SEPARATE_LOCKFNS
+
+#define __SIX_LOCK(type) \
+bool six_trylock_##type(struct six_lock *); \
+bool six_relock_##type(struct six_lock *, u32); \
+void six_lock_##type(struct six_lock *); \
+void six_unlock_##type(struct six_lock *);
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+#define SIX_LOCK_DISPATCH(type, fn, ...) \
+ switch (type) { \
+ case SIX_LOCK_read: \
+ return fn##_read(__VA_ARGS__); \
+ case SIX_LOCK_intent: \
+ return fn##_intent(__VA_ARGS__); \
+ case SIX_LOCK_write: \
+ return fn##_write(__VA_ARGS__); \
+ default: \
+ BUG(); \
+ }
+
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ SIX_LOCK_DISPATCH(type, six_trylock, lock);
+}
+
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned seq)
+{
+ SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
+}
+
+static inline void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ SIX_LOCK_DISPATCH(type, six_lock, lock);
+}
+
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ SIX_LOCK_DISPATCH(type, six_unlock, lock);
+}
+
+#else
+
bool six_trylock_type(struct six_lock *, enum six_lock_type);
bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned);
void six_lock_type(struct six_lock *, enum six_lock_type);
void six_unlock_type(struct six_lock *, enum six_lock_type);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
- enum six_lock_type);
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-void six_lock_downgrade(struct six_lock *);
-
-#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
#define __SIX_LOCK(type) \
static __always_inline bool six_trylock_##type(struct six_lock *lock) \
@@ -131,5 +176,15 @@ static __always_inline void six_unlock_##type(struct six_lock *lock) \
__SIX_LOCK(read)
__SIX_LOCK(intent)
__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+#endif
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+ enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
#endif /* _BCACHEFS_SIX_H */
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 530cf0a4..0adb9a1c 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -131,12 +131,11 @@ bch2_hash_lookup_at(const struct bch_hash_desc desc,
struct btree_iter *iter, const void *search)
{
u64 inode = iter->pos.inode;
+ struct bkey_s_c k;
- do {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter);
-
- if (btree_iter_err(k))
- return k;
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ if (iter->pos.inode != inode)
+ break;
if (k.k->type == desc.key_type) {
if (!desc.cmp_key(k, search))
@@ -147,11 +146,8 @@ bch2_hash_lookup_at(const struct bch_hash_desc desc,
/* hole, not found */
break;
}
-
- bch2_btree_iter_advance_pos(iter);
- } while (iter->pos.inode == inode);
-
- return bkey_s_c_err(-ENOENT);
+ }
+ return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
}
static inline struct bkey_s_c
@@ -160,12 +156,11 @@ bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc,
struct btree_iter *iter, struct bkey_s_c search)
{
u64 inode = iter->pos.inode;
+ struct bkey_s_c k;
- do {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter);
-
- if (btree_iter_err(k))
- return k;
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ if (iter->pos.inode != inode)
+ break;
if (k.k->type == desc.key_type) {
if (!desc.cmp_bkey(k, search))
@@ -176,11 +171,8 @@ bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc,
/* hole, not found */
break;
}
-
- bch2_btree_iter_advance_pos(iter);
- } while (iter->pos.inode == inode);
-
- return bkey_s_c_err(-ENOENT);
+ }
+ return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
}
static inline struct bkey_s_c
@@ -190,7 +182,8 @@ bch2_hash_lookup(const struct bch_hash_desc desc,
struct btree_iter *iter, const void *key)
{
bch2_btree_iter_init(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)), 0);
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_SLOTS);
return bch2_hash_lookup_at(desc, info, iter, key);
}
@@ -203,7 +196,7 @@ bch2_hash_lookup_intent(const struct bch_hash_desc desc,
{
bch2_btree_iter_init(iter, c, desc.btree_id,
POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
return bch2_hash_lookup_at(desc, info, iter, key);
}
@@ -211,20 +204,17 @@ bch2_hash_lookup_intent(const struct bch_hash_desc desc,
static inline struct bkey_s_c
bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
{
- while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter);
+ u64 inode = iter->pos.inode;
+ struct bkey_s_c k;
- if (btree_iter_err(k))
- return k;
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ if (iter->pos.inode != inode)
+ break;
if (k.k->type != desc.key_type)
return k;
-
- /* hash collision, keep going */
- bch2_btree_iter_advance_pos(iter);
- if (iter->pos.inode != k.k->p.inode)
- return bkey_s_c_err(-ENOENT);
}
+ return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
}
static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc,
@@ -235,7 +225,7 @@ static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc,
{
bch2_btree_iter_init(iter, c, desc.btree_id,
POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
return bch2_hash_hole_at(desc, iter);
}
@@ -245,16 +235,12 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
struct btree_iter *iter,
struct btree_iter *start)
{
+ struct bkey_s_c k;
+
bch2_btree_iter_set_pos(iter,
btree_type_successor(start->btree_id, start->pos));
- while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter);
- int ret = btree_iter_err(k);
-
- if (ret)
- return ret;
-
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
if (k.k->type != desc.key_type &&
k.k->type != desc.whiteout_type)
return false;
@@ -262,9 +248,8 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset)
return true;
-
- bch2_btree_iter_advance_pos(iter);
}
+ return btree_iter_err(k);
}
static inline int bch2_hash_set(const struct bch_hash_desc desc,
@@ -279,9 +264,9 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc,
bch2_btree_iter_init(&hashed_slot, c, desc.btree_id,
POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos,
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
bch2_btree_iter_link(&hashed_slot, &iter);
retry:
/*
@@ -354,7 +339,7 @@ static inline int bch2_hash_delete_at(const struct bch_hash_desc desc,
int ret = -ENOENT;
bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
- iter->pos, 0);
+ iter->pos, BTREE_ITER_SLOTS);
bch2_btree_iter_link(iter, &whiteout_iter);
ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
@@ -385,9 +370,10 @@ static inline int bch2_hash_delete(const struct bch_hash_desc desc,
bch2_btree_iter_init(&iter, c, desc.btree_id,
POS(inode, desc.hash_key(info, key)),
- BTREE_ITER_INTENT);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)), 0);
+ POS(inode, desc.hash_key(info, key)),
+ BTREE_ITER_SLOTS);
bch2_btree_iter_link(&iter, &whiteout_iter);
retry:
k = bch2_hash_lookup_at(desc, info, &iter, key);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 8dce7dc1..e3c319fc 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -490,7 +490,7 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
unsigned order;
reread:
bio_reset(sb->bio);
- sb->bio->bi_bdev = sb->bdev;
+ bio_set_dev(sb->bio, sb->bdev);
sb->bio->bi_iter.bi_sector = offset;
sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
@@ -588,7 +588,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
* superblocks:
*/
bio_reset(sb->bio);
- sb->bio->bi_bdev = sb->bdev;
+ bio_set_dev(sb->bio, sb->bdev);
sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
@@ -667,7 +667,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
null_nonce(), sb);
bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
+ bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
bio->bi_iter.bi_size =
roundup(vstruct_bytes(sb),
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 29ffba65..8c7a147a 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -197,6 +197,29 @@ int bch2_congested(void *data, int bdi_bits)
* - allocator depends on the journal (when it rewrites prios and gens)
*/
+static void bch_fs_mark_clean(struct bch_fs *c)
+{
+ if (!bch2_journal_error(&c->journal) &&
+ !test_bit(BCH_FS_ERROR, &c->flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_CLEAN(c->disk_sb, true);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+}
+
+static bool btree_interior_updates_done(struct bch_fs *c)
+{
+ bool ret;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ ret = list_empty(&c->btree_interior_update_list);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ return ret;
+}
+
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -213,17 +236,38 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
- bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_flush_pins(&c->journal, U64_MAX - 1);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
- bch2_fs_journal_stop(&c->journal);
+ bch2_journal_flush_all_pins(&c->journal);
- if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags))
+ /*
+ * We need to explicitly wait on btree interior updates to complete
+ * before stopping the journal, flushing all journal pins isn't
+ * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+ * interior updates have to drop their journal pin before they're
+ * fully complete:
+ */
+ closure_wait_event(&c->btree_interior_update_wait,
+ btree_interior_updates_done(c));
+
+ if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_btree_verify_flushed(c);
+ bch2_fs_journal_stop(&c->journal);
+
+ /*
+ * the journal kicks off btree writes via reclaim - wait for in flight
+ * writes after stopping journal:
+ */
+ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ bch2_btree_flush_all_writes(c);
+
+ /*
+ * After stopping journal:
+ */
for_each_member_device(ca, c, i)
bch2_dev_allocator_remove(c, ca);
}
@@ -274,19 +318,12 @@ void bch2_fs_read_only(struct bch_fs *c)
__bch2_fs_read_only(c);
+ bch_fs_mark_clean(c);
+
wait_event(bch_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-
- if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb, true);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
c->state = BCH_FS_RO;
}
@@ -400,29 +437,22 @@ static void bch2_fs_free(struct bch_fs *c)
module_put(THIS_MODULE);
}
-static void bch2_fs_exit(struct bch_fs *c)
+static void bch2_fs_release(struct kobject *kobj)
{
- unsigned i;
-
- cancel_delayed_work_sync(&c->pd_controllers_update);
- cancel_work_sync(&c->read_only_work);
-
- for (i = 0; i < c->sb.nr_devices; i++)
- if (c->devs[i])
- bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- closure_debug_destroy(&c->cl);
- kobject_put(&c->kobj);
+ bch2_fs_free(c);
}
-static void bch2_fs_offline(struct bch_fs *c)
+void bch2_fs_stop(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
- mutex_lock(&bch_fs_list_lock);
- list_del(&c->list);
- mutex_unlock(&bch_fs_list_lock);
+ mutex_lock(&c->state_lock);
+ BUG_ON(c->state == BCH_FS_STOPPING);
+ c->state = BCH_FS_STOPPING;
+ mutex_unlock(&c->state_lock);
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
@@ -440,30 +470,34 @@ static void bch2_fs_offline(struct bch_fs *c)
kobject_put(&c->opts_dir);
kobject_put(&c->internal);
+ mutex_lock(&bch_fs_list_lock);
+ list_del(&c->list);
+ mutex_unlock(&bch_fs_list_lock);
+
+ closure_sync(&c->cl);
+ closure_debug_destroy(&c->cl);
+
mutex_lock(&c->state_lock);
__bch2_fs_read_only(c);
mutex_unlock(&c->state_lock);
-}
-static void bch2_fs_release(struct kobject *kobj)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+ bch_fs_mark_clean(c);
- bch2_fs_free(c);
-}
+ /* btree prefetch might have kicked off reads in the background: */
+ bch2_btree_flush_all_reads(c);
-void bch2_fs_stop(struct bch_fs *c)
-{
- mutex_lock(&c->state_lock);
- BUG_ON(c->state == BCH_FS_STOPPING);
- c->state = BCH_FS_STOPPING;
- mutex_unlock(&c->state_lock);
+ for_each_member_device(ca, c, i)
+ cancel_work_sync(&ca->io_error_work);
- bch2_fs_offline(c);
+ cancel_work_sync(&c->btree_write_error_work);
+ cancel_delayed_work_sync(&c->pd_controllers_update);
+ cancel_work_sync(&c->read_only_work);
- closure_sync(&c->cl);
+ for (i = 0; i < c->sb.nr_devices; i++)
+ if (c->devs[i])
+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
- bch2_fs_exit(c);
+ kobject_put(&c->kobj);
}
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
@@ -545,6 +579,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_opts_apply(&c->opts, opts);
c->block_bits = ilog2(c->opts.block_size);
+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
c->opts.nochanges |= c->opts.noreplay;
c->opts.read_only |= c->opts.nochanges;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 1e8f8735..597e1f02 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -12,8 +12,10 @@
#include "compress.h"
#include "sysfs.h"
#include "btree_cache.h"
+#include "btree_io.h"
#include "btree_iter.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "inode.h"
@@ -146,6 +148,8 @@ read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
read_attribute(journal_pins);
+read_attribute(btree_updates);
+read_attribute(dirty_btree_nodes);
read_attribute(internal_uuid);
@@ -343,6 +347,12 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_pins)
return bch2_journal_print_pins(&c->journal, buf);
+ if (attr == &sysfs_btree_updates)
+ return bch2_btree_updates_print(c, buf);
+
+ if (attr == &sysfs_dirty_btree_nodes)
+ return bch2_dirty_btree_nodes_print(c, buf);
+
if (attr == &sysfs_compression_stats)
return bch2_compression_stats(c, buf);
@@ -474,6 +484,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_alloc_debug,
&sysfs_journal_debug,
&sysfs_journal_pins,
+ &sysfs_btree_updates,
+ &sysfs_dirty_btree_nodes,
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,