From 99adc23cf6260a8e5b237f498119ee163d8f71f6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Feb 2018 00:26:03 -0500 Subject: Update bcachefs sources to 0e765bc37c bcachefs: foreground merging of interior btree nodes --- libbcachefs/alloc.c | 12 +- libbcachefs/alloc.h | 2 +- libbcachefs/bcachefs.h | 3 + libbcachefs/bkey.c | 4 +- libbcachefs/bkey_methods.c | 4 + libbcachefs/bset.c | 64 ++-- libbcachefs/bset.h | 51 ++-- libbcachefs/btree_cache.c | 27 +- libbcachefs/btree_gc.c | 6 +- libbcachefs/btree_io.c | 121 ++++++-- libbcachefs/btree_io.h | 23 +- libbcachefs/btree_iter.c | 594 ++++++++++++++++++++++-------------- libbcachefs/btree_iter.h | 112 ++++--- libbcachefs/btree_locking.h | 16 +- libbcachefs/btree_update.h | 2 - libbcachefs/btree_update_interior.c | 176 +++++++---- libbcachefs/btree_update_interior.h | 47 ++- libbcachefs/btree_update_leaf.c | 161 ++++------ libbcachefs/debug.c | 30 +- libbcachefs/dirent.c | 7 +- libbcachefs/extents.c | 78 ++--- libbcachefs/fifo.h | 11 +- libbcachefs/fs-io.c | 40 ++- libbcachefs/fs.c | 4 +- libbcachefs/fsck.c | 4 +- libbcachefs/inode.c | 44 +-- libbcachefs/inode.h | 2 - libbcachefs/io.c | 10 +- libbcachefs/journal.c | 209 ++++++++----- libbcachefs/journal.h | 16 +- libbcachefs/journal_types.h | 6 +- libbcachefs/migrate.c | 6 +- libbcachefs/move.c | 8 +- libbcachefs/quota.c | 8 +- libbcachefs/six.c | 216 +++++++++---- libbcachefs/six.h | 71 ++++- libbcachefs/str_hash.h | 78 ++--- libbcachefs/super-io.c | 6 +- libbcachefs/super.c | 121 +++++--- libbcachefs/sysfs.c | 12 + 40 files changed, 1477 insertions(+), 935 deletions(-) (limited to 'libbcachefs') diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index f3833846..f6592de9 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -343,7 +343,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); do { - ret = bch2_btree_iter_traverse(iter); + ret = btree_iter_err(bch2_btree_iter_peek_slot(iter)); if (ret) break; @@ -393,7 +393,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) return 0; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL); bch2_btree_iter_unlock(&iter); @@ -407,7 +407,7 @@ static int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca) int ret = 0; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); down_read(&ca->bucket_lock); for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { @@ -826,7 +826,7 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, int ret = 0; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); /* * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the @@ -1862,8 +1862,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca) static void allocator_start_issue_discards(struct bch_fs *c) { struct bch_dev *ca; - unsigned i, dev_iter; - size_t bu; + unsigned dev_iter; + size_t i, bu; for_each_rw_member(ca, c, dev_iter) { unsigned done = 0; diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 1b9d960b..3bd765b6 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -83,7 +83,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) struct task_struct *p; rcu_read_lock(); - if ((p = ACCESS_ONCE(ca->alloc_thread))) + if ((p = READ_ONCE(ca->alloc_thread))) wake_up_process(p); rcu_read_unlock(); } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 78c427fa..298f26d4 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -520,6 +520,8 @@ struct bch_fs { unsigned short block_bits; /* ilog2(block_size) */ + u16 btree_foreground_merge_threshold; + struct closure sb_write; struct mutex sb_lock; @@ -548,6 +550,7 @@ struct bch_fs { mempool_t btree_interior_update_pool; struct list_head btree_interior_update_list; struct mutex btree_interior_update_lock; + struct closure_waitlist btree_interior_update_wait; struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 97015084..850ba72c 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -1040,8 +1040,8 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, high_word(f, r), b->nr_key_bits); - EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p, - bkey_unpack_key_format_checked(b, r).p)); + EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), + bkey_unpack_pos(b, r))); return ret; } diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 3b3a09eb..84cdf662 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -72,6 +72,10 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, if (k.k->p.snapshot) return "nonzero snapshot"; + if (type != BKEY_TYPE_BTREE && + !bkey_cmp(k.k->p, POS_MAX)) + return "POS_MAX key"; + return NULL; } diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 02be5bb4..a07d5540 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -264,9 +264,9 @@ void bch2_verify_key_order(struct btree *b, #else -static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) {} +static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + struct btree *b, + struct bkey_packed *k) {} #endif @@ -569,7 +569,7 @@ static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, unsigned j, struct bkey_packed *k) { - BUG_ON(k >= btree_bkey_last(b, t)); + EBUG_ON(k >= btree_bkey_last(b, t)); rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { .offset = __btree_node_key_to_offset(b, k), @@ -619,7 +619,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, { unsigned l = 0, r = t->size; - BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); while (l < r) { unsigned m = (l + r) >> 1; @@ -630,13 +630,13 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, r = m; } - BUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset < offset); - BUG_ON(l && - rw_aux_tree(b, t)[l - 1].offset >= offset); + EBUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset < offset); + EBUG_ON(l && + rw_aux_tree(b, t)[l - 1].offset >= offset); - BUG_ON(l > r); - BUG_ON(l > t->size); + EBUG_ON(l > r); + EBUG_ON(l > t->size); return l; } @@ -764,16 +764,16 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, #ifdef __LITTLE_ENDIAN shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - BUG_ON(shift + bits > b->format.key_u64s * 64); + EBUG_ON(shift + bits > b->format.key_u64s * 64); #else shift = high_bit_offset + b->nr_key_bits - exponent - bits; - BUG_ON(shift < KEY_PACKED_BITS_START); + EBUG_ON(shift < KEY_PACKED_BITS_START); #endif - BUG_ON(shift < 0 || shift >= BFLOAT_FAILED); + EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); f->exponent = shift; mantissa = bkey_mantissa(m, f, j); @@ -890,6 +890,7 @@ retry: prev = k, k = bkey_next(k); if (k >= btree_bkey_last(b, t)) { + /* XXX: this path sucks */ t->size--; goto retry; } @@ -898,8 +899,8 @@ retry: bkey_float(b, t, j)->key_offset = bkey_to_cacheline_offset(b, t, cacheline++, k); - BUG_ON(tree_to_prev_bkey(b, t, j) != prev); - BUG_ON(tree_to_bkey(b, t, j) != k); + EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); + EBUG_ON(tree_to_bkey(b, t, j) != k); } while (bkey_next(k) != btree_bkey_last(b, t)) @@ -1074,7 +1075,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, struct bkey_packed min_key, max_key; unsigned inorder, j; - BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); /* signal to make_bfloat() that they're uninitialized: */ min_key.u64s = max_key.u64s = 0; @@ -1148,7 +1149,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, int shift = new_u64s - clobber_u64s; unsigned l, j, where = __btree_node_key_to_offset(b, _where); - BUG_ON(bset_has_ro_aux_tree(t)); + EBUG_ON(bset_has_ro_aux_tree(t)); if (!bset_has_rw_aux_tree(t)) return; @@ -1157,8 +1158,8 @@ static void bch2_bset_fix_lookup_table(struct btree *b, /* l is first >= than @where */ - BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); - BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); + EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); + EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); if (!l) /* never delete first entry */ l++; @@ -1189,9 +1190,9 @@ static void bch2_bset_fix_lookup_table(struct btree *b, for (j = l; j < t->size; j++) rw_aux_tree(b, t)[j].offset += shift; - BUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset == - rw_aux_tree(b, t)[l - 1].offset); + EBUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset == + rw_aux_tree(b, t)[l - 1].offset); if (t->size < bset_rw_tree_capacity(b, t) && (l < t->size @@ -1248,8 +1249,8 @@ void bch2_bset_insert(struct btree *b, u64 *src_p = where->_data + clobber_u64s; u64 *dst_p = where->_data + src->u64s; - BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < - (int) clobber_u64s - src->u64s); + EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < + (int) clobber_u64s - src->u64s); memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); @@ -1277,7 +1278,7 @@ void bch2_bset_delete(struct btree *b, bch2_bset_verify_rw_aux_tree(b, t); - BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); + EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); @@ -1594,7 +1595,7 @@ struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, { struct btree_node_iter_set *set; - BUG_ON(iter->used > MAX_BSETS); + EBUG_ON(iter->used > MAX_BSETS); btree_node_iter_for_each(iter, set) if (set->end == t->end_offset) @@ -1653,20 +1654,23 @@ void bch2_btree_node_iter_sort(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { +#ifdef CONFIG_BCACHEFS_DEBUG struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b); - +#endif iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; - BUG_ON(iter->data->k > iter->data->end); + EBUG_ON(iter->data->k > iter->data->end); if (iter->data->k == iter->data->end) { - BUG_ON(iter->used == 0); + EBUG_ON(iter->used == 0); iter->data[0] = iter->data[--iter->used]; } btree_node_iter_sift(iter, b, 0); +#ifdef CONFIG_BCACHEFS_DEBUG bch2_btree_node_iter_next_check(iter, b, k); +#endif } /* diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index c195cd91..f5a84481 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -188,16 +188,15 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); -static inline struct bkey -bkey_unpack_key_format_checked(const struct btree *b, +static inline void +__bkey_unpack_key_format_checked(const struct btree *b, + struct bkey *dst, const struct bkey_packed *src) { - struct bkey dst; - #ifdef HAVE_BCACHEFS_COMPILED_UNPACK { compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(&dst, src); + unpack_fn(dst, src); if (btree_keys_expensive_checks(b)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); @@ -206,17 +205,36 @@ bkey_unpack_key_format_checked(const struct btree *b, * hack around a harmless race when compacting whiteouts * for a write: */ - dst2.needs_whiteout = dst.needs_whiteout; + dst2.needs_whiteout = dst->needs_whiteout; - BUG_ON(memcmp(&dst, &dst2, sizeof(dst))); + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); } } #else - dst = __bch2_bkey_unpack_key(&b->format, src); + *dst = __bch2_bkey_unpack_key(&b->format, src); #endif +} + +static inline struct bkey +bkey_unpack_key_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ + struct bkey dst; + + __bkey_unpack_key_format_checked(b, &dst, src); return dst; } +static inline void __bkey_unpack_key(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (likely(bkey_packed(src))) + __bkey_unpack_key_format_checked(b, dst, src); + else + *dst = *packed_to_bkey_c(src); +} + /** * bkey_unpack_key -- unpack just the key, not the value */ @@ -253,7 +271,7 @@ static inline struct bkey_s_c bkey_disassemble(struct btree *b, const struct bkey_packed *k, struct bkey *u) { - *u = bkey_unpack_key(b, k); + __bkey_unpack_key(b, u, k); return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; } @@ -263,7 +281,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, struct bkey_packed *k, struct bkey *u) { - *u = bkey_unpack_key(b, k); + __bkey_unpack_key(b, u, k); return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; } @@ -353,15 +371,6 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, } /* Returns true if @k is after iterator position @pos */ -static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k, - bool strictly_greater) -{ - int cmp = bkey_cmp(k->p, pos); - - return cmp > 0 || - (cmp == 0 && !strictly_greater && !bkey_deleted(k)); -} - static inline bool btree_iter_pos_cmp_packed(const struct btree *b, struct bpos *pos, const struct bkey_packed *k, @@ -493,14 +502,14 @@ static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, static inline struct bkey_packed * __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) + struct btree *b) { return __btree_node_offset_to_key(b, iter->data->k); } static inline struct bkey_packed * bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) + struct btree *b) { return bch2_btree_node_iter_end(iter) ? NULL diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 78e36299..0bde449e 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -57,10 +57,20 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) list_move(&b->list, &bc->freed); } +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct btree *b = obj; + const u64 *v = arg->key; + + return PTR_HASH(&b->key) == *v ? 0 : 1; +} + static const struct rhashtable_params bch_btree_cache_params = { .head_offset = offsetof(struct btree, hash), .key_offset = offsetof(struct btree, key.v), .key_len = sizeof(struct bch_extent_ptr), + .obj_cmpfn = bch2_btree_cache_cmp_fn, }; static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) @@ -337,6 +347,9 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) while (!list_empty(&bc->live)) { b = list_first_entry(&bc->live, struct btree, list); + BUG_ON(btree_node_read_in_flight(b) || + btree_node_write_in_flight(b)); + if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); clear_btree_node_dirty(b); @@ -736,7 +749,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree *ret; unsigned level = b->level; - parent = iter->nodes[level + 1]; + parent = btree_iter_node(iter, level + 1); if (!parent) return NULL; @@ -745,7 +758,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, return ERR_PTR(-EINTR); } - node_iter = iter->node_iters[parent->level]; + node_iter = iter->l[parent->level].iter; k = bch2_btree_node_iter_peek_all(&node_iter, parent); BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); @@ -774,9 +787,13 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); } - if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) { - six_unlock_intent(&ret->lock); - ret = ERR_PTR(-EINTR); + if (!bch2_btree_node_relock(iter, level)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + if (!IS_ERR(ret)) { + six_unlock_intent(&ret->lock); + ret = ERR_PTR(-EINTR); + } } return ret; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 43e64d27..63508663 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -589,7 +589,7 @@ static void recalc_packed_keys(struct btree *b) static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, struct btree *old_nodes[GC_MERGE_NODES]) { - struct btree *parent = iter->nodes[old_nodes[0]->level + 1]; + struct btree *parent = btree_node_parent(iter, old_nodes[0]); unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; unsigned blocks = btree_blocks(c) * 2 / 3; struct btree *new_nodes[GC_MERGE_NODES]; @@ -775,7 +775,7 @@ next: BUG_ON(!bch2_keylist_empty(&keylist)); - BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]); + BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]); BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0])); @@ -868,7 +868,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) * and the nodes in our sliding window might not have the same * parent anymore - blow away the sliding window: */ - if (iter.nodes[iter.level + 1] && + if (btree_iter_node(&iter, iter.level + 1) && !btree_node_intent_locked(&iter, iter.level + 1)) memset(merge + 1, 0, (GC_MERGE_NODES - 1) * sizeof(merge[0])); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index fb76b29e..9b4eff1c 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -266,19 +266,16 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, bool compacting, enum compact_mode mode) { - unsigned live_u64s = b->nr.bset_u64s[t - b->set]; unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - - if (live_u64s == bset_u64s) - return 0; + unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; if (mode == COMPACT_LAZY) { - if (live_u64s * 4 < bset_u64s * 3 || + if (should_compact_bset_lazy(b, t) || (compacting && bset_unwritten(b, bset(b, t)))) - return bset_u64s - live_u64s; + return dead_u64s; } else { if (bset_written(b, bset(b, t))) - return bset_u64s - live_u64s; + return dead_u64s; } return 0; @@ -426,7 +423,7 @@ static bool bch2_drop_whiteouts(struct btree *b) struct bset *i = bset(b, t); struct bkey_packed *k, *n, *out, *start, *end; - if (!should_compact_bset(b, t, true, true)) + if (!should_compact_bset(b, t, true, COMPACT_WRITTEN)) continue; start = btree_bkey_first(b, t); @@ -830,13 +827,13 @@ void bch2_btree_build_aux_trees(struct btree *b) * Returns true if we sorted (i.e. invalidated iterators */ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) + struct btree_iter *iter) { struct btree_node_entry *bne; bool did_sort; EBUG_ON(!(b->lock.state.seq & 1)); - EBUG_ON(iter && iter->nodes[b->level] != b); + EBUG_ON(iter && iter->l[b->level].b != b); did_sort = btree_node_compact(c, b, iter); @@ -1297,8 +1294,8 @@ static void btree_node_read_work(struct work_struct *work) do { bch_info(c, "retrying read"); bio_reset(bio); + bio_set_dev(bio, rb->pick.ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; - bio->bi_bdev = rb->pick.ca->disk_sb.bdev; bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); submit_bio_wait(bio); @@ -1333,7 +1330,7 @@ static void btree_node_read_endio(struct bio *bio) bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ); INIT_WORK(&rb->work, btree_node_read_work); - schedule_work(&rb->work); + queue_work(system_unbound_wq, &rb->work); } void bch2_btree_node_read(struct bch_fs *c, struct btree *b, @@ -1357,8 +1354,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, rb->c = c; rb->start_time = local_clock(); rb->pick = pick; + bio_set_dev(bio, pick.ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; - bio->bi_bdev = pick.ca->disk_sb.bdev; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); bch2_bio_map(bio, b->data); @@ -1470,15 +1467,13 @@ retry: goto err; /* has node been freed? */ - if (iter.nodes[b->level] != b) { + if (iter.l[b->level].b != b) { /* node has been freed: */ - if (!btree_node_dying(b)) - panic("foo4\n"); + BUG_ON(!btree_node_dying(b)); goto out; } - if (!btree_node_hashed(b)) - panic("foo5\n"); + BUG_ON(!btree_node_hashed(b)); bkey_copy(&tmp.k, &b->key); @@ -1583,7 +1578,7 @@ static void btree_node_write_endio(struct bio *bio) container_of(orig, struct btree_write_bio, wbio); INIT_WORK(&wb->work, btree_node_write_work); - schedule_work(&wb->work); + queue_work(system_unbound_wq, &wb->work); } } @@ -1664,8 +1659,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - if (lock_type_held == SIX_LOCK_intent) { - six_lock_write(&b->lock); + /* + * We can't block on six_lock_write() here; another thread might be + * trying to get a journal reservation with read locks held, and getting + * a journal reservation might be blocked on flushing the journal and + * doing btree writes: + */ + if (lock_type_held == SIX_LOCK_intent && + six_trylock_write(&b->lock)) { __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); six_unlock_write(&b->lock); } else { @@ -1900,13 +1901,12 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, BUG_ON(lock_type_held == SIX_LOCK_write); if (lock_type_held == SIX_LOCK_intent || - six_trylock_convert(&b->lock, SIX_LOCK_read, - SIX_LOCK_intent)) { + six_lock_tryupgrade(&b->lock)) { __bch2_btree_node_write(c, b, SIX_LOCK_intent); /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b)) { - six_lock_write(&b->lock); + if (btree_node_just_written(b) && + six_trylock_write(&b->lock)) { bch2_btree_post_write_cleanup(c, b); six_unlock_write(&b->lock); } @@ -1918,6 +1918,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, } } +static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + unsigned i; +restart: + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (test_bit(flag, &b->flags)) { + rcu_read_unlock(); + wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + goto restart; + + } + rcu_read_unlock(); +} + +void bch2_btree_flush_all_reads(struct bch_fs *c) +{ + __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); +} + +void bch2_btree_flush_all_writes(struct bch_fs *c) +{ + __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} + void bch2_btree_verify_flushed(struct bch_fs *c) { struct bucket_table *tbl; @@ -1926,7 +1954,46 @@ void bch2_btree_verify_flushed(struct bch_fs *c) unsigned i; rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - BUG_ON(btree_node_dirty(b)); + for_each_cached_btree(b, c, tbl, i, pos) { + unsigned long flags = READ_ONCE(b->flags); + + BUG_ON((flags & (1 << BTREE_NODE_dirty)) || + (flags & (1 << BTREE_NODE_write_in_flight))); + } + rcu_read_unlock(); +} + +ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = buf + PAGE_SIZE; + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + unsigned i; + + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) { + unsigned long flags = READ_ONCE(b->flags); + unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; + + if (//!(flags & (1 << BTREE_NODE_dirty)) && + !b->writes[0].wait.list.first && + !b->writes[1].wait.list.first && + !(b->will_make_reachable & 1)) + continue; + + out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n", + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + b->level, + b->written, + !list_empty_careful(&b->write_blocked), + b->will_make_reachable != 0, + b->will_make_reachable & 1, + b->writes[ idx].wait.list.first != NULL, + b->writes[!idx].wait.list.first != NULL); + } rcu_read_unlock(); + + return out - buf; } diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 66053d53..f2790716 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -57,21 +57,23 @@ enum compact_mode { bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); +static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t) +{ + unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); + unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; + + return dead_u64s > 128 && dead_u64s * 3 > bset_u64s; +} + static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) { struct bset_tree *t; - for_each_bset(b, t) { - unsigned live_u64s = b->nr.bset_u64s[t - b->set]; - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - - if (live_u64s * 4 < bset_u64s * 3) - goto compact; - } + for_each_bset(b, t) + if (should_compact_bset_lazy(b, t)) + return __bch2_compact_whiteouts(c, b, COMPACT_LAZY); return false; -compact: - return __bch2_compact_whiteouts(c, b, COMPACT_LAZY); } void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); @@ -134,6 +136,9 @@ do { \ } \ } while (0) +void bch2_btree_flush_all_reads(struct bch_fs *); +void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_btree_verify_flushed(struct bch_fs *); +ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index ee463f36..21a6cbc2 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -10,11 +10,15 @@ #include #include +static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, + struct btree_iter_level *, + struct bkey *); + #define BTREE_ITER_NOT_END ((struct btree *) 1) static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { - return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END; + return iter->l[l].b && iter->l[l].b != BTREE_ITER_NOT_END; } /* Btree node locking: */ @@ -27,7 +31,7 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) { struct btree_iter *linked; - EBUG_ON(iter->nodes[b->level] != b); + EBUG_ON(iter->l[b->level].b != b); EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq); for_each_linked_btree_node(iter, b, linked) @@ -43,14 +47,14 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) struct btree_iter *linked; unsigned readers = 0; - EBUG_ON(iter->nodes[b->level] != b); + EBUG_ON(iter->l[b->level].b != b); EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); if (six_trylock_write(&b->lock)) return; for_each_linked_btree_iter(iter, linked) - if (linked->nodes[b->level] == b && + if (linked->l[b->level].b == b && btree_node_read_locked(linked, b->level)) readers++; @@ -71,10 +75,10 @@ void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) } } -bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { struct btree_iter *linked; - struct btree *b = iter->nodes[level]; + struct btree *b = iter->l[level].b; int want = btree_lock_want(iter, level); int have = btree_node_locked_type(iter, level); @@ -93,7 +97,7 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level) goto success; for_each_linked_btree_iter(iter, linked) - if (linked->nodes[level] == b && + if (linked->l[level].b == b && btree_node_locked_type(linked, level) == want && iter->lock_seq[level] == b->lock.state.seq) { btree_node_unlock(iter, level); @@ -112,10 +116,16 @@ bool bch2_btree_iter_relock(struct btree_iter *iter) { unsigned l; - for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) - if (!bch2_btree_node_relock(iter, l)) + for (l = iter->level; + l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b; + l++) + if (!bch2_btree_node_relock(iter, l)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); return false; + } + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + iter->uptodate = BTREE_ITER_NEED_PEEK; return true; } @@ -139,7 +149,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, iter->nodes_locked != iter->nodes_intent_locked); for_each_linked_btree_iter(iter, linked) - if (linked->nodes[level] == b && + if (linked->l[level].b == b && btree_node_locked_type(linked, level) == type) { six_lock_increment(&b->lock, type); return true; @@ -212,7 +222,7 @@ static void btree_iter_drop_extra_locks(struct btree_iter *iter) btree_node_unlock(iter, l); } else { if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->nodes[l]->lock); + six_lock_downgrade(&iter->l[l].b->lock); iter->nodes_intent_locked ^= 1 << l; } break; @@ -255,7 +265,7 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter, static void __bch2_btree_iter_unlock(struct btree_iter *iter) { - iter->flags &= ~BTREE_ITER_UPTODATE; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); while (iter->nodes_locked) btree_node_unlock(iter, __ffs(iter->nodes_locked)); @@ -277,13 +287,13 @@ int bch2_btree_iter_unlock(struct btree_iter *iter) #ifdef CONFIG_BCACHEFS_DEBUG static void __bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) + struct btree *b) { - struct btree_node_iter *node_iter = &iter->node_iters[b->level]; - struct btree_node_iter tmp = *node_iter; + struct btree_iter_level *l = &iter->l[b->level]; + struct btree_node_iter tmp = l->iter; struct bkey_packed *k; - bch2_btree_node_iter_verify(node_iter, b); + bch2_btree_node_iter_verify(&l->iter, b); /* * For interior nodes, the iterator will have skipped past @@ -302,7 +312,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, buf, iter->pos.inode, iter->pos.offset); } - k = bch2_btree_node_iter_peek_all(node_iter, b); + k = bch2_btree_node_iter_peek_all(&l->iter, b); if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k, iter->flags & BTREE_ITER_IS_EXTENTS)) { char buf[100]; @@ -318,7 +328,7 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) { struct btree_iter *linked; - if (iter->nodes[b->level] == b) + if (iter->l[b->level].b == b) __bch2_btree_iter_verify(iter, b); for_each_linked_btree_node(iter, b, linked) @@ -348,8 +358,15 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->flags & BTREE_ITER_IS_EXTENTS)) + iter->flags & BTREE_ITER_IS_EXTENTS)) { bch2_btree_node_iter_push(node_iter, b, where, end); + + if (!b->level && + node_iter == &iter->l[0].iter) + bkey_disassemble(b, + bch2_btree_node_iter_peek_all(node_iter, b), + &iter->k); + } return; found: set->end = (int) set->end + shift; @@ -362,16 +379,20 @@ found: btree_iter_pos_cmp_packed(b, &iter->pos, where, iter->flags & BTREE_ITER_IS_EXTENTS)) { set->k = offset; - bch2_btree_node_iter_sort(node_iter, b); } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; if (set->k == set->end) *set = node_iter->data[--node_iter->used]; - bch2_btree_node_iter_sort(node_iter, b); } else { set->k = (int) set->k + shift; + goto iter_current_key_not_modified; } + bch2_btree_node_iter_sort(node_iter, b); + if (!b->level && node_iter == &iter->l[0].iter) + __btree_iter_peek_all(iter, &iter->l[0], &iter->k); +iter_current_key_not_modified: + /* * Interior nodes are special because iterators for interior nodes don't * obey the usual invariants regarding the iterator position: @@ -439,18 +460,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, { struct btree_iter *linked; - if (node_iter != &iter->node_iters[b->level]) + if (node_iter != &iter->l[b->level].iter) __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - if (iter->nodes[b->level] == b) + if (iter->l[b->level].b == b) __bch2_btree_node_iter_fix(iter, b, - &iter->node_iters[b->level], t, + &iter->l[b->level].iter, t, where, clobber_u64s, new_u64s); for_each_linked_btree_node(iter, b, linked) __bch2_btree_node_iter_fix(linked, b, - &linked->node_iters[b->level], t, + &linked->l[b->level].iter, t, where, clobber_u64s, new_u64s); /* interior node iterators are... special... */ @@ -458,51 +479,49 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, bch2_btree_iter_verify(iter, b); } -/* peek_all() doesn't skip deleted keys */ -static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter) +static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, + struct btree_iter_level *l, + struct bkey *u, + struct bkey_packed *k) { - struct btree *b = iter->nodes[iter->level]; - struct bkey_packed *k = - bch2_btree_node_iter_peek_all(&iter->node_iters[iter->level], b); struct bkey_s_c ret; - EBUG_ON(!btree_node_locked(iter, iter->level)); - - if (!k) + if (unlikely(!k)) { + /* + * signal to bch2_btree_iter_peek_slot() that we're currently at + * a hole + */ + u->type = KEY_TYPE_DELETED; return bkey_s_c_null; + } - ret = bkey_disassemble(b, k, &iter->k); + ret = bkey_disassemble(l->b, k, u); if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, b, ret); + bch2_bkey_debugcheck(iter->c, l->b, ret); return ret; } -static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter) +/* peek_all() doesn't skip deleted keys */ +static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, + struct btree_iter_level *l, + struct bkey *u) { - struct btree *b = iter->nodes[iter->level]; - struct bkey_packed *k = - bch2_btree_node_iter_peek(&iter->node_iters[iter->level], b); - struct bkey_s_c ret; - - EBUG_ON(!btree_node_locked(iter, iter->level)); - - if (!k) - return bkey_s_c_null; - - ret = bkey_disassemble(b, k, &iter->k); - - if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, b, ret); + return __btree_iter_unpack(iter, l, u, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); +} - return ret; +static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, + struct btree_iter_level *l) +{ + return __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&l->iter, l->b)); } -static inline void __btree_iter_advance(struct btree_iter *iter) +static inline void __btree_iter_advance(struct btree_iter_level *l) { - bch2_btree_node_iter_advance(&iter->node_iters[iter->level], - iter->nodes[iter->level]); + bch2_btree_node_iter_advance(&l->iter, l->b); } /* @@ -510,24 +529,28 @@ static inline void __btree_iter_advance(struct btree_iter *iter) */ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) { + struct btree_iter_level *l; + unsigned plevel; bool parent_locked; struct bkey_packed *k; - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || - !iter->nodes[b->level + 1]) + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) return; - parent_locked = btree_node_locked(iter, b->level + 1); + plevel = b->level + 1; + if (!btree_iter_node(iter, plevel)) + return; - if (!bch2_btree_node_relock(iter, b->level + 1)) + parent_locked = btree_node_locked(iter, plevel); + + if (!bch2_btree_node_relock(iter, plevel)) return; - k = bch2_btree_node_iter_peek_all(&iter->node_iters[b->level + 1], - iter->nodes[b->level + 1]); + l = &iter->l[plevel]; + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); if (!k || bkey_deleted(k) || - bkey_cmp_left_packed(iter->nodes[b->level + 1], - k, &b->key.k.p)) { + bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); @@ -540,16 +563,21 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) btree_node_unlock(iter, b->level + 1); } -static inline void __btree_iter_init(struct btree_iter *iter, - struct btree *b) +/* Returns true if @k is after iterator position @pos */ +static inline bool btree_iter_pos_cmp(struct btree_iter *iter, + const struct bkey *k) { - bch2_btree_node_iter_init(&iter->node_iters[b->level], b, iter->pos, - iter->flags & BTREE_ITER_IS_EXTENTS, - btree_node_is_extents(b)); + int cmp = bkey_cmp(k->p, iter->pos); - /* Skip to first non whiteout: */ - if (b->level) - bch2_btree_node_iter_peek(&iter->node_iters[b->level], b); + return cmp > 0 || + (cmp == 0 && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k)); +} + +static inline bool btree_iter_pos_after_node(struct btree_iter *iter, + struct btree *b) +{ + return !btree_iter_pos_cmp(iter, &b->key.k); } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -557,8 +585,21 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, { return iter->btree_id == b->btree_id && bkey_cmp(iter->pos, b->data->min_key) >= 0 && - btree_iter_pos_cmp(iter->pos, &b->key.k, - iter->flags & BTREE_ITER_IS_EXTENTS); + !btree_iter_pos_after_node(iter, b); +} + +static inline void __btree_iter_init(struct btree_iter *iter, + struct btree *b) +{ + struct btree_iter_level *l = &iter->l[b->level]; + + bch2_btree_node_iter_init(&l->iter, b, iter->pos, + iter->flags & BTREE_ITER_IS_EXTENTS, + btree_node_is_extents(b)); + + /* Skip to first non whiteout: */ + if (b->level) + bch2_btree_node_iter_peek(&l->iter, b); } static inline void btree_iter_node_set(struct btree_iter *iter, @@ -570,7 +611,7 @@ static inline void btree_iter_node_set(struct btree_iter *iter, EBUG_ON(b->lock.state.seq & 1); iter->lock_seq[b->level] = b->lock.state.seq; - iter->nodes[b->level] = b; + iter->l[b->level].b = b; __btree_iter_init(iter, b); } @@ -632,10 +673,10 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) { unsigned level = b->level; - if (iter->nodes[level] == b) { - iter->flags &= ~BTREE_ITER_UPTODATE; + if (iter->l[level].b == b) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); btree_node_unlock(iter, level); - iter->nodes[level] = BTREE_ITER_NOT_END; + iter->l[level].b = BTREE_ITER_NOT_END; } } @@ -674,7 +715,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, * that depth */ iter->level = depth_want; - iter->nodes[iter->level] = NULL; + iter->l[iter->level].b = NULL; return 0; } @@ -687,8 +728,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, b->level == iter->level && !race_fault())) { for (i = 0; i < iter->level; i++) - iter->nodes[i] = BTREE_ITER_NOT_END; - iter->nodes[iter->level] = b; + iter->l[i].b = BTREE_ITER_NOT_END; + iter->l[iter->level].b = b; mark_btree_node_locked(iter, iter->level, lock_type); btree_iter_node_set(iter, b); @@ -703,52 +744,57 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, noinline static void btree_iter_prefetch(struct btree_iter *iter) { - struct btree *b = iter->nodes[iter->level + 1]; - struct btree_node_iter node_iter = iter->node_iters[iter->level + 1]; + struct btree_iter_level *l = &iter->l[iter->level]; + struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; BKEY_PADDED(k) tmp; - unsigned nr = iter->level ? 1 : 8; - bool was_locked = btree_node_locked(iter, iter->level + 1); + unsigned nr = iter->level > 1 ? 1 : 8; + bool was_locked = btree_node_locked(iter, iter->level); while (nr) { - if (!bch2_btree_node_relock(iter, iter->level + 1)) + if (!bch2_btree_node_relock(iter, iter->level)) return; - bch2_btree_node_iter_advance(&node_iter, b); - k = bch2_btree_node_iter_peek(&node_iter, b); + bch2_btree_node_iter_advance(&node_iter, l->b); + k = bch2_btree_node_iter_peek(&node_iter, l->b); if (!k) break; - bch2_bkey_unpack(b, &tmp.k, k); + bch2_bkey_unpack(l->b, &tmp.k, k); bch2_btree_node_prefetch(iter->c, &tmp.k, - iter->level, iter->btree_id); + iter->level - 1, + iter->btree_id); } if (!was_locked) - btree_node_unlock(iter, iter->level + 1); + btree_node_unlock(iter, iter->level); } static inline int btree_iter_down(struct btree_iter *iter) { + struct btree_iter_level *l = &iter->l[iter->level]; struct btree *b; - struct bkey_s_c k = __btree_iter_peek(iter); unsigned level = iter->level - 1; enum six_lock_type lock_type = btree_lock_want(iter, level); BKEY_PADDED(k) tmp; - bkey_reassemble(&tmp.k, k); + BUG_ON(!btree_node_locked(iter, iter->level)); + + bch2_bkey_unpack(l->b, &tmp.k, + bch2_btree_node_iter_peek(&l->iter, l->b)); b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type); if (unlikely(IS_ERR(b))) return PTR_ERR(b); - iter->level = level; mark_btree_node_locked(iter, level, lock_type); btree_iter_node_set(iter, b); if (iter->flags & BTREE_ITER_PREFETCH) btree_iter_prefetch(iter); + iter->level = level; + return 0; } @@ -829,7 +875,7 @@ io_error: BUG_ON(ret != -EIO); iter->flags |= BTREE_ITER_ERROR; - iter->nodes[iter->level] = NULL; + iter->l[iter->level].b = NULL; goto out; } @@ -846,22 +892,22 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { unsigned depth_want = iter->level; - if (unlikely(!iter->nodes[iter->level])) + if (unlikely(!iter->l[iter->level].b)) return 0; - iter->flags &= ~(BTREE_ITER_UPTODATE|BTREE_ITER_AT_END_OF_LEAF); + iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; /* make sure we have all the intent locks we need - ugh */ - if (unlikely(iter->nodes[iter->level] && + if (unlikely(iter->l[iter->level].b && iter->level + 1 < iter->locks_want)) { unsigned i; for (i = iter->level + 1; - i < iter->locks_want && iter->nodes[i]; + i < iter->locks_want && iter->l[i].b; i++) if (!bch2_btree_node_relock(iter, i)) { while (iter->level < BTREE_MAX_DEPTH && - iter->nodes[iter->level] && + iter->l[iter->level].b && iter->level + 1 < iter->locks_want) btree_iter_up(iter); break; @@ -872,27 +918,31 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * If the current node isn't locked, go up until we have a locked node * or run out of nodes: */ - while (iter->level < BTREE_MAX_DEPTH && - iter->nodes[iter->level] && + while (btree_iter_node(iter, iter->level) && !(is_btree_node(iter, iter->level) && bch2_btree_node_relock(iter, iter->level) && - btree_iter_pos_cmp(iter->pos, - &iter->nodes[iter->level]->key.k, - iter->flags & BTREE_ITER_IS_EXTENTS))) + + /* + * XXX: correctly using BTREE_ITER_UPTODATE should make + * comparing iter->pos against node's key unnecessary + */ + btree_iter_pos_in_node(iter, iter->l[iter->level].b))) btree_iter_up(iter); /* * If we've got a btree node locked (i.e. we aren't about to relock the * root) - advance its node iterator if necessary: + * + * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary */ - if (iter->level < BTREE_MAX_DEPTH && - iter->nodes[iter->level]) { + if (btree_iter_node(iter, iter->level)) { + struct btree_iter_level *l = &iter->l[iter->level]; struct bkey_s_c k; + struct bkey u; - while ((k = __btree_iter_peek_all(iter)).k && - !btree_iter_pos_cmp(iter->pos, k.k, - iter->flags & BTREE_ITER_IS_EXTENTS)) - __btree_iter_advance(iter); + while ((k = __btree_iter_peek_all(iter, l, &u)).k && + !btree_iter_pos_cmp(iter, k.k)) + __btree_iter_advance(l); } /* @@ -902,16 +952,17 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * btree_iter_lock_root() comes next and that it can't fail */ while (iter->level > depth_want) { - int ret = iter->nodes[iter->level] + int ret = btree_iter_node(iter, iter->level) ? btree_iter_down(iter) : btree_iter_lock_root(iter, depth_want); if (unlikely(ret)) { iter->level = depth_want; - iter->nodes[iter->level] = BTREE_ITER_NOT_END; + iter->l[iter->level].b = BTREE_ITER_NOT_END; return ret; } } + iter->uptodate = BTREE_ITER_NEED_PEEK; return 0; } @@ -919,6 +970,9 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; + if (iter->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + ret = __bch2_btree_iter_traverse(iter); if (unlikely(ret)) ret = btree_iter_traverse_error(iter, ret); @@ -939,7 +993,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) if (ret) return ERR_PTR(ret); - b = iter->nodes[iter->level]; + b = iter->l[iter->level].b; if (b) { EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); @@ -958,16 +1012,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) btree_iter_up(iter); - if (iter->level == BTREE_MAX_DEPTH || - !iter->nodes[iter->level]) + if (!btree_iter_node(iter, iter->level)) return NULL; /* parent node usually won't be locked: redo traversal if necessary */ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ret = bch2_btree_iter_traverse(iter); if (ret) return NULL; - b = iter->nodes[iter->level]; + b = iter->l[iter->level].b; if (!b) return b; @@ -980,11 +1034,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) : bkey_successor(iter->pos); iter->level = depth; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ret = bch2_btree_iter_traverse(iter); if (ret) return NULL; - b = iter->nodes[iter->level]; + b = iter->l[iter->level].b; } iter->pos = b->key.k.p; @@ -996,179 +1051,259 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) { - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree_iter_level *l = &iter->l[0]; struct bkey_packed *k; EBUG_ON(iter->level != 0); EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); EBUG_ON(!btree_node_locked(iter, 0)); - EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0); + EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); - while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - !btree_iter_pos_cmp_packed(b, &new_pos, k, + iter->pos = new_pos; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && + !btree_iter_pos_cmp_packed(l->b, &iter->pos, k, iter->flags & BTREE_ITER_IS_EXTENTS)) - bch2_btree_node_iter_advance(node_iter, b); + __btree_iter_advance(l); - if (!k && - !btree_iter_pos_cmp(new_pos, &b->key.k, - iter->flags & BTREE_ITER_IS_EXTENTS)) + if (!k && btree_iter_pos_after_node(iter, l->b)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); iter->flags |= BTREE_ITER_AT_END_OF_LEAF; - - iter->pos = new_pos; - iter->flags &= ~BTREE_ITER_UPTODATE; + } } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ iter->pos = new_pos; - iter->flags &= ~BTREE_ITER_UPTODATE; -} - -void bch2_btree_iter_advance_pos(struct btree_iter *iter) -{ - if (iter->flags & BTREE_ITER_UPTODATE && - !(iter->flags & BTREE_ITER_WITH_HOLES)) { - struct bkey_s_c k; - - __btree_iter_advance(iter); - k = __btree_iter_peek(iter); - if (likely(k.k)) { - iter->pos = bkey_start_pos(k.k); - return; - } - } - - /* - * We use iter->k instead of iter->pos for extents: iter->pos will be - * equal to the start of the extent we returned, but we need to advance - * to the end of the extent we returned. - */ - bch2_btree_iter_set_pos(iter, - btree_type_successor(iter->btree_id, iter->k.p)); -} - -/* XXX: expensive */ -void bch2_btree_iter_rewind(struct btree_iter *iter, struct bpos pos) -{ - /* incapable of rewinding across nodes: */ - BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0); - iter->pos = pos; - iter->flags &= ~BTREE_ITER_UPTODATE; - __btree_iter_init(iter, iter->nodes[iter->level]); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); } struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { + struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c k; int ret; EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); + EBUG_ON(iter->flags & BTREE_ITER_SLOTS); - if (iter->flags & BTREE_ITER_UPTODATE) { - struct btree *b = iter->nodes[0]; + if (iter->uptodate == BTREE_ITER_UPTODATE) { struct bkey_packed *k = - __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b); + __bch2_btree_node_iter_peek_all(&l->iter, l->b); struct bkey_s_c ret = { .k = &iter->k, - .v = bkeyp_val(&b->format, k) + .v = bkeyp_val(&l->b->format, k) }; EBUG_ON(!btree_node_locked(iter, 0)); if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, b, ret); + bch2_bkey_debugcheck(iter->c, l->b, ret); return ret; } + if (iter->uptodate == BTREE_ITER_END) + return bkey_s_c_null; + while (1) { ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) { - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + if (unlikely(ret)) return bkey_s_c_err(ret); - } - k = __btree_iter_peek(iter); - if (likely(k.k)) { - /* - * iter->pos should always be equal to the key we just - * returned - except extents can straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - iter->pos = bkey_start_pos(k.k); - - iter->flags |= BTREE_ITER_UPTODATE; - return k; - } - - iter->pos = iter->nodes[0]->key.k.p; + k = __btree_iter_peek(iter, l); + if (likely(k.k)) + break; + /* got to the end of the leaf, iterator needs to be traversed: */ + iter->pos = l->b->key.k.p; if (!bkey_cmp(iter->pos, POS_MAX)) { - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); - bch2_btree_iter_unlock(iter); + iter->uptodate = BTREE_ITER_END; return bkey_s_c_null; } iter->pos = btree_type_successor(iter->btree_id, iter->pos); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + } + + /* + * iter->pos should always be equal to the key we just + * returned - except extents can straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || + bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); + + iter->uptodate = BTREE_ITER_UPTODATE; + return k; +} + +static noinline +struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + + iter->pos = l->b->key.k.p; + if (!bkey_cmp(iter->pos, POS_MAX)) { + iter->uptodate = BTREE_ITER_END; + return bkey_s_c_null; } + + iter->pos = btree_type_successor(iter->btree_id, iter->pos); + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + return bch2_btree_iter_peek(iter); } -struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *p; struct bkey_s_c k; - struct bkey n; - int ret; EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); + EBUG_ON(iter->flags & BTREE_ITER_SLOTS); - iter->flags &= ~BTREE_ITER_UPTODATE; + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + k = bch2_btree_iter_peek(iter); + if (IS_ERR_OR_NULL(k.k)) + return k; + } - while (1) { + do { + __btree_iter_advance(l); + p = bch2_btree_node_iter_peek_all(&l->iter, l->b); + if (unlikely(!p)) + return bch2_btree_iter_peek_next_leaf(iter); + } while (bkey_deleted(p)); + + k = __btree_iter_unpack(iter, l, &iter->k, p); + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); + iter->pos = bkey_start_pos(k.k); + return k; +} + +static inline struct bkey_s_c +__bch2_btree_iter_peek_slot(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + struct bkey n; + int ret; + +recheck: + while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && + bkey_deleted(k.k) && + bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) + __btree_iter_advance(l); + + if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); + EBUG_ON(bkey_deleted(k.k)); + iter->uptodate = BTREE_ITER_UPTODATE; + return k; + } + + /* + * If we got to the end of the node, check if we need to traverse to the + * next node: + */ + if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) { - iter->k = KEY(iter->pos.inode, iter->pos.offset, 0); + if (unlikely(ret)) return bkey_s_c_err(ret); - } - k = __btree_iter_peek_all(iter); -recheck: - if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) { - /* hole */ - bkey_init(&n); - n.p = iter->pos; - - if (iter->flags & BTREE_ITER_IS_EXTENTS) { - if (n.p.offset == KEY_OFFSET_MAX) { - iter->pos = bkey_successor(iter->pos); - goto recheck; - } - - if (!k.k) - k.k = &iter->nodes[0]->key.k; - - bch2_key_resize(&n, - min_t(u64, KEY_SIZE_MAX, - (k.k->p.inode == n.p.inode - ? bkey_start_offset(k.k) - : KEY_OFFSET_MAX) - - n.p.offset)); - - EBUG_ON(!n.size); + goto recheck; + } + + /* hole */ + bkey_init(&n); + n.p = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (n.p.offset == KEY_OFFSET_MAX) { + if (n.p.inode == KEY_INODE_MAX) { + iter->uptodate = BTREE_ITER_END; + return bkey_s_c_null; } - iter->k = n; - return (struct bkey_s_c) { &iter->k, NULL }; - } else if (!bkey_deleted(k.k)) { - return k; - } else { - __btree_iter_advance(iter); + iter->pos = bkey_successor(iter->pos); + goto recheck; } + + if (!k.k) + k.k = &l->b->key.k; + + bch2_key_resize(&n, + min_t(u64, KEY_SIZE_MAX, + (k.k->p.inode == n.p.inode + ? bkey_start_offset(k.k) + : KEY_OFFSET_MAX) - + n.p.offset)); + + EBUG_ON(!n.size); } + + iter->k = n; + iter->uptodate = BTREE_ITER_UPTODATE; + return (struct bkey_s_c) { &iter->k, NULL }; +} + +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + int ret; + + EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != + (iter->btree_id == BTREE_ID_EXTENTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); + + if (iter->uptodate == BTREE_ITER_UPTODATE) { + struct bkey_s_c ret = { .k = &iter->k };; + + if (!bkey_deleted(&iter->k)) + ret.v = bkeyp_val(&l->b->format, + __bch2_btree_node_iter_peek_all(&l->iter, l->b)); + + EBUG_ON(!btree_node_locked(iter, 0)); + + if (debug_check_bkeys(iter->c)) + bch2_bkey_debugcheck(iter->c, l->b, ret); + return ret; + } + + if (iter->uptodate == BTREE_ITER_END) + return bkey_s_c_null; + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + return __bch2_btree_iter_peek_slot(iter); +} + +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +{ + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + struct bkey_s_c k; + + k = bch2_btree_iter_peek_slot(iter); + if (btree_iter_err(k)) + return k; + } + + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); + + if (!bkey_deleted(&iter->k)) + __btree_iter_advance(&iter->l[0]); + + return __bch2_btree_iter_peek_slot(iter); } void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, @@ -1176,19 +1311,23 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, unsigned locks_want, unsigned depth, unsigned flags) { + unsigned i; + EBUG_ON(depth >= BTREE_MAX_DEPTH); EBUG_ON(locks_want > BTREE_MAX_DEPTH); iter->c = c; iter->pos = pos; iter->flags = flags; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; iter->level = depth; iter->locks_want = locks_want; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; - memset(iter->nodes, 0, sizeof(iter->nodes)); - iter->nodes[iter->level] = BTREE_ITER_NOT_END; + for (i = 0; i < ARRAY_SIZE(iter->l); i++) + iter->l[i].b = NULL; + iter->l[iter->level].b = BTREE_ITER_NOT_END; iter->next = iter; prefetch(c->btree_roots[btree_id].b); @@ -1236,4 +1375,5 @@ void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) __bch2_btree_iter_unlock(dst); memcpy(dst, src, offsetof(struct btree_iter, next)); dst->nodes_locked = dst->nodes_intent_locked = 0; + dst->uptodate = BTREE_ITER_NEED_RELOCK; } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index eb196a3a..318b0424 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -1,23 +1,32 @@ #ifndef _BCACHEFS_BTREE_ITER_H #define _BCACHEFS_BTREE_ITER_H -#include "btree_types.h" +#include +#include "btree_types.h" +#include "bset.h" -#define BTREE_ITER_UPTODATE (1 << 0) -#define BTREE_ITER_WITH_HOLES (1 << 1) -#define BTREE_ITER_INTENT (1 << 2) -#define BTREE_ITER_PREFETCH (1 << 3) +#define BTREE_ITER_SLOTS (1 << 0) +#define BTREE_ITER_INTENT (1 << 1) +#define BTREE_ITER_PREFETCH (1 << 2) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 4) +#define BTREE_ITER_IS_EXTENTS (1 << 3) /* * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: */ -#define BTREE_ITER_AT_END_OF_LEAF (1 << 5) -#define BTREE_ITER_ERROR (1 << 6) +#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) +#define BTREE_ITER_ERROR (1 << 5) + +enum btree_iter_uptodate { + BTREE_ITER_UPTODATE = 0, + BTREE_ITER_NEED_PEEK = 1, + BTREE_ITER_NEED_RELOCK = 2, + BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_END = 4, +}; /* * @pos - iterator's current position @@ -31,32 +40,23 @@ struct btree_iter { struct bpos pos; u8 flags; - enum btree_id btree_id:8; + unsigned uptodate:4; + enum btree_id btree_id:4; unsigned level:4, locks_want:4, nodes_locked:4, nodes_intent_locked:4; - u32 lock_seq[BTREE_MAX_DEPTH]; + struct btree_iter_level { + struct btree *b; + struct btree_node_iter iter; + } l[BTREE_MAX_DEPTH]; - /* - * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root(). - * - * This is because iter->nodes[iter->level] == NULL is how - * btree_iter_next_node() knows that it's finished with a depth first - * traversal. Just unlocking a node (with btree_node_unlock()) is fine, - * and if you really don't want that node used again (e.g. btree_split() - * freed it) decrementing lock_seq will cause bch2_btree_node_relock() to - * always fail (but since freeing a btree node takes a write lock on the - * node, which increments the node's lock seq, that's not actually - * necessary in that example). - */ - struct btree *nodes[BTREE_MAX_DEPTH]; - struct btree_node_iter node_iters[BTREE_MAX_DEPTH]; + u32 lock_seq[BTREE_MAX_DEPTH]; /* * Current unpacked key - so that bch2_btree_iter_next()/ - * bch2_btree_iter_next_with_holes() can correctly advance pos. + * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; @@ -71,6 +71,24 @@ struct btree_iter { struct btree_iter *next; }; +static inline void btree_iter_set_dirty(struct btree_iter *iter, + enum btree_iter_uptodate u) +{ + iter->uptodate = max_t(unsigned, iter->uptodate, u); +} + +static inline struct btree *btree_iter_node(struct btree_iter *iter, + unsigned level) +{ + return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; +} + +static inline struct btree *btree_node_parent(struct btree_iter *iter, + struct btree *b) +{ + return btree_iter_node(iter, b->level + 1); +} + static inline bool btree_iter_linked(const struct btree_iter *iter) { return iter->next != iter; @@ -102,7 +120,7 @@ __next_linked_btree_node(struct btree_iter *iter, struct btree *b, * sequence number is incremented by taking and releasing write * locks and is even when unlocked: */ - } while (linked->nodes[b->level] != b || + } while (linked->l[b->level].b != b || linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1); return linked; @@ -159,11 +177,13 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); + void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -void bch2_btree_iter_advance_pos(struct btree_iter *); -void bch2_btree_iter_rewind(struct btree_iter *, struct bpos); void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *, enum btree_id, struct bpos, @@ -175,8 +195,8 @@ static inline void bch2_btree_iter_init(struct btree_iter *iter, { __bch2_btree_iter_init(iter, c, btree_id, pos, flags & BTREE_ITER_INTENT ? 1 : 0, 0, - btree_id == BTREE_ID_EXTENTS - ? BTREE_ITER_IS_EXTENTS : 0); + (btree_id == BTREE_ID_EXTENTS + ? BTREE_ITER_IS_EXTENTS : 0)|flags); } void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *); @@ -225,16 +245,30 @@ static inline int btree_iter_cmp(const struct btree_iter *l, static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_WITH_HOLES - ? bch2_btree_iter_peek_with_holes(iter) + return flags & BTREE_ITER_SLOTS + ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } +static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, + unsigned flags) +{ + return flags & BTREE_ITER_SLOTS + ? bch2_btree_iter_next_slot(iter) + : bch2_btree_iter_next(iter); +} + #define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \ for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \ - (_start), (_flags)); \ - !IS_ERR_OR_NULL(((_k) = __bch2_btree_iter_peek(_iter, _flags)).k);\ - bch2_btree_iter_advance_pos(_iter)) + (_start), (_flags)), \ + (_k) = __bch2_btree_iter_peek(_iter, _flags); \ + !IS_ERR_OR_NULL((_k).k); \ + (_k) = __bch2_btree_iter_next(_iter, _flags)) + +#define for_each_btree_key_continue(_iter, _flags, _k) \ + for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ + !IS_ERR_OR_NULL((_k).k); \ + (_k) = __bch2_btree_iter_next(_iter, _flags)) static inline int btree_iter_err(struct bkey_s_c k) { @@ -247,16 +281,10 @@ static inline int btree_iter_err(struct bkey_s_c k) */ static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) { - struct btree_iter *linked; - if (need_resched()) { - for_each_linked_btree_iter(iter, linked) - bch2_btree_iter_unlock(linked); bch2_btree_iter_unlock(iter); schedule(); } else if (race_fault()) { - for_each_linked_btree_iter(iter, linked) - bch2_btree_iter_unlock(linked); bch2_btree_iter_unlock(iter); } } diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index ca2992ba..0581f44a 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -91,11 +91,10 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { int lock_type = btree_node_locked_type(iter, level); - EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE); EBUG_ON(level >= BTREE_MAX_DEPTH); if (lock_type != BTREE_NODE_UNLOCKED) - six_unlock_type(&iter->nodes[level]->lock, lock_type); + six_unlock_type(&iter->l[level].b->lock, lock_type); mark_btree_node_unlocked(iter, level); } @@ -113,10 +112,21 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, __bch2_btree_node_lock(b, pos, level, iter, type); } -bool bch2_btree_node_relock(struct btree_iter *, unsigned); +bool __bch2_btree_node_relock(struct btree_iter *, unsigned); + +static inline bool bch2_btree_node_relock(struct btree_iter *iter, + unsigned level) +{ + return likely(btree_lock_want(iter, level) == + btree_node_locked_type(iter, level)) || + __bch2_btree_node_relock(iter, level); +} + bool bch2_btree_iter_relock(struct btree_iter *); void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void bch2_btree_node_lock_write(struct btree *, struct btree_iter *); #endif /* _BCACHEFS_BTREE_LOCKING_H */ + + diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index c7c29306..f357095d 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -120,8 +120,6 @@ int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, struct extent_insert_hook *, u64 *, int flags); -int bch2_btree_update(struct bch_fs *, enum btree_id, - struct bkey_i *, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, struct bversion, diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index e92a6a3a..c45527a2 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -393,6 +393,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev struct bch_fs *c = as->c; struct btree *b; + BUG_ON(level >= BTREE_MAX_DEPTH); BUG_ON(!as->reserve->nr); b = as->reserve->b[--as->reserve->nr]; @@ -589,17 +590,18 @@ static void bch2_btree_update_free(struct btree_update *as) mutex_lock(&c->btree_interior_update_lock); list_del(&as->list); - mutex_unlock(&c->btree_interior_update_lock); closure_debug_destroy(&as->cl); mempool_free(as, &c->btree_interior_update_pool); percpu_ref_put(&c->writes); + + closure_wake_up(&c->btree_interior_update_wait); + mutex_unlock(&c->btree_interior_update_lock); } static void btree_update_nodes_reachable(struct closure *cl) { - struct btree_update *as = - container_of(cl, struct btree_update, cl); + struct btree_update *as = container_of(cl, struct btree_update, cl); struct bch_fs *c = as->c; bch2_journal_pin_drop(&c->journal, &as->journal); @@ -609,8 +611,7 @@ static void btree_update_nodes_reachable(struct closure *cl) while (as->nr_new_nodes) { struct btree *b = as->new_nodes[--as->nr_new_nodes]; - BUG_ON(b->will_make_reachable && - (struct btree_update *) b->will_make_reachable != as); + BUG_ON(b->will_make_reachable != (unsigned long) as); b->will_make_reachable = 0; mutex_unlock(&c->btree_interior_update_lock); @@ -634,10 +635,26 @@ static void btree_update_nodes_reachable(struct closure *cl) bch2_btree_update_free(as); } +static void btree_update_wait_on_journal(struct closure *cl) +{ + struct btree_update *as = container_of(cl, struct btree_update, cl); + struct bch_fs *c = as->c; + int ret; + + ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); + if (ret < 0) + goto err; + if (!ret) + continue_at(cl, btree_update_wait_on_journal, system_wq); + + bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); +err: + continue_at(cl, btree_update_nodes_reachable, system_wq); +} + static void btree_update_nodes_written(struct closure *cl) { - struct btree_update *as = - container_of(cl, struct btree_update, cl); + struct btree_update *as = container_of(cl, struct btree_update, cl); struct bch_fs *c = as->c; struct btree *b; @@ -648,6 +665,8 @@ static void btree_update_nodes_written(struct closure *cl) */ retry: mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; + switch (as->mode) { case BTREE_INTERIOR_NO_UPDATE: BUG(); @@ -672,7 +691,7 @@ retry: * b->write_blocked prevented it from being written, so * write it now if it needs to be written: */ - bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); + bch2_btree_node_write_cond(c, b, true); six_unlock_read(&b->lock); break; @@ -732,13 +751,10 @@ retry: */ bch2_journal_pin_drop(&c->journal, &as->journal); - /* - * And, do a journal write to write the pointer to the new root, - * then wait for it to complete before freeing the nodes we - * replaced: - */ - bch2_journal_meta_async(&c->journal, cl); - break; + as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); + + btree_update_wait_on_journal(cl); + return; } continue_at(cl, btree_update_nodes_reachable, system_wq); @@ -883,13 +899,15 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) unsigned long v; unsigned i; - if (!b->will_make_reachable) - return; - mutex_lock(&c->btree_interior_update_lock); v = xchg(&b->will_make_reachable, 0); - as = (struct btree_update *) (v & ~1UL); + + if (!as) { + mutex_unlock(&c->btree_interior_update_lock); + return; + } + for (i = 0; i < as->nr_new_nodes; i++) if (as->new_nodes[i] == b) goto found; @@ -1044,7 +1062,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, bch2_keylist_init(&as->parent_keys, as->inline_keys); mutex_lock(&c->btree_interior_update_lock); - list_add(&as->list, &c->btree_interior_update_list); + list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); return as; @@ -1341,7 +1359,7 @@ static void btree_split(struct btree_update *as, struct btree *b, struct btree_iter *iter, struct keylist *keys) { struct bch_fs *c = as->c; - struct btree *parent = iter->nodes[b->level + 1]; + struct btree *parent = btree_node_parent(iter, b); struct btree *n1, *n2 = NULL, *n3 = NULL; u64 start_time = local_clock(); @@ -1437,30 +1455,17 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_time_stats_update(&c->btree_split_time, start_time); } -static int +static void bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, struct btree_iter *iter, struct keylist *keys) { - struct bch_fs *c = as->c; struct btree_iter *linked; struct btree_node_iter node_iter; struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); - BUG_ON(!b->level); - BUG_ON(!as || as->b); - bch2_verify_keylist_sorted(keys); - - bch2_btree_node_lock_for_insert(c, b, iter); - - if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) { - bch2_btree_node_unlock_write(b, iter); - return -1; - } - /* Don't screw up @iter's position: */ - node_iter = iter->node_iters[b->level]; + node_iter = iter->l[b->level].iter; /* * btree_split(), btree_gc_coalesce() will insert keys before @@ -1481,19 +1486,10 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, btree_update_updated_node(as, b); for_each_linked_btree_node(iter, b, linked) - bch2_btree_node_iter_peek(&linked->node_iters[b->level], - b); - bch2_btree_node_iter_peek(&iter->node_iters[b->level], b); + bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); + bch2_btree_node_iter_peek(&iter->l[b->level].iter, b); bch2_btree_iter_verify(iter, b); - - if (bch2_maybe_compact_whiteouts(c, b)) - bch2_btree_iter_reinit_node(iter, b); - - bch2_btree_node_unlock_write(b, iter); - - btree_node_interior_verify(b); - return 0; } /** @@ -1511,17 +1507,54 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, void bch2_btree_insert_node(struct btree_update *as, struct btree *b, struct btree_iter *iter, struct keylist *keys) { + struct bch_fs *c = as->c; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); BUG_ON(!b->level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + + if (as->must_rewrite) + goto split; + + bch2_btree_node_lock_for_insert(c, b, iter); + + if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) { + bch2_btree_node_unlock_write(b, iter); + goto split; + } + + bch2_btree_insert_keys_interior(as, b, iter, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + bch2_btree_node_unlock_write(b, iter); + + btree_node_interior_verify(b); - if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) || - bch2_btree_insert_keys_interior(as, b, iter, keys)) - btree_split(as, b, iter, keys); + bch2_foreground_maybe_merge(c, iter, b->level); + return; +split: + btree_split(as, b, iter, keys); } int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, unsigned btree_reserve_flags) { - struct btree *b = iter->nodes[0]; + struct btree *b = iter->l[0].b; struct btree_update *as; struct closure cl; int ret = 0; @@ -1577,9 +1610,10 @@ out: return ret; } -int bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - enum btree_node_sibling sib) +int __bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + enum btree_node_sibling sib) { struct btree_update *as; struct bkey_format_state new_s; @@ -1592,12 +1626,12 @@ int bch2_foreground_maybe_merge(struct bch_fs *c, closure_init_stack(&cl); retry: - if (!bch2_btree_node_relock(iter, iter->level)) + if (!bch2_btree_node_relock(iter, level)) return 0; - b = iter->nodes[iter->level]; + b = iter->l[level].b; - parent = iter->nodes[b->level + 1]; + parent = btree_node_parent(iter, b); if (!parent) return 0; @@ -1734,7 +1768,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, struct btree *b, unsigned flags, struct closure *cl) { - struct btree *n, *parent = iter->nodes[b->level + 1]; + struct btree *n, *parent = btree_node_parent(iter, b); struct btree_update *as; as = bch2_btree_update_start(c, iter->btree_id, @@ -1834,7 +1868,6 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, struct bkey_i_extent *new_key) { struct btree *parent; - bool must_rewrite_parent = false; int ret; /* @@ -1859,14 +1892,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, */ if (b->will_make_reachable) - must_rewrite_parent = true; - - if (must_rewrite_parent) - as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE; + as->must_rewrite = true; btree_interior_update_add_node_reference(as, b); - parent = iter->nodes[b->level + 1]; + parent = btree_node_parent(iter, b); if (parent) { if (new_hash) { bkey_copy(&new_hash->key, &new_key->k_i); @@ -2066,3 +2096,21 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) six_unlock_write(&b->lock); six_unlock_intent(&b->lock); } + +ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = buf + PAGE_SIZE; + struct btree_update *as; + + mutex_lock(&c->btree_interior_update_lock); + list_for_each_entry(as, &c->btree_interior_update_list, list) + out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n", + as, + as->mode, + as->nodes_written, + atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, + bch2_journal_pin_seq(&c->journal, &as->journal)); + mutex_unlock(&c->btree_interior_update_lock); + + return out - buf; +} diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 23ee3980..0b58ccc9 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -2,6 +2,7 @@ #define _BCACHEFS_BTREE_UPDATE_INTERIOR_H #include "btree_cache.h" +#include "btree_locking.h" #include "btree_update.h" struct btree_reserve { @@ -61,9 +62,12 @@ struct btree_update { BTREE_INTERIOR_UPDATING_ROOT, BTREE_INTERIOR_UPDATING_AS, } mode; + + unsigned must_rewrite:1; + unsigned nodes_written:1; + enum btree_id btree_id; - unsigned flags; struct btree_reserve *reserve; /* @@ -120,8 +124,6 @@ struct btree_update { u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; }; -#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0) - #define for_each_pending_btree_node_free(c, as, p) \ list_for_each_entry(as, &c->btree_interior_update_list, list) \ for (p = as->pending; p < as->pending + as->nr_pending; p++) @@ -146,8 +148,34 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *, void bch2_btree_insert_node(struct btree_update *, struct btree *, struct btree_iter *, struct keylist *); int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, - enum btree_node_sibling); + +int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, + unsigned, enum btree_node_sibling); + +static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + enum btree_node_sibling sib) +{ + struct btree *b; + + if (!bch2_btree_node_relock(iter, level)) + return 0; + + b = iter->l[level].b; + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + return 0; + + return __bch2_foreground_maybe_merge(c, iter, level, sib); +} + +static inline void bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level) +{ + bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib); + bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib); +} void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); @@ -220,16 +248,17 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bset *i = btree_bset_last(b); unsigned offset = max_t(unsigned, b->written << 9, bset_byte_offset(b, vstruct_end(i))); - ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) + ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t) (offset + sizeof(struct btree_node_entry) + b->whiteout_u64s * sizeof(u64) + b->uncompacted_whiteout_u64s * sizeof(u64)); EBUG_ON(offset > btree_bytes(c)); - if ((unlikely(bset_written(b, i)) && n > 0) || + if ((unlikely(bset_written(b, i)) && + remaining_space > block_bytes(c)) || (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && - n > btree_write_set_buffer(b))) + remaining_space > btree_write_set_buffer(b))) return (void *) b->data + offset; return NULL; @@ -312,4 +341,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans, return u64s <= trans->journal_res.u64s; } +ssize_t bch2_btree_updates_print(struct bch_fs *, char *); + #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index dbf70569..4b252b6d 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -39,9 +39,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, t = bch2_bkey_to_bset(b, k); if (bset_unwritten(b, bset(b, t)) && - bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { - BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k)); - + bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && + !bkey_whiteout(&insert->k)) { k->type = insert->k.type; memcpy_u64s(bkeyp_val(f, k), &insert->v, bkey_val_u64s(&insert->k)); @@ -131,28 +130,21 @@ void bch2_btree_journal_key(struct btree_insert *trans, { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - struct btree *b = iter->nodes[0]; + struct btree *b = iter->l[0].b; struct btree_write *w = btree_current_write(b); EBUG_ON(iter->level || b->level); EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); - if (!journal_pin_active(&w->journal)) - bch2_journal_pin_add(j, &trans->journal_res, - &w->journal, - btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); - - if (trans->journal_res.ref) { + if (likely(trans->journal_res.ref)) { u64 seq = trans->journal_res.seq; bool needs_whiteout = insert->k.needs_whiteout; /* ick */ insert->k.needs_whiteout = false; bch2_journal_add_keys(j, &trans->journal_res, - b->btree_id, insert); + iter->btree_id, insert); insert->k.needs_whiteout = needs_whiteout; bch2_journal_set_has_inode(j, &trans->journal_res, @@ -163,7 +155,14 @@ void bch2_btree_journal_key(struct btree_insert *trans, btree_bset_last(b)->journal_seq = cpu_to_le64(seq); } - if (!btree_node_dirty(b)) + if (unlikely(!journal_pin_active(&w->journal))) + bch2_journal_pin_add(j, &trans->journal_res, + &w->journal, + btree_node_write_idx(b) == 0 + ? btree_node_flush0 + : btree_node_flush1); + + if (unlikely(!btree_node_dirty(b))) set_btree_node_dirty(b); } @@ -172,13 +171,13 @@ bch2_insert_fixup_key(struct btree_insert *trans, struct btree_insert_entry *insert) { struct btree_iter *iter = insert->iter; + struct btree_iter_level *l = &iter->l[0]; - BUG_ON(iter->level); - BUG_ON(insert->k->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, iter->nodes[0])); + EBUG_ON(iter->level); + EBUG_ON(insert->k->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, l->b)); - if (bch2_btree_bset_insert_key(iter, iter->nodes[0], - &iter->node_iters[0], + if (bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert->k)) bch2_btree_journal_key(trans, iter, insert->k); @@ -186,38 +185,22 @@ bch2_insert_fixup_key(struct btree_insert *trans, return BTREE_INSERT_OK; } -static int inline foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - enum btree_node_sibling sib) -{ - struct btree *b; - - if (!btree_node_locked(iter, iter->level)) - return 0; - - b = iter->nodes[iter->level]; - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - return 0; - - return bch2_foreground_maybe_merge(c, iter, sib); -} - /** * btree_insert_key - insert a key one key into a leaf node */ static enum btree_insert_ret -btree_insert_key(struct btree_insert *trans, - struct btree_insert_entry *insert) +btree_insert_key_leaf(struct btree_insert *trans, + struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; - struct btree *b = iter->nodes[0]; + struct btree *b = iter->l[0].b; enum btree_insert_ret ret; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - iter->flags &= ~BTREE_ITER_UPTODATE; + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ret = !btree_node_is_extents(b) ? bch2_insert_fixup_key(trans, insert) @@ -247,7 +230,7 @@ static bool same_leaf_as_prev(struct btree_insert *trans, * point to the same leaf node they'll always be adjacent now: */ return i != trans->entries && - i[0].iter->nodes[0] == i[-1].iter->nodes[0]; + i[0].iter->l[0].b == i[-1].iter->l[0].b; } #define trans_for_each_entry(trans, i) \ @@ -276,7 +259,8 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans) trans_for_each_entry(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(c, i->iter->nodes[0], i->iter); + bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, + i->iter); } static void multi_unlock_write(struct btree_insert *trans) @@ -285,15 +269,18 @@ static void multi_unlock_write(struct btree_insert *trans) trans_for_each_entry(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter); + bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); } -static int btree_trans_entry_cmp(const void *_l, const void *_r) +static inline void btree_trans_sort(struct btree_insert *trans) { - const struct btree_insert_entry *l = _l; - const struct btree_insert_entry *r = _r; + int i, end = trans->nr; - return btree_iter_cmp(l->iter, r->iter); + while (--end > 0) + for (i = 0; i < end; i++) + if (btree_iter_cmp(trans->entries[i].iter, + trans->entries[i + 1].iter) > 0) + swap(trans->entries[i], trans->entries[i + 1]); } /* Normal update interface: */ @@ -326,16 +313,22 @@ int __bch2_btree_insert_at(struct btree_insert *trans) bkey_i_to_s_c(i->k))); } - sort(trans->entries, trans->nr, sizeof(trans->entries[0]), - btree_trans_entry_cmp, NULL); + btree_trans_sort(trans); if (unlikely(!percpu_ref_tryget(&c->writes))) return -EROFS; retry_locks: ret = -EINTR; - trans_for_each_entry(trans, i) + trans_for_each_entry(trans, i) { if (!bch2_btree_iter_set_locks_want(i->iter, 1)) goto err; + + if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) { + ret = bch2_btree_iter_traverse(i->iter); + if (ret) + goto err; + } + } retry: trans->did_work = false; u64s = 0; @@ -375,7 +368,7 @@ retry: if (!i->done) { u64s += i->k->k.u64s + i->extra_res; if (!bch2_btree_node_insert_fits(c, - i->iter->nodes[0], u64s)) { + i->iter->l[0].b, u64s)) { split = i->iter; goto unlock; } @@ -390,7 +383,7 @@ retry: if (i->done) continue; - switch (btree_insert_key(trans, i)) { + switch (btree_insert_key_leaf(trans, i)) { case BTREE_INSERT_OK: i->done = true; break; @@ -427,19 +420,19 @@ unlock: if (ret) goto err; - /* - * hack: iterators are inconsistent when they hit end of leaf, until - * traversed again - */ trans_for_each_entry(trans, i) if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF) goto out; - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) { - foreground_maybe_merge(c, i->iter, btree_prev_sib); - foreground_maybe_merge(c, i->iter, btree_next_sib); - } + trans_for_each_entry(trans, i) { + /* + * iterators are inconsistent when they hit end of leaf, until + * traversed again + */ + if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE && + !same_leaf_as_prev(trans, i)) + bch2_foreground_maybe_merge(c, i->iter, 0); + } out: /* make sure we didn't lose an error: */ if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) @@ -513,12 +506,7 @@ int bch2_btree_insert_list_at(struct btree_iter *iter, bch2_verify_keylist_sorted(keys); while (!bch2_keylist_empty(keys)) { - /* need to traverse between each insert */ - int ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - ret = bch2_btree_insert_at(iter->c, disk_res, hook, + int ret = bch2_btree_insert_at(iter->c, disk_res, hook, journal_seq, flags, BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys))); if (ret) @@ -544,51 +532,14 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, u64 *journal_seq, int flags) { struct btree_iter iter; - int ret, ret2; + int ret; bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - - ret = bch2_btree_iter_traverse(&iter); - if (unlikely(ret)) - goto out; - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, BTREE_INSERT_ENTRY(&iter, k)); -out: ret2 = bch2_btree_iter_unlock(&iter); - - return ret ?: ret2; -} - -/** - * bch_btree_update - like bch2_btree_insert(), but asserts that we're - * overwriting an existing key - */ -int bch2_btree_update(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, u64 *journal_seq) -{ - struct btree_iter iter; - struct bkey_s_c u; - int ret; - - EBUG_ON(id == BTREE_ID_EXTENTS); - - bch2_btree_iter_init(&iter, c, id, k->k.p, - BTREE_ITER_INTENT); - - u = bch2_btree_iter_peek_with_holes(&iter); - ret = btree_iter_err(u); - if (ret) - return ret; - - if (bkey_deleted(u.k)) { - bch2_btree_iter_unlock(&iter); - return -ENOENT; - } - - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0, - BTREE_INSERT_ENTRY(&iter, k)); bch2_btree_iter_unlock(&iter); + return ret; } diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 0f090ca5..00e0de16 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -59,7 +59,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) return; bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); - bio->bi_bdev = pick.ca->disk_sb.bdev; + bio_set_dev(bio, pick.ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_iter.bi_size = btree_bytes(c); @@ -212,10 +212,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH); + for_each_btree_key(&iter, i->c, i->id, i->from, + BTREE_ITER_PREFETCH, k) { + i->from = iter.pos; - while ((k = bch2_btree_iter_peek(&iter)).k && - !(err = btree_iter_err(k))) { bch2_bkey_val_to_text(i->c, bkey_type(0, i->id), i->buf, sizeof(i->buf), k); i->bytes = strlen(i->buf); @@ -223,9 +223,6 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->buf[i->bytes] = '\n'; i->bytes++; - bch2_btree_iter_advance_pos(&iter); - i->from = iter.pos; - err = flush_buf(i); if (err) break; @@ -233,7 +230,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) break; } - bch2_btree_iter_unlock(&iter); + err = bch2_btree_iter_unlock(&iter) ?: err; return err < 0 ? err : i->ret; } @@ -318,26 +315,27 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, while ((k = bch2_btree_iter_peek(&iter)).k && !(err = btree_iter_err(k))) { - struct btree *b = iter.nodes[0]; - struct btree_node_iter *node_iter = &iter.node_iters[0]; - struct bkey_packed *_k = bch2_btree_node_iter_peek(node_iter, b); + struct btree_iter_level *l = &iter.l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); - if (iter.nodes[0] != prev_node) { - i->bytes = bch2_print_btree_node(i->c, b, i->buf, + if (l->b != prev_node) { + i->bytes = bch2_print_btree_node(i->c, l->b, i->buf, sizeof(i->buf)); err = flush_buf(i); if (err) break; } - prev_node = iter.nodes[0]; + prev_node = l->b; - i->bytes = bch2_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf)); + i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf, + sizeof(i->buf)); err = flush_buf(i); if (err) break; - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next(&iter); i->from = iter.pos; err = flush_buf(i); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index a900d397..6bdece3a 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -213,12 +213,13 @@ int bch2_dirent_rename(struct bch_fs *c, int ret = -ENOMEM; bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos, - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos, - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); bch2_btree_iter_link(&src_iter, &dst_iter); - bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, 0); + bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, + BTREE_ITER_SLOTS); bch2_btree_iter_link(&src_iter, &whiteout_iter); if (mode == BCH_RENAME_EXCHANGE) { diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 7e7c38ef..c2469167 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1022,10 +1022,10 @@ struct extent_insert_state { }; static void bch2_add_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) + struct bkey_s_c k, u64 offset, s64 sectors) { struct bch_fs *c = s->trans->c; - struct btree *b = s->insert->iter->nodes[0]; + struct btree *b = s->insert->iter->l[0].b; EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); @@ -1079,7 +1079,7 @@ static bool bch2_extent_merge_inline(struct bch_fs *, static enum btree_insert_ret extent_insert_should_stop(struct extent_insert_state *s) { - struct btree *b = s->insert->iter->nodes[0]; + struct btree *b = s->insert->iter->l[0].b; /* * Check if we have sufficient space in both the btree node and the @@ -1101,19 +1101,18 @@ extent_insert_should_stop(struct extent_insert_state *s) static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert) { - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; - struct bset_tree *t = bset_tree_last(b); + struct btree_iter_level *l = &iter->l[0]; + struct bset_tree *t = bset_tree_last(l->b); struct bkey_packed *where = - bch2_btree_node_iter_bset_pos(node_iter, b, t); - struct bkey_packed *prev = bch2_bkey_prev(b, t, where); + bch2_btree_node_iter_bset_pos(&l->iter, l->b, t); + struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where); struct bkey_packed *next_live_key = where; unsigned clobber_u64s; if (prev) where = bkey_next(prev); - while (next_live_key != btree_bkey_last(b, t) && + while (next_live_key != btree_bkey_last(l->b, t) && bkey_deleted(next_live_key)) next_live_key = bkey_next(next_live_key); @@ -1127,18 +1126,19 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) goto drop_deleted_keys; - if (next_live_key != btree_bkey_last(b, t) && + if (next_live_key != btree_bkey_last(l->b, t) && bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), next_live_key, false)) goto drop_deleted_keys; - bch2_bset_insert(b, node_iter, where, insert, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, t, where, + bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where, clobber_u64s, where->u64s); return; drop_deleted_keys: - bch2_bset_delete(b, where, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0); + bch2_bset_delete(l->b, where, clobber_u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, + where, clobber_u64s, 0); } static void extent_insert_committed(struct extent_insert_state *s) @@ -1181,8 +1181,7 @@ static void extent_insert_committed(struct extent_insert_state *s) } if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->nodes[iter->level], - bkey_i_to_s_c(&split.k)); + bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k)); bch2_btree_journal_key(s->trans, iter, &split.k); @@ -1224,7 +1223,7 @@ __extent_insert_advance_pos(struct extent_insert_state *s, static enum btree_insert_ret extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) { - struct btree *b = s->insert->iter->nodes[0]; + struct btree *b = s->insert->iter->l[0].b; struct bpos next_pos = bpos_min(s->insert->k->k.p, k.k ? k.k->p : b->key.k.p); enum btree_insert_ret ret; @@ -1287,8 +1286,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree_iter_level *l = &iter->l[0]; + struct btree *b = l->b; + struct btree_node_iter *node_iter = &l->iter; enum btree_insert_ret ret; switch (overlap) { @@ -1397,8 +1397,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree_iter_level *l = &iter->l[0]; + struct btree *b = l->b; + struct btree_node_iter *node_iter = &l->iter; struct bkey_packed *_k; struct bkey unpacked; struct bkey_i *insert = s->insert->k; @@ -1544,8 +1545,9 @@ bch2_insert_fixup_extent(struct btree_insert *trans, { struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree_iter_level *l = &iter->l[0]; + struct btree *b = l->b; + struct btree_node_iter *node_iter = &l->iter; struct bkey_packed *_k; struct bkey unpacked; enum btree_insert_ret ret = BTREE_INSERT_OK; @@ -2176,8 +2178,7 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter, struct bkey_packed *k, struct bkey uk, bool check, bool could_pack) { - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree_iter_level *l = &iter->l[0]; BUG_ON(!bkey_deleted(k)); @@ -2185,10 +2186,10 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter, return !bkey_packed(k) || could_pack; } else { uk.p = new_pos; - extent_save(b, node_iter, k, &uk); - bch2_bset_fix_invalidated_key(b, t, k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - k, k->u64s, k->u64s); + extent_save(l->b, &l->iter, k, &uk); + bch2_bset_fix_invalidated_key(l->b, t, k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, + k, k->u64s, k->u64s); return true; } } @@ -2196,8 +2197,9 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter, static bool extent_merge_do_overlapping(struct btree_iter *iter, struct bkey *m, bool back_merge) { - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree_iter_level *l = &iter->l[0]; + struct btree *b = l->b; + struct btree_node_iter *node_iter = &l->iter; struct bset_tree *t; struct bkey_packed *k; struct bkey uk; @@ -2288,8 +2290,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, struct bkey_packed *r, bool back_merge) { - struct btree *b = iter->nodes[0]; - struct btree_node_iter *node_iter = &iter->node_iters[0]; + struct btree *b = iter->l[0].b; + struct btree_node_iter *node_iter = &iter->l[0].iter; const struct bkey_format *f = &b->format; struct bset_tree *t = bset_tree_last(b); struct bkey_packed *m; @@ -2332,8 +2334,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, if (back_merge) bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); - bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter, - t, m, m->u64s, m->u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, + t, m, m->u64s, m->u64s); if (!back_merge) bkey_copy(packed_to_bkey(l), &li.k); @@ -2350,8 +2352,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, extent_i_save(b, m, &li.k); bch2_bset_fix_invalidated_key(b, t, m); - bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter, - t, m, m->u64s, m->u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, + t, m, m->u64s, m->u64s); return true; default: BUG(); @@ -2368,7 +2370,7 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) end.offset += size; for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, - BTREE_ITER_WITH_HOLES, k) { + BTREE_ITER_SLOTS, k) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 08739d26..789ae663 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -56,6 +56,11 @@ do { \ #define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) #define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) +#define fifo_entry_idx_abs(fifo, p) \ + ((((p) >= &fifo_peek_front(fifo) \ + ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ + (((p) - (fifo)->data))) + #define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) #define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] @@ -103,13 +108,15 @@ do { \ #define fifo_peek(fifo) fifo_peek_front(fifo) #define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (_iter = (_fifo)->front; \ + for (((void) (&(_iter) == &(_fifo)->front)), \ + _iter = (_fifo)->front; \ ((_iter != (_fifo)->back) && \ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ _iter++) #define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (_iter = (_fifo)->front; \ + for (((void) (&(_iter) == &(_fifo)->front)), \ + _iter = (_fifo)->front; \ ((_iter != (_fifo)->back) && \ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ _iter++) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 7693520d..1bffddf6 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -400,17 +400,13 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BTREE_ITER_INTENT); bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES, POS(extent_iter.pos.inode, 0), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); hook.op = op; hook.hook.fn = bchfs_extent_update_hook; hook.need_inode_update = false; do { - ret = bch2_btree_iter_traverse(&extent_iter); - if (ret) - goto err; - /* XXX: inode->i_size locking */ k = bch2_keylist_front(keys); if (min(k->k.p.offset << 9, op->new_i_size) > @@ -423,7 +419,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) if (!btree_iter_linked(&inode_iter)) bch2_btree_iter_link(&extent_iter, &inode_iter); - inode = bch2_btree_iter_peek_with_holes(&inode_iter); + inode = bch2_btree_iter_peek_slot(&inode_iter); if ((ret = btree_iter_err(inode))) goto err; @@ -994,7 +990,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector)); - k = bch2_btree_iter_peek_with_holes(iter); + k = bch2_btree_iter_peek_slot(iter); BUG_ON(!k.k); if (IS_ERR(k.k)) { @@ -1067,7 +1063,8 @@ int bch2_readpages(struct file *file, struct address_space *mapping, .mapping = mapping, .nr_pages = nr_pages }; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS); INIT_LIST_HEAD(&readpages_iter.pages); list_add(&readpages_iter.pages, pages); @@ -1107,7 +1104,8 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); bio_add_page_contig(&rbio->bio, page); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS); bchfs_read(c, &iter, rbio, inum, NULL); } @@ -2236,9 +2234,10 @@ static long bch2_fcollapse(struct bch_inode_info *inode, bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); /* position will be set from dst iter's position: */ - bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, 0); + bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS); bch2_btree_iter_link(&src, &dst); /* @@ -2276,11 +2275,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, bch2_btree_iter_set_pos(&src, POS(dst.pos.inode, dst.pos.offset + (len >> 9))); - ret = bch2_btree_iter_traverse(&dst); - if (ret) - goto btree_iter_err; - - k = bch2_btree_iter_peek_with_holes(&src); + k = bch2_btree_iter_peek_slot(&src); if ((ret = btree_iter_err(k))) goto btree_iter_err; @@ -2356,7 +2351,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, int ret; bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); inode_lock(&inode->v); inode_dio_wait(&inode->v); @@ -2403,20 +2398,20 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, struct bkey_i_reservation reservation; struct bkey_s_c k; - k = bch2_btree_iter_peek_with_holes(&iter); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = btree_iter_err(k))) goto btree_iter_err; /* already reserved */ if (k.k->type == BCH_RESERVATION && bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next_slot(&iter); continue; } if (bkey_extent_is_data(k.k)) { if (!(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next_slot(&iter); continue; } } @@ -2542,9 +2537,8 @@ static loff_t bch2_next_pagecache_data(struct inode *vinode, for (index = start_offset >> PAGE_SHIFT; index < end_offset >> PAGE_SHIFT; index++) { - if (find_get_pages(mapping, index, 1, &page)) { + if (find_get_pages(mapping, &index, 1, &page)) { lock_page(page); - index = page->index; if (page_is_data(page)) end_offset = @@ -2646,7 +2640,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_WITH_HOLES, k) { + BTREE_ITER_SLOTS, k) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_next_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index aba845b2..98f282b2 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -85,10 +85,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c, lockdep_assert_held(&inode->ei_update_lock); bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); if ((ret = btree_iter_err(k))) goto out; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index ce14aa78..2991a0dd 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -231,7 +231,7 @@ static int hash_check_key(const struct bch_hash_desc desc, return ret; return 1; } - bch2_btree_iter_advance_pos(&h->iter); + bch2_btree_iter_next(&h->iter); } fsck_err: return ret; @@ -1081,7 +1081,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); if (nlinks_pos == iter.pos.inode) genradix_iter_advance(&nlinks_iter, links); - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next(&iter); bch2_btree_iter_cond_resched(&iter); } fsck_err: diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 71a24cc6..797aa2a9 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -303,10 +303,10 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, searched_from_start = true; again: bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); u32 bi_generation = 0; ret = btree_iter_err(k); @@ -322,7 +322,7 @@ again: if (iter.pos.inode == max) goto out; - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next_slot(&iter); break; case BCH_INODE_GENERATION: { @@ -415,9 +415,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) return ret; bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); u32 bi_generation = 0; ret = btree_iter_err(k); @@ -474,7 +474,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0), - BTREE_ITER_WITH_HOLES, k) { + BTREE_ITER_SLOTS, k) { switch (k.k->type) { case BCH_INODE_FS: ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); @@ -491,38 +491,6 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, return bch2_btree_iter_unlock(&iter) ?: ret; } -int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid, - struct bkey_i_inode_blockdev *ret) -{ - struct btree_iter iter; - struct bkey_s_c k; - - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), 0, k) { - if (k.k->p.inode >= BLOCKDEV_INODE_MAX) - break; - - if (k.k->type == BCH_INODE_BLOCKDEV) { - struct bkey_s_c_inode_blockdev inode = - bkey_s_c_to_inode_blockdev(k); - - pr_debug("found inode %llu: %pU (u64s %u)", - inode.k->p.inode, inode.v->i_uuid.b, - inode.k->u64s); - - if (CACHED_DEV(inode.v) && - !memcmp(uuid, &inode.v->i_uuid, 16)) { - bkey_reassemble(&ret->k_i, k); - bch2_btree_iter_unlock(&iter); - return 0; - } - } - - bch2_btree_iter_cond_resched(&iter); - } - bch2_btree_iter_unlock(&iter); - return -ENOENT; -} - #ifdef CONFIG_BCACHEFS_DEBUG void bch2_inode_pack_test(void) { diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 8ebb6fb6..5c7aeadc 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -40,8 +40,6 @@ int bch2_inode_rm(struct bch_fs *, u64); int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); -int bch2_cached_dev_inode_find_by_uuid(struct bch_fs *, uuid_le *, - struct bkey_i_inode_blockdev *); static inline struct timespec bch2_time_to_timespec(struct bch_fs *c, u64 time) { diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 6f6d42fc..7cddbccd 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -179,7 +179,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, bio_sectors(&n->bio)); n->have_io_ref = true; - n->bio.bi_bdev = ca->disk_sb.bdev; + bio_set_dev(&n->bio, ca->disk_sb.bdev); submit_bio(&n->bio); } else { n->have_io_ref = false; @@ -1410,7 +1410,7 @@ noclone: rbio->promote = promote ? promote_alloc(rbio) : NULL; INIT_WORK(&rbio->work, NULL); - rbio->bio.bi_bdev = pick->ca->disk_sb.bdev; + bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev); rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick->ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; @@ -1452,9 +1452,9 @@ static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), - BTREE_ITER_WITH_HOLES); + BTREE_ITER_SLOTS); retry: - k = bch2_btree_iter_peek_with_holes(&iter); + k = bch2_btree_iter_peek_slot(&iter); if (btree_iter_err(k)) { bch2_btree_iter_unlock(&iter); goto err; @@ -1525,7 +1525,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, retry: for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), - BTREE_ITER_WITH_HOLES, k) { + BTREE_ITER_SLOTS, k) { BKEY_PADDED(k) tmp; struct extent_pick_ptr pick; struct bvec_iter fragment; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 1952643f..1870534d 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -31,6 +31,12 @@ static void journal_pin_add_entry(struct journal *, struct journal_entry_pin *, journal_pin_flush_fn); +static inline void journal_wake(struct journal *j) +{ + wake_up(&j->wait); + closure_wake_up(&j->async_wait); +} + static inline struct journal_buf *journal_cur_buf(struct journal *j) { return j->buf + j->reservations.idx; @@ -43,15 +49,27 @@ static inline struct journal_buf *journal_prev_buf(struct journal *j) /* Sequence number of oldest dirty journal entry */ -static inline u64 last_seq(struct journal *j) +static inline u64 journal_last_seq(struct journal *j) { - return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1; + return j->pin.front; } static inline u64 journal_pin_seq(struct journal *j, struct journal_entry_pin_list *pin_list) { - return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); + return fifo_entry_idx_abs(&j->pin, pin_list); +} + +u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin) +{ + u64 ret = 0; + + spin_lock(&j->lock); + if (journal_pin_active(pin)) + ret = journal_pin_seq(j, pin->pin_list); + spin_unlock(&j->lock); + + return ret; } static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf, @@ -678,7 +696,7 @@ reread: sectors_read = min_t(unsigned, end - offset, buf->size >> 9); bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = offset; bio->bi_iter.bi_size = sectors_read << 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -968,7 +986,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) struct journal_replay *i; struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq; + u64 cur_seq, end_seq, seq; unsigned iter, keys = 0, entries = 0; size_t nr; int ret = 0; @@ -1035,11 +1053,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) j->pin.front = le64_to_cpu(i->j.last_seq); j->pin.back = le64_to_cpu(i->j.seq) + 1; - BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq)); - BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != - &fifo_peek_back(&j->pin)); - - fifo_for_each_entry_ptr(p, &j->pin, iter) { + fifo_for_each_entry_ptr(p, &j->pin, seq) { INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, 0); @@ -1062,7 +1076,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) mutex_unlock(&j->blacklist_lock); - cur_seq = last_seq(j); + cur_seq = journal_last_seq(j); end_seq = le64_to_cpu(list_last_entry(list, struct journal_replay, list)->j.seq); @@ -1087,7 +1101,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", cur_seq, le64_to_cpu(i->j.seq) - 1, - last_seq(j), end_seq); + journal_last_seq(j), end_seq); cur_seq = le64_to_cpu(i->j.seq) + 1; @@ -1155,10 +1169,10 @@ static void journal_pin_new_entry(struct journal *j, int count) /* * The fifo_push() needs to happen at the same time as j->seq is - * incremented for last_seq() to be calculated correctly + * incremented for journal_last_seq() to be calculated correctly */ - p = fifo_push_ref(&j->pin); atomic64_inc(&j->seq); + p = fifo_push_ref(&j->pin); EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != &fifo_peek_back(&j->pin)); @@ -1237,7 +1251,7 @@ static enum { journal_reclaim_fast(j); /* XXX: why set this here, and not in journal_write()? */ - buf->data->last_seq = cpu_to_le64(last_seq(j)); + buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); journal_pin_new_entry(j, 1); @@ -1272,7 +1286,7 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - wake_up(&j->wait); + journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); closure_wake_up(&journal_prev_buf(j)->wait); } @@ -1302,7 +1316,7 @@ static unsigned journal_dev_buckets_available(struct journal *j, * Don't use the last bucket unless writing the new last_seq * will make another bucket available: */ - if (ja->bucket_seq[ja->last_idx] >= last_seq(j)) + if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j)) available = max((int) available - 1, 0); return available; @@ -1441,7 +1455,7 @@ static int journal_entry_open(struct journal *j) mod_delayed_work(system_freezable_wq, &j->write_work, msecs_to_jiffies(j->write_delay_ms)); - wake_up(&j->wait); + journal_wake(j); return 1; } @@ -1537,7 +1551,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) } if (atomic_dec_and_test(&j->replay_pin_list->count)) - wake_up(&j->wait); + journal_wake(j); } j->replay_pin_list = NULL; @@ -1704,7 +1718,7 @@ static void journal_reclaim_fast(struct journal *j) } if (popped) - wake_up(&j->wait); + journal_wake(j); } /* @@ -1718,6 +1732,7 @@ static inline void __journal_pin_add(struct journal *j, journal_pin_flush_fn flush_fn) { BUG_ON(journal_pin_active(pin)); + BUG_ON(!atomic_read(&pin_list->count)); atomic_inc(&pin_list->count); pin->pin_list = pin_list; @@ -1727,7 +1742,12 @@ static inline void __journal_pin_add(struct journal *j, list_add(&pin->list, &pin_list->list); else INIT_LIST_HEAD(&pin->list); - wake_up(&j->wait); + + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); } static void journal_pin_add_entry(struct journal *j, @@ -1754,38 +1774,32 @@ void bch2_journal_pin_add(struct journal *j, spin_unlock(&j->lock); } -static inline bool __journal_pin_drop(struct journal *j, +static inline void __journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { struct journal_entry_pin_list *pin_list = pin->pin_list; - pin->pin_list = NULL; + if (!journal_pin_active(pin)) + return; - /* journal_reclaim_work() might have already taken us off the list */ - if (!list_empty_careful(&pin->list)) - list_del_init(&pin->list); + pin->pin_list = NULL; + list_del_init(&pin->list); - return atomic_dec_and_test(&pin_list->count); + /* + * Unpinning a journal entry make make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: + */ + if (atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin)) + journal_reclaim_fast(j); } void bch2_journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { - bool wakeup = false; - spin_lock(&j->lock); - if (journal_pin_active(pin)) - wakeup = __journal_pin_drop(j, pin); + __journal_pin_drop(j, pin); spin_unlock(&j->lock); - - /* - * Unpinning a journal entry make make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: - * - * Nested irqsave is expensive, don't do the wakeup with lock held: - */ - if (wakeup) - wake_up(&j->wait); } void bch2_journal_pin_add_if_older(struct journal *j, @@ -1797,10 +1811,9 @@ void bch2_journal_pin_add_if_older(struct journal *j, if (journal_pin_active(src_pin) && (!journal_pin_active(pin) || - fifo_entry_idx(&j->pin, src_pin->pin_list) < - fifo_entry_idx(&j->pin, pin->pin_list))) { - if (journal_pin_active(pin)) - __journal_pin_drop(j, pin); + journal_pin_seq(j, src_pin->pin_list) < + journal_pin_seq(j, pin->pin_list))) { + __journal_pin_drop(j, pin); __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); } @@ -1812,13 +1825,13 @@ __journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret; - unsigned iter; + u64 iter; /* no need to iterate over empty fifo entries: */ journal_reclaim_fast(j); fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { - if (journal_pin_seq(j, pin_list) > seq_to_flush) + if (iter > seq_to_flush) break; ret = list_first_entry_or_null(&pin_list->list, @@ -1826,7 +1839,7 @@ __journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) if (ret) { /* must be list_del_init(), see bch2_journal_pin_drop() */ list_move(&ret->list, &pin_list->flushed); - *seq = journal_pin_seq(j, pin_list); + *seq = iter; return ret; } } @@ -1863,9 +1876,9 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) || + ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL || !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || - last_seq(j) > seq_to_flush || + journal_last_seq(j) > seq_to_flush || (fifo_used(&j->pin) == 1 && atomic_read(&fifo_peek_front(&j->pin).count) == 1); spin_unlock(&j->lock); @@ -1891,7 +1904,7 @@ again: } spin_lock(&j->lock); - flush = last_seq(j) != j->last_seq_ondisk || + flush = journal_last_seq(j) != j->last_seq_ondisk || (seq_to_flush == U64_MAX && c->btree_roots_dirty); spin_unlock(&j->lock); @@ -1984,7 +1997,7 @@ static void journal_reclaim_work(struct work_struct *work) ja->last_idx = (ja->last_idx + 1) % ja->nr; spin_unlock(&j->lock); - wake_up(&j->wait); + journal_wake(j); } /* @@ -2224,7 +2237,7 @@ out: &j->reservations.counter); closure_wake_up(&w->wait); - wake_up(&j->wait); + journal_wake(j); if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); @@ -2338,8 +2351,8 @@ static void journal_write(struct closure *cl) bio = ca->journal.bio; bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = ptr->offset; - bio->bi_bdev = ca->disk_sb.bdev; bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; @@ -2360,7 +2373,7 @@ static void journal_write(struct closure *cl) bio = ca->journal.bio; bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_opf = REQ_OP_FLUSH; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; @@ -2377,19 +2390,34 @@ err: continue_at(cl, journal_write_done, system_highpri_wq); } -static void journal_write_work(struct work_struct *work) +/* + * returns true if there's nothing to flush and no journal write still in flight + */ +static bool journal_flush_write(struct journal *j) { - struct journal *j = container_of(to_delayed_work(work), - struct journal, write_work); + bool ret; + spin_lock(&j->lock); + ret = !j->reservations.prev_buf_unwritten; + if (!journal_entry_is_open(j)) { spin_unlock(&j->lock); - return; + return ret; } set_bit(JOURNAL_NEED_WRITE, &j->flags); - if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED) + if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED) + ret = false; + else spin_unlock(&j->lock); + return ret; +} + +static void journal_write_work(struct work_struct *work) +{ + struct journal *j = container_of(work, struct journal, write_work.work); + + journal_flush_write(j); } /* @@ -2514,6 +2542,43 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret < 0 ? ret : 0; } +u64 bch2_journal_last_unwritten_seq(struct journal *j) +{ + u64 seq; + + spin_lock(&j->lock); + seq = atomic64_read(&j->seq); + if (j->reservations.prev_buf_unwritten) + seq--; + spin_unlock(&j->lock); + + return seq; +} + +int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent) +{ + int ret; + + spin_lock(&j->lock); + BUG_ON(seq > atomic64_read(&j->seq)); + + if (seq < atomic64_read(&j->seq) || + journal_entry_is_open(j)) { + spin_unlock(&j->lock); + return 1; + } + + ret = journal_entry_open(j); + if (!ret) + closure_wait(&j->async_wait, parent); + spin_unlock(&j->lock); + + if (!ret) + journal_reclaim_work(&j->reclaim_work.work); + + return ret; +} + void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) { spin_lock(&j->lock); @@ -2737,14 +2802,13 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct bch_devs_list devs; - u64 seq = 0; - unsigned iter; + u64 iter, seq = 0; int ret = 0; spin_lock(&j->lock); fifo_for_each_entry_ptr(p, &j->pin, iter) if (bch2_dev_list_has_dev(p->devs, dev_idx)) - seq = journal_pin_seq(j, p); + seq = iter; spin_unlock(&j->lock); ret = bch2_journal_flush_pins(j, seq); @@ -2758,7 +2822,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) spin_lock(&j->lock); while (!ret && seq < atomic64_read(&j->seq)) { - seq = max(seq, last_seq(j)); + seq = max(seq, journal_last_seq(j)); devs = journal_seq_pin(j, seq)->devs; seq++; @@ -2804,15 +2868,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { - if (!test_bit(JOURNAL_STARTED, &j->flags)) - return; - - /* - * Empty out the journal by first flushing everything pinning existing - * journal entries, then force a brand new empty journal entry to be - * written: - */ - bch2_journal_flush_all_pins(j); + wait_event(j->wait, journal_flush_write(j)); cancel_delayed_work_sync(&j->write_work); cancel_delayed_work_sync(&j->reclaim_work); @@ -2914,7 +2970,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) spin_lock(&j->lock); ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "active journal entries:\t%zu\n" + "active journal entries:\t%llu\n" "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" @@ -2927,7 +2983,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) "replay done:\t\t%i\n", fifo_used(&j->pin), (u64) atomic64_read(&j->seq), - last_seq(j), + journal_last_seq(j), j->last_seq_ondisk, journal_state_count(*s, s->idx), s->cur_entry_offset, @@ -2965,14 +3021,13 @@ ssize_t bch2_journal_print_pins(struct journal *j, char *buf) struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; ssize_t ret = 0; - unsigned i; + u64 i; spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, i) { ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%llu: count %u\n", - journal_pin_seq(j, pin_list), - atomic_read(&pin_list->count)); + i, atomic_read(&pin_list->count)); list_for_each_entry(pin, &pin_list->list, list) ret += scnprintf(buf + ret, PAGE_SIZE - ret, diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 5abf356e..52d74eec 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -155,9 +155,11 @@ static inline bool journal_pin_active(struct journal_entry_pin *pin) static inline struct journal_entry_pin_list * journal_seq_pin(struct journal *j, u64 seq) { - return &j->pin.data[(size_t) seq & j->pin.mask]; + return &j->pin.data[seq & j->pin.mask]; } +u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *); + void bch2_journal_pin_add(struct journal *, struct journal_res *, struct journal_entry_pin *, journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); @@ -195,8 +197,11 @@ static inline void bch2_journal_set_has_inode(struct journal *j, u64 inum) { struct journal_buf *buf = &j->buf[res->idx]; + unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); - set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode); + /* avoid atomic op if possible */ + if (unlikely(!test_bit(bit, buf->has_inode))) + set_bit(bit, buf->has_inode); } /* @@ -234,7 +239,7 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res unsigned actual = jset_u64s(u64s); EBUG_ON(!res->ref); - BUG_ON(actual > res->u64s); + EBUG_ON(actual > res->u64s); bch2_journal_add_entry_at(buf, res->offset, type, id, level, data, u64s); @@ -352,6 +357,9 @@ out: return 0; } +u64 bch2_journal_last_unwritten_seq(struct journal *); +int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); + void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); @@ -370,6 +378,8 @@ static inline int bch2_journal_error(struct journal *j) ? -EIO : 0; } +struct bch_dev; + static inline bool journal_flushes_device(struct bch_dev *ca) { return true; diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 5eea6579..e39b18f2 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -140,6 +140,7 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; + struct closure_waitlist async_wait; struct closure io; struct delayed_work write_work; @@ -166,7 +167,10 @@ struct journal { * needed. When all journal entries in the oldest journal bucket are no * longer needed, the bucket can be discarded and reused. */ - DECLARE_FIFO(struct journal_entry_pin_list, pin); + struct { + u64 front, back, size, mask; + struct journal_entry_pin_list *data; + } pin; struct journal_entry_pin_list *replay_pin_list; struct mutex blacklist_lock; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 2033db81..01c88960 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -162,7 +162,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) bch2_bkey_devs(k)); if (ret) break; - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next(&iter); continue; } @@ -265,10 +265,6 @@ retry: } } bch2_btree_iter_unlock(&iter); - - /* btree root */ - mutex_lock(&c->btree_root_lock); - mutex_unlock(&c->btree_root_lock); } ret = 0; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 7c7f436c..a67e7a45 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -49,10 +49,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); struct bkey_i_extent *insert, *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; @@ -143,7 +143,7 @@ nomatch: &m->ctxt->stats->sectors_raced); atomic_long_inc(&c->extent_migrate_raced); trace_move_race(&new->k); - bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_next_slot(&iter); goto next; } out: @@ -435,7 +435,7 @@ next: atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k), &stats->sectors_seen); next_nondata: - bch2_btree_iter_advance_pos(&stats->iter); + bch2_btree_iter_next(&stats->iter); bch2_btree_iter_cond_resched(&stats->iter); } diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 854f7f55..6ab2c866 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -245,7 +245,7 @@ int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, memset(&msgs, 0, sizeof(msgs)); for_each_set_qtype(c, i, q, qtypes) - mutex_lock(&q->lock); + mutex_lock_nested(&q->lock, i); for_each_set_qtype(c, i, q, qtypes) { mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); @@ -296,7 +296,7 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, memset(&msgs, 0, sizeof(msgs)); for_each_set_qtype(c, i, q, qtypes) - mutex_lock(&q->lock); + mutex_lock_nested(&q->lock, i); for_each_set_qtype(c, i, q, qtypes) { src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); @@ -735,8 +735,8 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p, - BTREE_ITER_WITH_HOLES|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_with_holes(&iter); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); ret = btree_iter_err(k); if (unlikely(ret)) diff --git a/libbcachefs/six.c b/libbcachefs/six.c index c60a6730..f0ff8d41 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -10,10 +10,6 @@ #define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) #define six_release(l) lock_release(l, 0, _RET_IP_) -#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) - struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ u64 lock_val; @@ -31,6 +27,10 @@ struct six_lock_vals { enum six_lock_type unlock_wakeup; }; +#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) +#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) +#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) + #define LOCK_VALS { \ [SIX_LOCK_read] = { \ .lock_val = __SIX_VAL(read_lock, 1), \ @@ -55,25 +55,40 @@ struct six_lock_vals { }, \ } -static void six_set_owner(struct six_lock *lock, enum six_lock_type type) +static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, + union six_lock_state old) { - if (type == SIX_LOCK_intent) + if (type != SIX_LOCK_intent) + return; + + if (!old.intent_lock) { + EBUG_ON(lock->owner); lock->owner = current; + } else { + EBUG_ON(lock->owner != current); + } } -static void six_clear_owner(struct six_lock *lock, enum six_lock_type type) +static inline void six_clear_owner(struct six_lock *lock, enum six_lock_type type) { - if (type == SIX_LOCK_intent) + if (type != SIX_LOCK_intent) + return; + + EBUG_ON(lock->owner != current); + + if (lock->state.intent_lock == 1) lock->owner = NULL; } -static inline bool __six_trylock_type(struct six_lock *lock, - enum six_lock_type type) +static __always_inline bool do_six_trylock_type(struct six_lock *lock, + enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old; u64 v = READ_ONCE(lock->state.v); + EBUG_ON(type == SIX_LOCK_write && lock->owner != current); + do { old.v = v; @@ -86,23 +101,24 @@ static inline bool __six_trylock_type(struct six_lock *lock, } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, old.v, old.v + l[type].lock_val)) != old.v); + + six_set_owner(lock, type, old); return true; } -bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +__always_inline __flatten +static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) { - bool ret = __six_trylock_type(lock, type); - - if (ret) { - six_acquire(&lock->dep_map, 1); - six_set_owner(lock, type); - } + if (!do_six_trylock_type(lock, type)) + return false; - return ret; + six_acquire(&lock->dep_map, 1); + return true; } -bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) +__always_inline __flatten +static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old; @@ -117,8 +133,8 @@ bool six_relock_type(struct six_lock *lock, enum six_lock_type type, old.v, old.v + l[type].lock_val)) != old.v); + six_set_owner(lock, type, old); six_acquire(&lock->dep_map, 1); - six_set_owner(lock, type); return true; } @@ -150,7 +166,8 @@ static inline int six_can_spin_on_owner(struct six_lock *lock) return retval; } -static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner) +static inline bool six_spin_on_owner(struct six_lock *lock, + struct task_struct *owner) { bool ret = true; @@ -176,7 +193,7 @@ static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner) return ret; } -static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) { struct task_struct *task = current; @@ -201,7 +218,7 @@ static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) if (owner && !six_spin_on_owner(lock, owner)) break; - if (__six_trylock_type(lock, type)) { + if (do_six_trylock_type(lock, type)) { osq_unlock(&lock->osq); preempt_enable(); return true; @@ -240,20 +257,16 @@ fail: return false; } -void six_lock_type(struct six_lock *lock, enum six_lock_type type) +noinline +static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old, new; struct six_lock_waiter wait; u64 v; - six_acquire(&lock->dep_map, 0); - - if (__six_trylock_type(lock, type)) - goto done; - if (six_optimistic_spin(lock, type)) - goto done; + return; lock_contended(&lock->dep_map, _RET_IP_); @@ -262,7 +275,9 @@ void six_lock_type(struct six_lock *lock, enum six_lock_type type) while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (list_empty_careful(&wait.list)) { + if (type == SIX_LOCK_write) + EBUG_ON(lock->owner != current); + else if (list_empty_careful(&wait.list)) { raw_spin_lock(&lock->wait_lock); list_add_tail(&wait.list, &lock->wait_list[type]); raw_spin_unlock(&lock->wait_lock); @@ -287,6 +302,8 @@ void six_lock_type(struct six_lock *lock, enum six_lock_type type) schedule(); } + six_set_owner(lock, type, old); + __set_current_state(TASK_RUNNING); if (!list_empty_careful(&wait.list)) { @@ -294,9 +311,17 @@ void six_lock_type(struct six_lock *lock, enum six_lock_type type) list_del_init(&wait.list); raw_spin_unlock(&lock->wait_lock); } -done: +} + +__always_inline +static void __six_lock_type(struct six_lock *lock, enum six_lock_type type) +{ + six_acquire(&lock->dep_map, 0); + + if (!do_six_trylock_type(lock, type)) + __six_lock_type_slowpath(lock, type); + lock_acquired(&lock->dep_map, _RET_IP_); - six_set_owner(lock, type); } static inline void six_lock_wakeup(struct six_lock *lock, @@ -315,6 +340,14 @@ static inline void six_lock_wakeup(struct six_lock *lock, clear_bit(waitlist_bitnr(waitlist_id), (unsigned long *) &lock->state.v); + if (waitlist_id == SIX_LOCK_write) { + struct task_struct *p = READ_ONCE(lock->owner); + + if (p) + wake_up_process(p); + return; + } + raw_spin_lock(&lock->wait_lock); list_for_each_entry_safe(w, next, wait_list, list) { @@ -332,26 +365,87 @@ static inline void six_lock_wakeup(struct six_lock *lock, raw_spin_unlock(&lock->wait_lock); } -void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +__always_inline __flatten +static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state state; - six_clear_owner(lock, type); - EBUG_ON(!(lock->state.v & l[type].held_mask)); EBUG_ON(type == SIX_LOCK_write && !(lock->state.v & __SIX_LOCK_HELD_intent)); + six_clear_owner(lock, type); + state.v = atomic64_add_return_release(l[type].unlock_val, &lock->state.counter); six_release(&lock->dep_map); six_lock_wakeup(lock, state, l[type].unlock_wakeup); } -bool six_trylock_convert(struct six_lock *lock, - enum six_lock_type from, - enum six_lock_type to) +#ifdef SIX_LOCK_SEPARATE_LOCKFNS + +#define __SIX_LOCK(type) \ +bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return __six_trylock_type(lock, SIX_LOCK_##type); \ +} \ + \ +bool six_relock_##type(struct six_lock *lock, u32 seq) \ +{ \ + return __six_relock_type(lock, SIX_LOCK_##type, seq); \ +} \ + \ +void six_lock_##type(struct six_lock *lock) \ +{ \ + __six_lock_type(lock, SIX_LOCK_##type); \ +} \ + \ +void six_unlock_##type(struct six_lock *lock) \ +{ \ + __six_unlock_type(lock, SIX_LOCK_##type); \ +} + +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) + +#undef __SIX_LOCK + +#else + +bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + return __six_trylock_type(lock, type); +} + +bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + return __six_relock_type(lock, type, seq); + +} + +void six_lock_type(struct six_lock *lock, enum six_lock_type type) +{ + __six_lock_type(lock, type); +} + +void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + __six_unlock_type(lock, type); +} + +#endif + +/* Convert from intent to read: */ +void six_lock_downgrade(struct six_lock *lock) +{ + six_lock_increment(lock, SIX_LOCK_read); + six_unlock_intent(lock); +} + +bool six_lock_tryupgrade(struct six_lock *lock) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old, new; @@ -359,22 +453,41 @@ bool six_trylock_convert(struct six_lock *lock, do { new.v = old.v = v; - new.v += l[from].unlock_val; - if (new.v & l[to].lock_fail) + EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); + + new.v += l[SIX_LOCK_read].unlock_val; + + if (new.v & l[SIX_LOCK_intent].lock_fail) return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - new.v + l[to].lock_val)) != old.v); - six_clear_owner(lock, from); - six_set_owner(lock, to); + new.v += l[SIX_LOCK_intent].lock_val; + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, + old.v, new.v)) != old.v); - six_lock_wakeup(lock, new, l[from].unlock_wakeup); + six_set_owner(lock, SIX_LOCK_intent, old); + six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); return true; } +bool six_trylock_convert(struct six_lock *lock, + enum six_lock_type from, + enum six_lock_type to) +{ + EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); + + if (to == from) + return true; + + if (to == SIX_LOCK_read) { + six_lock_downgrade(lock); + return true; + } else { + return six_lock_tryupgrade(lock); + } +} + /* * Increment read/intent lock count, assuming we already have it read or intent * locked: @@ -390,10 +503,3 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) atomic64_add(l[type].lock_val, &lock->state.counter); } - -/* Convert from intent to read: */ -void six_lock_downgrade(struct six_lock *lock) -{ - six_lock_increment(lock, SIX_LOCK_read); - six_unlock_intent(lock); -} diff --git a/libbcachefs/six.h b/libbcachefs/six.h index 0f319df6..f518c64c 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -8,6 +8,8 @@ #include "util.h" +#define SIX_LOCK_SEPARATE_LOCKFNS + /* * LOCK STATES: * @@ -68,7 +70,7 @@ struct six_lock { struct optimistic_spin_queue osq; raw_spinlock_t wait_lock; - struct list_head wait_list[3]; + struct list_head wait_list[2]; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -82,7 +84,6 @@ static __always_inline void __six_lock_init(struct six_lock *lock, raw_spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]); #ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *) lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); @@ -96,16 +97,60 @@ do { \ __six_lock_init((lock), #lock, &__key); \ } while (0) +#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) + +#ifdef SIX_LOCK_SEPARATE_LOCKFNS + +#define __SIX_LOCK(type) \ +bool six_trylock_##type(struct six_lock *); \ +bool six_relock_##type(struct six_lock *, u32); \ +void six_lock_##type(struct six_lock *); \ +void six_unlock_##type(struct six_lock *); + +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) +#undef __SIX_LOCK + +#define SIX_LOCK_DISPATCH(type, fn, ...) \ + switch (type) { \ + case SIX_LOCK_read: \ + return fn##_read(__VA_ARGS__); \ + case SIX_LOCK_intent: \ + return fn##_intent(__VA_ARGS__); \ + case SIX_LOCK_write: \ + return fn##_write(__VA_ARGS__); \ + default: \ + BUG(); \ + } + +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + SIX_LOCK_DISPATCH(type, six_trylock, lock); +} + +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + SIX_LOCK_DISPATCH(type, six_relock, lock, seq); +} + +static inline void six_lock_type(struct six_lock *lock, enum six_lock_type type) +{ + SIX_LOCK_DISPATCH(type, six_lock, lock); +} + +static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + SIX_LOCK_DISPATCH(type, six_unlock, lock); +} + +#else + bool six_trylock_type(struct six_lock *, enum six_lock_type); bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned); void six_lock_type(struct six_lock *, enum six_lock_type); void six_unlock_type(struct six_lock *, enum six_lock_type); -bool six_trylock_convert(struct six_lock *, enum six_lock_type, - enum six_lock_type); -void six_lock_increment(struct six_lock *, enum six_lock_type); -void six_lock_downgrade(struct six_lock *); - -#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) #define __SIX_LOCK(type) \ static __always_inline bool six_trylock_##type(struct six_lock *lock) \ @@ -131,5 +176,15 @@ static __always_inline void six_unlock_##type(struct six_lock *lock) \ __SIX_LOCK(read) __SIX_LOCK(intent) __SIX_LOCK(write) +#undef __SIX_LOCK + +#endif + +void six_lock_downgrade(struct six_lock *); +bool six_lock_tryupgrade(struct six_lock *); +bool six_trylock_convert(struct six_lock *, enum six_lock_type, + enum six_lock_type); + +void six_lock_increment(struct six_lock *, enum six_lock_type); #endif /* _BCACHEFS_SIX_H */ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 530cf0a4..0adb9a1c 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -131,12 +131,11 @@ bch2_hash_lookup_at(const struct bch_hash_desc desc, struct btree_iter *iter, const void *search) { u64 inode = iter->pos.inode; + struct bkey_s_c k; - do { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); - - if (btree_iter_err(k)) - return k; + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + if (iter->pos.inode != inode) + break; if (k.k->type == desc.key_type) { if (!desc.cmp_key(k, search)) @@ -147,11 +146,8 @@ bch2_hash_lookup_at(const struct bch_hash_desc desc, /* hole, not found */ break; } - - bch2_btree_iter_advance_pos(iter); - } while (iter->pos.inode == inode); - - return bkey_s_c_err(-ENOENT); + } + return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); } static inline struct bkey_s_c @@ -160,12 +156,11 @@ bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc, struct btree_iter *iter, struct bkey_s_c search) { u64 inode = iter->pos.inode; + struct bkey_s_c k; - do { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); - - if (btree_iter_err(k)) - return k; + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + if (iter->pos.inode != inode) + break; if (k.k->type == desc.key_type) { if (!desc.cmp_bkey(k, search)) @@ -176,11 +171,8 @@ bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc, /* hole, not found */ break; } - - bch2_btree_iter_advance_pos(iter); - } while (iter->pos.inode == inode); - - return bkey_s_c_err(-ENOENT); + } + return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); } static inline struct bkey_s_c @@ -190,7 +182,8 @@ bch2_hash_lookup(const struct bch_hash_desc desc, struct btree_iter *iter, const void *key) { bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), 0); + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS); return bch2_hash_lookup_at(desc, info, iter, key); } @@ -203,7 +196,7 @@ bch2_hash_lookup_intent(const struct bch_hash_desc desc, { bch2_btree_iter_init(iter, c, desc.btree_id, POS(inode, desc.hash_key(info, key)), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); return bch2_hash_lookup_at(desc, info, iter, key); } @@ -211,20 +204,17 @@ bch2_hash_lookup_intent(const struct bch_hash_desc desc, static inline struct bkey_s_c bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter) { - while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); + u64 inode = iter->pos.inode; + struct bkey_s_c k; - if (btree_iter_err(k)) - return k; + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + if (iter->pos.inode != inode) + break; if (k.k->type != desc.key_type) return k; - - /* hash collision, keep going */ - bch2_btree_iter_advance_pos(iter); - if (iter->pos.inode != k.k->p.inode) - return bkey_s_c_err(-ENOENT); } + return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); } static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc, @@ -235,7 +225,7 @@ static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc, { bch2_btree_iter_init(iter, c, desc.btree_id, POS(inode, desc.hash_key(info, key)), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); return bch2_hash_hole_at(desc, iter); } @@ -245,16 +235,12 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, struct btree_iter *iter, struct btree_iter *start) { + struct bkey_s_c k; + bch2_btree_iter_set_pos(iter, btree_type_successor(start->btree_id, start->pos)); - while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_with_holes(iter); - int ret = btree_iter_err(k); - - if (ret) - return ret; - + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { if (k.k->type != desc.key_type && k.k->type != desc.whiteout_type) return false; @@ -262,9 +248,8 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, if (k.k->type == desc.key_type && desc.hash_bkey(info, k) <= start->pos.offset) return true; - - bch2_btree_iter_advance_pos(iter); } + return btree_iter_err(k); } static inline int bch2_hash_set(const struct bch_hash_desc desc, @@ -279,9 +264,9 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc, bch2_btree_iter_init(&hashed_slot, c, desc.btree_id, POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos, - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); bch2_btree_iter_link(&hashed_slot, &iter); retry: /* @@ -354,7 +339,7 @@ static inline int bch2_hash_delete_at(const struct bch_hash_desc desc, int ret = -ENOENT; bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id, - iter->pos, 0); + iter->pos, BTREE_ITER_SLOTS); bch2_btree_iter_link(iter, &whiteout_iter); ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter); @@ -385,9 +370,10 @@ static inline int bch2_hash_delete(const struct bch_hash_desc desc, bch2_btree_iter_init(&iter, c, desc.btree_id, POS(inode, desc.hash_key(info, key)), - BTREE_ITER_INTENT); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), 0); + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS); bch2_btree_iter_link(&iter, &whiteout_iter); retry: k = bch2_hash_lookup_at(desc, info, &iter, key); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 8dce7dc1..e3c319fc 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -490,7 +490,7 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) unsigned order; reread: bio_reset(sb->bio); - sb->bio->bi_bdev = sb->bdev; + bio_set_dev(sb->bio, sb->bdev); sb->bio->bi_iter.bi_sector = offset; sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); @@ -588,7 +588,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts, * superblocks: */ bio_reset(sb->bio); - sb->bio->bi_bdev = sb->bdev; + bio_set_dev(sb->bio, sb->bdev); sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); @@ -667,7 +667,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) null_nonce(), sb); bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); bio->bi_iter.bi_size = roundup(vstruct_bytes(sb), diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 29ffba65..8c7a147a 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -197,6 +197,29 @@ int bch2_congested(void *data, int bdi_bits) * - allocator depends on the journal (when it rewrites prios and gens) */ +static void bch_fs_mark_clean(struct bch_fs *c) +{ + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb, true); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } +} + +static bool btree_interior_updates_done(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->btree_interior_update_lock); + ret = list_empty(&c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + return ret; +} + static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; @@ -213,17 +236,38 @@ static void __bch2_fs_read_only(struct bch_fs *c) * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: */ - bch2_journal_flush_all_pins(&c->journal); + bch2_journal_flush_pins(&c->journal, U64_MAX - 1); for_each_member_device(ca, c, i) bch2_dev_allocator_stop(ca); - bch2_fs_journal_stop(&c->journal); + bch2_journal_flush_all_pins(&c->journal); - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags)) + /* + * We need to explicitly wait on btree interior updates to complete + * before stopping the journal, flushing all journal pins isn't + * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree + * interior updates have to drop their journal pin before they're + * fully complete: + */ + closure_wait_event(&c->btree_interior_update_wait, + btree_interior_updates_done(c)); + + if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) bch2_btree_verify_flushed(c); + bch2_fs_journal_stop(&c->journal); + + /* + * the journal kicks off btree writes via reclaim - wait for in flight + * writes after stopping journal: + */ + if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + bch2_btree_flush_all_writes(c); + + /* + * After stopping journal: + */ for_each_member_device(ca, c, i) bch2_dev_allocator_remove(c, ca); } @@ -274,19 +318,12 @@ void bch2_fs_read_only(struct bch_fs *c) __bch2_fs_read_only(c); + bch_fs_mark_clean(c); + wait_event(bch_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags)) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb, true); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - c->state = BCH_FS_RO; } @@ -400,29 +437,22 @@ static void bch2_fs_free(struct bch_fs *c) module_put(THIS_MODULE); } -static void bch2_fs_exit(struct bch_fs *c) +static void bch2_fs_release(struct kobject *kobj) { - unsigned i; - - cancel_delayed_work_sync(&c->pd_controllers_update); - cancel_work_sync(&c->read_only_work); - - for (i = 0; i < c->sb.nr_devices; i++) - if (c->devs[i]) - bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - closure_debug_destroy(&c->cl); - kobject_put(&c->kobj); + bch2_fs_free(c); } -static void bch2_fs_offline(struct bch_fs *c) +void bch2_fs_stop(struct bch_fs *c) { struct bch_dev *ca; unsigned i; - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); + mutex_lock(&c->state_lock); + BUG_ON(c->state == BCH_FS_STOPPING); + c->state = BCH_FS_STOPPING; + mutex_unlock(&c->state_lock); for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && @@ -440,30 +470,34 @@ static void bch2_fs_offline(struct bch_fs *c) kobject_put(&c->opts_dir); kobject_put(&c->internal); + mutex_lock(&bch_fs_list_lock); + list_del(&c->list); + mutex_unlock(&bch_fs_list_lock); + + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + mutex_lock(&c->state_lock); __bch2_fs_read_only(c); mutex_unlock(&c->state_lock); -} -static void bch2_fs_release(struct kobject *kobj) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + bch_fs_mark_clean(c); - bch2_fs_free(c); -} + /* btree prefetch might have kicked off reads in the background: */ + bch2_btree_flush_all_reads(c); -void bch2_fs_stop(struct bch_fs *c) -{ - mutex_lock(&c->state_lock); - BUG_ON(c->state == BCH_FS_STOPPING); - c->state = BCH_FS_STOPPING; - mutex_unlock(&c->state_lock); + for_each_member_device(ca, c, i) + cancel_work_sync(&ca->io_error_work); - bch2_fs_offline(c); + cancel_work_sync(&c->btree_write_error_work); + cancel_delayed_work_sync(&c->pd_controllers_update); + cancel_work_sync(&c->read_only_work); - closure_sync(&c->cl); + for (i = 0; i < c->sb.nr_devices; i++) + if (c->devs[i]) + bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); - bch2_fs_exit(c); + kobject_put(&c->kobj); } static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) @@ -545,6 +579,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_opts_apply(&c->opts, opts); c->block_bits = ilog2(c->opts.block_size); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); c->opts.nochanges |= c->opts.noreplay; c->opts.read_only |= c->opts.nochanges; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 1e8f8735..597e1f02 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -12,8 +12,10 @@ #include "compress.h" #include "sysfs.h" #include "btree_cache.h" +#include "btree_io.h" #include "btree_iter.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" #include "inode.h" @@ -146,6 +148,8 @@ read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); read_attribute(journal_pins); +read_attribute(btree_updates); +read_attribute(dirty_btree_nodes); read_attribute(internal_uuid); @@ -343,6 +347,12 @@ SHOW(bch2_fs) if (attr == &sysfs_journal_pins) return bch2_journal_print_pins(&c->journal, buf); + if (attr == &sysfs_btree_updates) + return bch2_btree_updates_print(c, buf); + + if (attr == &sysfs_dirty_btree_nodes) + return bch2_dirty_btree_nodes_print(c, buf); + if (attr == &sysfs_compression_stats) return bch2_compression_stats(c, buf); @@ -474,6 +484,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_alloc_debug, &sysfs_journal_debug, &sysfs_journal_pins, + &sysfs_btree_updates, + &sysfs_dirty_btree_nodes, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, -- cgit v1.2.3