diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-03-28 17:38:28 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2021-03-29 00:22:38 -0400 |
commit | a2094890a90a2f865e49f94e8448deca7e5852ef (patch) | |
tree | 11bf5f426509e288b2b3482492c805a26bb1885a /libbcachefs | |
parent | bb6eccc2ecd4728871bfc70462d3a4a20daa9d68 (diff) |
Update bcachefs sources to 18686af684 bcachefs: Inode backpointersv0.13
Diffstat (limited to 'libbcachefs')
36 files changed, 750 insertions, 376 deletions
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 532f23b9..cb225951 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -138,19 +138,18 @@ struct bpos { #define KEY_SNAPSHOT_MAX ((__u32)~0U) #define KEY_SIZE_MAX ((__u32)~0U) -static inline struct bpos POS(__u64 inode, __u64 offset) +static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) { - struct bpos ret; - - ret.inode = inode; - ret.offset = offset; - ret.snapshot = 0; - - return ret; + return (struct bpos) { + .inode = inode, + .offset = offset, + .snapshot = snapshot, + }; } -#define POS_MIN POS(0, 0) -#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) +#define POS_MIN SPOS(0, 0, 0) +#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) +#define POS(_inode, _offset) SPOS(_inode, _offset, 0) /* Empty placeholder struct, for container_of() */ struct bch_val { @@ -707,7 +706,9 @@ struct bch_inode_generation { x(bi_foreground_target, 16) \ x(bi_background_target, 16) \ x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -743,6 +744,7 @@ enum { __BCH_INODE_I_SIZE_DIRTY= 5, __BCH_INODE_I_SECTORS_DIRTY= 6, __BCH_INODE_UNLINKED = 7, + __BCH_INODE_BACKPTR_UNTRUSTED = 8, /* bits 20+ reserved for packed fields below: */ }; @@ -755,6 +757,7 @@ enum { #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) +#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); @@ -1204,7 +1207,9 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_new_versioning = 10, bcachefs_metadata_version_bkey_renumber = 10, bcachefs_metadata_version_inode_btree_change = 11, - bcachefs_metadata_version_max = 12, + bcachefs_metadata_version_snapshot = 12, + bcachefs_metadata_version_inode_backpointers = 13, + bcachefs_metadata_version_max = 14, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1736,7 +1741,7 @@ struct btree_node { /* Closed interval: */ struct bpos min_key; struct bpos max_key; - struct bch_extent_ptr ptr; + struct bch_extent_ptr _ptr; /* not used anymore */ struct bkey_format format; union { diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index e1906f25..3af56062 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -614,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f) return "incorrect number of fields"; for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (f->bits_per_field[i] > 64) + if (f->bits_per_field[i] > unpacked_bits) return "field too large"; - if (field_offset && - (f->bits_per_field[i] == 64 || - (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < - field_offset))) + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return "offset + bits overflow"; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) return "offset + bits overflow"; bits += f->bits_per_field[i]; @@ -1045,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, high_word(f, r), b->nr_key_bits); - EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), + EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), bkey_unpack_pos(b, r))); return ret; } @@ -1055,7 +1059,7 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, const struct bkey_packed *l, const struct bpos *r) { - return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); + return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); } __pure __flatten @@ -1076,7 +1080,7 @@ int bch2_bkey_cmp_packed(const struct btree *b, r = (void*) &unpacked; } - return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); + return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); } __pure __flatten @@ -1087,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b, const struct bkey *l_unpacked; return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bkey_cmp(l_unpacked->p, *r) + ? bpos_cmp(l_unpacked->p, *r) : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } @@ -1123,11 +1127,12 @@ void bch2_bkey_pack_test(void) struct bkey_packed p; struct bkey_format test_format = { - .key_u64s = 2, + .key_u64s = 3, .nr_fields = BKEY_NR_FIELDS, .bits_per_field = { 13, 64, + 32, }, }; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 629288a6..2e45d88f 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -33,16 +33,6 @@ struct bkey_s { #define bkey_next(_k) vstruct_next(_k) -static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, - struct bkey_packed *end) -{ - k = bkey_next(k); - - while (k != end && !k->u64s) - k = (void *) ((u64 *) k + 1); - return k; -} - #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) @@ -150,29 +140,27 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, return bkey_cmp_left_packed(b, l, &r); } -#if 1 +static __always_inline int bpos_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset) ?: + cmp_int(l.snapshot, r.snapshot); +} + static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { - if (l.inode != r.inode) - return l.inode < r.inode ? -1 : 1; - if (l.offset != r.offset) - return l.offset < r.offset ? -1 : 1; - if (l.snapshot != r.snapshot) - return l.snapshot < r.snapshot ? -1 : 1; - return 0; + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset); } -#else -int bkey_cmp(struct bpos l, struct bpos r); -#endif static inline struct bpos bpos_min(struct bpos l, struct bpos r) { - return bkey_cmp(l, r) < 0 ? l : r; + return bpos_cmp(l, r) < 0 ? l : r; } static inline struct bpos bpos_max(struct bpos l, struct bpos r) { - return bkey_cmp(l, r) > 0 ? l : r; + return bpos_cmp(l, r) > 0 ? l : r; } #define sbb(a, b, borrow) \ @@ -200,7 +188,7 @@ static inline struct bpos bpos_sub(struct bpos a, struct bpos b) static inline struct bpos bpos_diff(struct bpos l, struct bpos r) { - if (bkey_cmp(l, r) > 0) + if (bpos_cmp(l, r) > 0) swap(l, r); return bpos_sub(r, l); @@ -262,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format) format->bits_per_field[BKEY_FIELD_SNAPSHOT]; } -static inline struct bpos bkey_successor(struct bpos p) +static inline struct bpos bpos_successor(struct bpos p) { - struct bpos ret = p; + if (!++p.snapshot && + !++p.offset && + !++p.inode) + BUG(); - if (!++ret.offset) - BUG_ON(!++ret.inode); + return p; +} - return ret; +static inline struct bpos bpos_predecessor(struct bpos p) +{ + if (!p.snapshot-- && + !p.offset-- && + !p.inode--) + BUG(); + + return p; } -static inline struct bpos bkey_predecessor(struct bpos p) +static inline struct bpos bpos_nosnap_successor(struct bpos p) { - struct bpos ret = p; + p.snapshot = 0; - if (!ret.offset--) - BUG_ON(!ret.inode--); + if (!++p.offset && + !++p.inode) + BUG(); - return ret; + return p; +} + +static inline struct bpos bpos_nosnap_predecessor(struct bpos p) +{ + p.snapshot = 0; + + if (!p.offset-- && + !p.inode--) + BUG(); + + return p; } static inline u64 bkey_start_offset(const struct bkey *k) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 641169ef..6fe95b80 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -119,10 +119,17 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, return "nonzero size field"; } - if (k.k->p.snapshot) + if (type != BKEY_TYPE_btree && + !btree_type_has_snapshots(type) && + k.k->p.snapshot) return "nonzero snapshot"; if (type != BKEY_TYPE_btree && + btree_type_has_snapshots(type) && + k.k->p.snapshot != U32_MAX) + return "invalid snapshot field"; + + if (type != BKEY_TYPE_btree && !bkey_cmp(k.k->p, POS_MAX)) return "POS_MAX key"; @@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) { - if (bkey_cmp(k.k->p, b->data->min_key) < 0) + if (bpos_cmp(k.k->p, b->data->min_key) < 0) return "key before start of btree node"; - if (bkey_cmp(k.k->p, b->data->max_key) > 0) + if (bpos_cmp(k.k->p, b->data->max_key) > 0) return "key past end of btree node"; return NULL; @@ -165,9 +172,9 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { - if (!bkey_cmp(pos, POS_MIN)) + if (!bpos_cmp(pos, POS_MIN)) pr_buf(out, "POS_MIN"); - else if (!bkey_cmp(pos, POS_MAX)) + else if (!bpos_cmp(pos, POS_MAX)) pr_buf(out, "POS_MAX"); else { if (pos.inode == U64_MAX) @@ -256,7 +263,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, !ops->key_merge || l.k->type != r.k->type || bversion_cmp(l.k->version, r.k->version) || - bkey_cmp(l.k->p, bkey_start_pos(r.k))) + bpos_cmp(l.k->p, bkey_start_pos(r.k))) return BCH_MERGE_NOMERGE; ret = ops->key_merge(c, l, r); @@ -310,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, const struct bkey_ops *ops; struct bkey uk; struct bkey_s u; + unsigned nr_compat = 5; int i; /* * Do these operations in reverse order in the write path: */ - for (i = 0; i < 4; i++) - switch (!write ? i : 3 - i) { + for (i = 0; i < nr_compat; i++) + switch (!write ? i : nr_compat - 1 - i) { case 0: if (big_endian != CPU_BIG_ENDIAN) bch2_bkey_swab_key(f, k); @@ -351,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, } break; case 3: + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + struct bkey_i *u = packed_to_bkey(k); + + if (u) { + u->k.p.snapshot = write + ? 0 : U32_MAX; + } else { + u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; + u64 max_packed = min_packed + + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + uk = __bch2_bkey_unpack_key(f, k); + uk.p.snapshot = write + ? min_packed : min_t(u64, U32_MAX, max_packed); + + BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); + } + } + + break; + case 4: if (!bkey_packed(k)) { u = bkey_i_to_s(packed_to_bkey(k)); } else { diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index f2507079..537ab791 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -45,7 +45,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) BUG_ON(!iter->used); - i->k = bkey_next_skip_noops(i->k, i->end); + i->k = bkey_next(i->k); BUG_ON(i->k > i->end); diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 87f951e1..3fb9a9ed 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, for (_k = i->start; _k < vstruct_last(i); _k = _n) { - _n = bkey_next_skip_noops(_k, vstruct_last(i)); + _n = bkey_next(_k); k = bkey_disassemble(b, _k, &uk); if (c) @@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, n = bkey_unpack_key(b, _n); - if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { + if (bpos_cmp(n.p, k.k->p) < 0) { printk(KERN_ERR "Key skipped backwards\n"); continue; } if (!bkey_deleted(k.k) && - !bkey_cmp(n.p, k.k->p)) + !bpos_cmp(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } } @@ -534,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, goto start; while (1) { if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, + BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k, bkey_unpack_pos(b, k))); start: if (++j == t->size) @@ -544,7 +544,7 @@ start: rw_aux_tree(b, t)[j - 1].offset); } - k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + k = bkey_next(k); BUG_ON(k >= btree_bkey_last(b, t)); } } @@ -686,16 +686,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, if (is_power_of_2(j) && !min_key->u64s) { - k = (void *) min_key; - bkey_init(&k->k); - k->k.p = b->data->min_key; + if (!bkey_pack_pos(min_key, b->data->min_key, b)) { + k = (void *) min_key; + bkey_init(&k->k); + k->k.p = b->data->min_key; + } } if (is_power_of_2(j + 1) && !max_key->u64s) { - k = (void *) max_key; - bkey_init(&k->k); - k->k.p = t->max_key; + if (!bkey_pack_pos(max_key, b->data->max_key, b)) { + k = (void *) max_key; + bkey_init(&k->k); + k->k.p = t->max_key; + } } __make_bfloat(b, t, j, min_key, max_key); @@ -759,7 +763,7 @@ retry: /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + prev = k, k = bkey_next(k); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -776,14 +780,19 @@ retry: } while (k != btree_bkey_last(b, t)) - prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + prev = k, k = bkey_next(k); t->max_key = bkey_unpack_pos(b, prev); - bkey_init(&min_key.k); - min_key.k.p = b->data->min_key; - bkey_init(&max_key.k); - max_key.k.p = t->max_key; + if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + } + + if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { + bkey_init(&max_key.k); + max_key.k.p = t->max_key; + } /* Then we build the tree */ eytzinger1_for_each(j, t->size) @@ -911,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_next_skip_noops(i, k)) + for (i = p; i != k; i = bkey_next(i)) if (i->type >= min_key_type) ret = i; @@ -922,10 +931,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, BUG_ON(ret >= orig_k); for (i = ret - ? bkey_next_skip_noops(ret, orig_k) + ? bkey_next(ret) : btree_bkey_first(b, t); i != orig_k; - i = bkey_next_skip_noops(i, orig_k)) + i = bkey_next(i)) BUG_ON(i->type >= min_key_type); } @@ -960,7 +969,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, /* signal to make_bfloat() that they're uninitialized: */ min_key.u64s = max_key.u64s = 0; - if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { + if (bkey_next(k) == btree_bkey_last(b, t)) { t->max_key = bkey_unpack_pos(b, k); for (j = 1; j < t->size; j = j * 2 + 1) @@ -1084,7 +1093,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, struct bkey_packed *k = start; while (1) { - k = bkey_next_skip_noops(k, end); + k = bkey_next(k); if (k == end) break; @@ -1170,15 +1179,14 @@ void bch2_bset_delete(struct btree *b, __flatten static struct bkey_packed *bset_search_write_set(const struct btree *b, struct bset_tree *t, - struct bpos *search, - const struct bkey_packed *packed_search) + struct bpos *search) { unsigned l = 0, r = t->size; while (l + 1 != r) { unsigned m = (l + r) >> 1; - if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) + if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) l = m; else r = m; @@ -1238,9 +1246,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, prefetch(&base->f[n << 4]); f = &base->f[n]; - - if (!unlikely(packed_search)) - goto slowpath; if (unlikely(f->exponent >= BFLOAT_FAILED)) goto slowpath; @@ -1304,7 +1309,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, case BSET_NO_AUX_TREE: return btree_bkey_first(b, t); case BSET_RW_AUX_TREE: - return bset_search_write_set(b, t, search, lossy_packed_search); + return bset_search_write_set(b, t, search); case BSET_RO_AUX_TREE: /* * Each node in the auxiliary search tree covers a certain range @@ -1313,7 +1318,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, * start and end - handle that here: */ - if (bkey_cmp(*search, t->max_key) > 0) + if (bpos_cmp(*search, t->max_key) > 0) return btree_bkey_last(b, t); return bset_search_tree(b, t, search, lossy_packed_search); @@ -1334,12 +1339,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, m, lossy_packed_search, search) < 0) - m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + m = bkey_next(m); if (!packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + m = bkey_next(m); if (bch2_expensive_debug_checks) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); @@ -1403,16 +1408,15 @@ noinline __flatten __attribute__((cold)) static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { - struct bset_tree *t; + struct bkey_packed *k; trace_bkey_pack_pos_fail(search); - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - bch2_bset_search(b, t, search, NULL, NULL), - btree_bkey_last(b, t)); + bch2_btree_node_iter_init_from_start(iter, b); - bch2_btree_node_iter_sort(iter, b); + while ((k = bch2_btree_node_iter_peek(iter, b)) && + bkey_iter_pos_cmp(b, k, search) < 0) + bch2_btree_node_iter_advance(iter, b); } /** @@ -1446,7 +1450,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, * to the search key is going to have 0 sectors after the search key. * * But this does mean that we can't just search for - * bkey_successor(start_of_range) to get the first extent that overlaps with + * bpos_successor(start_of_range) to get the first extent that overlaps with * the range we want - if we're unlucky and there's an extent that ends * exactly where we searched, then there could be a deleted key at the same * position and we'd get that when we search instead of the preceding extent @@ -1464,7 +1468,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct bkey_packed *k[MAX_BSETS]; unsigned i; - EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); bset_aux_tree_verify(b); memset(iter, 0, sizeof(*iter)); diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 54b364c8..506da4e0 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -305,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ _k != btree_bkey_last(_b, _t); \ - _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) + _k = bkey_next(_k)) static inline bool bset_has_ro_aux_tree(struct bset_tree *t) { @@ -378,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, EBUG_ON(r_packed && !bkey_packed(r_packed)); if (unlikely(!bkey_packed(l))) - return bkey_cmp(packed_to_bkey_c(l)->p, *r); + return bpos_cmp(packed_to_bkey_c(l)->p, *r); if (likely(r_packed)) return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); @@ -403,24 +403,6 @@ bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) return bch2_bkey_prev_filter(b, t, k, 1); } -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_cmp(k->p, m->p) < 0; - int cmp2 = bkey_cmp(bkey_start_pos(k), - bkey_start_pos(m)) > 0; - - return (cmp1 << 1) + cmp2; -} - /* Btree key iteration */ void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index fc76e788..8a4667ba 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -149,7 +149,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, if (level) six_lock_pcpu_alloc(&b->c.lock); else - six_lock_pcpu_free(&b->c.lock); + six_lock_pcpu_free_rcu(&b->c.lock); mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); @@ -814,9 +814,9 @@ lock_node: EBUG_ON(b->c.btree_id != iter->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bkey_cmp(b->data->min_key, + bpos_cmp(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); return b; @@ -897,9 +897,9 @@ lock_node: EBUG_ON(b->c.btree_id != btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bkey_cmp(b->data->min_key, + bpos_cmp(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); out: bch2_btree_cache_cannibalize_unlock(c); @@ -1011,7 +1011,7 @@ out: if (sib != btree_prev_sib) swap(n1, n2); - if (bkey_cmp(bkey_successor(n1->key.k.p), + if (bpos_cmp(bpos_successor(n1->key.k.p), n2->data->min_key)) { char buf1[200], buf2[200]; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 6d5ed774..88c549c4 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -64,7 +64,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, struct bpos node_end = b->data->max_key; struct bpos expected_start = bkey_deleted(&prev->k->k) ? node_start - : bkey_successor(prev->k->k.p); + : bpos_successor(prev->k->k.p); char buf1[200], buf2[200]; bool update_min = false; bool update_max = false; @@ -81,7 +81,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); } - if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, + if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " cur %s", @@ -92,7 +92,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, } if (fsck_err_on(is_last && - bkey_cmp(cur.k->k.p, node_end), c, + bpos_cmp(cur.k->k.p, node_end), c, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", @@ -470,8 +470,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, k, &max_stale, true); @@ -560,13 +560,13 @@ static int bch2_gc_btree_init(struct bch_fs *c, return 0; six_lock_read(&b->c.lock, NULL, NULL); - if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, + if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, "btree root with incorrect min_key: %s", (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { BUG(); } - if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, + if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, "btree root with incorrect max_key: %s", (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { BUG(); @@ -1148,7 +1148,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { @@ -1171,6 +1173,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_btree_iter_advance(iter); } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -1271,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, /* Find a format that all keys in @old_nodes can pack into */ bch2_bkey_format_init(&format_state); + /* + * XXX: this won't correctly take it account the new min/max keys: + */ for (i = 0; i < nr_old_nodes; i++) __bch2_btree_calc_format(&format_state, old_nodes[i]); @@ -1333,7 +1339,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, k < vstruct_last(s2) && vstruct_blocks_plus(n1->data, c->block_bits, u64s + k->u64s) <= blocks; - k = bkey_next_skip_noops(k, vstruct_last(s2))) { + k = bkey_next(k)) { last = k; u64s += k->u64s; } @@ -1362,7 +1368,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, n1->key.k.p = n1->data->max_key = bkey_unpack_pos(n1, last); - n2->data->min_key = bkey_successor(n1->data->max_key); + n2->data->min_key = bpos_successor(n1->data->max_key); memcpy_u64s(vstruct_last(s1), s2->start, u64s); @@ -1405,7 +1411,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, unsigned j; for (j = 0; j < nr_new_nodes; j++) - if (!bkey_cmp(old_nodes[i]->key.k.p, + if (!bpos_cmp(old_nodes[i]->key.k.p, new_nodes[j]->key.k.p)) goto next; diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index c3d02f58..b1362a9f 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -45,13 +45,9 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - if (l.phase != r.phase) - return l.phase < r.phase ? -1 : 1; - if (bkey_cmp(l.pos, r.pos)) - return bkey_cmp(l.pos, r.pos); - if (l.level != r.level) - return l.level < r.level ? -1 : 1; - return 0; + return cmp_int(l.phase, r.phase) ?: + bpos_cmp(l.pos, r.pos) ?: + cmp_int(l.level, r.level); } static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 9b74e799..b43d4468 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -32,13 +32,13 @@ static void verify_no_dups(struct btree *b, if (start == end) return; - for (p = start, k = bkey_next_skip_noops(start, end); + for (p = start, k = bkey_next(start); k != end; - p = k, k = bkey_next_skip_noops(k, end)) { + p = k, k = bkey_next(k)) { struct bkey l = bkey_unpack_key(b, p); struct bkey r = bkey_unpack_key(b, k); - BUG_ON(bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); + BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); } #endif } @@ -47,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; - k != vstruct_last(i); - k = bkey_next_skip_noops(k, vstruct_last(i))) + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) k->needs_whiteout = v; } @@ -213,7 +211,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next_skip_noops(k, end); + n = bkey_next(k); if (!bkey_deleted(k)) { bkey_copy(out, k); @@ -614,12 +612,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect level"); - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { - u64 *p = (u64 *) &bn->ptr; - - *p = swab64(*p); - } - if (!write) compat_btree_node(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, bn); @@ -633,14 +625,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, b->data->max_key = b->key.k.p; } - btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), + btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %s should be %s", (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); } - btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), + btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %s", (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); @@ -754,7 +746,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, } prev = k; - k = bkey_next_skip_noops(k, vstruct_last(i)); + k = bkey_next(k); } fsck_err: return ret; @@ -947,7 +939,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bp.v->mem_ptr = 0; } - k = bkey_next_skip_noops(k, vstruct_last(i)); + k = bkey_next(k); } bch2_bset_build_aux_tree(b, b->set, false); @@ -1327,8 +1319,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) return -1; - ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?: - validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); + ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: + validate_bset(c, NULL, b, i, sectors, WRITE, false); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -1481,7 +1473,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, validate_before_checksum = true; /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change) + if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) validate_before_checksum = true; /* if we're going to be encrypting, check metadata validity first: */ diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 16ce6dff..9c14cd30 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -189,8 +189,8 @@ void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bkey_format *f) + unsigned version, unsigned big_endian, + int write, struct bkey_format *f) { if (version < bcachefs_metadata_version_inode_btree_change && btree_id == BTREE_ID_inodes) { @@ -199,6 +199,16 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id, swap(f->field_offset[BKEY_FIELD_INODE], f->field_offset[BKEY_FIELD_OFFSET]); } + + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + u64 max_packed = + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + f->field_offset[BKEY_FIELD_SNAPSHOT] = write + ? 0 + : U32_MAX - max_packed; + } } static inline void compat_bpos(unsigned level, enum btree_id btree_id, @@ -220,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, { if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bn->min_key, POS_MIN) && + bpos_cmp(bn->min_key, POS_MIN) && write) - bn->min_key = bkey_predecessor(bn->min_key); + bn->min_key = bpos_nosnap_predecessor(bn->min_key); + + if (version < bcachefs_metadata_version_snapshot && + write) + bn->max_key.snapshot = 0; compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + if (version < bcachefs_metadata_version_snapshot && + !write) + bn->max_key.snapshot = U32_MAX; + if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bn->min_key, POS_MIN) && + bpos_cmp(bn->min_key, POS_MIN) && !write) - bn->min_key = bkey_successor(bn->min_key); + bn->min_key = bpos_nosnap_successor(bn->min_key); } #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 459d27ca..8190e73d 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -18,6 +18,36 @@ static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_successor(p); + } else { + p = bpos_nosnap_successor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) +{ + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_predecessor(p); + } else { + p = bpos_nosnap_predecessor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { return l < BTREE_MAX_DEPTH && @@ -30,20 +60,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) if ((iter->flags & BTREE_ITER_IS_EXTENTS) && bkey_cmp(pos, POS_MAX)) - pos = bkey_successor(pos); + pos = bkey_successor(iter, pos); return pos; } static inline bool btree_iter_pos_before_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(iter->real_pos, b->data->min_key) < 0; + return bpos_cmp(iter->real_pos, b->data->min_key) < 0; } static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(b->key.k.p, iter->real_pos) < 0; + return bpos_cmp(b->key.k.p, iter->real_pos) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -285,7 +315,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* Must lock btree nodes in key order: */ if (btree_node_locked(linked, level) && - bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, btree_iter_type(linked))) <= 0) { deadlock_iter = linked; reason = 7; @@ -583,10 +613,24 @@ err: static void bch2_btree_iter_verify(struct btree_iter *iter) { + enum btree_iter_type type = btree_iter_type(iter); unsigned i; EBUG_ON(iter->btree_id >= BTREE_ID_NR); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(type == BTREE_ITER_NODES && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(type != BTREE_ITER_NODES && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + bch2_btree_iter_verify_locks(iter); for (i = 0; i < BTREE_MAX_DEPTH; i++) @@ -597,6 +641,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { enum btree_iter_type type = btree_iter_type(iter); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + BUG_ON((type == BTREE_ITER_KEYS || type == BTREE_ITER_CACHED) && (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || @@ -1384,7 +1431,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) if (!b) return NULL; - BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); iter->pos = iter->real_pos = b->key.k.p; @@ -1421,12 +1468,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (!b) return NULL; - if (bkey_cmp(iter->pos, b->key.k.p) < 0) { + if (bpos_cmp(iter->pos, b->key.k.p) < 0) { /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ - btree_iter_set_search_pos(iter, bkey_successor(iter->pos)); + btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); /* Unlock to avoid screwing up our lock invariants: */ btree_node_unlock(iter, iter->level); @@ -1453,7 +1500,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) { - int cmp = bkey_cmp(new_pos, iter->real_pos); + int cmp = bpos_cmp(new_pos, iter->real_pos); unsigned l = iter->level; if (!cmp) @@ -1497,10 +1544,10 @@ out: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = bkey_cmp(pos, POS_MAX) != 0; + bool ret = bpos_cmp(pos, POS_MAX) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(pos); + pos = bkey_successor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; } @@ -1508,10 +1555,10 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = bkey_cmp(pos, POS_MIN) != 0; + bool ret = bpos_cmp(pos, POS_MIN) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_predecessor(pos); + pos = bkey_predecessor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; } @@ -1519,7 +1566,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) { struct bpos next_pos = iter->l[0].b->key.k.p; - bool ret = bkey_cmp(next_pos, POS_MAX) != 0; + bool ret = bpos_cmp(next_pos, POS_MAX) != 0; /* * Typically, we don't want to modify iter->pos here, since that @@ -1527,7 +1574,7 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) * btree, in that case we want iter->pos to reflect that: */ if (ret) - btree_iter_set_search_pos(iter, bkey_successor(next_pos)); + btree_iter_set_search_pos(iter, bpos_successor(next_pos)); else bch2_btree_iter_set_pos(iter, POS_MAX); @@ -1537,10 +1584,10 @@ static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) { struct bpos next_pos = iter->l[0].b->data->min_key; - bool ret = bkey_cmp(next_pos, POS_MIN) != 0; + bool ret = bpos_cmp(next_pos, POS_MIN) != 0; if (ret) - btree_iter_set_search_pos(iter, bkey_predecessor(next_pos)); + btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); else bch2_btree_iter_set_pos(iter, POS_MIN); @@ -1586,13 +1633,13 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool wi k = btree_iter_level_peek(iter, &iter->l[0]); if (next_update && - bkey_cmp(next_update->k.p, iter->real_pos) <= 0) + bpos_cmp(next_update->k.p, iter->real_pos) <= 0) k = bkey_i_to_s_c(next_update); if (likely(k.k)) { if (bkey_deleted(k.k)) { btree_iter_set_search_pos(iter, - bkey_successor(k.k->p)); + bkey_successor(iter, k.k->p)); continue; } @@ -1731,7 +1778,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); + bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); } pos = iter->pos; @@ -1965,6 +2012,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, { struct btree_iter *iter, *best = NULL; + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + !btree_type_has_snapshots(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) + pos.snapshot = btree_type_has_snapshots(btree_id) + ? U32_MAX : 0; + /* We always want a fresh iterator for node iterators: */ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES) goto alloc_iter; @@ -1999,11 +2054,14 @@ alloc_iter: if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && btree_node_type_is_extents(btree_id) && - !(flags & BTREE_ITER_NOT_EXTENTS)) + !(flags & BTREE_ITER_NOT_EXTENTS) && + !(flags & BTREE_ITER_ALL_SNAPSHOTS)) flags |= BTREE_ITER_IS_EXTENTS; iter->flags = flags; + iter->snapshot = pos.snapshot; + if (!(iter->flags & BTREE_ITER_INTENT)) bch2_btree_iter_downgrade(iter); else if (!iter->locks_want) @@ -2026,6 +2084,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, __bch2_trans_get_iter(trans, btree_id, pos, BTREE_ITER_NODES| BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| flags); unsigned i; @@ -2127,6 +2186,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->nr_updates2 = 0; trans->mem_top = 0; + trans->hooks = NULL; trans->extra_journal_entries = NULL; trans->extra_journal_entry_u64s = 0; @@ -2137,7 +2197,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) (void *) &trans->fs_usage_deltas->memset_start); } - bch2_trans_cond_resched(trans); + if (!(flags & TRANS_RESET_NOUNLOCK)) + bch2_trans_cond_resched(trans); if (!(flags & TRANS_RESET_NOTRAVERSE)) bch2_btree_iter_traverse_all(trans); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 8768f4cb..7585f989 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -172,6 +172,9 @@ bool bch2_btree_iter_rewind(struct btree_iter *); static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + bkey_init(&iter->k); iter->k.p = iter->pos = new_pos; } @@ -303,6 +306,7 @@ static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btr } #define TRANS_RESET_NOTRAVERSE (1 << 0) +#define TRANS_RESET_NOUNLOCK (1 << 1) void bch2_trans_reset(struct btree_trans *, unsigned); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 0b354563..04354f56 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -21,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct bkey_cached_key *key = arg->key; return cmp_int(ck->key.btree_id, key->btree_id) ?: - bkey_cmp(ck->key.pos, key->pos); + bpos_cmp(ck->key.pos, key->pos); } static const struct rhashtable_params bch2_btree_key_cache_params = { @@ -70,7 +70,7 @@ static void bkey_cached_evict(struct btree_key_cache *c, bch2_btree_key_cache_params)); memset(&ck->key, ~0, sizeof(ck->key)); - c->nr_keys--; + atomic_long_dec(&c->nr_keys); } static void bkey_cached_free(struct btree_key_cache *bc, @@ -99,12 +99,6 @@ bkey_cached_alloc(struct btree_key_cache *c) { struct bkey_cached *ck; - list_for_each_entry_reverse(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) { - c->nr_freed--; - return ck; - } - ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); if (likely(ck)) { INIT_LIST_HEAD(&ck->list); @@ -114,11 +108,39 @@ bkey_cached_alloc(struct btree_key_cache *c) return ck; } - list_for_each_entry(ck, &c->clean, list) + return NULL; +} + +static struct bkey_cached * +bkey_cached_reuse(struct btree_key_cache *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct bkey_cached *ck; + unsigned i; + + mutex_lock(&c->lock); + list_for_each_entry_reverse(ck, &c->freed, list) if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); + c->nr_freed--; + list_del(&ck->list); + mutex_unlock(&c->lock); return ck; } + mutex_unlock(&c->lock); + + rcu_read_lock(); + tbl = rht_dereference_rcu(c->table.tbl, &c->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(c, ck); + rcu_read_unlock(); + return ck; + } + } + rcu_read_unlock(); return NULL; } @@ -129,10 +151,17 @@ btree_key_cache_create(struct btree_key_cache *c, struct bpos pos) { struct bkey_cached *ck; + bool was_new = true; ck = bkey_cached_alloc(c); - if (!ck) - return ERR_PTR(-ENOMEM); + + if (unlikely(!ck)) { + ck = bkey_cached_reuse(c); + if (unlikely(!ck)) + return ERR_PTR(-ENOMEM); + + was_new = false; + } ck->c.level = 0; ck->c.btree_id = btree_id; @@ -141,17 +170,26 @@ btree_key_cache_create(struct btree_key_cache *c, ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (rhashtable_lookup_insert_fast(&c->table, + if (unlikely(rhashtable_lookup_insert_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)) { + bch2_btree_key_cache_params))) { /* We raced with another fill: */ - bkey_cached_free(c, ck); + + if (likely(was_new)) { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { + mutex_lock(&c->lock); + bkey_cached_free(c, ck); + mutex_unlock(&c->lock); + } + return NULL; } - c->nr_keys++; + atomic_long_inc(&c->nr_keys); - list_move(&ck->list, &c->clean); six_unlock_write(&ck->c.lock); return ck; @@ -213,7 +251,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) const struct btree_iter *iter = p; return ck->key.btree_id == iter->btree_id && - !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; + !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; } __flatten @@ -238,11 +276,8 @@ retry: return 0; } - mutex_lock(&c->btree_key_cache.lock); ck = btree_key_cache_create(&c->btree_key_cache, iter->btree_id, iter->pos); - mutex_unlock(&c->btree_key_cache.lock); - ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; @@ -257,7 +292,7 @@ retry: if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, bkey_cached_check_fn, iter, _THIS_IP_)) { if (ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)) { + bpos_cmp(ck->key.pos, iter->pos)) { goto retry; } @@ -267,7 +302,7 @@ retry: } if (ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)) { + bpos_cmp(ck->key.pos, iter->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } @@ -370,15 +405,13 @@ err: bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); + BUG_ON(!btree_node_locked(c_iter, 0)); + if (!evict) { - mutex_lock(&c->btree_key_cache.lock); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - c->btree_key_cache.nr_dirty--; + atomic_long_dec(&c->btree_key_cache.nr_dirty); } - - list_move_tail(&ck->list, &c->btree_key_cache.clean); - mutex_unlock(&c->btree_key_cache.lock); } else { evict: BUG_ON(!btree_node_intent_locked(c_iter, 0)); @@ -388,13 +421,14 @@ evict: six_lock_write(&ck->c.lock, NULL, NULL); - mutex_lock(&c->btree_key_cache.lock); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - c->btree_key_cache.nr_dirty--; + atomic_long_dec(&c->btree_key_cache.nr_dirty); } bkey_cached_evict(&c->btree_key_cache, ck); + + mutex_lock(&c->btree_key_cache.lock); bkey_cached_free(&c->btree_key_cache, ck); mutex_unlock(&c->btree_key_cache.lock); } @@ -475,16 +509,11 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - mutex_lock(&c->btree_key_cache.lock); - list_move(&ck->list, &c->btree_key_cache.dirty); - set_bit(BKEY_CACHED_DIRTY, &ck->flags); - c->btree_key_cache.nr_dirty++; + atomic_long_inc(&c->btree_key_cache.nr_dirty); if (bch2_nr_btree_keys_need_flush(c)) kick_reclaim = true; - - mutex_unlock(&c->btree_key_cache.lock); } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, @@ -509,9 +538,11 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct bch_fs *c = container_of(shrink, struct bch_fs, btree_key_cache.shrink); struct btree_key_cache *bc = &c->btree_key_cache; + struct bucket_table *tbl; struct bkey_cached *ck, *t; size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned flags; + unsigned start, flags; + int srcu_idx; /* Return -1 if we can't do anything right now */ if (sc->gfp_mask & __GFP_FS) @@ -519,6 +550,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, else if (!mutex_trylock(&bc->lock)) return -1; + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); /* @@ -540,23 +572,40 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, if (scanned >= nr) goto out; - list_for_each_entry_safe(ck, t, &bc->clean, list) { - if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - else if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); - } + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + start = bc->shrink_iter; - scanned++; - if (scanned >= nr) { - if (&t->list != &bc->clean) - list_move_tail(&bc->clean, &t->list); - goto out; + do { + struct rhash_head *pos, *next; + + rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) + continue; + + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + scanned++; + if (scanned >= nr) + break; } - } + + bc->shrink_iter++; + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + } while (scanned < nr && bc->shrink_iter != start); + + rcu_read_unlock(); out: memalloc_nofs_restore(flags); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); mutex_unlock(&bc->lock); return freed; @@ -569,41 +618,45 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, btree_key_cache.shrink); struct btree_key_cache *bc = &c->btree_key_cache; - return bc->nr_keys; + return atomic_long_read(&bc->nr_keys); } void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bucket_table *tbl; struct bkey_cached *ck, *n; + struct rhash_head *pos; + unsigned i; if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); mutex_lock(&bc->lock); - list_splice(&bc->dirty, &bc->clean); - list_for_each_entry_safe(ck, n, &bc->clean, list) { + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &bc->freed); + } + rcu_read_unlock(); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); bch2_journal_preres_put(&c->journal, &ck->res); - kfree(ck->k); list_del(&ck->list); + kfree(ck->k); kmem_cache_free(bch2_key_cache, ck); - bc->nr_keys--; } - BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); - BUG_ON(bc->nr_keys); - - list_for_each_entry_safe(ck, n, &bc->freed, list) { - cond_resched(); + BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal)); + BUG_ON(atomic_long_read(&bc->nr_keys)); - list_del(&ck->list); - kmem_cache_free(bch2_key_cache, ck); - } mutex_unlock(&bc->lock); if (bc->table_init_done) @@ -614,8 +667,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { mutex_init(&c->lock); INIT_LIST_HEAD(&c->freed); - INIT_LIST_HEAD(&c->clean); - INIT_LIST_HEAD(&c->dirty); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) @@ -641,8 +692,8 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); - pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); - pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); + pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys)); + pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty)); } void bch2_btree_key_cache_exit(void) diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index 2f8b5521..02715cd2 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -3,8 +3,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) { - size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); - size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 1024 + nr_keys / 2; return max_t(ssize_t, 0, nr_dirty - max_dirty); @@ -12,8 +12,8 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) { - size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); - size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; return nr_dirty > max_dirty && diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 5999044a..1941616f 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -216,6 +216,7 @@ enum btree_iter_type { #define BTREE_ITER_CACHED_NOFILL (1 << 9) #define BTREE_ITER_CACHED_NOCREATE (1 << 10) #define BTREE_ITER_NOT_EXTENTS (1 << 11) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -245,6 +246,8 @@ struct btree_iter { /* what we're searching for/what the iterator actually points to: */ struct bpos real_pos; struct bpos pos_after_commit; + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; u16 flags; u8 idx; @@ -292,13 +295,12 @@ struct btree_key_cache { struct rhashtable table; bool table_init_done; struct list_head freed; - struct list_head clean; - struct list_head dirty; struct shrinker shrink; + unsigned shrink_iter; size_t nr_freed; - size_t nr_keys; - size_t nr_dirty; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; }; struct bkey_cached_key { @@ -330,7 +332,7 @@ struct bkey_cached { struct btree_insert_entry { unsigned trigger_flags; u8 bkey_type; - u8 btree_id; + enum btree_id btree_id:8; u8 level; unsigned trans_triggers_run:1; unsigned is_extent:1; @@ -344,6 +346,14 @@ struct btree_insert_entry { #define BTREE_ITER_MAX 32 #endif +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; + struct btree_trans { struct bch_fs *c; #ifdef CONFIG_BCACHEFS_DEBUG @@ -378,6 +388,7 @@ struct btree_trans { struct btree_insert_entry *updates2; /* update path: */ + struct btree_trans_commit_hook *hooks; struct jset_entry *extra_journal_entries; unsigned extra_journal_entry_u64s; struct journal_entry_pin *journal_pin; @@ -600,6 +611,17 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) + +static inline bool btree_type_has_snapshots(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +} + enum btree_trigger_flags { __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index a2513808..4ce12ae2 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -77,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, int bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_trigger_flags); +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); /** diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a661bc0c..19dfc32e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -50,7 +50,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; bp = bkey_s_c_to_btree_ptr_v2(k); - if (bkey_cmp(next_node, bp.v->min_key)) { + if (bpos_cmp(next_node, bp.v->min_key)) { bch2_dump_btree_node(c, b); panic("expected next min_key %s got %s\n", (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), @@ -60,7 +60,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) bch2_btree_node_iter_advance(&iter, b); if (bch2_btree_node_iter_end(&iter)) { - if (bkey_cmp(k.k->p, b->key.k.p)) { + if (bpos_cmp(k.k->p, b->key.k.p)) { bch2_dump_btree_node(c, b); panic("expected end %s got %s\n", (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), @@ -69,7 +69,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; } - next_node = bkey_successor(k.k->p); + next_node = bpos_successor(k.k->p); } #endif } @@ -82,8 +82,6 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) struct bset_tree *t; struct bkey uk; - bch2_bkey_format_add_pos(s, b->data->min_key); - for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) { @@ -97,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b) struct bkey_format_state s; bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, b->data->min_key); + bch2_bkey_format_add_pos(&s, b->data->max_key); __bch2_btree_calc_format(&s, b); return bch2_bkey_format_done(&s); @@ -289,7 +289,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b->data->flags = 0; SET_BTREE_NODE_ID(b->data, as->btree_id); SET_BTREE_NODE_LEVEL(b->data, level); - b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); @@ -1095,10 +1094,12 @@ static struct btree *__btree_split_node(struct btree_update *as, struct btree *n1, struct btree_iter *iter) { + struct bkey_format_state s; size_t nr_packed = 0, nr_unpacked = 0; struct btree *n2; struct bset *set1, *set2; - struct bkey_packed *k, *prev = NULL; + struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; + struct bpos n1_pos; n2 = bch2_btree_node_alloc(as, n1->c.level); bch2_btree_update_add_new_node(as, n2); @@ -1108,8 +1109,6 @@ static struct btree *__btree_split_node(struct btree_update *as, SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); n2->key.k.p = n1->key.k.p; - btree_node_set_format(n2, n2->data->format); - set1 = btree_bset_first(n1); set2 = btree_bset_first(n2); @@ -1119,7 +1118,7 @@ static struct btree *__btree_split_node(struct btree_update *as, */ k = set1->start; while (1) { - struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); + struct bkey_packed *n = bkey_next(k); if (n == vstruct_last(set1)) break; @@ -1136,33 +1135,53 @@ static struct btree *__btree_split_node(struct btree_update *as, } BUG_ON(!prev); + set2_start = k; + set2_end = vstruct_last(set1); - btree_set_max(n1, bkey_unpack_pos(n1, prev)); - btree_set_min(n2, bkey_successor(n1->key.k.p)); - - set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); - set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); - + set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data); set_btree_bset_end(n1, n1->set); - set_btree_bset_end(n2, n2->set); - - n2->nr.live_u64s = le16_to_cpu(set2->u64s); - n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); - n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; - n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; n1->nr.live_u64s = le16_to_cpu(set1->u64s); n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); n1->nr.packed_keys = nr_packed; n1->nr.unpacked_keys = nr_unpacked; + n1_pos = bkey_unpack_pos(n1, prev); + if (as->c->sb.version < bcachefs_metadata_version_snapshot) + n1_pos.snapshot = U32_MAX; + + btree_set_max(n1, n1_pos); + btree_set_min(n2, bpos_successor(n1->key.k.p)); + + bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, n2->data->min_key); + bch2_bkey_format_add_pos(&s, n2->data->max_key); + + for (k = set2_start; k != set2_end; k = bkey_next(k)) { + struct bkey uk = bkey_unpack_key(n1, k); + bch2_bkey_format_add_key(&s, &uk); + } + + n2->data->format = bch2_bkey_format_done(&s); + btree_node_set_format(n2, n2->data->format); + + out = set2->start; + memset(&n2->nr, 0, sizeof(n2->nr)); + + for (k = set2_start; k != set2_end; k = bkey_next(k)) { + BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k) + ? &n1->format : &bch2_bkey_format_current, k)); + out->format = KEY_FORMAT_LOCAL_BTREE; + btree_keys_account_key_add(&n2->nr, 0, out); + out = bkey_next(out); + } + + set2->u64s = cpu_to_le16((u64 *) out - set2->_data); + set_btree_bset_end(n2, n2->set); + BUG_ON(!set1->u64s); BUG_ON(!set2->u64s); - memcpy_u64s(set2->start, - vstruct_end(set1), - le16_to_cpu(set2->u64s)); - btree_node_reset_sib_u64s(n1); btree_node_reset_sib_u64s(n2); @@ -1216,7 +1235,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, i = btree_bset_first(b); src = dst = i->start; while (src != vstruct_last(i)) { - n = bkey_next_skip_noops(src, vstruct_last(i)); + n = bkey_next(src); if (!bkey_deleted(src)) { memmove_u64s_down(dst, src, src->u64s); dst = bkey_next(dst); @@ -1563,8 +1582,10 @@ retry: } bch2_bkey_format_init(&new_s); - __bch2_btree_calc_format(&new_s, b); - __bch2_btree_calc_format(&new_s, m); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); + __bch2_btree_calc_format(&new_s, prev); + __bch2_btree_calc_format(&new_s, next); + bch2_bkey_format_add_pos(&new_s, next->data->max_key); new_f = bch2_bkey_format_done(&new_s); sib_u64s = btree_node_u64s_with_format(b, &new_f) + diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index d9308bd4..67a2c65b 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -26,7 +26,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, { return cmp_int(l->btree_id, r->btree_id) ?: -cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p); + bpos_cmp(l->k->k.p, r->k->k.p); } static inline bool same_leaf_as_prev(struct btree_trans *trans, @@ -70,8 +70,8 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(btree_node_just_written(b)); EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); - EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->trans->c, b)); EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); @@ -223,9 +223,17 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, { struct bch_fs *c = trans->c; - BUG_ON(bch2_debug_check_bkeys && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type)); - BUG_ON(bkey_cmp(i->k->k.p, i->iter->real_pos)); + if (bch2_debug_check_bkeys) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + panic("invalid bkey %s on insert: %s\n", buf, invalid); + } + } + BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); BUG_ON(i->level != i->iter->level); BUG_ON(i->btree_id != i->iter->btree_id); } @@ -369,6 +377,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; + struct btree_trans_commit_hook *h; unsigned u64s = 0; bool marking = false; int ret; @@ -386,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + return ret; + h = h->next; + } + trans_for_each_update2(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) @@ -556,6 +573,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; + if (!(trans->flags & BTREE_INSERT_NOUNLOCK)) trans_for_each_update2(trans, i) if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && !same_leaf_as_prev(trans, i)) @@ -826,7 +844,7 @@ int __bch2_trans_commit(struct btree_trans *trans) struct btree_insert_entry *i = NULL; struct btree_iter *iter; bool trans_trigger_run; - unsigned u64s; + unsigned u64s, reset_flags = 0; int ret = 0; if (!trans->nr_updates) @@ -940,7 +958,11 @@ out: if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); out_reset: - bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); + if (!ret) + reset_flags |= TRANS_RESET_NOTRAVERSE; + if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK)) + reset_flags |= TRANS_RESET_NOUNLOCK; + bch2_trans_reset(trans, reset_flags); return ret; err: @@ -1053,6 +1075,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, return 0; } +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, struct bkey_i *k) { diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index c6d49f44..acf60038 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); k = bch2_btree_iter_peek(iter); while (k.k && !(err = bkey_err(k))) { @@ -273,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (err) return err; - if (!i->size || !bkey_cmp(POS_MAX, i->from)) + if (!i->size || !bpos_cmp(POS_MAX, i->from)) return i->ret; bch2_trans_init(&trans, i->c, 0, 0); @@ -289,8 +291,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, * can't easily correctly restart a btree node traversal across * all nodes, meh */ - i->from = bkey_cmp(POS_MAX, b->key.k.p) - ? bkey_successor(b->key.k.p) + i->from = bpos_cmp(POS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) : b->key.k.p; if (!i->size) diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 592dd80c..cf4ce2e7 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, int bch2_dirent_create(struct btree_trans *trans, u64 dir_inum, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - int flags) + u64 *dir_offset, int flags) { struct bkey_i_dirent *dirent; int ret; @@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans, if (ret) return ret; - return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, &dirent->k_i, flags); + *dir_offset = dirent->k.p.offset; + + return ret; } static void dirent_copy_target(struct bkey_i_dirent *dst, @@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, int bch2_dirent_rename(struct btree_trans *trans, u64 src_dir, struct bch_hash_info *src_hash, u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, - const struct qstr *dst_name, u64 *dst_inum, + const struct qstr *src_name, u64 *src_inum, u64 *src_offset, + const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { struct btree_iter *src_iter = NULL, *dst_iter = NULL; @@ -255,7 +258,7 @@ int bch2_dirent_rename(struct btree_trans *trans, new_dst->k.p = src_iter->pos; bch2_trans_update(trans, src_iter, &new_dst->k_i, 0); - goto out; + goto out_set_offset; } else { /* If we're overwriting, we can't insert new_dst * at a different slot because it has to @@ -278,6 +281,9 @@ int bch2_dirent_rename(struct btree_trans *trans, bch2_trans_update(trans, src_iter, &new_src->k_i, 0); bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); +out_set_offset: + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; out: bch2_trans_iter_put(trans, src_iter); bch2_trans_iter_put(trans, dst_iter); diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 34769371..e1d8ce37 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_create(struct btree_trans *, u64, const struct bch_hash_info *, u8, - const struct qstr *, u64, int); + const struct qstr *, u64, u64 *, int); int bch2_dirent_delete_at(struct btree_trans *, const struct bch_hash_info *, @@ -46,8 +46,8 @@ enum bch_rename_mode { int bch2_dirent_rename(struct btree_trans *, u64, struct bch_hash_info *, u64, struct bch_hash_info *, - const struct qstr *, u64 *, - const struct qstr *, u64 *, + const struct qstr *, u64 *, u64 *, + const struct qstr *, u64 *, u64 *, enum bch_rename_mode); struct btree_iter * diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 1dba7e99..f712f685 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -873,6 +873,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, if (ret) break; } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index a7e04082..b07d3955 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -180,7 +180,8 @@ const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - if (bp.v->min_key.snapshot) + if (c->sb.version < bcachefs_metadata_version_snapshot && + bp.v->min_key.snapshot) return "invalid min_key.snapshot"; return bch2_bkey_ptrs_invalid(c, k); @@ -212,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, btree_node_type_is_extents(btree_id) && bkey_cmp(bp.v->min_key, POS_MIN)) bp.v->min_key = write - ? bkey_predecessor(bp.v->min_key) - : bkey_successor(bp.v->min_key); + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); } /* KEY_TYPE_extent: */ diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index c8069dfb..ccee43a2 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -582,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s); /* Generic extent code: */ +enum bch_extent_overlap { + BCH_EXTENT_OVERLAP_ALL = 0, + BCH_EXTENT_OVERLAP_BACK = 1, + BCH_EXTENT_OVERLAP_FRONT = 2, + BCH_EXTENT_OVERLAP_MIDDLE = 3, +}; + +/* Returns how k overlaps with m */ +static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, + const struct bkey *m) +{ + int cmp1 = bkey_cmp(k->p, m->p) < 0; + int cmp2 = bkey_cmp(bkey_start_pos(k), + bkey_start_pos(m)) > 0; + + return (cmp1 << 1) + cmp2; +} + int bch2_cut_front_s(struct bpos, struct bkey_s); int bch2_cut_back_s(struct bpos, struct bkey_s); diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 503ce192..83c2168c 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, { struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL; + struct btree_iter *inode_iter = NULL; struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); - u64 now = bch2_current_time(trans->c); + u64 now = bch2_current_time(c); + u64 dir_offset = 0; int ret; dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); @@ -34,7 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, new_inode); + inode_iter = bch2_inode_create(trans, new_inode); + ret = PTR_ERR_OR_ZERO(inode_iter); if (ret) goto err; @@ -66,11 +69,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ret = bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(new_inode->bi_mode), name, new_inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; } + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, inode_iter, new_inode); err: + bch2_trans_iter_put(trans, inode_iter); bch2_trans_iter_put(trans, dir_iter); return ret; } @@ -79,9 +91,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 inum, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name) { + struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; - u64 now = bch2_current_time(trans->c); + u64 now = bch2_current_time(c); + u64 dir_offset = 0; int ret; inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); @@ -92,6 +106,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); + inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); ret = PTR_ERR_OR_ZERO(dir_iter); if (ret) @@ -99,12 +115,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_hash = bch2_hash_info_init(trans->c, dir_u); + dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(inode_u->bi_mode), - name, inum, BCH_HASH_SET_MUST_CREATE) ?: - bch2_inode_write(trans, dir_iter, dir_u) ?: + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + inode_u->bi_dir = dir_inum; + inode_u->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, dir_iter, dir_u) ?: bch2_inode_write(trans, inode_iter, inode_u); err: bch2_trans_iter_put(trans, dir_iter); @@ -117,10 +142,11 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, const struct qstr *name) { + struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(trans->c); + u64 inum, now = bch2_current_time(c); struct bkey_s_c k; int ret; @@ -129,7 +155,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - dir_hash = bch2_hash_info_init(trans->c, dir_u); + dir_hash = bch2_hash_info_init(c, dir_u); dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, name, BTREE_ITER_INTENT); @@ -195,10 +221,12 @@ int bch2_rename_trans(struct btree_trans *trans, const struct qstr *dst_name, enum bch_rename_mode mode) { + struct bch_fs *c = trans->c; struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, dst_inode, now = bch2_current_time(trans->c); + u64 src_inode, src_offset, dst_inode, dst_offset; + u64 now = bch2_current_time(c); int ret; src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, @@ -207,7 +235,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (ret) goto err; - src_hash = bch2_hash_info_init(trans->c, src_dir_u); + src_hash = bch2_hash_info_init(c, src_dir_u); if (dst_dir != src_dir) { dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, @@ -216,7 +244,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (ret) goto err; - dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); + dst_hash = bch2_hash_info_init(c, dst_dir_u); } else { dst_dir_u = src_dir_u; dst_hash = src_hash; @@ -225,8 +253,8 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, - dst_name, &dst_inode, + src_name, &src_inode, &src_offset, + dst_name, &dst_inode, &dst_offset, mode); if (ret) goto err; @@ -245,6 +273,16 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; + + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } + } + if (mode == BCH_RENAME_OVERWRITE) { if (S_ISDIR(src_inode_u->bi_mode) != S_ISDIR(dst_inode_u->bi_mode)) { diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 9dc162f2..62788ae1 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -675,6 +675,39 @@ retry: continue; } + if (!target.bi_nlink && + !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (target.bi_dir != k.k->p.inode || + target.bi_dir_offset != k.k->p.offset) && + (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, + "inode %llu has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + d_inum, + target.bi_dir, + target.bi_dir_offset, + k.k->p.inode, + k.k->p.offset) || + c->opts.version_upgrade)) { + struct bkey_inode_buf p; + + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; + bch2_trans_unlock(&trans); + + bch2_inode_pack(c, &p, &target); + + ret = bch2_btree_insert(c, BTREE_ID_inodes, + &p.inode.k_i, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error in fsck: error %i updating inode", ret); + goto err; + } + continue; + } + if (fsck_err_on(have_target && d.v->d_type != mode_to_type(target.bi_mode), c, @@ -1314,6 +1347,16 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (!S_ISDIR(u.bi_mode) && + u.bi_nlink && + !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, + "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || + c->opts.version_upgrade)) { + u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; + do_update = true; + } + if (do_update) { struct bkey_inode_buf p; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 4559e77f..f1665ca8 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -332,6 +332,7 @@ int bch2_inode_write(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(trans->c, inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } @@ -469,11 +470,10 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -int bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u) +struct btree_iter *bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u) { struct bch_fs *c = trans->c; - struct bkey_inode_buf *inode_p; struct btree_iter *iter = NULL; struct bkey_s_c k; u64 min, max, start, *hint; @@ -493,10 +493,6 @@ int bch2_inode_create(struct btree_trans *trans, if (start >= max || start < min) start = min; - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); again: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { @@ -520,7 +516,7 @@ again: bch2_trans_iter_put(trans, iter); if (ret) - return ret; + return ERR_PTR(ret); if (start != min) { /* Retry from start */ @@ -528,15 +524,12 @@ again: goto again; } - return -ENOSPC; + return ERR_PTR(-ENOSPC); found_slot: *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - - ret = bch2_inode_write(trans, iter, inode_u); - bch2_trans_iter_put(trans, iter); - return ret; + return iter; } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 1caf036a..6bad6dfb 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -69,7 +69,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); +struct btree_iter *bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *); int bch2_inode_rm(struct bch_fs *, u64, bool); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 284d398b..36b10cb7 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -322,6 +322,9 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta || new_i_size) { bch2_inode_pack(trans->c, &inode_p, &inode_u); + + inode_p.inode.k.p.snapshot = iter->snapshot; + bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i, 0); } @@ -437,6 +440,8 @@ int bch2_write_index_default(struct bch_write_op *op) k = bch2_keylist_front(keys); + k->k.p.snapshot = iter->snapshot; + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); bkey_copy(sk.k, k); bch2_cut_front(iter->pos, sk.k); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 1f26139d..69c553a6 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -914,14 +914,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) if (dynamic_fault("bcachefs:add:journal_alloc")) return -ENOMEM; + /* 1/128th of the device by default: */ + nr = ca->mi.nbuckets >> 7; + /* - * clamp journal size to 1024 buckets or 512MB (in sectors), whichever + * clamp journal size to 8192 buckets or 8GB (in sectors), whichever * is smaller: */ - nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, + nr = clamp_t(unsigned, nr, BCH_JOURNAL_BUCKETS_MIN, - min(1 << 10, - (1 << 20) / ca->mi.bucket_size)); + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 54f2e205..c7fa03cf 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1452,7 +1452,7 @@ void bch2_journal_write(struct closure *cl) if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; - if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change) + if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) validate_before_checksum = true; if (validate_before_checksum && diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index bbf8e5ad..4a5b50ed 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -610,8 +610,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, - c->btree_key_cache.nr_dirty, - c->btree_key_cache.nr_keys); + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index f863fd74..3d1bf87e 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -48,14 +48,14 @@ static int __journal_key_cmp(enum btree_id l_btree_id, { return (cmp_int(l_btree_id, r->btree_id) ?: cmp_int(l_level, r->level) ?: - bkey_cmp(l_pos, r->k->k.p)); + bpos_cmp(l_pos, r->k->k.p)); } static int journal_key_cmp(struct journal_key *l, struct journal_key *r) { return (cmp_int(l->btree_id, r->btree_id) ?: cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p)); + bpos_cmp(l->k->k.p, r->k->k.p)); } static size_t journal_key_search(struct journal_keys *journal_keys, @@ -90,7 +90,7 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign if (iter->idx > idx || (iter->idx == idx && biter->last && - bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) + bpos_cmp(n->k.p, biter->unpacked.p) <= 0)) iter->idx++; } @@ -238,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); if (btree_k.k && journal_k.k) { - int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); + int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p); if (!cmp) bch2_journal_iter_advance_btree(iter); @@ -256,7 +256,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * ret = iter->last == journal ? journal_k : btree_k; if (iter->b && - bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { + bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) { iter->journal.idx = iter->journal.keys->nr; iter->last = none; return bkey_s_c_null; @@ -419,7 +419,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) return cmp_int(l->btree_id, r->btree_id) ?: cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p) ?: + bpos_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -490,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) while (src + 1 < keys.d + keys.nr && src[0].btree_id == src[1].btree_id && src[0].level == src[1].level && - !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) src++; *dst++ = *src++; @@ -581,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(r->level, l->level) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->k->k.p, r->k->k.p); + bpos_cmp(l->k->k.p, r->k->k.p); } static int bch2_journal_replay(struct bch_fs *c, @@ -998,6 +998,13 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } + if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) { + bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); + ret = -EINVAL; + goto err; + + } + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { bch_info(c, "alloc_v2 feature bit not set, fsck required"); c->opts.fsck = true; @@ -1338,6 +1345,7 @@ int bch2_fs_initialize(struct bch_fs *c) S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; bch2_inode_pack(c, &packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_inodes, diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index dfb12fdd..7507b6bc 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -67,6 +67,7 @@ static int test_delete(struct bch_fs *c, u64 nr) goto err; } err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -106,6 +107,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) goto err; } err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -113,7 +115,7 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_s_c k; u64 i; int ret = 0; @@ -159,6 +161,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) BUG_ON(i); err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -166,7 +169,7 @@ err: static int test_iterate_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_s_c k; u64 i; int ret = 0; @@ -213,6 +216,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) BUG_ON(i); err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -257,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) BUG_ON(k.k->p.offset != i); i += 2; } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); BUG_ON(i != nr * 2); @@ -274,6 +278,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) if (i == nr * 2) break; } + bch2_trans_iter_put(&trans, iter); err: bch2_trans_exit(&trans); return ret; @@ -318,7 +323,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) BUG_ON(k.k->size != 8); i += 16; } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); BUG_ON(i != nr); @@ -337,6 +342,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) if (i == nr) break; } + bch2_trans_iter_put(&trans, iter); err: bch2_trans_exit(&trans); return 0; @@ -362,6 +368,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) k = bch2_btree_iter_peek(iter); BUG_ON(k.k); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return 0; } @@ -382,6 +390,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) k = bch2_btree_iter_peek(iter); BUG_ON(k.k); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return 0; } @@ -473,6 +483,7 @@ static int rand_insert(struct bch_fs *c, u64 nr) for (i = 0; i < nr; i++) { bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; ret = __bch2_trans_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); @@ -508,7 +519,7 @@ static int rand_lookup(struct bch_fs *c, u64 nr) } } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -549,7 +560,7 @@ static int rand_mixed(struct bch_fs *c, u64 nr) } } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); return ret; } @@ -630,6 +641,8 @@ static int seq_insert(struct bch_fs *c, u64 nr) if (++i == nr) break; } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } @@ -645,6 +658,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr) for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) ; + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } @@ -671,6 +686,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) break; } } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } |