diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2022-04-02 15:40:30 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2022-04-02 15:40:30 -0400 |
commit | 3ac04b499779a0ee8873a7014211b40c95eeec49 (patch) | |
tree | 2d19b8f3b9cd01931c8aa3af877c576d3fb5105c | |
parent | b034dfb24fece43a7677b9a29781495aeb62767f (diff) |
Merge with 40a2993bf6 bcachefs: Discard path fixes/improvements
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
98 files changed, 9042 insertions, 5866 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 71cda24e6d08..7ddae26116a0 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -16,6 +16,7 @@ bcachefs-y := \ btree_update_interior.o \ btree_update_leaf.o \ buckets.o \ + buckets_waiting_for_journal.o \ chardev.o \ checksum.o \ clock.o \ @@ -37,8 +38,10 @@ bcachefs-y := \ journal.o \ journal_io.o \ journal_reclaim.o \ + journal_sb.o \ journal_seq_blacklist.o \ keylist.o \ + lru.o \ migrate.o \ move.o \ movinggc.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 2a36af5e0220..e8a34eccac25 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -9,10 +9,12 @@ #include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "ec.h" #include "error.h" +#include "lru.h" #include "recovery.h" #include "varint.h" @@ -25,12 +27,7 @@ #include <linux/sort.h> #include <trace/events/bcachefs.h> -const char * const bch2_allocator_states[] = { -#define x(n) #n, - ALLOC_THREAD_STATES() -#undef x - NULL -}; +/* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, @@ -38,16 +35,28 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #undef x }; -struct bkey_alloc_buf { - struct bkey_i k; - struct bch_alloc_v3 v; +const char * const bch2_bucket_states[] = { + "free", + "need gc gens", + "need discard", + "cached", + "dirty", + NULL +}; -#define x(_name, _bits) + _bits / 8 - u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +struct bkey_alloc_unpacked { + u64 journal_seq; + u64 bucket; + u8 dev; + u8 gen; + u8 oldest_gen; + u8 data_type; + bool need_discard:1; + bool need_inc_gen:1; +#define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS_V2() #undef x -} __attribute__((packed, aligned(8))); - -/* Persistent alloc info: */ +}; static inline u64 alloc_field_v1_get(const struct bch_alloc *a, const void **p, unsigned field) @@ -169,6 +178,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; + out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); + out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); out->journal_seq = le64_to_cpu(a.v->journal_seq); #define x(_name, _bits) \ @@ -190,47 +201,7 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, return 0; } -static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, - const struct bkey_alloc_unpacked src) -{ - struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - u8 *out = a->v.data; - u8 *end = (void *) &dst[1]; - u8 *last_nonzero_field = out; - unsigned bytes; - - a->k.p = POS(src.dev, src.bucket); - a->v.gen = src.gen; - a->v.oldest_gen = src.oldest_gen; - a->v.data_type = src.data_type; - a->v.journal_seq = cpu_to_le64(src.journal_seq); - -#define x(_name, _bits) \ - nr_fields++; \ - \ - if (src._name) { \ - out += bch2_varint_encode_fast(out, src._name); \ - \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } else { \ - *out++ = 0; \ - } - - BCH_ALLOC_FIELDS_V2() -#undef x - BUG_ON(out > end); - - out = last_nonzero_field; - a->v.nr_fields = last_nonzero_fieldnr; - - bytes = (u8 *) out - (u8 *) &a->v; - set_bkey_val_bytes(&a->k, bytes); - memset_u64s_tail(&a->v, 0, bytes); -} - -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { struct bkey_alloc_unpacked ret = { .dev = k.k->p.inode, @@ -253,24 +224,71 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) return ret; } -static void bch2_alloc_pack(struct bch_fs *c, - struct bkey_alloc_buf *dst, - const struct bkey_alloc_unpacked src) +void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { - bch2_alloc_pack_v3(dst, src); + if (k.k->type == KEY_TYPE_alloc_v4) { + *out = *bkey_s_c_to_alloc_v4(k).v; + } else { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + *out = (struct bch_alloc_v4) { + .journal_seq = u.journal_seq, + .flags = u.need_discard, + .gen = u.gen, + .oldest_gen = u.oldest_gen, + .data_type = u.data_type, + .stripe_redundancy = u.stripe_redundancy, + .dirty_sectors = u.dirty_sectors, + .cached_sectors = u.cached_sectors, + .io_time[READ] = u.read_time, + .io_time[WRITE] = u.write_time, + .stripe = u.stripe, + }; + } } -int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_alloc_unpacked *u, unsigned trigger_flags) +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { - struct bkey_alloc_buf *a; + struct bkey_i_alloc_v4 *ret; - a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - if (IS_ERR(a)) - return PTR_ERR(a); + if (k.k->type == KEY_TYPE_alloc_v4) { + ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if (!IS_ERR(ret)) + bkey_reassemble(&ret->k_i, k); + } else { + ret = bch2_trans_kmalloc(trans, sizeof(*ret)); + if (!IS_ERR(ret)) { + bkey_alloc_v4_init(&ret->k_i); + ret->k.p = k.k->p; + bch2_alloc_to_v4(k, &ret->v); + } + } + return ret; +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) +{ + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + int ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); + } - bch2_alloc_pack(trans->c, a, *u); - return bch2_trans_update(trans, iter, &a->k, trigger_flags); + a = bch2_alloc_to_v4_mut(trans, k); + if (IS_ERR(a)) + bch2_trans_iter_exit(trans, iter); + return a; } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -316,629 +334,835 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_alloc_unpacked u; + struct bch_dev *ca; if (k.k->p.inode >= c->sb.nr_devices || !c->devs[k.k->p.inode]) return "invalid device"; + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + if (k.k->p.offset < ca->mi.first_bucket || + k.k->p.offset >= ca->mi.nbuckets) + return "invalid bucket"; + if (bch2_alloc_unpack_v3(&u, k)) return "unpack error"; return NULL; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", - u.gen, u.oldest_gen, bch2_data_types[u.data_type], - u.journal_seq); -#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); - BCH_ALLOC_FIELDS_V2() -#undef x -} - -static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) +const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bch_fs *c = trans->c; struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked u; - if (!bkey_is_alloc(k.k)) - return 0; + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = bucket(ca, k.k->p.offset); - u = bch2_alloc_unpack(k); - - *bucket_gen(ca, k.k->p.offset) = u.gen; - g->_mark.gen = u.gen; - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; - g->gen_valid = 1; - return 0; + if (k.k->p.offset < ca->mi.first_bucket || + k.k->p.offset >= ca->mi.nbuckets) + return "invalid bucket"; + + return NULL; +} + +void bch2_alloc_v4_swab(struct bkey_s k) +{ + struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; + + a->journal_seq = swab64(a->journal_seq); + a->flags = swab32(a->flags); + a->dirty_sectors = swab32(a->dirty_sectors); + a->cached_sectors = swab32(a->cached_sectors); + a->io_time[0] = swab64(a->io_time[0]); + a->io_time[1] = swab64(a->io_time[1]); + a->stripe = swab32(a->stripe); + a->nr_external_backpointers = swab32(a->nr_external_backpointers); +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 a; + + bch2_alloc_to_v4(k, &a); + + pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu", + a.gen, a.oldest_gen, bch2_data_types[a.data_type], + a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a)); + pr_buf(out, " dirty_sectors %u", a.dirty_sectors); + pr_buf(out, " cached_sectors %u", a.cached_sectors); + pr_buf(out, " stripe %u", a.stripe); + pr_buf(out, " stripe_redundancy %u", a.stripe_redundancy); + pr_buf(out, " read_time %llu", a.io_time[READ]); + pr_buf(out, " write_time %llu", a.io_time[WRITE]); } int bch2_alloc_read(struct bch_fs *c) { struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct bch_dev *ca; int ret; bch2_trans_init(&trans, c, 0, 0); - down_read(&c->gc_lock); - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); - up_read(&c->gc_lock); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + bch2_alloc_to_v4(k, &a); + + *bucket_gen(ca, k.k->p.offset) = a.gen; + } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); - if (ret) { + + if (ret) bch_err(c, "error reading alloc info: %i", ret); - return ret; - } - return 0; + return ret; } -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) +/* Free space/discard btree: */ + +static int bch2_bucket_do_index(struct btree_trans *trans, + struct bkey_s_c alloc_k, + struct bch_alloc_v4 a, + bool set) { struct bch_fs *c = trans->c; - struct bkey_s_c k; - struct bkey_alloc_unpacked old_u, new_u; + struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + struct btree_iter iter; + struct bkey_s_c old; + struct bkey_i *k; + enum bucket_state state = bucket_state(a); + enum btree_id btree; + enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; + enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + struct printbuf buf = PRINTBUF; int ret; -retry: - bch2_trans_begin(trans); - ret = bch2_btree_key_cache_flush(trans, - BTREE_ID_alloc, iter->pos); - if (ret) - goto err; + if (state != BUCKET_free && + state != BUCKET_need_discard) + return 0; - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); - old_u = bch2_alloc_unpack(k); - new_u = alloc_mem_to_key(c, iter); + bkey_init(&k->k); + k->k.type = new_type; - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + switch (state) { + case BUCKET_free: + btree = BTREE_ID_freespace; + k->k.p = alloc_freespace_pos(alloc_k.k->p, a); + bch2_key_resize(&k->k, 1); + break; + case BUCKET_need_discard: + btree = BTREE_ID_need_discard; + k->k.p = alloc_k.k->p; + break; + default: return 0; + } - ret = bch2_alloc_write(trans, iter, &new_u, - BTREE_TRIGGER_NORUN) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags); + bch2_trans_iter_init(trans, &iter, btree, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + old = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(old); + if (ret) + goto err; + + if (ca->mi.freespace_initialized && + bch2_fs_inconsistent_on(old.k->type != old_type, c, + "incorrect key when %s %s btree (got %s should be %s)\n" + " for %s", + set ? "setting" : "clearing", + bch2_btree_ids[btree], + bch2_bkey_types[old.k->type], + bch2_bkey_types[old_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = -EIO; + goto err; + } + + ret = bch2_trans_update(trans, &iter, k, 0); err: - if (ret == -EINTR) - goto retry; + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } -int bch2_alloc_write_all(struct bch_fs *c, unsigned flags) +int bch2_trans_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { - struct btree_trans trans; - struct btree_iter iter; - struct bch_dev *ca; - unsigned i; + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a, *new_a; + u64 old_lru, new_lru; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + /* + * Deletion only happens in the device removal path, with + * BTREE_TRIGGER_NORUN: + */ + BUG_ON(new->k.type != KEY_TYPE_alloc_v4); - for_each_member_device(ca, c, i) { - bch2_btree_iter_set_pos(&iter, - POS(ca->dev_idx, ca->mi.first_bucket)); + bch2_alloc_to_v4(old, &old_a); + new_a = &bkey_i_to_alloc_v4(new)->v; - while (iter.pos.offset < ca->mi.nbuckets) { - ret = bch2_alloc_write_key(&trans, &iter, flags); - if (ret) { - percpu_ref_put(&ca->ref); - goto err; - } - bch2_btree_iter_advance(&iter); - } + if (new_a->dirty_sectors > old_a.dirty_sectors || + new_a->cached_sectors > old_a.cached_sectors) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } -err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - return ret; -} -/* Bucket IO clocks: */ - -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_alloc_unpacked u; - u64 *time, now; - int ret = 0; + if (old_a.data_type && !new_a->data_type && + old_a.gen == new_a->gen && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter); - if (ret) - goto out; + if (bucket_state(old_a) != bucket_state(*new_a) || + (bucket_state(*new_a) == BUCKET_free && + alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: + bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true); + if (ret) + return ret; + } - u = alloc_mem_to_key(c, &iter); + old_lru = alloc_lru_idx(old_a); + new_lru = alloc_lru_idx(*new_a); - time = rw == READ ? &u.read_time : &u.write_time; - now = atomic64_read(&c->io_clock[rw].now); - if (*time == now) - goto out; + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, + old_lru, &new_lru); + if (ret) + return ret; - *time = now; + if (new_lru && new_a->io_time[READ] != new_lru) + new_a->io_time[READ] = new_lru; + } - ret = bch2_alloc_write(trans, &iter, &u, 0) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } -/* Background allocator thread: */ +static int bch2_check_alloc_key(struct btree_trans *trans, + struct btree_iter *alloc_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter discard_iter, freespace_iter, lru_iter; + struct bch_alloc_v4 a; + unsigned discard_key_type, freespace_key_type; + struct bkey_s_c alloc_k, k; + struct printbuf buf = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret; -/* - * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens - * (marking them as invalidated on disk), then optionally issues discard - * commands to the newly free buckets, then puts them on the various freelists. - */ + alloc_k = bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 0; -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, - struct bucket_mark m) -{ - u8 gc_gen; + ret = bkey_err(alloc_k); + if (ret) + return ret; - if (!is_available_bucket(m)) - return false; + bch2_alloc_to_v4(alloc_k, &a); + discard_key_type = bucket_state(a) == BUCKET_need_discard + ? KEY_TYPE_set : 0; + freespace_key_type = bucket_state(a) == BUCKET_free + ? KEY_TYPE_set : 0; - if (m.owned_by_allocator) - return false; + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, + alloc_k.k->p, 0); + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, + alloc_freespace_pos(alloc_k.k->p, a), 0); + bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, + POS(alloc_k.k->p.inode, a.io_time[READ]), 0); - if (ca->buckets_nouse && - test_bit(b, ca->buckets_nouse)) - return false; + k = bch2_btree_iter_peek_slot(&discard_iter); + ret = bkey_err(k); + if (ret) + goto err; - if (ca->new_fs_bucket_idx) { - /* - * Device or filesystem is still being initialized, and we - * haven't fully marked superblocks & journal: - */ - if (is_superblock_bucket(ca, b)) - return false; + if (fsck_err_on(k.k->type != discard_key_type, c, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = discard_key_type; + update->k.p = discard_iter.pos; - if (b < ca->new_fs_bucket_idx) - return false; + ret = bch2_trans_update(trans, &discard_iter, update, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; } - gc_gen = bucket_gc_gen(bucket(ca, b)); + k = bch2_btree_iter_peek_slot(&freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; - ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; - ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; + if (fsck_err_on(k.k->type != freespace_key_type, c, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; - return gc_gen < BUCKET_GC_GEN_MAX; -} + bkey_init(&update->k); + update->k.type = freespace_key_type; + update->k.p = freespace_iter.pos; + bch2_key_resize(&update->k, 1); -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - */ + ret = bch2_trans_update(trans, &freespace_iter, update, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } -static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, - u64 now, u64 last_seq_ondisk) -{ - unsigned used = bucket_sectors_used(m); + if (bucket_state(a) == BUCKET_cached) { + k = bch2_btree_iter_peek_slot(&lru_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(!a.io_time[READ], c, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || + fsck_err_on(k.k->type != KEY_TYPE_lru || + le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, + "incorrect/missing lru entry\n" + " %s\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + u64 read_time = a.io_time[READ]; + + if (!a.io_time[READ]) + a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + + ret = bch2_lru_change(trans, + alloc_k.k->p.inode, + alloc_k.k->p.offset, + 0, &a.io_time[READ]); + if (ret) + goto err; - if (used) { - /* - * Prefer to keep buckets that have been read more recently, and - * buckets that have more data in them: - */ - u64 last_read = max_t(s64, 0, now - g->io_time[READ]); - u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); + if (a.io_time[READ] != read_time) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; + + a_mut->v.io_time[READ] = a.io_time[READ]; + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } - return -last_read_scaled; - } else { - /* - * Prefer to use buckets with smaller gc_gen so that we don't - * have to walk the btree and recalculate oldest_gen - but shift - * off the low bits so that buckets will still have equal sort - * keys when there's only a small difference, so that we can - * keep sequential buckets together: - */ - return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| - (bucket_gc_gen(g) >> 4); + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto err; + } } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + bch2_trans_iter_exit(trans, &freespace_iter); + bch2_trans_iter_exit(trans, &discard_iter); + printbuf_exit(&buf2); + printbuf_exit(&buf); + return ret; } -static inline int bucket_alloc_cmp(alloc_heap *h, - struct alloc_heap_entry l, - struct alloc_heap_entry r) +static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - return cmp_int(l.key, r.key) ?: - cmp_int(r.nr, l.nr) ?: - cmp_int(l.bucket, r.bucket); -} + struct bch_dev *ca; -static inline int bucket_idx_cmp(const void *_l, const void *_r) -{ - const struct alloc_heap_entry *l = _l, *r = _r; + if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) + return false; - return cmp_int(l->bucket, r->bucket); + ca = bch_dev_bkey_exists(c, pos.inode); + return pos.offset >= ca->mi.first_bucket && + pos.offset < ca->mi.nbuckets; } -static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) +static int bch2_check_freespace_key(struct btree_trans *trans, + struct btree_iter *freespace_iter, + bool initial) { - struct bucket_array *buckets; - struct alloc_heap_entry e = { 0 }; - u64 now, last_seq_ondisk; - size_t b, i, nr = 0; + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; + struct bkey_s_c k, freespace_k; + struct bch_alloc_v4 a; + u64 genbits; + struct bpos pos; + struct bkey_i *update; + struct printbuf buf = PRINTBUF; + int ret; - down_read(&ca->bucket_lock); + freespace_k = bch2_btree_iter_peek(freespace_iter); + if (!freespace_k.k) + return 1; - buckets = bucket_array(ca); - ca->alloc_heap.used = 0; - now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.flushed_seq_ondisk; + ret = bkey_err(freespace_k); + if (ret) + return ret; - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket *g = &buckets->b[b]; - struct bucket_mark m = READ_ONCE(g->mark); - unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); + pos = freespace_iter->pos; + pos.offset &= ~(~0ULL << 56); + genbits = freespace_iter->pos.offset & (~0ULL << 56); - cond_resched(); + bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); - if (!bch2_can_invalidate_bucket(ca, b, m)) - continue; + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + "%llu:%llu set in freespace btree but device or bucket does not exist", + pos.inode, pos.offset)) + goto delete; - if (e.nr && e.bucket + e.nr == b && e.key == key) { - e.nr++; - } else { - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - e = (struct alloc_heap_entry) { - .bucket = b, - .nr = 1, - .key = key, - }; - } - } + k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(k); + if (ret) + goto err; - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); + bch2_alloc_to_v4(k, &a); - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; + if (fsck_err_on(bucket_state(a) != BUCKET_free || + genbits != alloc_freespace_genbits(a), c, + "%s\n incorrectly set in freespace index (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + bucket_state(a) == BUCKET_free, + genbits >> 56, alloc_freespace_genbits(a) >> 56)) + goto delete; +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +delete: + update = bch2_trans_kmalloc(trans, sizeof(*update)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; - while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { - nr -= ca->alloc_heap.data[0].nr; - heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); - } + bkey_init(&update->k); + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, 1); - up_read(&ca->bucket_lock); + ret = bch2_trans_update(trans, freespace_iter, update, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + goto out; } -static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) +int bch2_check_alloc_info(struct bch_fs *c, bool initial) { - size_t i, nr = 0; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0, last_dev = -1; - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; + bch2_trans_init(&trans, c, 0, 0); - find_reclaimable_buckets_lru(c, ca); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->p.inode != last_dev) { + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + + if (!ca->mi.freespace_initialized) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + + last_dev = k.k->p.inode; + } + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_alloc_key(&trans, &iter)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); - heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); + if (ret) + goto err; - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_freespace_key(&trans, &iter, initial)); + if (ret) + break; - return nr; + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(&trans, &iter); +err: + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } -static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - struct bkey_alloc_unpacked *u) +static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, + struct bch_dev *ca, bool *discard_done) { struct bch_fs *c = trans->c; struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + struct printbuf buf = PRINTBUF; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos, + BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; - ret = bch2_btree_iter_traverse(&iter); + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); if (ret) - goto err; + goto out; - *u = alloc_mem_to_key(c, &iter); + if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; + } - u->gen++; - u->data_type = 0; - u->dirty_sectors = 0; - u->cached_sectors = 0; - u->read_time = atomic64_read(&c->io_clock[READ].now); - u->write_time = atomic64_read(&c->io_clock[WRITE].now); + BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk); - ret = bch2_alloc_write(trans, &iter, u, - BTREE_TRIGGER_BUCKET_INVALIDATE); -err: + if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c, + "%s\n incorrectly set in need_discard btree", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto out; + } + + if (!*discard_done && ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL, 0); + *discard_done = true; + + ret = bch2_trans_relock(trans) ? 0 : -EINTR; + if (ret) + goto out; + } + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); +write: + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +out: bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } -static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, unsigned flags) +static void bch2_do_discards_work(struct work_struct *work) { - struct bkey_alloc_unpacked u; - size_t b; - int ret = 0; - - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) - return 1; + struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = NULL; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + int ret; - BUG_ON(!ca->alloc_heap.used || - !ca->alloc_heap.data[0].nr); - b = ca->alloc_heap.data[0].bucket; + bch2_trans_init(&trans, c, 0, 0); - /* first, put on free_inc and mark as owned by allocator: */ - percpu_down_read(&c->mark_lock); + for_each_btree_key(&trans, iter, BTREE_ID_need_discard, + POS_MIN, 0, k, ret) { + bool discard_done = false; - bch2_mark_alloc_bucket(c, ca, b, true); + if (ca && k.k->p.inode != ca->dev_idx) { + percpu_ref_put(&ca->io_ref); + ca = NULL; + } - spin_lock(&c->freelist_lock); - verify_not_on_freelist(c, ca, b); - BUG_ON(!fifo_push(&ca->free_inc, b)); - spin_unlock(&c->freelist_lock); + if (!ca) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { + ca = NULL; + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + } - percpu_up_read(&c->mark_lock); + seen++; - ret = bch2_trans_do(c, NULL, journal_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - flags, - bucket_invalidate_btree(&trans, ca, b, &u)); + if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) { + open++; + continue; + } - if (!ret) { - /* remove from alloc_heap: */ - struct alloc_heap_entry e, *top = ca->alloc_heap.data; + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + k.k->p.inode, k.k->p.offset)) { + need_journal_commit++; + continue; + } - top->bucket++; - top->nr--; + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOFAIL, + bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); + if (ret) + break; - if (!top->nr) - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + discarded++; + } + bch2_trans_iter_exit(&trans, &iter); - /* - * Make sure we flush the last journal entry that updated this - * bucket (i.e. deleting the last reference) before writing to - * this bucket again: - */ - *journal_seq = max(*journal_seq, u.journal_seq); - } else { - size_t b2; + if (ca) + percpu_ref_put(&ca->io_ref); - /* remove from free_inc: */ - percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); + bch2_trans_exit(&trans); - bch2_mark_alloc_bucket(c, ca, b, false); + if (need_journal_commit * 2 > seen) + bch2_journal_flush_async(&c->journal, NULL); - BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); - BUG_ON(b != b2); + percpu_ref_put(&c->writes); - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->mark_lock); - } + trace_do_discards(c, seen, open, need_journal_commit, discarded, ret); +} - return ret < 0 ? ret : 0; +void bch2_do_discards(struct bch_fs *c) +{ + if (percpu_ref_tryget(&c->writes) && + !queue_work(system_long_wq, &c->discard_work)) + percpu_ref_put(&c->writes); } -/* - * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: - */ -static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) +static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) { - u64 journal_seq = 0; - int ret = 0; + struct bch_fs *c = trans->c; + struct btree_iter lru_iter, alloc_iter = { NULL }; + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + u64 bucket, idx; + int ret; - /* Only use nowait if we've already invalidated at least one bucket: */ - while (!ret && - !fifo_full(&ca->free_inc) && - ca->alloc_heap.used) { - if (kthread_should_stop()) { - ret = 1; - break; - } + bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, + POS(ca->dev_idx, 0), 0); + k = bch2_btree_iter_peek(&lru_iter); + ret = bkey_err(k); + if (ret) + goto out; - ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, - (!fifo_empty(&ca->free_inc) - ? BTREE_INSERT_NOWAIT : 0)); - /* - * We only want to batch up invalidates when they're going to - * require flushing the journal: - */ - if (!journal_seq) - break; - } + if (!k.k || k.k->p.inode != ca->dev_idx) + goto out; - /* If we used NOWAIT, don't return the error: */ - if (!fifo_empty(&ca->free_inc)) - ret = 0; - if (ret < 0) - bch_err(ca, "error invalidating buckets: %i", ret); + if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c, + "non lru key in lru btree")) + goto out; + + idx = k.k->p.offset; + bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); + + a = bch2_trans_start_alloc_update(trans, &alloc_iter, + POS(ca->dev_idx, bucket)); + ret = PTR_ERR_OR_ZERO(a); if (ret) - return ret; + goto out; - if (journal_seq) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - if (ret) { - bch_err(ca, "journal error: %i", ret); - return ret; - } + if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c, + "invalidating bucket with wrong lru idx (got %llu should be %llu", + idx, alloc_lru_idx(a->v))) + goto out; - return 0; -} + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + a->v.gen++; + a->v.data_type = 0; + a->v.dirty_sectors = 0; + a->v.cached_sectors = 0; + a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); -static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) -{ - if (ca->allocator_state != new_state) { - ca->allocator_state = new_state; - closure_wake_up(&ca->fs->freelist_wait); - } + ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, + BTREE_TRIGGER_BUCKET_INVALIDATE); +out: + bch2_trans_iter_exit(trans, &alloc_iter); + bch2_trans_iter_exit(trans, &lru_iter); + return ret; } -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +static void bch2_do_invalidates_work(struct work_struct *work) { + struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca; + struct btree_trans trans; unsigned i; int ret = 0; - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - /* - * Don't strand buckets on the copygc freelist until - * after recovery is finished: - */ - if (i == RESERVE_MOVINGGC && - !test_bit(BCH_FS_STARTED, &c->flags)) - continue; + bch2_trans_init(&trans, c, 0, 0); - if (fifo_push(&ca->free[i], b)) { - fifo_pop(&ca->free_inc, b); - ret = 1; - break; - } - } - spin_unlock(&c->freelist_lock); + for_each_member_device(ca, c, i) + while (!ret && should_invalidate_buckets(ca)) + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOFAIL, + invalidate_one_bucket(&trans, ca)); - ca->allocator_state = ret - ? ALLOCATOR_running - : ALLOCATOR_blocked_full; - closure_wake_up(&c->freelist_wait); - return ret; + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); } -static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +void bch2_do_invalidates(struct bch_fs *c) { - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), - ca->mi.bucket_size, GFP_NOFS, 0); + if (percpu_ref_tryget(&c->writes)) + queue_work(system_long_wq, &c->invalidate_work); } -static bool allocator_thread_running(struct bch_dev *ca) +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) { - unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) - ? ALLOCATOR_running - : ALLOCATOR_stopped; - alloc_thread_set_state(ca, state); - return state == ALLOCATOR_running; -} + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct bch_member *m; + int ret; -static int buckets_available(struct bch_dev *ca, unsigned long gc_count) -{ - s64 available = dev_buckets_reclaimable(ca) - - (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); - bool ret = available > 0; + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (iter.pos.offset >= ca->mi.nbuckets) + break; + + bch2_alloc_to_v4(k, &a); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_bucket_do_index(&trans, k, a, true)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) { + bch_err(ca, "error initializing free space: %i", ret); + return ret; + } + + mutex_lock(&c->sb_lock); + m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + mutex_unlock(&c->sb_lock); - alloc_thread_set_state(ca, ret - ? ALLOCATOR_running - : ALLOCATOR_blocked); return ret; } -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by find_reclaimable_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch2_allocator_thread(void *arg) +int bch2_fs_freespace_init(struct bch_fs *c) { - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - unsigned long gc_count = c->gc_count; - size_t nr; - int ret; + struct bch_dev *ca; + unsigned i; + int ret = 0; + bool doing_init = false; - set_freezable(); + /* + * We can crash during the device add path, so we need to check this on + * every mount: + */ - while (1) { - ret = kthread_wait_freezable(allocator_thread_running(ca)); - if (ret) - goto stop; + for_each_member_device(ca, c, i) { + if (ca->mi.freespace_initialized) + continue; - while (!ca->alloc_heap.used) { - cond_resched(); + if (!doing_init) { + bch_info(c, "initializing freespace"); + doing_init = true; + } - ret = kthread_wait_freezable(buckets_available(ca, gc_count)); - if (ret) - goto stop; + ret = bch2_dev_freespace_init(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + return ret; + } + } - gc_count = c->gc_count; - nr = find_reclaimable_buckets(c, ca); + if (doing_init) { + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); + bch_verbose(c, "done initializing freespace"); + } - if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || - ca->inc_gen_really_needs_gc) && - c->gc_thread) { - atomic_inc(&c->kick_gc); - wake_up_process(c->gc_thread); - } - } + return ret; +} - ret = bch2_invalidate_buckets(c, ca); - if (ret) - goto stop; +/* Bucket IO clocks: */ + +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + u64 now; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - while (!fifo_empty(&ca->free_inc)) { - u64 b = fifo_peek(&ca->free_inc); + now = atomic64_read(&c->io_clock[rw].now); + if (a->v.io_time[rw] == now) + goto out; - discard_one_bucket(c, ca, b); + a->v.io_time[rw] = now; - ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); - if (ret) - goto stop; - } - } -stop: - alloc_thread_set_state(ca, ALLOCATOR_stopped); - return 0; + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; } /* Startup/shutdown (ro/rw): */ @@ -949,7 +1173,7 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i, j; + unsigned i; lockdep_assert_held(&c->state_lock); @@ -980,8 +1204,9 @@ void bch2_recalc_capacity(struct bch_fs *c) * allocations for foreground writes must wait - * not -ENOSPC calculations. */ - for (j = 0; j < RESERVE_NONE; j++) - dev_reserve += ca->free[j].size; + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ @@ -1037,8 +1262,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { unsigned i; - BUG_ON(ca->alloc_thread); - /* First, remove device from allocation groups: */ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) @@ -1112,62 +1335,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -{ - if (ca->alloc_thread) - closure_wait_event(&c->freelist_wait, - ca->allocator_state != ALLOCATOR_running); -} - -/* stop allocator thread: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) -{ - struct task_struct *p; - - p = rcu_dereference_protected(ca->alloc_thread, 1); - ca->alloc_thread = NULL; - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid bch2_wake_allocator() racing: - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - synchronize_rcu(); - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -/* start allocator thread: */ -int bch2_dev_allocator_start(struct bch_dev *ca) -{ - struct task_struct *p; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - p = kthread_create(bch2_allocator_thread, ca, - "bch-alloc/%s", ca->name); - if (IS_ERR(p)) { - bch_err(ca->fs, "error creating allocator thread: %li", - PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - rcu_assign_pointer(ca->alloc_thread, p); - wake_up_process(p); - return 0; -} - void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); + INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 86b64177b3d0..da1b650e8017 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,90 +8,98 @@ #include "debug.h" #include "super.h" -extern const char * const bch2_allocator_states[]; - -struct bkey_alloc_unpacked { - u64 journal_seq; - u64 bucket; - u8 dev; - u8 gen; - u8 oldest_gen; - u8 data_type; -#define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS_V2() -#undef x -}; - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U -/* returns true if not equal */ -static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, - struct bkey_alloc_unpacked r) +static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) { - return l.gen != r.gen || - l.oldest_gen != r.oldest_gen || - l.data_type != r.data_type -#define x(_name, ...) || l._name != r._name - BCH_ALLOC_FIELDS_V2() -#undef x - ; + return a.gen - a.oldest_gen; } -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -int bch2_alloc_write(struct btree_trans *, struct btree_iter *, - struct bkey_alloc_unpacked *, unsigned); +enum bucket_state { + BUCKET_free, + BUCKET_need_gc_gens, + BUCKET_need_discard, + BUCKET_cached, + BUCKET_dirty, +}; -int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); +extern const char * const bch2_bucket_states[]; + +static inline enum bucket_state bucket_state(struct bch_alloc_v4 a) +{ + if (a.dirty_sectors || a.stripe) + return BUCKET_dirty; + if (a.cached_sectors) + return BUCKET_cached; + BUG_ON(a.data_type); + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BUCKET_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BUCKET_need_gc_gens; + return BUCKET_free; +} + +static inline u64 alloc_lru_idx(struct bch_alloc_v4 a) +{ + return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0; +} + +static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) +{ + return ((u64) alloc_gc_gen(a) >> 4) << 56; +} -static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter) +static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) { - struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked ret; - - percpu_down_read(&c->mark_lock); - ca = bch_dev_bkey_exists(c, iter->pos.inode); - g = bucket(ca, iter->pos.offset); - ret = (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = g->mark.gen, - .oldest_gen = g->oldest_gen, - .data_type = g->mark.data_type, - .dirty_sectors = g->mark.dirty_sectors, - .cached_sectors = g->mark.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - .stripe = g->stripe, - .stripe_redundancy = g->stripe_redundancy, - }; - percpu_up_read(&c->mark_lock); - - return ret; + pos.offset |= alloc_freespace_genbits(a); + return pos; } +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); + +void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); + +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k); +void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ .key_invalid = bch2_alloc_v1_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ } #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ } #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ .key_invalid = bch2_alloc_v3_invalid, \ .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ +} + +#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v4_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .swab = bch2_alloc_v4_swab, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ } static inline bool bkey_is_alloc(const struct bkey *k) @@ -103,43 +111,29 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); -static inline void bch2_wake_allocator(struct bch_dev *ca) +int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); +int bch2_check_alloc_info(struct bch_fs *, bool); +void bch2_do_discards(struct bch_fs *); + +static inline bool should_invalidate_buckets(struct bch_dev *ca) { - struct task_struct *p; + struct bch_dev_usage u = bch2_dev_usage_read(ca); - rcu_read_lock(); - p = rcu_dereference(ca->alloc_thread); - if (p) - wake_up_process(p); - rcu_read_unlock(); + return u.d[BCH_DATA_cached].buckets && + u.buckets_unavailable + u.d[BCH_DATA_cached].buckets < + ca->mi.nbuckets >> 7; } -static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - if (bch2_expensive_debug_checks) { - size_t iter; - long i; - unsigned j; - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} +void bch2_do_invalidates(struct bch_fs *); + +int bch2_fs_freespace_init(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_stop(struct bch_dev *); -int bch2_dev_allocator_start(struct bch_dev *); - -int bch2_alloc_write_all(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 0a634125dc90..4dbab45be5ed 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -14,19 +14,31 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" +#include "error.h" #include "io.h" +#include "journal.h" #include <linux/math64.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <trace/events/bcachefs.h> +const char * const bch2_alloc_reserves[] = { +#define x(t) #t, + BCH_ALLOC_RESERVES() +#undef x + NULL +}; + /* * Open buckets represent a bucket that's currently being allocated from. They * serve two purposes: @@ -78,7 +90,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, ob->bucket, false); ob->valid = false; ob->data_type = 0; @@ -151,22 +162,6 @@ static void open_bucket_free_unused(struct bch_fs *c, } } -static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct open_bucket *ob; - unsigned i; - - rcu_read_lock(); - open_bucket_for_each(c, obs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - - BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen); - } - rcu_read_unlock(); -#endif -} - /* _only_ for allocating the journal on a new device: */ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) { @@ -184,49 +179,45 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) { switch (reserve) { - case RESERVE_BTREE: - case RESERVE_BTREE_MOVINGGC: + case RESERVE_btree: + case RESERVE_btree_movinggc: return 0; - case RESERVE_MOVINGGC: + case RESERVE_movinggc: return OPEN_BUCKETS_COUNT / 4; default: return OPEN_BUCKETS_COUNT / 2; } } -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, - struct closure *cl) +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, + enum alloc_reserve reserve, + struct bch_alloc_v4 *a, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) { struct open_bucket *ob; - long b = 0; - spin_lock(&c->freelist_lock); + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + (*skipped_nouse)++; + return NULL; + } - if (may_alloc_partial) { - int i; - - for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { - ob = c->open_buckets + ca->open_buckets_partial[i]; - - if (reserve <= ob->alloc_reserve) { - array_remove_item(ca->open_buckets_partial, - ca->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - ob->alloc_reserve = reserve; - spin_unlock(&c->freelist_lock); - return ob; - } - } + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + (*skipped_open)++; + return NULL; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + (*skipped_need_journal_commit)++; + return NULL; } + spin_lock(&c->freelist_lock); + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -235,36 +226,18 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, c->blocked_allocate_open_bucket = local_clock(); spin_unlock(&c->freelist_lock); - trace_open_bucket_alloc_fail(ca, reserve); + + trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]); return ERR_PTR(-OPEN_BUCKETS_EMPTY); } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) - goto out; - - switch (reserve) { - case RESERVE_BTREE_MOVINGGC: - case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) - goto out; - break; - default: - break; + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + spin_unlock(&c->freelist_lock); + (*skipped_open)++; + return NULL; } - if (cl) - closure_wait(&c->freelist_wait, cl); - - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); - - spin_unlock(&c->freelist_lock); - - trace_bucket_alloc_fail(ca, reserve); - return ERR_PTR(-FREELIST_EMPTY); -out: - verify_not_on_freelist(c, ca, b); - ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); @@ -273,8 +246,8 @@ out: ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->dev = ca->dev_idx; - ob->gen = *bucket_gen(ca, b); - ob->bucket = b; + ob->gen = a->gen; + ob->bucket = bucket; spin_unlock(&ob->lock); ca->nr_open_buckets++; @@ -296,9 +269,283 @@ out: spin_unlock(&c->freelist_lock); - bch2_wake_allocator(ca); + trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); + return ob; +} + +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum alloc_reserve reserve, u64 free_entry, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob; + struct bch_alloc_v4 a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + bch2_alloc_to_v4(k, &a); + + if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c, + "non free bucket in freespace btree (state %s)\n" + " %s\n" + " at %llu (genbits %u)", + bch2_bucket_states[bucket_state(a)], + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + free_entry, genbits)) { + ob = ERR_PTR(-EIO); + goto err; + } + + if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c, + "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " %s", + genbits, alloc_freespace_genbits(a) >> 56, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ob = ERR_PTR(-EIO); + goto err; + } + + if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c, + "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)", + b, ca->mi.first_bucket, ca->mi.nbuckets)) { + ob = ERR_PTR(-EIO); + goto err; + } + + ob = __try_alloc_bucket(c, ca, b, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; +} + +static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve) +{ + struct open_bucket *ob; + int i; + + spin_lock(&c->freelist_lock); + + for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { + ob = c->open_buckets + ca->open_buckets_partial[i]; + + if (reserve <= ob->alloc_reserve) { + array_remove_item(ca->open_buckets_partial, + ca->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + ob->alloc_reserve = reserve; + spin_unlock(&c->freelist_lock); + return ob; + } + } + + spin_unlock(&c->freelist_lock); + return NULL; +} + +/* + * This path is for before the freespace btree is initialized: + * + * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * +bch2_bucket_alloc_trans_early(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *cur_bucket, + u64 *buckets_seen, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + int ret; + + *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); + *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); + + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), + BTREE_ITER_SLOTS, k, ret) { + struct bch_alloc_v4 a; + + if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; + + if (ca->new_fs_bucket_idx && + is_superblock_bucket(ca, k.k->p.offset)) + continue; + + bch2_alloc_to_v4(k, &a); + + if (bucket_state(a) != BUCKET_free) + continue; + + (*buckets_seen)++; + + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + *cur_bucket = iter.pos.offset; + + return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); +} + +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *cur_bucket, + u64 *buckets_seen, + u64 *skipped_open, + u64 *skipped_need_journal_commit, + u64 *skipped_nouse, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + int ret; + + if (unlikely(!ca->mi.freespace_initialized)) + return bch2_bucket_alloc_trans_early(trans, ca, reserve, + cur_bucket, + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + + BUG_ON(ca->new_fs_bucket_idx); + + for_each_btree_key(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, *cur_bucket), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); + *cur_bucket != k.k->p.offset && !ob; + (*cur_bucket)++) { + if (btree_trans_too_many_iters(trans)) { + ob = ERR_PTR(-EINTR); + break; + } + + (*buckets_seen)++; + + ob = try_alloc_bucket(trans, ca, reserve, + *cur_bucket, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl); + } + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ob ?: ERR_PTR(ret); +} + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + * */ +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ + struct open_bucket *ob = NULL; + u64 avail = dev_buckets_available(ca, reserve); + u64 cur_bucket = 0; + u64 buckets_seen = 0; + u64 skipped_open = 0; + u64 skipped_need_journal_commit = 0; + u64 skipped_nouse = 0; + int ret; + + if (may_alloc_partial) { + ob = try_alloc_partial_bucket(c, ca, reserve); + if (ob) + return ob; + } +again: + if (!avail) { + if (cl) { + closure_wait(&c->freelist_wait, cl); + /* recheck after putting ourself on waitlist */ + avail = dev_buckets_available(ca, reserve); + if (avail) { + closure_wake_up(&c->freelist_wait); + goto again; + } + } + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + ob = ERR_PTR(-FREELIST_EMPTY); + goto err; + } + + ret = bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, + &cur_bucket, + &buckets_seen, + &skipped_open, + &skipped_need_journal_commit, + &skipped_nouse, + cl))); + + if (skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); +err: + if (!ob) + ob = ERR_PTR(ret ?: -FREELIST_EMPTY); + + if (IS_ERR(ob)) { + trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail, + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl == NULL, PTR_ERR(ob)); + atomic_long_inc(&c->bucket_alloc_fail); + } - trace_bucket_alloc(ca, reserve); return ob; } @@ -329,7 +576,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, struct dev_stripe_state *stripe) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca); + u64 free_space = dev_buckets_available(ca, RESERVE_none); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -380,6 +627,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c, { struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; struct bch_dev *ca; int ret = -INSUFFICIENT_DEVICES; unsigned i; @@ -389,30 +637,43 @@ int bch2_bucket_alloc_set(struct bch_fs *c, for (i = 0; i < devs_sorted.nr; i++) { struct open_bucket *ob; - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + dev = devs_sorted.devs[i]; + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + if (!ca) continue; - if (!ca->mi.durability && *have_cache) + if (!ca->mi.durability && *have_cache) { + percpu_ref_put(&ca->ref); continue; + } ob = bch2_bucket_alloc(c, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment(ca, stripe); + percpu_ref_put(&ca->ref); + if (IS_ERR(ob)) { ret = PTR_ERR(ob); if (cl) - return ret; + break; continue; } add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); - bch2_dev_stripe_increment(ca, stripe); - - if (*nr_effective >= nr_replicas) - return 0; + if (*nr_effective >= nr_replicas) { + ret = 0; + break; + } } return ret; @@ -580,9 +841,6 @@ static int open_bucket_add_buckets(struct bch_fs *c, if (*nr_effective >= nr_replicas) return 0; - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from @@ -596,9 +854,6 @@ retry_blocking: goto retry_blocking; } - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; } @@ -857,8 +1112,6 @@ alloc_done: BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - verify_not_stale(c, &wp->ptrs); - return wp; err: open_bucket_for_each(c, &wp->ptrs, ob, i) @@ -881,7 +1134,7 @@ err: case -INSUFFICIENT_DEVICES: return ERR_PTR(-EROFS); default: - BUG(); + return ERR_PTR(ret); } } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index d466bda9afc8..8bc78877f0fc 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -12,6 +12,8 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +extern const char * const bch2_alloc_reserves[]; + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -115,6 +117,20 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke return false; } +static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) +{ + bool ret; + + if (bch2_bucket_is_open(c, dev, bucket)) + return true; + + spin_lock(&c->freelist_lock); + ret = bch2_bucket_is_open(c, dev, bucket); + spin_unlock(&c->freelist_lock); + + return ret; +} + int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, unsigned, unsigned *, bool *, enum alloc_reserve, diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 409232e3d998..21b56451bc18 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -10,28 +10,18 @@ struct ec_bucket_buf; -#define ALLOC_THREAD_STATES() \ - x(stopped) \ - x(running) \ - x(blocked) \ - x(blocked_full) - -enum allocator_states { -#define x(n) ALLOCATOR_##n, - ALLOC_THREAD_STATES() -#undef x -}; +#define BCH_ALLOC_RESERVES() \ + x(btree_movinggc) \ + x(btree) \ + x(movinggc) \ + x(none) enum alloc_reserve { - RESERVE_BTREE_MOVINGGC = -2, - RESERVE_BTREE = -1, - RESERVE_MOVINGGC = 0, - RESERVE_NONE = 1, - RESERVE_NR = 2, +#define x(name) RESERVE_##name, + BCH_ALLOC_RESERVES() +#undef x }; -typedef FIFO(long) alloc_fifo; - #define OPEN_BUCKETS_COUNT 1024 #define WRITE_POINT_HASH_NR 32 @@ -94,12 +84,4 @@ struct write_point_specifier { unsigned long v; }; -struct alloc_heap_entry { - size_t bucket; - size_t nr; - unsigned long key; -}; - -typedef HEAP(struct alloc_heap_entry) alloc_heap; - #endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3ada85ac09c6..a13845a23387 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -177,7 +177,11 @@ */ #undef pr_fmt +#ifdef __KERNEL__ #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ +#else +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ +#endif #include <linux/backing-dev-defs.h> #include <linux/bug.h> @@ -219,8 +223,8 @@ #define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else -#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name) -#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum) +#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) #endif #define bch_info(c, fmt, ...) \ @@ -277,9 +281,6 @@ do { \ "significantly affect performance") \ BCH_DEBUG_PARAM(debug_check_iterators, \ "Enables extra verification for btree iterators") \ - BCH_DEBUG_PARAM(debug_check_bkeys, \ - "Run bkey_debugcheck (primarily checking GC/allocation "\ - "information) when iterating over keys") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(journal_seq_verify, \ @@ -351,6 +352,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" #include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" #include "clock_types.h" #include "ec_types.h" #include "journal_types.h" @@ -389,6 +391,10 @@ enum gc_phase { GC_PHASE_BTREE_reflink, GC_PHASE_BTREE_subvolumes, GC_PHASE_BTREE_snapshots, + GC_PHASE_BTREE_lru, + GC_PHASE_BTREE_freespace, + GC_PHASE_BTREE_need_discard, + GC_PHASE_BTREE_backpointers, GC_PHASE_PENDING_DELETE, }; @@ -432,6 +438,7 @@ struct bch_dev { struct bch_sb_handle disk_sb; struct bch_sb *sb_read_scratch; int sb_write_error; + dev_t dev; struct bch_devs_mask self; @@ -444,8 +451,9 @@ struct bch_dev { * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ - struct bucket_array __rcu *buckets[2]; - struct bucket_gens *bucket_gens; + struct bucket_array __rcu *buckets_gc; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; @@ -455,32 +463,16 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - struct task_struct __rcu *alloc_thread; - /* - * free: Buckets that are ready to be used - * - * free_inc: Incoming buckets - these are buckets that currently have - * cached data in them, and we can't reuse them until after we write - * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) - */ - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; unsigned nr_open_buckets; + unsigned nr_btree_reserve; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial_nr; - size_t fifo_last_bucket; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; - - enum allocator_states allocator_state; - - alloc_heap alloc_heap; + size_t buckets_waiting_on_journal; atomic64_t rebalance_work; @@ -502,17 +494,13 @@ struct bch_dev { enum { /* startup: */ - BCH_FS_INITIALIZED, - BCH_FS_ALLOC_READ_DONE, BCH_FS_ALLOC_CLEAN, - BCH_FS_ALLOCATOR_RUNNING, - BCH_FS_ALLOCATOR_STOPPING, BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_TOPOLOGY_REPAIR_DONE, - BCH_FS_BTREE_INTERIOR_REPLAY_DONE, BCH_FS_FSCK_DONE, BCH_FS_STARTED, + BCH_FS_MAY_GO_RW, BCH_FS_RW, BCH_FS_WAS_RW, @@ -530,16 +518,11 @@ enum { /* misc: */ BCH_FS_NEED_ANOTHER_GC, BCH_FS_DELETED_NODES, - BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, - BCH_FS_HOLD_BTREE_WRITES, }; struct btree_debug { unsigned id; - struct dentry *btree; - struct dentry *btree_format; - struct dentry *failed; }; struct bch_fs_pcpu { @@ -560,6 +543,7 @@ struct journal_keys { enum btree_id btree_id:8; unsigned level:8; bool allocated; + bool overwritten; struct bkey_i *k; u32 journal_seq; u32 journal_offset; @@ -666,7 +650,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct work_struct snapshot_delete_work; struct work_struct snapshot_wait_for_pagecache_and_delete_work; - struct snapshot_id_list snapshots_unlinked; + snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; /* BTREE CACHE */ @@ -709,6 +693,7 @@ struct bch_fs { bool btree_trans_barrier_initialized; struct btree_key_cache btree_key_cache; + unsigned btree_key_cache_btrees; struct workqueue_struct *btree_update_wq; struct workqueue_struct *btree_io_complete_wq; @@ -750,6 +735,7 @@ struct bch_fs { /* JOURNAL SEQ BLACKLIST */ struct journal_seq_blacklist_table * journal_seq_blacklist_table; + struct work_struct journal_seq_blacklist_gc_work; /* ALLOCATOR */ spinlock_t freelist_lock; @@ -771,6 +757,10 @@ struct bch_fs { struct mutex write_points_hash_lock; unsigned write_points_nr; + struct buckets_waiting_for_journal buckets_waiting_for_journal; + struct work_struct discard_work; + struct work_struct invalidate_work; + /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; @@ -796,6 +786,7 @@ struct bch_fs { * it's not while a gc is in progress. */ struct rw_semaphore gc_lock; + struct mutex gc_gens_lock; /* IO PATH */ struct semaphore io_in_flight; @@ -858,7 +849,6 @@ struct bch_fs { u64 reflink_hint; reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; - size_t reflink_gc_idx; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; @@ -879,7 +869,8 @@ struct bch_fs { struct bch_memquota_type quotas[QTYP_NR]; /* DEBUG JUNK */ - struct dentry *debug; + struct dentry *fs_debug_dir; + struct dentry *btree_debug_dir; struct btree_debug btree_debug[BTREE_ID_NR]; struct btree *verify_data; struct btree_node *verify_ondisk; @@ -907,6 +898,7 @@ struct bch_fs { atomic_long_t read_realloc_races; atomic_long_t extent_migrate_done; atomic_long_t extent_migrate_raced; + atomic_long_t bucket_alloc_fail; unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; @@ -943,6 +935,11 @@ static inline size_t btree_sectors(const struct bch_fs *c) return c->opts.btree_node_size >> 9; } +static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) +{ + return c->btree_key_cache_btrees & (1U << btree); +} + static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a053fca7886d..8312018e1ed5 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -76,6 +76,22 @@ #include <asm/byteorder.h> #include <linux/kernel.h> #include <linux/uuid.h> +#include "vstructs.h" + +#define BITMASK(name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ +static const unsigned name##_BITS = (end - offset); \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (k->field >> offset) & ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << (end - offset)) << offset); \ + k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ +} #define LE_BITMASK(_bits, name, type, field, offset, end) \ static const unsigned name##_OFFSET = offset; \ @@ -346,7 +362,10 @@ static inline void bkey_init(struct bkey *k) x(subvolume, 21) \ x(snapshot, 22) \ x(inode_v2, 23) \ - x(alloc_v3, 24) + x(alloc_v3, 24) \ + x(set, 25) \ + x(lru, 26) \ + x(alloc_v4, 27) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -376,6 +395,10 @@ struct bch_hash_whiteout { struct bch_val v; }; +struct bch_set { + struct bch_val v; +}; + /* Extents */ /* @@ -876,8 +899,8 @@ struct bch_alloc_v2 { #define BCH_ALLOC_FIELDS_V2() \ x(read_time, 64) \ x(write_time, 64) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ x(stripe, 32) \ x(stripe_redundancy, 8) @@ -892,11 +915,34 @@ struct bch_alloc_v3 { __u8 data[]; } __attribute__((packed, aligned(8))); +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + struct bpos backpointers[0]; +} __attribute__((packed, aligned(8))); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + enum { #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, BCH_ALLOC_FIELDS_V1() #undef x - BCH_ALLOC_FIELD_NR }; /* Quotas: */ @@ -1014,6 +1060,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +/* LRU btree: */ + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __attribute__((packed, aligned(8))); + +#define LRU_ID_STRIPES (1U << 16) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1022,16 +1077,17 @@ struct bch_sb_field { __le32 type; }; -#define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) +#define BCH_SB_FIELDS() \ + x(journal, 0) \ + x(members, 1) \ + x(crypt, 2) \ + x(replicas_v0, 3) \ + x(quota, 4) \ + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1040,6 +1096,14 @@ enum bch_sb_field_type { BCH_SB_FIELD_NR }; +/* + * Most superblock fields are replicated in all device's superblocks - a few are + * not: + */ +#define BCH_SINGLE_DEVICE_SB_FIELDS \ + ((1U << BCH_SB_FIELD_journal)| \ + (1U << BCH_SB_FIELD_journal_v2)) + /* BCH_SB_FIELD_journal: */ struct bch_sb_field_journal { @@ -1047,6 +1111,15 @@ struct bch_sb_field_journal { __le64 buckets[0]; }; +struct bch_sb_field_journal_v2 { + struct bch_sb_field field; + + struct bch_sb_field_journal_v2_entry { + __le64 start; + __le64 nr; + } d[0]; +}; + /* BCH_SB_FIELD_members: */ #define BCH_MIN_NR_NBUCKETS (1 << 6) @@ -1068,6 +1141,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags[0], 30, 31) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); @@ -1274,19 +1349,25 @@ struct bch_sb_field_journal_seq_blacklist { #define BCH_JSET_VERSION_OLD 2 #define BCH_BSET_VERSION_OLD 3 +#define BCH_METADATA_VERSIONS() \ + x(bkey_renumber, 10) \ + x(inode_btree_change, 11) \ + x(snapshot, 12) \ + x(inode_backpointers, 13) \ + x(btree_ptr_sectors_written, 14) \ + x(snapshot_2, 15) \ + x(reflink_p_fix, 16) \ + x(subvol_dirent, 17) \ + x(inode_v2, 18) \ + x(freespace, 19) \ + x(alloc_v4, 20) + enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, - bcachefs_metadata_version_new_versioning = 10, - bcachefs_metadata_version_bkey_renumber = 10, - bcachefs_metadata_version_inode_btree_change = 11, - bcachefs_metadata_version_snapshot = 12, - bcachefs_metadata_version_inode_backpointers = 13, - bcachefs_metadata_version_btree_ptr_sectors_written = 14, - bcachefs_metadata_version_snapshot_2 = 15, - bcachefs_metadata_version_reflink_p_fix = 16, - bcachefs_metadata_version_subvol_dirent = 17, - bcachefs_metadata_version_inode_v2 = 18, - bcachefs_metadata_version_max = 19, + bcachefs_metadata_version_min = 9, +#define x(t, n) bcachefs_metadata_version_##t = n, + BCH_METADATA_VERSIONS() +#undef x + bcachefs_metadata_version_max }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1426,6 +1507,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); +LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); /* * Features: @@ -1660,7 +1742,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(usage, 5) \ x(data_usage, 6) \ x(clock, 7) \ - x(dev_usage, 8) + x(dev_usage, 8) \ + x(log, 9) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1690,11 +1773,16 @@ struct jset_entry_blacklist_v2 { __le64 end; }; +#define BCH_FS_USAGE_TYPES() \ + x(reserved, 0) \ + x(inodes, 1) \ + x(key_version, 2) + enum { - FS_USAGE_RESERVED = 0, - FS_USAGE_INODES = 1, - FS_USAGE_KEY_VERSION = 2, - FS_USAGE_NR = 3 +#define x(f, nr) BCH_FS_USAGE_##f = nr, + BCH_FS_USAGE_TYPES() +#undef x + BCH_FS_USAGE_NR }; struct jset_entry_usage { @@ -1732,6 +1820,17 @@ struct jset_entry_dev_usage { struct jset_entry_dev_usage_type d[]; } __attribute__((packed)); +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) +{ + return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); +} + +struct jset_entry_log { + struct jset_entry entry; + u8 d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1785,7 +1884,11 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); x(stripes, 6) \ x(reflink, 7) \ x(subvolumes, 8) \ - x(snapshots, 9) + x(snapshots, 9) \ + x(lru, 10) \ + x(freespace, 11) \ + x(need_discard, 12) \ + x(backpointers, 13) enum btree_id { #define x(kwd, val) BTREE_ID_##kwd = val, diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 946dd27f09fc..4b01ab3029a2 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -57,11 +57,12 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, tmp = __bch2_bkey_unpack_key(format, packed); if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { - char buf1[160], buf2[160]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; char buf3[160], buf4[160]; - bch2_bkey_to_text(&PBUF(buf1), unpacked); - bch2_bkey_to_text(&PBUF(buf2), &tmp); + bch2_bkey_to_text(&buf1, unpacked); + bch2_bkey_to_text(&buf2, &tmp); bch2_to_binary(buf3, (void *) unpacked, 80); bch2_to_binary(buf4, high_word(format, packed), 80); @@ -72,7 +73,7 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, format->bits_per_field[2], format->bits_per_field[3], format->bits_per_field[4], - buf1, buf2, buf3, buf4); + buf1.buf, buf2.buf, buf3, buf4); } } diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5c900cf8a8a2..0eac86e5e776 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -9,6 +9,7 @@ #include "error.h" #include "extents.h" #include "inode.h" +#include "lru.h" #include "quota.h" #include "reflink.h" #include "subvolume.h" @@ -85,6 +86,24 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ } +static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_val_bytes(k.k)) + return "nonempty value"; + return NULL; +} + +static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +#define bch2_bkey_ops_set (struct bkey_ops) { \ + .key_invalid = key_type_set_invalid, \ + .key_merge = key_type_set_merge, \ +} + const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() @@ -130,7 +149,8 @@ static unsigned bch2_key_types_allowed[] = { (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_alloc)| (1U << KEY_TYPE_alloc_v2)| - (1U << KEY_TYPE_alloc_v3), + (1U << KEY_TYPE_alloc_v3)| + (1U << KEY_TYPE_alloc_v4), [BKEY_TYPE_quotas] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_quota), @@ -147,6 +167,15 @@ static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_snapshots] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_snapshot), + [BKEY_TYPE_lru] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_lru), + [BKEY_TYPE_freespace] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_need_discard] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), [BKEY_TYPE_btree] = (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| @@ -212,22 +241,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) return NULL; } -void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -{ - const char *invalid; - - BUG_ON(!k.k->u64s); - - invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: - bch2_bkey_in_btree_node(b, k); - if (invalid) { - char buf[160]; - - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); - } -} - void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { if (!bpos_cmp(pos, POS_MIN)) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 3012035db1a3..2289a09d98fc 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -6,6 +6,7 @@ struct bch_fs; struct btree; +struct btree_trans; struct bkey; enum btree_node_type; @@ -20,6 +21,10 @@ struct bkey_ops { void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); + int (*trans_trigger)(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); + int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -34,8 +39,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type); const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); -void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); - void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); void bch2_val_to_text(struct printbuf *, struct bch_fs *, @@ -59,6 +62,28 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +static inline int bch2_mark_key(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, + unsigned flags) +{ + const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; + + return ops->atomic_trigger + ? ops->atomic_trigger(trans, old, new, flags) + : 0; +} + +static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, + struct bkey_i *new, unsigned flags) +{ + const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; + + return ops->trans_trigger + ? ops->trans_trigger(trans, old, new, flags) + : 0; +} + void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 59e4c1d1a2a5..c7a41d0dc781 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -70,7 +70,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, struct bkey_packed *_k, *_n; struct bkey uk, n; struct bkey_s_c k; - char buf[200]; + struct printbuf buf = PRINTBUF; if (!i->u64s) return; @@ -81,12 +81,14 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, _n = bkey_next(_k); k = bkey_disassemble(b, _k, &uk); + + printbuf_reset(&buf); if (c) - bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_bkey_val_to_text(&buf, c, k); else - bch2_bkey_to_text(&PBUF(buf), k.k); + bch2_bkey_to_text(&buf, k.k); printk(KERN_ERR "block %u key %5zu: %s\n", set, - _k->_data - i->_data, buf); + _k->_data - i->_data, buf.buf); if (_n == vstruct_last(i)) continue; @@ -102,6 +104,8 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, !bpos_cmp(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } + + printbuf_exit(&buf); } void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) @@ -118,6 +122,7 @@ void bch2_dump_btree_node_iter(struct btree *b, struct btree_node_iter *iter) { struct btree_node_iter_set *set; + struct printbuf buf = PRINTBUF; printk(KERN_ERR "btree node iter with %u/%u sets:\n", __btree_node_iter_used(iter), b->nsets); @@ -126,12 +131,14 @@ void bch2_dump_btree_node_iter(struct btree *b, struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); struct bset_tree *t = bch2_bkey_to_bset(b, k); struct bkey uk = bkey_unpack_key(b, k); - char buf[100]; - bch2_bkey_to_text(&PBUF(buf), &uk); + printbuf_reset(&buf); + bch2_bkey_to_text(&buf, &uk); printk(KERN_ERR "set %zu key %u: %s\n", - t - b->set, set->k, buf); + t - b->set, set->k, buf.buf); } + + printbuf_exit(&buf); } #ifdef CONFIG_BCACHEFS_DEBUG @@ -167,13 +174,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, struct btree_node_iter_set *set; struct bkey ku = bkey_unpack_key(b, k); struct bkey nu = bkey_unpack_key(b, n); - char buf1[80], buf2[80]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&PBUF(buf1), &ku); - bch2_bkey_to_text(&PBUF(buf2), &nu); + bch2_bkey_to_text(&buf1, &ku); + bch2_bkey_to_text(&buf2, &nu); printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", - buf1, buf2); + buf1.buf, buf2.buf); printk(KERN_ERR "iter was:"); btree_node_iter_for_each(_iter, set) { @@ -238,6 +246,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); struct bkey_packed *next = (void *) (where->_data + clobber_u64s); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; #if 0 BUG_ON(prev && bkey_iter_cmp(b, prev, insert) > 0); @@ -246,17 +256,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, bkey_iter_cmp(b, prev, insert) > 0) { struct bkey k1 = bkey_unpack_key(b, prev); struct bkey k2 = bkey_unpack_key(b, insert); - char buf1[100]; - char buf2[100]; bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&PBUF(buf1), &k1); - bch2_bkey_to_text(&PBUF(buf2), &k2); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); panic("prev > insert:\n" "prev key %s\n" "insert key %s\n", - buf1, buf2); + buf1.buf, buf2.buf); } #endif #if 0 @@ -267,17 +275,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, bkey_iter_cmp(b, insert, next) > 0) { struct bkey k1 = bkey_unpack_key(b, insert); struct bkey k2 = bkey_unpack_key(b, next); - char buf1[100]; - char buf2[100]; bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&PBUF(buf1), &k1); - bch2_bkey_to_text(&PBUF(buf2), &k2); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); panic("insert > next:\n" "insert key %s\n" "next key %s\n", - buf1, buf2); + buf1.buf, buf2.buf); } #endif } @@ -473,7 +479,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b, unsigned j) { return cacheline_to_bkey(b, t, - __eytzinger1_to_inorder(j, t->size, t->extra), + __eytzinger1_to_inorder(j, t->size - 1, t->extra), bkey_float(b, t, j)->key_offset); } @@ -607,10 +613,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, } __always_inline -static inline void __make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) +static inline void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); @@ -679,34 +685,6 @@ static inline void __make_bfloat(struct btree *b, struct bset_tree *t, f->mantissa = mantissa; } -static void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) -{ - struct bkey_i *k; - - if (is_power_of_2(j) && - !min_key->u64s) { - if (!bkey_pack_pos(min_key, b->data->min_key, b)) { - k = (void *) min_key; - bkey_init(&k->k); - k->k.p = b->data->min_key; - } - } - - if (is_power_of_2(j + 1) && - !max_key->u64s) { - if (!bkey_pack_pos(max_key, b->data->max_key, b)) { - k = (void *) max_key; - bkey_init(&k->k); - k->k.p = b->data->max_key; - } - } - - __make_bfloat(b, t, j, min_key, max_key); -} - /* bytes remaining - only valid for last bset: */ static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { @@ -763,7 +741,7 @@ retry: t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; /* First we figure out where the first key in each cacheline is */ - eytzinger1_for_each(j, t->size) { + eytzinger1_for_each(j, t->size - 1) { while (bkey_to_cacheline(b, t, k) < cacheline) prev = k, k = bkey_next(k); @@ -795,10 +773,10 @@ retry: } /* Then we build the tree */ - eytzinger1_for_each(j, t->size) - __make_bfloat(b, t, j, - bkey_to_packed(&min_key), - bkey_to_packed(&max_key)); + eytzinger1_for_each(j, t->size - 1) + make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); } static void bset_alloc_tree(struct btree *b, struct bset_tree *t) @@ -897,7 +875,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, do { p = j ? tree_to_bkey(b, t, __inorder_to_eytzinger1(j--, - t->size, t->extra)) + t->size - 1, t->extra)) : btree_bkey_first(b, t); } while (p >= k); break; @@ -943,91 +921,6 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, /* Insert */ -static void rw_aux_tree_fix_invalidated_key(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - unsigned offset = __btree_node_key_to_offset(b, k); - unsigned j = rw_aux_tree_bsearch(b, t, offset); - - if (j < t->size && - rw_aux_tree(b, t)[j].offset == offset) - rw_aux_tree_set(b, t, j, k); - - bch2_bset_verify_rw_aux_tree(b, t); -} - -static void ro_aux_tree_fix_invalidated_key(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed min_key, max_key; - unsigned inorder, j; - - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - - if (bkey_next(k) == btree_bkey_last(b, t)) { - for (j = 1; j < t->size; j = j * 2 + 1) - make_bfloat(b, t, j, &min_key, &max_key); - } - - inorder = bkey_to_cacheline(b, t, k); - - if (inorder && - inorder < t->size) { - j = __inorder_to_eytzinger1(inorder, t->size, t->extra); - - if (k == tree_to_bkey(b, t, j)) { - /* Fix the node this key corresponds to */ - make_bfloat(b, t, j, &min_key, &max_key); - - /* Children for which this key is the right boundary */ - for (j = eytzinger1_left_child(j); - j < t->size; - j = eytzinger1_right_child(j)) - make_bfloat(b, t, j, &min_key, &max_key); - } - } - - if (inorder + 1 < t->size) { - j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); - - if (k == tree_to_prev_bkey(b, t, j)) { - make_bfloat(b, t, j, &min_key, &max_key); - - /* Children for which this key is the left boundary */ - for (j = eytzinger1_right_child(j); - j < t->size; - j = eytzinger1_left_child(j)) - make_bfloat(b, t, j, &min_key, &max_key); - } - } -} - -/** - * bch2_bset_fix_invalidated_key() - given an existing key @k that has been - * modified, fix any auxiliary search tree by remaking all the nodes in the - * auxiliary search tree that @k corresponds to - */ -void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - break; - case BSET_RO_AUX_TREE: - ro_aux_tree_fix_invalidated_key(b, t, k); - break; - case BSET_RW_AUX_TREE: - rw_aux_tree_fix_invalidated_key(b, t, k); - break; - } -} - static void bch2_bset_fix_lookup_table(struct btree *b, struct bset_tree *t, struct bkey_packed *_where, @@ -1262,7 +1155,7 @@ slowpath: n = n * 2 + (cmp < 0); } while (n < t->size); - inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); + inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); /* * n would have been the node we recursed to - the low bit tells us if @@ -1273,7 +1166,7 @@ slowpath: if (unlikely(!inorder)) return btree_bkey_first(b, t); - f = &base->f[eytzinger1_prev(n >> 1, t->size)]; + f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; } return cacheline_to_bkey(b, t, inorder, f->key_offset); @@ -1547,10 +1440,6 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, EBUG_ON(iter->data->k > iter->data->end); - while (!__btree_node_iter_set_end(iter, 0) && - !__bch2_btree_node_iter_peek_all(iter, b)->u64s) - iter->data->k++; - if (unlikely(__btree_node_iter_set_end(iter, 0))) { bch2_btree_node_iter_set_drop(iter, iter->data); return; @@ -1684,9 +1573,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, struct bkey uk; unsigned j, inorder; - if (out->pos != out->end) - *out->pos = '\0'; - if (!bset_has_ro_aux_tree(t)) return; @@ -1694,7 +1580,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, if (!inorder || inorder >= t->size) return; - j = __inorder_to_eytzinger1(inorder, t->size, t->extra); + j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); if (k != tree_to_bkey(b, t, j)) return; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index e42f866cf2ec..0d46534c3dcd 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -361,7 +361,6 @@ void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); void bch2_bset_insert(struct btree *, struct btree_node_iter *, struct bkey_packed *, struct bkey_i *, unsigned); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 3411d5a02203..0dcdc30c6888 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -15,6 +15,13 @@ struct lock_class_key bch2_btree_node_lock_key; +const char * const bch2_btree_node_flags[] = { +#define x(f) #f, + BTREE_FLAGS() +#undef x + NULL +}; + void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned i, reserve = 16; @@ -35,6 +42,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) return max_t(int, 0, bc->used - bc->reserve); } +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) +{ + if (b->c.lock.readers) + list_move(&b->list, &bc->freed_pcpu); + else + list_move(&b->list, &bc->freed_nonpcpu); +} + static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; @@ -51,7 +66,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) b->aux_data = NULL; bc->used--; - list_move(&b->list, &bc->freed); + + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -83,6 +99,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + if (b->aux_data == MAP_FAILED) + b->aux_data = NULL; #endif if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); @@ -154,11 +172,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, b->c.level = level; b->c.btree_id = id; - if (level) - six_lock_pcpu_alloc(&b->c.lock); - else - six_lock_pcpu_free_rcu(&b->c.lock); - mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); if (!ret) @@ -215,15 +228,13 @@ wait_on_io: goto wait_on_io; } - if (btree_node_noevict(b)) - goto out_unlock; - - if (!btree_node_may_write(b)) + if (btree_node_noevict(b) || + btree_node_write_blocked(b) || + btree_node_will_make_reachable(b)) goto out_unlock; if (btree_node_dirty(b)) { - if (!flush || - test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + if (!flush) goto out_unlock; /* * Using the underscore version because we don't want to compact @@ -232,9 +243,9 @@ wait_on_io: * the post write cleanup: */ if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent); + bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); else - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, 0); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -274,6 +285,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned long freed = 0; unsigned i, flags; + unsigned long ret = SHRINK_STOP; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -282,7 +294,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (sc->gfp_mask & __GFP_FS) mutex_lock(&bc->lock); else if (!mutex_trylock(&bc->lock)) - return -1; + goto out_norestore; flags = memalloc_nofs_save(); @@ -299,13 +311,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { + /* + * Leave a few nodes on the freeable list, so that a btree split + * won't have to hit the system allocator: + */ + if (++i <= 3) + continue; + touched++; if (touched >= nr) break; - if (++i > 3 && - !btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -314,17 +332,13 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, } restart: list_for_each_entry_safe(b, t, &bc->live, list) { - touched++; - - if (touched >= nr) { - /* Save position */ - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); - break; + /* tweak this */ + if (btree_node_accessed(b)) { + clear_btree_node_accessed(b); + goto touched; } - if (!btree_node_accessed(b) && - !btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b)) { /* can't call bch2_btree_node_hash_remove under lock */ freed++; if (&t->list != &bc->live) @@ -345,14 +359,30 @@ restart: else if (!mutex_trylock(&bc->lock)) goto out; goto restart; - } else - clear_btree_node_accessed(b); + } else { + continue; + } +touched: + touched++; + + if (touched >= nr) { + /* Save position */ + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); + break; + } } mutex_unlock(&bc->lock); out: + ret = (unsigned long) freed * btree_pages(c); memalloc_nofs_restore(flags); - return (unsigned long) freed * btree_pages(c); +out_norestore: + trace_btree_cache_scan(sc->nr_to_scan, + sc->nr_to_scan / btree_pages(c), + btree_cache_can_free(bc), + ret); + return ret; } static unsigned long bch2_btree_cache_count(struct shrinker *shrink, @@ -400,15 +430,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(c, b); + clear_btree_node_dirty_acct(c, b); btree_node_data_free(c, b); } BUG_ON(atomic_read(&c->btree_cache.dirty)); - while (!list_empty(&bc->freed)) { - b = list_first_entry(&bc->freed, struct btree, list); + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + while (!list_empty(&bc->freed_nonpcpu)) { + b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); list_del(&b->list); six_lock_pcpu_free(&b->c.lock); kfree(b); @@ -462,7 +494,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) mutex_init(&bc->lock); INIT_LIST_HEAD(&bc->live); INIT_LIST_HEAD(&bc->freeable); - INIT_LIST_HEAD(&bc->freed); + INIT_LIST_HEAD(&bc->freed_pcpu); + INIT_LIST_HEAD(&bc->freed_nonpcpu); } /* @@ -537,10 +570,13 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) } } -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; + struct list_head *freed = pcpu_read_locks + ? &bc->freed_pcpu + : &bc->freed_nonpcpu; + struct btree *b, *b2; u64 start_time = local_clock(); unsigned flags; @@ -548,44 +584,49 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) mutex_lock(&bc->lock); /* - * btree_free() doesn't free memory; it sticks the node on the end of - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b, &bc->freeable, list) - if (!btree_node_reclaim(c, b)) - goto got_node; - - /* * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ - list_for_each_entry(b, &bc->freed, list) - if (!btree_node_reclaim(c, b)) + list_for_each_entry(b, freed, list) + if (!btree_node_reclaim(c, b)) { + list_del_init(&b->list); goto got_node; + } + + b = __btree_node_mem_alloc(c); + if (!b) + goto err_locked; + + if (pcpu_read_locks) + six_lock_pcpu_alloc(&b->c.lock); - b = NULL; + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); got_node: - if (b) - list_del_init(&b->list); - mutex_unlock(&bc->lock); - if (!b) { - b = __btree_node_mem_alloc(c); - if (!b) - goto err; + /* + * btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b2, &bc->freeable, list) + if (!btree_node_reclaim(c, b2)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + goto got_mem; + } - BUG_ON(!six_trylock_intent(&b->c.lock)); - BUG_ON(!six_trylock_write(&b->c.lock)); - } + mutex_unlock(&bc->lock); - if (!b->data) { - if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) - goto err; + if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + goto err; - mutex_lock(&bc->lock); - bc->used++; - mutex_unlock(&bc->lock); - } + mutex_lock(&bc->lock); + bc->used++; +got_mem: + mutex_unlock(&bc->lock); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); @@ -607,20 +648,24 @@ out: return b; err: mutex_lock(&bc->lock); - - if (b) { - list_add(&b->list, &bc->freed); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - } - +err_locked: /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { - b = btree_node_cannibalize(c); - list_del_init(&b->list); - mutex_unlock(&bc->lock); + b2 = btree_node_cannibalize(c); + bch2_btree_node_hash_remove(bc, b2); + + if (b) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + } else { + b = b2; + list_del_init(&b->list); + } - bch2_btree_node_hash_remove(bc, b); + mutex_unlock(&bc->lock); trace_btree_node_cannibalize(c); goto out; @@ -651,11 +696,22 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * been freed: */ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + trace_trans_restart_relock_parent_for_fill(trans->fn, + _THIS_IP_, btree_id, &path->pos); + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } + + b = bch2_btree_node_mem_alloc(c, level != 0); + + if (trans && b == ERR_PTR(-ENOMEM)) { + trans->memory_allocation_failure = true; + trace_trans_restart_memory_allocation_failure(trans->fn, + _THIS_IP_, btree_id, &path->pos); btree_trans_restart(trans); return ERR_PTR(-EINTR); } - b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) return b; @@ -698,6 +754,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { + trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, + btree_id, &path->pos); btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -715,14 +773,16 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { - char buf1[200], buf2[100], buf3[100]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct printbuf buf3 = PRINTBUF; if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) return; - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key)); - bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); - bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key)); + bch2_bpos_to_text(&buf2, b->data->min_key); + bch2_bpos_to_text(&buf3, b->data->max_key); bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" "btree %s level %u\n" @@ -730,10 +790,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) "header: btree %s level %llu\n" "min %s max %s\n", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, + buf1.buf, bch2_btree_ids[BTREE_NODE_ID(b->data)], BTREE_NODE_LEVEL(b->data), - buf2, buf3); + buf2.buf, buf3.buf); + + printbuf_exit(&buf3); + printbuf_exit(&buf2); + printbuf_exit(&buf1); } static inline void btree_check_header(struct bch_fs *c, struct btree *b) @@ -843,7 +907,7 @@ lock_node: if (bch2_btree_node_relock(trans, path, level + 1)) goto retry; - trace_trans_restart_btree_node_reused(trans->ip, + trace_trans_restart_btree_node_reused(trans->fn, trace_ip, path->btree_id, &path->pos); @@ -1025,7 +1089,7 @@ wait_on_io: six_lock_write(&b->c.lock, NULL, NULL); if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, 0); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index f7e10986f317..25906127c023 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -7,6 +7,8 @@ extern struct lock_class_key bch2_btree_node_lock_key; +extern const char * const bch2_btree_node_flags[]; + struct btree_iter; void bch2_recalc_btree_reserve(struct bch_fs *); @@ -20,7 +22,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); +struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index d1883701afc3..e19991796c82 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -69,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c, struct bpos expected_start = bkey_deleted(&prev->k->k) ? node_start : bpos_successor(prev->k->k.p); - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; int ret = 0; if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - if (bkey_deleted(&prev->k->k)) { - struct printbuf out = PBUF(buf1); - pr_buf(&out, "start of node: "); - bch2_bpos_to_text(&out, node_start); - } else { - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); - } - if (bpos_cmp(expected_start, bp->v.min_key)) { bch2_topology_error(c); + if (bkey_deleted(&prev->k->k)) { + pr_buf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, node_start); + } else { + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); + } + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); + if (__fsck_err(c, FSCK_CAN_FIX| FSCK_CAN_IGNORE| @@ -94,11 +95,11 @@ static int bch2_gc_check_topology(struct bch_fs *c, " prev %s\n" " cur %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, - (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) && + buf1.buf, buf2.buf) && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); - return FSCK_ERR_START_TOPOLOGY_REPAIR; + ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + goto err; } else { set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); } @@ -108,6 +109,12 @@ static int bch2_gc_check_topology(struct bch_fs *c, if (is_last && bpos_cmp(cur.k->k.p, node_end)) { bch2_topology_error(c); + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); + bch2_bpos_to_text(&buf2, node_end); + if (__fsck_err(c, FSCK_CAN_FIX| FSCK_CAN_IGNORE| @@ -116,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c, " %s\n" " expected %s", bch2_btree_ids[b->c.btree_id], b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), - (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) && + buf1.buf, buf2.buf) && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); - return FSCK_ERR_START_TOPOLOGY_REPAIR; + ret = FSCK_ERR_START_TOPOLOGY_REPAIR; + goto err; } else { set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); } } bch2_bkey_buf_copy(prev, c, cur.k); +err: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -155,6 +165,34 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } +static void bch2_btree_node_update_key_early(struct bch_fs *c, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) { struct bkey_i_btree_ptr_v2 *new; @@ -169,11 +207,11 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) new->v.min_key = new_min; SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); - kfree(new); - - if (ret) + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); return ret; + } bch2_btree_node_drop_keys_outside_node(b); @@ -198,11 +236,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new->k.p = new_max; SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); - kfree(new); - - if (ret) + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); return ret; + } bch2_btree_node_drop_keys_outside_node(b); @@ -222,18 +260,17 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; int ret = 0; if (!prev) { - struct printbuf out = PBUF(buf1); - pr_buf(&out, "start of node: "); - bch2_bpos_to_text(&out, b->data->min_key); + pr_buf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, b->data->min_key); } else { - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)); + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); if (prev && bpos_cmp(expected_start, cur->data->min_key) > 0 && @@ -246,8 +283,10 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, " node %s\n" " next %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) - return DROP_PREV_NODE; + buf1.buf, buf2.buf)) { + ret = DROP_PREV_NODE; + goto out; + } if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, bpos_predecessor(cur->data->min_key)), c, @@ -255,7 +294,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, " node %s\n" " next %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) + buf1.buf, buf2.buf)) ret = set_node_max(c, prev, bpos_predecessor(cur->data->min_key)); } else { @@ -267,39 +306,49 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, " prev %s\n" " node %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) - return DROP_THIS_NODE; + buf1.buf, buf2.buf)) { + ret = DROP_THIS_NODE; + goto out; + } if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " node %s", bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, buf2)) + buf1.buf, buf2.buf)) ret = set_node_min(c, cur, expected_start); } +out: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } static int btree_repair_node_end(struct bch_fs *c, struct btree *b, struct btree *child) { - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; int ret = 0; + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); + bch2_bpos_to_text(&buf2, b->key.k.p); + if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", bch2_btree_ids[b->c.btree_id], b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1), - (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) { + buf1.buf, buf2.buf)) { ret = set_node_max(c, child, b->key.k.p); if (ret) - return ret; + goto err; } +err: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -310,7 +359,7 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; bool have_child, dropped_children = false; - char buf[200]; + struct printbuf buf; int ret = 0; if (!b->c.level) @@ -334,12 +383,15 @@ again: false); ret = PTR_ERR_OR_ZERO(cur); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + if (mustfix_fsck_err_on(ret == -EIO, c, "Unreadable btree node at btree %s level %u:\n" " %s", bch2_btree_ids[b->c.btree_id], b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { + buf.buf)) { bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); @@ -439,12 +491,14 @@ again: have_child = true; } + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + if (mustfix_fsck_err_on(!have_child, c, "empty interior btree node at btree %s level %u\n" " %s", bch2_btree_ids[b->c.btree_id], - b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf))) + b->c.level, buf.buf)) ret = DROP_THIS_NODE; err: fsck_err: @@ -460,6 +514,7 @@ fsck_err: if (!ret && dropped_children) goto again; + printbuf_exit(&buf); return ret; } @@ -495,7 +550,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, const union bch_extent_entry *entry; struct extent_ptr_decoded p = { 0 }; bool do_update = false; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; /* @@ -505,7 +560,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - struct bucket *g2 = PTR_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); if (fsck_err_on(!g->gen_valid, c, @@ -514,83 +568,72 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->gen_valid = true; + g->gen = p.ptr.gen; } else { do_update = true; } } - if (fsck_err_on(data_type == BCH_DATA_btree && - g->mark.gen != p.ptr.gen, c, - "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - g2->_mark.data_type = g->_mark.data_type = data_type; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); - } - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } } - if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)], p.ptr.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (fsck_err_on(!p.ptr.cached && - gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + gen_cmp(p.ptr.gen, g->gen) < 0, c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (p.ptr.gen != g->mark.gen) + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) continue; - if (fsck_err_on(g->mark.data_type && - g->mark.data_type != data_type, c, + if (fsck_err_on(g->data_type && + g->data_type != data_type, c, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->mark.data_type], + bch2_data_types[g->data_type], bch2_data_types[data_type], - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { - g2->_mark.data_type = g->_mark.data_type = data_type; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); } else { do_update = true; } @@ -603,14 +646,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; } } @@ -623,13 +668,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, if (is_root) { bch_err(c, "cannot update btree roots yet"); - return -EINVAL; + ret = -EINVAL; + goto err; } new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { bch_err(c, "%s: error allocating new key", __func__); - return -ENOMEM; + ret = -ENOMEM; + goto err; } bkey_reassemble(new, *k); @@ -645,7 +692,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); - ptr->gen = g->mark.gen; + ptr->gen = g->gen; } } else { bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ @@ -654,12 +701,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || (!ptr->cached && - gen_cmp(ptr->gen, g->mark.gen) < 0) || - gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->mark.data_type && - g->mark.data_type != data_type); + gen_cmp(ptr->gen, g->gen) < 0) || + gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type); })); again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); @@ -690,13 +737,28 @@ found: } } - ret = bch2_journal_key_insert(c, btree_id, level, new); - kfree(new); + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + + if (level) + bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); - if (!ret) - *k = bkey_i_to_s_c(new); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, *k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + + *k = bkey_i_to_s_c(new); } +err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -705,11 +767,9 @@ fsck_err: static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, unsigned level, bool is_root, struct bkey_s_c *k, - u8 *max_stale, bool initial) + bool initial) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; unsigned flags = @@ -721,7 +781,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (initial) { BUG_ON(bch2_journal_seq_verify && - k->k->version.lo > journal_cur_seq(&c->journal)); + k->k->version.lo > atomic64_read(&c->journal.seq)); ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); if (ret) @@ -734,18 +794,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } - ptrs = bch2_bkey_ptrs_c(*k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - if (gen_after(g->oldest_gen, ptr->gen)) - g->oldest_gen = ptr->gen; - - *max_stale = max(*max_stale, ptr_stale(ca, ptr)); - } - - ret = bch2_mark_key(trans, old, *k, flags); + ret = __bch2_trans_do(trans, NULL, NULL, 0, + bch2_mark_key(trans, old, *k, flags)); fsck_err: err: if (ret) @@ -753,8 +803,7 @@ err: return ret; } -static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale, - bool initial) +static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { struct bch_fs *c = trans->c; struct btree_node_iter iter; @@ -763,8 +812,6 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma struct bkey_buf prev, cur; int ret = 0; - *max_stale = 0; - if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; @@ -775,7 +822,7 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, max_stale, initial); + &k, initial); if (ret) break; @@ -806,7 +853,6 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale = 0; int ret = 0; gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); @@ -817,21 +863,9 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(trans, b, &max_stale, initial); + ret = btree_gc_mark_node(trans, b, initial); if (ret) break; - - if (!initial) { - if (max_stale > 64) - bch2_btree_node_rewrite(trans, &iter, b, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - else if (!bch2_btree_gc_rewrite_disabled && - (bch2_btree_gc_always_rewrite || max_stale > 16)) - bch2_btree_node_rewrite(trans, &iter, - b, BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - } } bch2_trans_iter_exit(trans, &iter); @@ -843,8 +877,8 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, if (!btree_node_fake(b)) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, - &k, &max_stale, initial); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + true, &k, initial); } gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -859,8 +893,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - u8 max_stale = 0; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); @@ -872,8 +905,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, &max_stale, true); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + false, &k, true); if (ret) { bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); goto fsck_err; @@ -921,7 +954,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b " %s", bch2_btree_ids[b->c.btree_id], b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) && + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { ret = FSCK_ERR_START_TOPOLOGY_REPAIR; bch_info(c, "Halting mark and sweep to start topology repair pass"); @@ -951,6 +985,7 @@ fsck_err: bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); bch2_btree_and_journal_iter_exit(&iter); + printbuf_exit(&buf); return ret; } @@ -964,8 +999,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale = 0; - char buf[100]; + struct printbuf buf = PRINTBUF; int ret = 0; b = c->btree_roots[btree_id].b; @@ -974,17 +1008,19 @@ static int bch2_gc_btree_init(struct btree_trans *trans, return 0; six_lock_read(&b->c.lock, NULL, NULL); + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, - "btree root with incorrect min_key: %s", - (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { + "btree root with incorrect min_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = FSCK_ERR_EXIT; goto fsck_err; } + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, - "btree root with incorrect max_key: %s", - (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { + "btree root with incorrect max_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = FSCK_ERR_EXIT; goto fsck_err; @@ -997,13 +1033,14 @@ static int bch2_gc_btree_init(struct btree_trans *trans, struct bkey_s_c k = bkey_i_to_s_c(&b->key); ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, - &k, &max_stale, true); + &k, true); } fsck_err: six_unlock_read(&b->c.lock); if (ret < 0) bch_err(c, "%s: ret %i", __func__, ret); + printbuf_exit(&buf); return ret; } @@ -1022,6 +1059,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) bch2_trans_init(&trans, c, 0, 0); + if (initial) + trans.is_initial_gc = true; + for (i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); @@ -1123,10 +1163,10 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(ca, c, i) { - kvpfree(rcu_dereference_protected(ca->buckets[1], 1), + kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - ca->buckets[1] = NULL; + ca->buckets_gc = NULL; free_percpu(ca->usage_gc); ca->usage_gc = NULL; @@ -1140,18 +1180,20 @@ static int bch2_gc_done(struct bch_fs *c, bool initial, bool metadata_only) { struct bch_dev *ca = NULL; + struct printbuf buf = PRINTBUF; bool verify = !metadata_only && (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i, dev; int ret = 0; + percpu_down_write(&c->mark_lock); + #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ @@ -1161,18 +1203,6 @@ static int bch2_gc_done(struct bch_fs *c, iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ - } -#define copy_bucket_field(_f) \ - if (dst->b[b]._f != src->b[b]._f) { \ - if (verify) \ - fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", dev, b, \ - dst->b[b].mark.gen, \ - bch2_data_types[dst->b[b].mark.data_type],\ - dst->b[b]._f, src->b[b]._f); \ - dst->b[b]._f = src->b[b]._f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) @@ -1183,36 +1213,18 @@ static int bch2_gc_done(struct bch_fs *c, bch2_fs_usage_acc_to_base(c, i); for_each_member_device(ca, c, dev) { - struct bucket_array *dst = __bucket_array(ca, 0); - struct bucket_array *src = __bucket_array(ca, 1); - size_t b; - - for (b = 0; b < src->nbuckets; b++) { - copy_bucket_field(_mark.gen); - copy_bucket_field(_mark.data_type); - copy_bucket_field(_mark.stripe); - copy_bucket_field(_mark.dirty_sectors); - copy_bucket_field(_mark.cached_sectors); - copy_bucket_field(stripe_redundancy); - copy_bucket_field(stripe); - - dst->b[b].oldest_gen = src->b[b].oldest_gen; - } - - { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage_gc, - dev_usage_u64s()); - - copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); - - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); - } + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); } }; @@ -1239,22 +1251,21 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - char buf[80]; if (metadata_only && (e->data_type == BCH_DATA_user || e->data_type == BCH_DATA_cached)) continue; - bch2_replicas_entry_to_text(&PBUF(buf), e); + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, e); - copy_fs_field(replicas[i], "%s", buf); + copy_fs_field(replicas[i], "%s", buf.buf); } } #undef copy_fs_field #undef copy_dev_field -#undef copy_bucket_field #undef copy_stripe_field #undef copy_field fsck_err: @@ -1262,6 +1273,9 @@ fsck_err: percpu_ref_put(&ca->ref); if (ret) bch_err(c, "%s: ret %i", __func__, ret); + + percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); return ret; } @@ -1281,18 +1295,9 @@ static int bch2_gc_start(struct bch_fs *c, } for_each_member_device(ca, c, i) { - BUG_ON(ca->buckets[1]); + BUG_ON(ca->buckets_gc); BUG_ON(ca->usage_gc); - ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets[1]) { - percpu_ref_put(&ca->ref); - bch_err(c, "error allocating ca->buckets[gc]"); - return -ENOMEM; - } - ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); @@ -1301,103 +1306,215 @@ static int bch2_gc_start(struct bch_fs *c, } } - percpu_down_write(&c->mark_lock); + return 0; +} - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); +/* returns true if not equal */ +static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, + struct bch_alloc_v4 r) +{ + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type || + l.dirty_sectors != r.dirty_sectors || + l.cached_sectors != r.cached_sectors || + l.stripe_redundancy != r.stripe_redundancy || + l.stripe != r.stripe; +} - for_each_member_device(ca, c, i) { - struct bucket_array *dst = __bucket_array(ca, 1); - struct bucket_array *src = __bucket_array(ca, 0); - size_t b; +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket gc; + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + struct bch_alloc_v4 old, new; + int ret; - dst->first_bucket = src->first_bucket; - dst->nbuckets = src->nbuckets; + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; - for (b = 0; b < src->nbuckets; b++) { - struct bucket *d = &dst->b[b]; - struct bucket *s = &src->b[b]; + bch2_alloc_to_v4(k, &old); + new = old; - d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; - d->gen_valid = s->gen_valid; + percpu_down_read(&c->mark_lock); + gc = *gc_bucket(ca, iter->pos.offset); + percpu_up_read(&c->mark_lock); - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) - d->_mark = s->mark; - } - }; + if (metadata_only && + gc.data_type != BCH_DATA_sb && + gc.data_type != BCH_DATA_journal && + gc.data_type != BCH_DATA_btree) + return 0; - percpu_up_write(&c->mark_lock); + if (gen_after(old.gen, gc.gen)) + return 0; - return 0; +#define copy_bucket_field(_f) \ + if (fsck_err_on(new._f != gc._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + gc.gen, \ + bch2_data_types[gc.data_type], \ + new._f, gc._f)) \ + new._f = gc._f; \ + + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); +#undef copy_bucket_field + + if (!bch2_alloc_v4_cmp(old, new)) + return 0; + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + a->v = new; + + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); +fsck_err: + return ret; } -static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) +static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { - struct bch_fs *c = trans->c; - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - char buf[200]; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; int ret = 0; - if (!refcount) - return 0; + bch2_trans_init(&trans, c, 0, 0); - r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); - if (!r) - return -ENOMEM; + for_each_member_device(ca, c, i) { + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; + + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, + metadata_only)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; + if (ret) { + bch_err(c, "error writing alloc info: %i", ret); + percpu_ref_put(&ca->ref); + break; + } } - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - r->refcount)) { - struct bkey_i *new; + bch2_trans_exit(&trans); + return ret; +} - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto fsck_err; +static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +{ + struct bch_dev *ca; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bucket *g; + struct bch_alloc_v4 a; + unsigned i; + int ret; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); + bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; } - bkey_reassemble(new, k); + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets_gc, buckets); + }; - if (!r->refcount) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - } else { - *bkey_refcount(new) = cpu_to_le64(r->refcount); - } + bch2_trans_init(&trans, c, 0, 0); - ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); - kfree(new); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = gc_bucket(ca, k.k->p.offset); + + bch2_alloc_to_v4(k, &a); + + g->gen_valid = 1; + g->gen = a.gen; + + if (metadata_only && + (a.data_type == BCH_DATA_user || + a.data_type == BCH_DATA_cached || + a.data_type == BCH_DATA_parity)) { + g->data_type = a.data_type; + g->dirty_sectors = a.dirty_sectors; + g->cached_sectors = a.cached_sectors; + g->stripe = a.stripe; + g->stripe_redundancy = a.stripe_redundancy; + } } -fsck_err: + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error reading alloc info at gc start: %i", ret); + return ret; } -static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, - bool metadata_only) +static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = gc_bucket_array(ca); + struct bucket *g; + + for_each_bucket(g, buckets) { + if (metadata_only && + (g->data_type == BCH_DATA_user || + g->data_type == BCH_DATA_cached || + g->data_type == BCH_DATA_parity)) + continue; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } + }; +} + +static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; size_t idx = 0; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; if (metadata_only) @@ -1405,14 +1522,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bch2_trans_init(&trans, c, 0, 0); - if (initial) { - c->reflink_gc_idx = 0; - - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, - bch2_gc_reflink_done_initial_fn); - goto out; - } - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1420,7 +1529,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, if (!refcount) continue; - r = genradix_ptr(&c->reflink_gc_table, idx); + r = genradix_ptr(&c->reflink_gc_table, idx++); if (!r || r->offset != k.k->p.offset || r->size != k.k->size) { @@ -1433,7 +1542,8 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, "reflink key has wrong refcount:\n" " %s\n" " should be %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { struct bkey_i *new; @@ -1451,7 +1561,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, *bkey_refcount(new) = cpu_to_le64(r->refcount); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); + __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); kfree(new); if (ret) @@ -1460,149 +1570,128 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, } fsck_err: bch2_trans_iter_exit(&trans, &iter); -out: c->reflink_gc_nr = 0; bch2_trans_exit(&trans); + printbuf_exit(&buf); return ret; } -static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) +static int bch2_gc_reflink_start(struct bch_fs *c, + bool metadata_only) { - struct bch_fs *c = trans->c; - struct gc_stripe *m; - const struct bch_stripe *s; - char buf[200]; - unsigned i; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; int ret = 0; - if (k.k->type != KEY_TYPE_stripe) + if (metadata_only) return 0; - s = bkey_s_c_to_stripe(k).v; + bch2_trans_init(&trans, c, 0, 0); + c->reflink_gc_nr = 0; - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); - for (i = 0; i < s->nr_blocks; i++) - if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) - goto inconsistent; - return 0; -inconsistent: - if (fsck_err_on(true, c, - "stripe has wrong block sector count %u:\n" - " %s\n" - " should be %u", i, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - m ? m->block_sectors[i] : 0)) { - struct bkey_i_stripe *new; + if (!refcount) + continue; - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { ret = -ENOMEM; - goto fsck_err; + break; } - bkey_reassemble(&new->k_i, k); - - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - - ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); - kfree(new); - } -fsck_err: - return ret; -} - -static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, - bool metadata_only) -{ - struct btree_trans trans; - int ret = 0; - - if (metadata_only) - return 0; - - bch2_trans_init(&trans, c, 0, 0); - - if (initial) { - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, - bch2_gc_stripes_done_initial_fn); - } else { - BUG(); + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; } + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } -static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) +static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) { - - struct bch_fs *c = trans->c; + struct genradix_iter iter; struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - return 0; - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - return 0; + genradix_for_each(&c->reflink_gc_table, iter, r) + r->refcount = 0; } -static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, - bool metadata_only) +static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct reflink_gc *r; + struct gc_stripe *m; + const struct bch_stripe *s; + struct printbuf buf = PRINTBUF; + unsigned i; int ret = 0; if (metadata_only) return 0; bch2_trans_init(&trans, c, 0, 0); - c->reflink_gc_nr = 0; - if (initial) { - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, - bch2_gc_reflink_start_initial_fn); - goto out; - } - - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) + if (k.k->type != KEY_TYPE_stripe) continue; - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -ENOMEM; - break; - } + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; + for (i = 0; i < s->nr_blocks; i++) + if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) + goto inconsistent; + continue; +inconsistent: + if (fsck_err_on(true, c, + "stripe has wrong block sector count %u:\n" + " %s\n" + " should be %u", i, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf), + m ? m->block_sectors[i] : 0)) { + struct bkey_i_stripe *new; + + new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + break; + } + + bkey_reassemble(&new->k_i, k); + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); + kfree(new); + } } +fsck_err: bch2_trans_iter_exit(&trans, &iter); -out: + bch2_trans_exit(&trans); + + printbuf_exit(&buf); return ret; } +static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) +{ + genradix_free(&c->gc_stripes); +} + /** * bch2_gc - walk _all_ references to buckets, and recompute them: * @@ -1623,9 +1712,8 @@ out: */ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) { - struct bch_dev *ca; u64 start_time = local_clock(); - unsigned i, iter = 0; + unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1636,11 +1724,14 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) /* flush interior btree updates: */ closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); -again: + ret = bch2_gc_start(c, metadata_only) ?: - bch2_gc_reflink_start(c, initial, metadata_only); + bch2_gc_alloc_start(c, metadata_only) ?: + bch2_gc_reflink_start(c, metadata_only); if (ret) goto out; +again: + gc_pos_set(c, gc_phase(GC_PHASE_START)); bch2_mark_superblocks(c); @@ -1678,40 +1769,40 @@ again: if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || (!iter && bch2_test_restart_gc)) { + if (iter++ > 2) { + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + goto out; + } + /* * XXX: make sure gens we fixed got saved */ - if (iter++ <= 2) { - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - - percpu_down_write(&c->mark_lock); - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - goto again; - } + bch2_gc_stripes_reset(c, metadata_only); + bch2_gc_alloc_reset(c, metadata_only); + bch2_gc_reflink_reset(c, metadata_only); - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); + goto again; } out: if (!ret) { bch2_journal_block(&c->journal); - percpu_down_write(&c->mark_lock); - ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: - bch2_gc_stripes_done(c, initial, metadata_only) ?: + ret = bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only) ?: + bch2_gc_alloc_done(c, metadata_only) ?: bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); - } else { - percpu_down_write(&c->mark_lock); } + percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); @@ -1724,13 +1815,6 @@ out: bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); /* - * Wake up allocator in case it was waiting for buckets - * because of not being able to inc gens - */ - for_each_member_device(ca, c, i) - bch2_wake_allocator(ca); - - /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: */ @@ -1746,9 +1830,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) percpu_down_read(&c->mark_lock); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr); - if (gen_after(g->mark.gen, ptr->gen) > 16) { + if (ptr_stale(ca, ptr) > 16) { percpu_up_read(&c->mark_lock); return true; } @@ -1756,10 +1839,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr); + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(g->gc_gen, ptr->gen)) - g->gc_gen = ptr->gen; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; } percpu_up_read(&c->mark_lock); @@ -1770,23 +1853,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree * node pointers currently never have cached pointers that can become stale: */ -static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) +static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) { - struct btree_trans trans; + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; struct bkey_buf sk; int ret = 0, commit_err = 0; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, + bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, BTREE_ITER_PREFETCH| BTREE_ITER_NOT_EXTENTS| BTREE_ITER_ALL_SNAPSHOTS); - while ((bch2_trans_begin(&trans), + while ((bch2_trans_begin(trans), k = bch2_btree_iter_peek(&iter)).k) { ret = bkey_err(k); @@ -1802,10 +1884,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_extent_normalize(c, bkey_i_to_s(sk.k)); commit_err = - bch2_trans_update(&trans, &iter, sk.k, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_NOFAIL); + bch2_trans_update(trans, &iter, sk.k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_NOFAIL); if (commit_err == -EINTR) { commit_err = 0; continue; @@ -1814,20 +1896,48 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_btree_iter_advance(&iter); } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) +{ + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct bkey_i_alloc_v4 *a_mut; + int ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + bch2_alloc_to_v4(k, &a); + + if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + + a_mut = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + return ret; + + a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; + + return bch2_trans_update(trans, iter, &a_mut->k_i, 0); +} + int bch2_gc_gens(struct bch_fs *c) { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; - u64 start_time = local_clock(); + u64 b, start_time = local_clock(); unsigned i; int ret; @@ -1836,36 +1946,53 @@ int bch2_gc_gens(struct bch_fs *c) * introduces a deadlock in the RO path - we currently take the state * lock at the start of going RO, thus the gc thread may get stuck: */ + if (!mutex_trylock(&c->gc_gens_lock)) + return 0; + down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); + struct bucket_gens *gens; + + BUG_ON(ca->oldest_gen); + + ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + percpu_ref_put(&ca->ref); + ret = -ENOMEM; + goto err; + } + + gens = bucket_gens(ca); - for_each_bucket(g, buckets) - g->gc_gen = g->mark.gen; - up_read(&ca->bucket_lock); + for (b = gens->first_bucket; + b < gens->nbuckets; b++) + ca->oldest_gen[b] = gens->b[b]; } for (i = 0; i < BTREE_ID_NR; i++) if ((1 << i) & BTREE_ID_HAS_PTRS) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = bch2_gc_btree_gens(c, i); + ret = bch2_gc_btree_gens(&trans, i); if (ret) { bch_err(c, "error recalculating oldest_gen: %i", ret); goto err; } } - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->oldest_gen = g->gc_gen; - up_read(&ca->bucket_lock); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(&trans, &iter)); + if (ret) { + bch_err(c, "error writing oldest_gen: %i", ret); + break; + } } + bch2_trans_iter_exit(&trans, &iter); c->gc_gens_btree = 0; c->gc_gens_pos = POS_MIN; @@ -1874,7 +2001,14 @@ int bch2_gc_gens(struct bch_fs *c) bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); err: + for_each_member_device(ca, c, i) { + kvfree(ca->oldest_gen); + ca->oldest_gen = NULL; + } + + bch2_trans_exit(&trans); up_read(&c->gc_lock); + mutex_unlock(&c->gc_gens_lock); return ret; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 1455dc787190..4b880ea59cad 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -477,7 +477,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) }; if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { - bch2_btree_node_write(c, b, SIX_LOCK_write); + bch2_btree_node_write(c, b, SIX_LOCK_write, 0); reinit_iter = true; } } @@ -540,13 +540,7 @@ enum btree_validate_ret { #define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ __label__ out; \ - char _buf[300]; \ - char *_buf2 = _buf; \ - struct printbuf out = PBUF(_buf); \ - \ - _buf2 = kmalloc(4096, GFP_ATOMIC); \ - if (_buf2) \ - out = _PBUF(_buf2, 4986); \ + struct printbuf out = PRINTBUF; \ \ btree_err_msg(&out, c, ca, b, i, b->written, write); \ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ @@ -554,14 +548,13 @@ enum btree_validate_ret { if (type == BTREE_ERR_FIXABLE && \ write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", _buf2); \ + mustfix_fsck_err(c, "%s", out.buf); \ goto out; \ } \ \ switch (write) { \ case READ: \ - if (_buf2) \ - bch_err(c, "%s", _buf2); \ + bch_err(c, "%s", out.buf); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -582,7 +575,7 @@ enum btree_validate_ret { } \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s", _buf2); \ + bch_err(c, "corrupt metadata before write: %s", out.buf);\ \ if (bch2_fs_inconsistent(c)) { \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ @@ -591,8 +584,7 @@ enum btree_validate_ret { break; \ } \ out: \ - if (_buf2 != _buf) \ - kfree(_buf2); \ + printbuf_exit(&out); \ true; \ }) @@ -653,8 +645,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, { unsigned version = le16_to_cpu(i->version); const char *err; - char buf1[100]; - char buf2[100]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; int ret = 0; btree_err_on((version != BCH_BSET_VERSION_OLD && @@ -691,7 +683,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; - return 0; + ret = 0; + goto out; } btree_err_on(offset && !i->u64s, @@ -742,14 +735,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %s should be %s", - (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), - (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), + (printbuf_reset(&buf2), + bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); } btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %s", - (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); if (write) compat_btree_node(b->c.level, b->c.btree_id, version, @@ -764,7 +760,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } +out: fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -774,6 +773,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); int ret = 0; @@ -812,11 +813,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); if (invalid) { - char buf[160]; - - bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + printbuf_reset(&buf1); + bch2_bkey_val_to_text(&buf1, c, u.s_c); btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, - "invalid bkey: %s\n%s", invalid, buf); + "invalid bkey: %s\n%s", invalid, buf1.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -830,18 +830,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, &b->format, k); if (prev && bkey_iter_cmp(b, prev, k) > 0) { - char buf1[80]; - char buf2[80]; struct bkey up = bkey_unpack_key(b, prev); - bch2_bkey_to_text(&PBUF(buf1), &up); - bch2_bkey_to_text(&PBUF(buf2), u.k); + printbuf_reset(&buf1); + bch2_bkey_to_text(&buf1, &up); + printbuf_reset(&buf2); + bch2_bkey_to_text(&buf2, u.k); bch2_dump_bset(c, b, i, 0); if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "keys out of order: %s > %s", - buf1, buf2)) { + buf1.buf, buf2.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), (u64 *) vstruct_end(i) - (u64 *) k); @@ -853,6 +853,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k = bkey_next(k); } fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -885,11 +887,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, - "bad magic"); + "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(b->data->magic)); btree_err_on(!b->data->keys.seq, BTREE_ERR_MUST_RETRY, c, ca, b, NULL, - "bad btree header"); + "bad btree header: seq 0"); if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bch_btree_ptr_v2 *bp = @@ -922,9 +925,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); - bset_encrypt(c, i, b->written << 9); + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i", ret)) + goto fsck_err; - btree_err_on(btree_node_is_extents(b) && + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), BTREE_ERR_FATAL, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -949,7 +955,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); - bset_encrypt(c, i, b->written << 9); + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i\n", ret)) + goto fsck_err; sectors = vstruct_sectors(bne, c->block_bits); } @@ -972,19 +981,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - b->written += sectors; - blacklisted = bch2_journal_seq_is_blacklisted(c, le64_to_cpu(i->journal_seq), true); btree_err_on(blacklisted && first, BTREE_ERR_FIXABLE, c, ca, b, i, - "first btree node bset has blacklisted journal seq"); + "first btree node bset has blacklisted journal seq (%llu)", + le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, BTREE_ERR_FIXABLE, c, ca, b, i, - "found blacklisted bset in btree node with sectors_written"); + "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", + le64_to_cpu(i->journal_seq), + b->written, b->written + sectors, ptr_written); + + b->written += sectors; + if (blacklisted && !first) continue; @@ -1057,11 +1070,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (invalid || (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { - char buf[160]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, - "invalid bkey %s: %s", buf, invalid); + "invalid bkey %s: %s", buf.buf, invalid); + printbuf_exit(&buf); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1118,8 +1132,7 @@ static void btree_node_read_work(struct work_struct *work) struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; - char buf[200]; - struct printbuf out; + struct printbuf buf = PRINTBUF; bool saw_error = false; bool can_retry; @@ -1140,10 +1153,10 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - out = PBUF(buf); - btree_pos_to_text(&out, c, b); + printbuf_reset(&buf); + btree_pos_to_text(&buf, c, b); bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf); + bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1169,6 +1182,7 @@ start: bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); + printbuf_exit(&buf); if (saw_error && !btree_node_read_error(b)) bch2_btree_node_rewrite_async(c, b); @@ -1249,6 +1263,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) container_of(cl, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; + struct printbuf buf = PRINTBUF; bool dump_bset_maps = false; bool have_retry = false; int ret = 0, best = -1, write = READ; @@ -1292,8 +1307,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl) fsck_err: if (dump_bset_maps) { for (i = 0; i < ra->nr; i++) { - char buf[200]; - struct printbuf out = PBUF(buf); struct btree_node *bn = ra->buf[i]; struct btree_node_entry *bne = NULL; unsigned offset = 0, sectors; @@ -1302,6 +1315,8 @@ fsck_err: if (ra->err[i]) continue; + printbuf_reset(&buf); + while (offset < btree_sectors(c)) { if (!offset) { sectors = vstruct_sectors(bn, c->block_bits); @@ -1312,10 +1327,10 @@ fsck_err: sectors = vstruct_sectors(bne, c->block_bits); } - pr_buf(&out, " %u-%u", offset, offset + sectors); + pr_buf(&buf, " %u-%u", offset, offset + sectors); if (bne && bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), false)) - pr_buf(&out, "*"); + pr_buf(&buf, "*"); offset += sectors; } @@ -1323,19 +1338,19 @@ fsck_err: bne = ra->buf[i] + (offset << 9); if (bne->keys.seq == bn->keys.seq) { if (!gap) - pr_buf(&out, " GAP"); + pr_buf(&buf, " GAP"); gap = true; sectors = vstruct_sectors(bne, c->block_bits); - pr_buf(&out, " %u-%u", offset, offset + sectors); + pr_buf(&buf, " %u-%u", offset, offset + sectors); if (bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), false)) - pr_buf(&out, "*"); + pr_buf(&buf, "*"); } offset++; } - bch_err(c, "replica %u:%s", i, buf); + bch_err(c, "replica %u:%s", i, buf.buf); } } @@ -1356,6 +1371,7 @@ fsck_err: closure_debug_destroy(&ra->cl); kfree(ra); + printbuf_exit(&buf); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1455,23 +1471,23 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret; - btree_pos_to_text(&PBUF(buf), c, b); + btree_pos_to_text(&buf, c, b); trace_btree_read(c, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) - return; + goto out; ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick); if (bch2_fs_fatal_err_on(ret <= 0, c, "btree node read error: no device to read from\n" - " at %s", buf)) { + " at %s", buf.buf)) { set_btree_node_read_error(b); - return; + goto out; } ca = bch_dev_bkey_exists(c, pick.ptr.dev); @@ -1512,6 +1528,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, else queue_work(c->io_complete_wq, &rb->work); } +out: + printbuf_exit(&buf); } int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, @@ -1528,7 +1546,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, level != 0); bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); @@ -1578,7 +1596,7 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, bch2_journal_pin_drop(&c->journal, &w->journal); } -static void btree_node_write_done(struct bch_fs *c, struct btree *b) +static void __btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); unsigned long old, new, v; @@ -1589,26 +1607,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) do { old = new = v; - if (old & (1U << BTREE_NODE_need_write)) - goto do_write; - - new &= ~(1U << BTREE_NODE_write_in_flight); - new &= ~(1U << BTREE_NODE_write_in_flight_inner); - } while ((v = cmpxchg(&b->flags, old, new)) != old); - - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - return; - -do_write: - six_lock_read(&b->c.lock, NULL, NULL); - v = READ_ONCE(b->flags); - do { - old = new = v; - if ((old & (1U << BTREE_NODE_dirty)) && (old & (1U << BTREE_NODE_need_write)) && !(old & (1U << BTREE_NODE_never_write)) && - btree_node_may_write(b)) { + !(old & (1U << BTREE_NODE_write_blocked)) && + !(old & (1U << BTREE_NODE_will_make_reachable))) { new &= ~(1U << BTREE_NODE_dirty); new &= ~(1U << BTREE_NODE_need_write); new |= (1U << BTREE_NODE_write_in_flight); @@ -1622,8 +1625,15 @@ do_write: } while ((v = cmpxchg(&b->flags, old, new)) != old); if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, true); + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); + else + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} +static void btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + six_lock_read(&b->c.lock, NULL, NULL); + __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); } @@ -1738,7 +1748,7 @@ static void btree_write_submit(struct work_struct *work) bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); } -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) { struct btree_write_bio *wbio; struct bset_tree *t; @@ -1753,13 +1763,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta unsigned long old, new; bool validate_before_checksum = false; void *data; + int ret; - if (already_started) + if (flags & BTREE_WRITE_ALREADY_STARTED) goto do_write; - if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) - return; - /* * We may only have a read lock on the btree node - the dirty bit is our * "lock" against racing with other threads that may be trying to start @@ -1773,13 +1781,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta if (!(old & (1 << BTREE_NODE_dirty))) return; - if (!btree_node_may_write(b)) + if ((flags & BTREE_WRITE_ONLY_IF_NEED) && + !(old & (1 << BTREE_NODE_need_write))) return; - if (old & (1 << BTREE_NODE_never_write)) + if (old & + ((1 << BTREE_NODE_never_write)| + (1 << BTREE_NODE_write_blocked))) return; - BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); + if (b->written && + (old & (1 << BTREE_NODE_will_make_reachable))) + return; + + if (old & (1 << BTREE_NODE_write_in_flight)) + return; new &= ~(1 << BTREE_NODE_dirty); new &= ~(1 << BTREE_NODE_need_write); @@ -1875,7 +1891,7 @@ do_write: BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); - i->version = c->sb.version < bcachefs_metadata_version_new_versioning + i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber ? cpu_to_le16(BCH_BSET_VERSION_OLD) : cpu_to_le16(c->sb.version); SET_BSET_OFFSET(i, b->written); @@ -1893,7 +1909,10 @@ do_write: validate_bset_for_write(c, b, i, sectors_to_write)) goto err; - bset_encrypt(c, i, b->written << 9); + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error encrypting btree node: %i\n", ret)) + goto err; nonce = btree_nonce(i, b->written << 9); @@ -1976,7 +1995,7 @@ err: b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); - btree_node_write_done(c, b); + __btree_node_write_done(c, b); } /* @@ -2039,12 +2058,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * Use this one if the node is intent locked: */ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - enum six_lock_type lock_type_held) + enum six_lock_type lock_type_held, + unsigned flags) { if (lock_type_held == SIX_LOCK_intent || (lock_type_held == SIX_LOCK_read && six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, flags); /* don't cycle lock unnecessarily: */ if (btree_node_just_written(b) && @@ -2056,7 +2076,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (lock_type_held == SIX_LOCK_read) six_lock_downgrade(&b->c.lock); } else { - __bch2_btree_node_write(c, b, false); + __bch2_btree_node_write(c, b, flags); if (lock_type_held == SIX_LOCK_write && btree_node_just_written(b)) bch2_btree_post_write_cleanup(c, b); @@ -2076,7 +2096,6 @@ restart: rcu_read_unlock(); wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); goto restart; - } rcu_read_unlock(); } @@ -2090,30 +2109,3 @@ void bch2_btree_flush_all_writes(struct bch_fs *c) { __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } - -void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) { - unsigned long flags = READ_ONCE(b->flags); - - if (!(flags & (1 << BTREE_NODE_dirty))) - continue; - - pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", - b, - (flags & (1 << BTREE_NODE_dirty)) != 0, - (flags & (1 << BTREE_NODE_need_write)) != 0, - b->c.level, - b->written, - !list_empty_careful(&b->write_blocked), - b->will_make_reachable != 0, - b->will_make_reachable & 1); - } - rcu_read_unlock(); -} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 0f20224e2a77..d818d87661e8 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -15,18 +15,13 @@ struct btree; struct btree_iter; struct btree_node_read_all; -static inline bool btree_node_dirty(struct btree *b) -{ - return test_bit(BTREE_NODE_dirty, &b->flags); -} - -static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) +static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) atomic_inc(&c->btree_cache.dirty); } -static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) +static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) atomic_dec(&c->btree_cache.dirty); @@ -67,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *); void bch2_btree_node_wait_on_read(struct btree *); void bch2_btree_node_wait_on_write(struct btree *); -static inline bool btree_node_may_write(struct btree *b) -{ - return list_empty_careful(&b->write_blocked) && - (!b->written || !b->will_make_reachable); -} - enum compact_mode { COMPACT_LAZY, COMPACT_ALL, @@ -111,22 +100,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset) }}; } -static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) { struct nonce nonce = btree_nonce(i, offset); + int ret; if (!offset) { struct btree_node *bn = container_of(i, struct btree_node, keys); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, - bytes); + ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &bn->flags, bytes); + if (ret) + return ret; nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); + return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); } void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); @@ -145,41 +137,23 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id, void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); -void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); +#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) +#define BTREE_WRITE_ALREADY_STARTED (1U << 1) + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, - enum six_lock_type); + enum six_lock_type, unsigned); static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, enum six_lock_type lock_held) { - if (b->written && - btree_node_need_write(b) && - btree_node_may_write(b) && - !btree_node_write_in_flight(b)) - bch2_btree_node_write(c, b, lock_held); + bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); } -#define bch2_btree_node_write_cond(_c, _b, cond) \ -do { \ - unsigned long old, new, v = READ_ONCE((_b)->flags); \ - \ - do { \ - old = new = v; \ - \ - if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ - break; \ - \ - new |= (1 << BTREE_NODE_need_write); \ - } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ - \ - btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ -} while (0) - void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_writes(struct bch_fs *); -void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 65ab2cd64dde..25d254ee9eac 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "recovery.h" #include "replicas.h" #include "subvolume.h" @@ -19,7 +20,7 @@ #include <trace/events/bcachefs.h> static void btree_trans_verify_sorted(struct btree_trans *); -static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); +inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, @@ -57,6 +58,9 @@ static inline int __btree_path_cmp(const struct btree_path *l, struct bpos r_pos, unsigned r_level) { + /* + * Must match lock ordering as defined by __bch2_btree_node_lock: + */ return cmp_int(l->btree_id, r_btree_id) ?: cmp_int((int) l->cached, (int) r_cached) ?: bpos_cmp(l->pos, r_pos) ?: @@ -161,7 +165,7 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) else this_cpu_sub(*b->c.lock.readers, readers); - btree_node_lock_type(trans->c, b, SIX_LOCK_write); + six_lock_write(&b->c.lock, NULL, NULL); if (!b->c.lock.readers) atomic64_add(__SIX_VAL(read_lock, readers), @@ -177,19 +181,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, int want = __btree_lock_want(path, level); if (!is_btree_node(path, level)) - return false; + goto fail; if (race_fault()) - return false; + goto fail; if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, b, level, want))) { - mark_btree_node_locked(path, level, want); + mark_btree_node_locked(trans, path, level, want); return true; - } else { - return false; } +fail: + trace_btree_node_relock_fail(trans->fn, _RET_IP_, + path->btree_id, + &path->pos, + (unsigned long) b, + path->l[level].lock_seq, + is_btree_node(path, level) ? b->c.lock.state.seq : 0); + return false; } bool bch2_btree_node_upgrade(struct btree_trans *trans, @@ -230,13 +240,13 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, return false; success: - mark_btree_node_intent_locked(path, level); + mark_btree_node_intent_locked(trans, path, level); return true; } static inline bool btree_path_get_locks(struct btree_trans *trans, struct btree_path *path, - bool upgrade, unsigned long trace_ip) + bool upgrade) { unsigned l = path->level; int fail_idx = -1; @@ -293,10 +303,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { - struct btree_path *linked, *deadlock_path = NULL; - u64 start_time = local_clock(); - unsigned reason = 9; - bool ret; + struct btree_path *linked; + unsigned reason; /* Check if it's safe to block: */ trans_for_each_path(trans, linked) { @@ -317,28 +325,28 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - deadlock_path = linked; reason = 1; + goto deadlock; } if (linked->btree_id != path->btree_id) { - if (linked->btree_id > path->btree_id) { - deadlock_path = linked; - reason = 3; - } - continue; + if (linked->btree_id < path->btree_id) + continue; + + reason = 3; + goto deadlock; } /* - * Within the same btree, cached paths come before non - * cached paths: + * Within the same btree, non-cached paths come before cached + * paths: */ if (linked->cached != path->cached) { - if (path->cached) { - deadlock_path = linked; - reason = 4; - } - continue; + if (!linked->cached) + continue; + + reason = 4; + goto deadlock; } /* @@ -347,50 +355,33 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, * we're about to lock, it must have the ancestors locked too: */ if (level > __fls(linked->nodes_locked)) { - deadlock_path = linked; reason = 5; + goto deadlock; } /* Must lock btree nodes in key order: */ if (btree_node_locked(linked, level) && bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, linked->cached)) <= 0) { - deadlock_path = linked; - reason = 7; BUG_ON(trans->in_traverse_all); + reason = 7; + goto deadlock; } } - if (unlikely(deadlock_path)) { - trace_trans_restart_would_deadlock(trans->ip, ip, - trans->in_traverse_all, reason, - deadlock_path->btree_id, - deadlock_path->cached, - &deadlock_path->pos, - path->btree_id, - path->cached, - &pos); - btree_trans_restart(trans); - return false; - } - - if (six_trylock_type(&b->c.lock, type)) - return true; - - trans->locking_path_idx = path->idx; - trans->locking_pos = pos; - trans->locking_btree_id = path->btree_id; - trans->locking_level = level; - trans->locking = b; - - ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; - - trans->locking = NULL; - - if (ret) - bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], - start_time); - return ret; + return btree_node_lock_type(trans, path, b, pos, level, + type, should_sleep_fn, p); +deadlock: + trace_trans_restart_would_deadlock(trans->fn, ip, + trans->in_traverse_all, reason, + linked->btree_id, + linked->cached, + &linked->pos, + path->btree_id, + path->cached, + &pos); + btree_trans_restart(trans); + return false; } /* Btree iterator locking: */ @@ -439,6 +430,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans, if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(path); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, + path->btree_id, &path->pos); btree_trans_restart(trans); return false; } @@ -451,10 +444,13 @@ __flatten static bool bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - bool ret = btree_path_get_locks(trans, path, false, trace_ip); + bool ret = btree_path_get_locks(trans, path, false); - if (!ret) + if (!ret) { + trace_trans_restart_relock_path(trans->fn, trace_ip, + path->btree_id, &path->pos); btree_trans_restart(trans); + } return ret; } @@ -468,7 +464,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, path->locks_want = new_locks_want; - if (btree_path_get_locks(trans, path, true, _THIS_IP_)) + if (btree_path_get_locks(trans, path, true)) return true; /* @@ -490,14 +486,15 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, * before interior nodes - now that's handled by * bch2_btree_path_traverse_all(). */ - trans_for_each_path(trans, linked) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, _THIS_IP_); - } + if (!path->cached && !trans->in_traverse_all) + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_path_get_locks(trans, linked, true); + } return false; } @@ -547,7 +544,7 @@ bool bch2_trans_relock(struct btree_trans *trans) trans_for_each_path(trans, path) if (path->should_be_locked && !bch2_btree_path_relock(trans, path, _RET_IP_)) { - trace_trans_restart_relock(trans->ip, _RET_IP_, + trace_trans_restart_relock(trans->fn, _RET_IP_, path->btree_id, &path->pos); BUG_ON(!trans->restarted); return false; @@ -562,7 +559,12 @@ void bch2_trans_unlock(struct btree_trans *trans) trans_for_each_path(trans, path) __bch2_btree_path_unlock(path); - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + /* + * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking + * btree nodes, it implements its own walking: + */ + BUG_ON(!trans->is_initial_gc && + lock_class_is_held(&bch2_btree_node_lock_key)); } /* Btree iterator: */ @@ -593,7 +595,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_node_iter tmp; bool locked; struct bkey_packed *p, *k; - char buf1[100], buf2[100], buf3[100]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct printbuf buf3 = PRINTBUF; const char *msg; if (!bch2_debug_check_iterators) @@ -641,26 +645,27 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, btree_node_unlock(path, level); return; err: - strcpy(buf2, "(none)"); - strcpy(buf3, "(none)"); - - bch2_bpos_to_text(&PBUF(buf1), path->pos); + bch2_bpos_to_text(&buf1, path->pos); if (p) { struct bkey uk = bkey_unpack_key(l->b, p); - bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bkey_to_text(&buf2, &uk); + } else { + pr_buf(&buf2, "(none)"); } if (k) { struct bkey uk = bkey_unpack_key(l->b, k); - bch2_bkey_to_text(&PBUF(buf3), &uk); + bch2_bkey_to_text(&buf3, &uk); + } else { + pr_buf(&buf3, "(none)"); } panic("path should be %s key at level %u:\n" "path pos %s\n" "prev key %s\n" "cur key %s\n", - msg, level, buf1, buf2, buf3); + msg, level, buf1.buf, buf2.buf, buf3.buf); } static void bch2_btree_path_verify(struct btree_trans *trans, @@ -700,9 +705,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && - iter->pos.snapshot != iter->snapshot); - BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); @@ -710,6 +712,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(iter->btree_id)); + if (iter->update_path) + bch2_btree_path_verify(trans, iter->update_path); bch2_btree_path_verify(trans, iter->path); } @@ -759,16 +763,16 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (!bkey_cmp(prev.k->p, k.k->p) && bch2_snapshot_is_ancestor(trans->c, iter->snapshot, prev.k->p.snapshot) > 0) { - char buf1[100], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - bch2_bkey_to_text(&PBUF(buf1), k.k); - bch2_bkey_to_text(&PBUF(buf2), prev.k); + bch2_bkey_to_text(&buf1, k.k); + bch2_bkey_to_text(&buf2, prev.k); panic("iter snap %u\n" "k %s\n" "prev %s\n", iter->snapshot, - buf1, buf2); + buf1.buf, buf2.buf); } out: bch2_trans_iter_exit(trans, ©); @@ -780,7 +784,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, { struct btree_path *path; unsigned idx; - char buf[100]; + struct printbuf buf = PRINTBUF; trans_for_each_path_inorder(trans, path, idx) { int cmp = cmp_int(path->btree_id, id) ?: @@ -806,9 +810,10 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, } bch2_dump_trans_paths_updates(trans); + bch2_bpos_to_text(&buf, pos); + panic("not locked: %s %s%s\n", - bch2_btree_ids[id], - (bch2_bpos_to_text(&PBUF(buf), pos), buf), + bch2_btree_ids[id], buf.buf, key_cache ? " cached" : ""); } @@ -994,8 +999,6 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, struct bkey *u, struct bkey_packed *k) { - struct bkey_s_c ret; - if (unlikely(!k)) { /* * signal to bch2_btree_iter_peek_slot() that we're currently at @@ -1005,19 +1008,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, return bkey_s_c_null; } - ret = bkey_disassemble(l->b, k, u); - - /* - * XXX: bch2_btree_bset_insert_key() generates invalid keys when we - * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key - * being overwritten but doesn't change k->size. But this is ok, because - * those keys are never written out, we just have to avoid a spurious - * assertion here: - */ - if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) - bch2_bkey_debugcheck(c, l->b, ret); - - return ret; + return bkey_disassemble(l->b, k, u); } static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, @@ -1077,6 +1068,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, static void btree_path_verify_new_node(struct btree_trans *trans, struct btree_path *path, struct btree *b) { + struct bch_fs *c = trans->c; struct btree_path_level *l; unsigned plevel; bool parent_locked; @@ -1085,6 +1077,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) return; + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + plevel = b->c.level + 1; if (!btree_path_node(path, plevel)) return; @@ -1099,23 +1094,23 @@ static void btree_path_verify_new_node(struct btree_trans *trans, if (!k || bkey_deleted(k) || bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { - char buf1[100]; - char buf2[100]; - char buf3[100]; - char buf4[100]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct printbuf buf3 = PRINTBUF; + struct printbuf buf4 = PRINTBUF; struct bkey uk = bkey_unpack_key(b, k); - bch2_dump_btree_node(trans->c, l->b); - bch2_bpos_to_text(&PBUF(buf1), path->pos); - bch2_bkey_to_text(&PBUF(buf2), &uk); - bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); - bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); + bch2_dump_btree_node(c, l->b); + bch2_bpos_to_text(&buf1, path->pos); + bch2_bkey_to_text(&buf2, &uk); + bch2_bpos_to_text(&buf3, b->data->min_key); + bch2_bpos_to_text(&buf3, b->data->max_key); panic("parent iter doesn't point to new node:\n" "iter pos %s %s\n" "iter key %s\n" "new node %s-%s\n", - bch2_btree_ids[path->btree_id], buf1, - buf2, buf3, buf4); + bch2_btree_ids[path->btree_id], + buf1.buf, buf2.buf, buf3.buf, buf4.buf); } if (!parent_locked) @@ -1173,7 +1168,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) t != BTREE_NODE_UNLOCKED) { btree_node_unlock(path, b->c.level); six_lock_increment(&b->c.lock, t); - mark_btree_node_locked(path, b->c.level, t); + mark_btree_node_locked(trans, path, b->c.level, t); } btree_path_level_init(trans, path, b); @@ -1250,7 +1245,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) path->l[i].b = NULL; - mark_btree_node_locked(path, path->level, lock_type); + mark_btree_node_locked(trans, path, path->level, lock_type); btree_path_level_init(trans, path, b); return 0; } @@ -1296,6 +1291,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat return ret; } +static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, + struct btree_and_journal_iter *jiter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_and_journal_iter_advance(jiter); + k = bch2_btree_and_journal_iter_peek(jiter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, struct btree_path *path, unsigned plevel, struct btree *b) @@ -1318,6 +1348,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, btree_node_unlock(path, plevel); } +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + struct bkey_buf *out) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_and_journal_iter jiter; + struct bkey_s_c k; + int ret = 0; + + __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + + k = bch2_btree_and_journal_iter_peek(&jiter); + + bch2_bkey_buf_reassemble(out, c, k); + + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch_j(trans, path, &jiter); + + bch2_btree_and_journal_iter_exit(&jiter); + return ret; +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -1328,30 +1382,41 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); + bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (unlikely(!replay_done)) { + ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + if (ret) + goto err; + } else { + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (flags & BTREE_ITER_PREFETCH) { + ret = btree_path_prefetch(trans, path); + if (ret) + goto err; + } + } b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) goto err; - mark_btree_node_locked(path, level, lock_type); + mark_btree_node_locked(trans, path, level, lock_type); btree_path_level_init(trans, path, b); - if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(trans, path, level + 1, b); - if (flags & BTREE_ITER_PREFETCH) - ret = btree_path_prefetch(trans, path); - if (btree_node_read_locked(path, level + 1)) btree_node_unlock(path, level + 1); path->level = level; @@ -1365,12 +1430,12 @@ err: static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, unsigned, unsigned long); -static int __btree_path_traverse_all(struct btree_trans *trans, int ret, - unsigned long trace_ip) +static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_path *path; - int i; + unsigned long trace_ip = _RET_IP_; + int i, ret = 0; if (trans->in_traverse_all) return -EINTR; @@ -1378,6 +1443,7 @@ static int __btree_path_traverse_all(struct btree_trans *trans, int ret, trans->in_traverse_all = true; retry_all: trans->restarted = false; + trans->traverse_all_idx = U8_MAX; trans_for_each_path(trans, path) path->should_be_locked = false; @@ -1398,7 +1464,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); - if (unlikely(ret == -ENOMEM)) { + if (unlikely(trans->memory_allocation_failure)) { struct closure cl; closure_init_stack(&cl); @@ -1409,27 +1475,25 @@ retry_all: } while (ret); } - if (unlikely(ret == -EIO)) - goto out; - - BUG_ON(ret && ret != -EINTR); - /* Now, redo traversals in correct order: */ - i = 0; - while (i < trans->nr_sorted) { - path = trans->paths + trans->sorted[i]; + trans->traverse_all_idx = 0; + while (trans->traverse_all_idx < trans->nr_sorted) { + path = trans->paths + trans->sorted[trans->traverse_all_idx]; - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - - ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); - if (ret) - goto retry_all; - - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - - if (path->nodes_locked || - !btree_path_node(path, path->level)) - i++; + /* + * Traversing a path can cause another path to be added at about + * the same position: + */ + if (path->uptodate) { + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + if (ret == -EINTR || ret == -ENOMEM) + goto retry_all; + if (ret) + goto err; + BUG_ON(path->uptodate); + } else { + trans->traverse_all_idx++; + } } /* @@ -1439,20 +1503,15 @@ retry_all: */ trans_for_each_path(trans, path) BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); -out: +err: bch2_btree_cache_cannibalize_unlock(c); trans->in_traverse_all = false; - trace_trans_traverse_all(trans->ip, trace_ip); + trace_trans_traverse_all(trans->fn, trace_ip); return ret; } -static int bch2_btree_path_traverse_all(struct btree_trans *trans) -{ - return __btree_path_traverse_all(trans, 0, _RET_IP_); -} - static inline bool btree_path_good_node(struct btree_trans *trans, struct btree_path *path, unsigned l, int check_pos) @@ -1576,8 +1635,6 @@ out: return ret; } -static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); - int __must_check bch2_btree_path_traverse(struct btree_trans *trans, struct btree_path *path, unsigned flags) { @@ -1601,7 +1658,7 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, six_lock_increment(&dst->l[i].b->c.lock, __btree_lock_want(dst, i)); - btree_path_check_sort(trans, dst, 0); + bch2_btree_path_check_sort(trans, dst, 0); } static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, @@ -1629,11 +1686,12 @@ bch2_btree_path_make_mut(struct btree_trans *trans, btree_trans_verify_sorted(trans); } + path->should_be_locked = false; return path; } -static struct btree_path * __must_check -btree_path_set_pos(struct btree_trans *trans, +struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos new_pos, bool intent, unsigned long ip) { @@ -1648,10 +1706,9 @@ btree_path_set_pos(struct btree_trans *trans, path = bch2_btree_path_make_mut(trans, path, intent, ip); - path->pos = new_pos; - path->should_be_locked = false; + path->pos = new_pos; - btree_path_check_sort(trans, path, cmp); + bch2_btree_path_check_sort(trans, path, cmp); if (unlikely(path->cached)) { btree_node_unlock(path, 0); @@ -1663,6 +1720,7 @@ btree_path_set_pos(struct btree_trans *trans, l = btree_path_up_until_good_node(trans, path, cmp); if (btree_path_node(path, l)) { + BUG_ON(!btree_node_locked(path, l)); /* * We might have to skip over many keys, or just a few: try * advancing the node iterator, and if we have to skip over too @@ -1755,23 +1813,64 @@ free: __bch2_path_free(trans, path); } +void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + pr_buf(buf, "transaction updates for %s journal seq %llu", + trans->fn, trans->journal_res.seq); + pr_newline(buf); + pr_indent_push(buf, 2); + + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + + pr_buf(buf, "update: btree %s %pS", + bch2_btree_ids[i->btree_id], + (void *) i->ip_allocated); + pr_newline(buf); + + pr_buf(buf, " old "); + bch2_bkey_val_to_text(buf, trans->c, old); + pr_newline(buf); + + pr_buf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); + pr_newline(buf); + } + + pr_indent_pop(buf, 2); +} + +noinline __cold +void bch2_dump_trans_updates(struct btree_trans *trans) +{ + struct printbuf buf = PRINTBUF; + + bch2_trans_updates_to_text(&buf, trans); + bch_err(trans->c, "%s", buf.buf); + printbuf_exit(&buf); +} + noinline __cold void bch2_dump_trans_paths_updates(struct btree_trans *trans) { struct btree_path *path; - struct btree_insert_entry *i; + struct printbuf buf = PRINTBUF; unsigned idx; - char buf1[300], buf2[300]; - btree_trans_verify_sorted(trans); + trans_for_each_path_inorder(trans, path, idx) { + printbuf_reset(&buf); - trans_for_each_path_inorder(trans, path, idx) - printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", + bch2_bpos_to_text(&buf, path->pos); + + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n", path->idx, path->ref, path->intent_ref, path->should_be_locked ? " S" : "", path->preserve ? " P" : "", bch2_btree_ids[path->btree_id], - (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), + path->level, + buf.buf, path->nodes_locked, #ifdef CONFIG_BCACHEFS_DEBUG (void *) path->ip_allocated @@ -1779,17 +1878,11 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) NULL #endif ); + } - trans_for_each_update(trans, i) { - struct bkey u; - struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); + printbuf_exit(&buf); - printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", - bch2_btree_ids[i->btree_id], - (void *) i->ip_allocated, - (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), - (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); - } + bch2_dump_trans_updates(trans); } static struct btree_path *btree_path_alloc(struct btree_trans *trans, @@ -1830,6 +1923,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, int i; BUG_ON(trans->restarted); + btree_trans_verify_sorted(trans); + bch2_trans_verify_locks(trans); trans_for_each_path_inorder(trans, path, i) { if (__btree_path_cmp(path, @@ -1847,7 +1942,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path_pos->btree_id == btree_id && path_pos->level == level) { __btree_path_get(path_pos, intent); - path = btree_path_set_pos(trans, path_pos, pos, intent, ip); + path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); } else { path = btree_path_alloc(trans, path_pos); path_pos = NULL; @@ -1887,7 +1982,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, locks_want = min(locks_want, BTREE_MAX_DEPTH); if (locks_want > path->locks_want) { path->locks_want = locks_want; - btree_path_get_locks(trans, path, true, _THIS_IP_); + btree_path_get_locks(trans, path, true); } return path; @@ -1898,13 +1993,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct struct bkey_s_c k; - BUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - if (!path->cached) { struct btree_path_level *l = path_l(path); - struct bkey_packed *_k = - bch2_btree_node_iter_peek_all(&l->iter, l->b); + struct bkey_packed *_k; + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); @@ -1914,13 +2009,17 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct } else { struct bkey_cached *ck = (void *) path->l[0].b; - EBUG_ON(path->btree_id != ck->key.btree_id || - bkey_cmp(path->pos, ck->key.pos)); + EBUG_ON(ck && + (path->btree_id != ck->key.btree_id || + bkey_cmp(path->pos, ck->key.pos))); - /* BTREE_ITER_CACHED_NOFILL? */ - if (unlikely(!ck->valid)) - goto hole; + /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ + if (unlikely(!ck || !ck->valid)) + return bkey_s_c_null; + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + + *u = ck->k->k; k = bkey_i_to_s_c(ck->k); } @@ -1944,7 +2043,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - iter->path = btree_path_set_pos(iter->trans, iter->path, + iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -1981,7 +2080,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos = b->key.k.p; - iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); iter->path->should_be_locked = true; @@ -2017,6 +2116,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) btree_node_unlock(path, path->level); path->l[path->level].b = BTREE_ITER_NO_NODE_UP; path->level++; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); return NULL; } @@ -2024,6 +2124,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) __bch2_btree_path_unlock(path); path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, + path->btree_id, &path->pos); btree_trans_restart(trans); ret = -EINTR; goto err; @@ -2041,7 +2144,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) * the next child node */ path = iter->path = - btree_path_set_pos(trans, path, bpos_successor(iter->pos), + bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2064,7 +2167,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos = b->key.k.p; - iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); iter->path->should_be_locked = true; @@ -2107,24 +2210,88 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } -/** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position +static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if ((cmp_int(btree_id, i->btree_id) ?: + bpos_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->btree_id) + return i->k; + break; + } + + return NULL; +} + +static noinline +struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *next_journal = + bch2_journal_keys_peek(trans->c, iter->btree_id, 0, + iter->path->pos); + + if (next_journal && + bpos_cmp(next_journal->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } + + return k; +} + +/* + * Checks btree key cache for key at iter->pos and returns it if present, or + * bkey_s_c_null: */ -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey u; + int ret; + + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) + return bkey_s_c_null; + + if (!iter->key_cache_path) + iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, + iter->flags & BTREE_ITER_INTENT, 0, + iter->flags|BTREE_ITER_CACHED, + _THIS_IP_); + + iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + iter->key_cache_path->should_be_locked = true; + + return bch2_btree_path_peek_slot(iter->key_cache_path, &u); +} + +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bpos search_key = btree_iter_search_key(iter); struct bkey_i *next_update; - struct bkey_s_c k; - int ret, cmp; + struct bkey_s_c k, k2; + int ret; EBUG_ON(iter->path->cached || iter->path->level); bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); while (1) { - iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2136,19 +2303,30 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) goto out; } - next_update = iter->flags & BTREE_ITER_WITH_UPDATES - ? btree_trans_peek_updates(trans, iter->btree_id, search_key) - : NULL; + iter->path->should_be_locked = true; + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); - /* * In the btree, deleted keys sort before non deleted: */ - if (k.k && bkey_deleted(k.k) && - (!next_update || - bpos_cmp(k.k->p, next_update->k.p) <= 0)) { - search_key = k.k->p; - continue; + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + ret = bkey_err(k2); + if (ret) { + k = k2; + bch2_btree_iter_set_pos(iter, iter->pos); + goto out; + } + + k = k2; + iter->k = *k.k; } + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + + next_update = iter->flags & BTREE_ITER_WITH_UPDATES + ? btree_trans_peek_updates(trans, iter->btree_id, search_key) + : NULL; if (next_update && bpos_cmp(next_update->k.p, k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { @@ -2156,25 +2334,21 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) k = bkey_i_to_s_c(next_update); } - if (likely(k.k)) { + if (k.k && bkey_deleted(k.k)) { /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: + * If we've got a whiteout, and it's after the search + * key, advance the search key to the whiteout instead + * of just after the whiteout - it might be a btree + * whiteout, with a real key at the same position, since + * in the btree deleted keys sort before non deleted. */ - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - !bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { - search_key = bkey_successor(iter, k.k->p); - continue; - } + search_key = bpos_cmp(search_key, k.k->p) + ? k.k->p + : bpos_successor(k.k->p); + continue; + } + if (likely(k.k)) { break; } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { /* Advance to next leaf node: */ @@ -2186,35 +2360,137 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) goto out; } } +out: + bch2_btree_iter_verify(iter); - /* - * iter->pos should be mononotically increasing, and always be equal to - * the key we just returned - except extents can straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) - iter->pos = k.k->p; - else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - iter->pos = bkey_start_pos(k.k); + return k; +} - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) - iter->pos.snapshot = iter->snapshot; +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; + struct bpos iter_pos; + int ret; - cmp = bpos_cmp(k.k->p, iter->path->pos); - if (cmp) { - iter->path = bch2_btree_path_make_mut(trans, iter->path, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - iter->path->pos = k.k->p; - btree_path_check_sort(trans, iter->path, cmp); + if (iter->update_path) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { + k = __bch2_btree_iter_peek(iter, search_key); + if (!k.k || bkey_err(k)) + goto out; + + /* + * iter->pos should be mononotically increasing, and always be + * equal to the key we just returned - except extents can + * straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + iter_pos = k.k->p; + else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter_pos = bkey_start_pos(k.k); + else + iter_pos = iter->pos; + + if (bkey_cmp(iter_pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out; + } + + if (iter->update_path && + bkey_cmp(iter->update_path->pos, k.k->p)) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + (iter->flags & BTREE_ITER_INTENT) && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && + !iter->update_path) { + struct bpos pos = k.k->p; + + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } + + pos.snapshot = iter->snapshot; + + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + } + + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; + } + + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } + + break; } + + iter->pos = iter_pos; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + BUG_ON(!iter->path->nodes_locked); out: + if (iter->update_path) { + if (iter->update_path->uptodate && + !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) { + k = bkey_s_c_err(-EINTR); + } else { + BUG_ON(!(iter->update_path->nodes_locked & 1)); + iter->update_path->should_be_locked = true; + } + } iter->path->should_be_locked = true; - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + ret = bch2_btree_iter_verify_ret(iter, k); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (unlikely(ret)) { + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + } + + bch2_btree_iter_verify_entry_exit(iter); return k; } @@ -2247,6 +2523,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) EBUG_ON(iter->path->cached || iter->path->level); EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + + if (iter->flags & BTREE_ITER_WITH_JOURNAL) + return bkey_s_c_err(-EIO); + bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -2254,7 +2534,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) search_key.snapshot = U32_MAX; while (1) { - iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2275,7 +2555,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k = btree_path_level_prev(trans->c, iter->path, &iter->path->l[0], &iter->k); - btree_path_check_sort(trans, iter->path, 0); + bch2_btree_path_check_sort(trans, iter->path, 0); if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -2385,7 +2665,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } search_key = btree_iter_search_key(iter); - iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2397,25 +2677,44 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; - next_update = iter->flags & BTREE_ITER_WITH_UPDATES - ? btree_trans_peek_updates(trans, iter->btree_id, search_key) - : NULL; + if ((iter->flags & BTREE_ITER_WITH_UPDATES) && + (next_update = btree_trans_peek_updates(trans, + iter->btree_id, search_key)) && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + goto out; + } - if (next_update && + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + (next_update = bch2_journal_keys_peek(trans->c, iter->btree_id, + 0, iter->pos)) && !bpos_cmp(next_update->k.p, iter->pos)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); - } else { - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + goto out; + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + if (!bkey_err(k)) + iter->k = *k.k; + goto out; } + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); } else { struct bpos next; if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; + struct bpos end = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + end.offset = U64_MAX; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek(&iter2); + k = bch2_btree_iter_peek_upto(&iter2, end); if (k.k && !bkey_err(k)) { iter->k = iter2.k; @@ -2437,18 +2736,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (bkey_cmp(iter->pos, next) < 0) { bkey_init(&iter->k); iter->k.p = iter->pos; - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next.inode == iter->pos.inode - ? next.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); + + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, + (next.inode == iter->pos.inode + ? next.offset + : KEY_OFFSET_MAX) - + iter->pos.offset)); + EBUG_ON(!iter->k.size); + } k = (struct bkey_s_c) { &iter->k, NULL }; - EBUG_ON(!k.k->size); } } - +out: iter->path->should_be_locked = true; bch2_btree_iter_verify_entry_exit(iter); @@ -2503,7 +2805,10 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) unsigned i; trans_for_each_path_inorder(trans, path, i) { - BUG_ON(prev && btree_path_cmp(prev, path) > 0); + if (prev && btree_path_cmp(prev, path) > 0) { + bch2_dump_trans_paths_updates(trans); + panic("trans paths out of order!\n"); + } prev = path; } #endif @@ -2520,8 +2825,8 @@ static inline void btree_path_swap(struct btree_trans *trans, btree_path_verify_sorted_ref(trans, r); } -static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, - int cmp) +inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, + int cmp) { struct btree_path *n; @@ -2577,6 +2882,11 @@ static inline void btree_path_list_add(struct btree_trans *trans, path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; + if (trans->in_traverse_all && + trans->traverse_all_idx != U8_MAX && + trans->traverse_all_idx >= path->sorted_idx) + trans->traverse_all_idx++; + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); for (i = path->sorted_idx; i < trans->nr_sorted; i++) @@ -2590,7 +2900,15 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->path) bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_INTENT); + if (iter->update_path) + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + if (iter->key_cache_path) + bch2_path_put(trans, iter->key_cache_path, + iter->flags & BTREE_ITER_INTENT); iter->path = NULL; + iter->update_path = NULL; + iter->key_cache_path = NULL; } static void __bch2_trans_iter_init(struct btree_trans *trans, @@ -2615,8 +2933,19 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, btree_type_has_snapshots(btree_id)) flags |= BTREE_ITER_FILTER_SNAPSHOTS; + if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) + flags |= BTREE_ITER_WITH_JOURNAL; + + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + iter->trans = trans; iter->path = NULL; + iter->update_path = NULL; + iter->key_cache_path = NULL; iter->btree_id = btree_id; iter->min_depth = depth; iter->flags = flags; @@ -2665,6 +2994,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) *dst = *src; if (src->path) __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + if (src->update_path) + __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = NULL; } void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) @@ -2693,7 +3025,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); + trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -2727,8 +3059,7 @@ void bch2_trans_begin(struct btree_trans *trans) trans->mem_top = 0; trans->hooks = NULL; - trans->extra_journal_entries = NULL; - trans->extra_journal_entry_u64s = 0; + trans->extra_journal_entries.nr = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; @@ -2741,13 +3072,21 @@ void bch2_trans_begin(struct btree_trans *trans) path->should_be_locked = false; /* + * If the transaction wasn't restarted, we're presuming to be + * doing something new: dont keep iterators excpt the ones that + * are in use - except for the subvolumes btree: + */ + if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) + path->preserve = false; + + /* * XXX: we probably shouldn't be doing this if the transaction * was restarted, but currently we still overflow transaction * iterators if we do that */ if (!path->ref && !path->preserve) __bch2_path_free(trans, path); - else if (!path->ref) + else path->preserve = false; } @@ -2777,14 +3116,17 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) trans->updates = p; p += updates_bytes; } -void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, - unsigned expected_nr_iters, - size_t expected_mem_bytes) +void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes, + const char *fn) __acquires(&c->btree_trans_barrier) { + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + memset(trans, 0, sizeof(*trans)); trans->c = c; - trans->ip = _RET_IP_; + trans->fn = fn; bch2_trans_alloc_paths(trans, c); @@ -2817,7 +3159,7 @@ static void check_btree_paths_leaked(struct btree_trans *trans) goto leaked; return; leaked: - bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip); + bch_err(c, "btree paths leaked from %s!", trans->fn); trans_for_each_path(trans, path) if (path->ref) printk(KERN_ERR " btree %s %pS\n", @@ -2850,6 +3192,8 @@ void bch2_trans_exit(struct btree_trans *trans) bch2_journal_preres_put(&c->journal, &trans->journal_preres); + kfree(trans->extra_journal_entries.data); + if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) @@ -2903,6 +3247,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) struct btree_trans *trans; struct btree_path *path; struct btree *b; + static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; mutex_lock(&c->btree_trans_lock); @@ -2910,7 +3255,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) if (!trans_has_locks(trans)) continue; - pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); + pr_buf(out, "%i %s\n", trans->pid, trans->fn); trans_for_each_path(trans, path) { if (!path->nodes_locked) @@ -2939,10 +3284,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) b = READ_ONCE(trans->locking); if (b) { path = &trans->paths[trans->locking_path_idx]; - pr_buf(out, " locking path %u %c l=%u %s:", + pr_buf(out, " locking path %u %c l=%u %c %s:", trans->locking_path_idx, path->cached ? 'c' : 'b', trans->locking_level, + lock_types[trans->locking_lock_type], bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 4c903b9dd716..f6700295e1a7 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path, return btree_path_node(path, b->c.level + 1); } -static inline int btree_iter_err(const struct btree_iter *iter) -{ - return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; -} - /* Iterate over paths within a transaction: */ static inline struct btree_path * @@ -75,6 +70,8 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) return &trans->paths[idx]; } +void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); + #define trans_for_each_path(_trans, _path) \ for (_path = __trans_next_path((_trans), 0); \ (_path); \ @@ -132,6 +129,9 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, struct btree_path * __must_check bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool, unsigned long); +struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, + struct bpos, bool, unsigned long); int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, @@ -209,9 +209,14 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_upto(iter, SPOS_MAX); +} + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); @@ -222,11 +227,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); bool bch2_btree_iter_advance(struct btree_iter *); bool bch2_btree_iter_rewind(struct btree_iter *); -static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) - new_pos.snapshot = iter->snapshot; - iter->k.type = KEY_TYPE_deleted; iter->k.p.inode = iter->pos.inode = new_pos.inode; iter->k.p.offset = iter->pos.offset = new_pos.offset; @@ -234,6 +236,19 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos iter->k.size = 0; } +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + if (unlikely(iter->update_path)) + bch2_path_put(iter->trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + + __bch2_btree_iter_set_pos(iter, new_pos); +} + static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); @@ -295,14 +310,27 @@ static inline int bkey_err(struct bkey_s_c k) return PTR_ERR_OR_ZERO(k.k); } -static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, - unsigned flags) +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + unsigned flags) { return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } +static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + if (!(flags & BTREE_ITER_SLOTS)) + return bch2_btree_iter_peek_upto(iter, end); + + if (bkey_cmp(iter->pos, end) > 0) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + static inline int btree_trans_too_many_iters(struct btree_trans *trans) { return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 @@ -316,7 +344,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, struct bkey_s_c k; while (btree_trans_too_many_iters(trans) || - (k = __bch2_btree_iter_peek(iter, flags), + (k = bch2_btree_iter_peek_type(iter, flags), bkey_err(k) == -EINTR)) bch2_trans_begin(trans); @@ -335,7 +363,15 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) @@ -347,16 +383,21 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for (; \ - (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) /* new multiple iterator interface: */ +void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); -void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); +void __bch2_trans_init(struct btree_trans *, struct bch_fs *, + unsigned, size_t, const char *); void bch2_trans_exit(struct btree_trans *); +#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) + void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_btree_iter_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 230a920ae32a..f5a942b6bbf7 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -146,28 +146,32 @@ bkey_cached_reuse(struct btree_key_cache *c) } static struct bkey_cached * -btree_key_cache_create(struct btree_key_cache *c, +btree_key_cache_create(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) { + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck; bool was_new = true; - ck = bkey_cached_alloc(c); + ck = bkey_cached_alloc(bc); if (unlikely(!ck)) { - ck = bkey_cached_reuse(c); - if (unlikely(!ck)) + ck = bkey_cached_reuse(bc); + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", + bch2_btree_ids[btree_id]); return ERR_PTR(-ENOMEM); + } was_new = false; + } else { + if (btree_id == BTREE_ID_subvolumes) + six_lock_pcpu_alloc(&ck->c.lock); + else + six_lock_pcpu_free(&ck->c.lock); } - if (btree_id == BTREE_ID_subvolumes) - six_lock_pcpu_alloc(&ck->c.lock); - else - six_lock_pcpu_free(&ck->c.lock); - ck->c.level = 0; ck->c.btree_id = btree_id; ck->key.btree_id = btree_id; @@ -175,7 +179,7 @@ btree_key_cache_create(struct btree_key_cache *c, ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (unlikely(rhashtable_lookup_insert_fast(&c->table, + if (unlikely(rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params))) { /* We raced with another fill: */ @@ -185,15 +189,15 @@ btree_key_cache_create(struct btree_key_cache *c, six_unlock_intent(&ck->c.lock); kfree(ck); } else { - mutex_lock(&c->lock); - bkey_cached_free(c, ck); - mutex_unlock(&c->lock); + mutex_lock(&bc->lock); + bkey_cached_free(bc, ck); + mutex_unlock(&bc->lock); } return NULL; } - atomic_long_inc(&c->nr_keys); + atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); @@ -204,21 +208,24 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_iter iter; + struct btree_path *path; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; + struct bkey u; int ret; - bch2_trans_iter_init(trans, &iter, ck->key.btree_id, - ck->key.pos, BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + path = bch2_path_get(trans, ck->key.btree_id, + ck->key.pos, 0, 0, 0, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path, 0); if (ret) goto err; + k = bch2_btree_path_peek_slot(path, &u); + if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_transaction_restart_ip(trans->ip, _THIS_IP_); + trace_trans_restart_relock_key_cache_fill(trans->fn, + _THIS_IP_, ck_path->btree_id, &ck_path->pos); ret = btree_trans_restart(trans); goto err; } @@ -233,6 +240,8 @@ static int btree_key_cache_fill(struct btree_trans *trans, new_u64s = roundup_pow_of_two(new_u64s); new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[ck->key.btree_id], new_u64s); ret = -ENOMEM; goto err; } @@ -254,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + path->preserve = false; err: - bch2_trans_iter_exit(trans, &iter); + bch2_path_put(trans, path, 0); return ret; } @@ -293,15 +302,14 @@ retry: return 0; } - ck = btree_key_cache_create(&c->btree_key_cache, - path->btree_id, path->pos); + ck = btree_key_cache_create(c, path->btree_id, path->pos); ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; if (!ck) goto retry; - mark_btree_node_locked(path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); @@ -312,7 +320,6 @@ retry: if (!trans->restarted) goto retry; - trace_transaction_restart_ip(trans->ip, _THIS_IP_); ret = -EINTR; goto err; } @@ -323,7 +330,7 @@ retry: goto retry; } - mark_btree_node_locked(path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, lock_want); } path->l[0].lock_seq = ck->c.lock.state.seq; @@ -332,7 +339,7 @@ fill: if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { if (!path->locks_want && !__bch2_btree_path_upgrade(trans, path, 1)) { - trace_transaction_restart_ip(trans->ip, _THIS_IP_); + trace_transaction_restart_ip(trans->fn, _THIS_IP_); ret = btree_trans_restart(trans); goto err; } @@ -378,21 +385,27 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_CACHED_NOFILL| BTREE_ITER_CACHED_NOCREATE| BTREE_ITER_INTENT); + b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + ret = bch2_btree_iter_traverse(&c_iter); if (ret) goto out; ck = (void *) c_iter.path->l[0].b; - if (!ck || - (journal_seq && ck->journal.seq != journal_seq)) + if (!ck) goto out; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (!evict) - goto out; - goto evict; + if (evict) + goto evict; + goto out; } + BUG_ON(!ck->valid); + + if (journal_seq && ck->journal.seq != journal_seq) + goto out; + /* * Since journal reclaim depends on us making progress here, and the * allocator/copygc depend on journal reclaim making progress, we need @@ -400,6 +413,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, * */ ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_KEY_CACHE_RECLAIM| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, @@ -407,7 +421,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| (ck->journal.seq == journal_last_seq(j) - ? BTREE_INSERT_JOURNAL_RESERVED + ? JOURNAL_WATERMARK_reserved : 0)| commit_flags); if (ret) { @@ -541,14 +555,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, return true; } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, - enum btree_id id, struct bpos pos) -{ - BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); -} -#endif - static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index 0768ef3ca776..fd29c14c5626 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty && - test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + return nr_dirty > max_dirty; } int bch2_btree_key_cache_journal_flush(struct journal *, @@ -33,14 +32,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *, struct btree_path *, struct bkey_i *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_key_cache_verify_clean(struct btree_trans *, - enum btree_id, struct bpos); -#else -static inline void -bch2_btree_key_cache_verify_clean(struct btree_trans *trans, - enum btree_id id, struct bpos pos) {} -#endif void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index d599008c5fc1..67c970d727ac 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -58,7 +58,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, path->nodes_intent_locked &= ~(1 << level); } -static inline void mark_btree_node_locked(struct btree_path *path, +static inline void mark_btree_node_locked(struct btree_trans *trans, + struct btree_path *path, unsigned level, enum six_lock_type type) { @@ -66,14 +67,17 @@ static inline void mark_btree_node_locked(struct btree_path *path, BUILD_BUG_ON(SIX_LOCK_read != 0); BUILD_BUG_ON(SIX_LOCK_intent != 1); + BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx); + path->nodes_locked |= 1 << level; path->nodes_intent_locked |= type << level; } -static inline void mark_btree_node_intent_locked(struct btree_path *path, +static inline void mark_btree_node_intent_locked(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - mark_btree_node_locked(path, level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); } static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) @@ -128,23 +132,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) } } -/* - * wrapper around six locks that just traces lock contended time - */ -static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, - enum six_lock_type type) +static inline bool btree_node_lock_type(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) { - u64 start_time = local_clock(); + struct bch_fs *c = trans->c; + u64 start_time; + bool ret; - six_lock_type(&b->c.lock, type, NULL, NULL); - bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -} + if (six_trylock_type(&b->c.lock, type)) + return true; -static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, - enum six_lock_type type) -{ - if (!six_trylock_type(&b->c.lock, type)) - __btree_node_lock_type(c, b, type); + start_time = local_clock(); + + trans->locking_path_idx = path->idx; + trans->locking_pos = pos; + trans->locking_btree_id = path->btree_id; + trans->locking_level = level; + trans->locking_lock_type = type; + trans->locking = b; + ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; + trans->locking = NULL; + + if (ret) + bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); + + return ret; } /* diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index c84bba7bcda5..3438e089dba0 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -8,6 +8,7 @@ #include "bkey_methods.h" #include "buckets_types.h" +#include "darray.h" #include "journal_types.h" struct open_bucket; @@ -152,7 +153,8 @@ struct btree_cache { struct mutex lock; struct list_head live; struct list_head freeable; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; /* Number of elements in live + freeable lists */ unsigned used; @@ -202,15 +204,16 @@ struct btree_node_iter { */ #define BTREE_ITER_IS_EXTENTS (1 << 4) #define BTREE_ITER_NOT_EXTENTS (1 << 5) -#define BTREE_ITER_ERROR (1 << 6) -#define BTREE_ITER_CACHED (1 << 7) -#define BTREE_ITER_CACHED_NOFILL (1 << 8) -#define BTREE_ITER_CACHED_NOCREATE (1 << 9) +#define BTREE_ITER_CACHED (1 << 6) +#define BTREE_ITER_CACHED_NOFILL (1 << 7) +#define BTREE_ITER_CACHED_NOCREATE (1 << 8) +#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) #define BTREE_ITER_WITH_UPDATES (1 << 10) -#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) -#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) -#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) -#define BTREE_ITER_NOPRESERVE (1 << 14) +#define BTREE_ITER_WITH_JOURNAL (1 << 11) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) +#define BTREE_ITER_NOPRESERVE (1 << 15) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -275,6 +278,8 @@ static inline struct btree_path_level *path_l(struct btree_path *path) struct btree_iter { struct btree_trans *trans; struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; enum btree_id btree_id:4; unsigned min_depth:4; @@ -322,7 +327,7 @@ struct bkey_cached { struct btree_bkey_cached_common c; unsigned long flags; - u8 u64s; + u16 u64s; bool valid; u32 btree_trans_barrier_seq; struct bkey_cached_key key; @@ -340,12 +345,20 @@ struct btree_insert_entry { unsigned flags; u8 bkey_type; enum btree_id btree_id:8; - u8 level; + u8 level:4; bool cached:1; bool insert_trigger_run:1; bool overwrite_trigger_run:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; struct bkey_i *k; struct btree_path *path; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; unsigned long ip_allocated; }; @@ -367,21 +380,26 @@ struct btree_trans_commit_hook { struct btree_trans { struct bch_fs *c; + const char *fn; struct list_head list; struct btree *locking; unsigned locking_path_idx; struct bpos locking_pos; u8 locking_btree_id; u8 locking_level; + u8 locking_lock_type; pid_t pid; - unsigned long ip; int srcu_idx; u8 nr_sorted; u8 nr_updates; + u8 traverse_all_idx; bool used_mempool:1; bool in_traverse_all:1; bool restarted:1; + bool memory_allocation_failure:1; + bool journal_transaction_names:1; + bool is_initial_gc:1; /* * For when bch2_trans_update notices we'll be splitting a compressed * extent: @@ -400,8 +418,7 @@ struct btree_trans { /* update path: */ struct btree_trans_commit_hook *hooks; - struct jset_entry *extra_journal_entries; - unsigned extra_journal_entry_u64s; + DARRAY(u64) extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; @@ -414,7 +431,31 @@ struct btree_trans { struct replicas_delta_list *fs_usage_deltas; }; -#define BTREE_FLAG(flag) \ +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) + +enum btree_flags { +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x +}; + +#define x(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ @@ -424,36 +465,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ static inline void clear_btree_node_ ## flag(struct btree *b) \ { clear_bit(BTREE_NODE_ ## flag, &b->flags); } -enum btree_flags { - BTREE_NODE_read_in_flight, - BTREE_NODE_read_error, - BTREE_NODE_dirty, - BTREE_NODE_need_write, - BTREE_NODE_noevict, - BTREE_NODE_write_idx, - BTREE_NODE_accessed, - BTREE_NODE_write_in_flight, - BTREE_NODE_write_in_flight_inner, - BTREE_NODE_just_written, - BTREE_NODE_dying, - BTREE_NODE_fake, - BTREE_NODE_need_rewrite, - BTREE_NODE_never_write, -}; - -BTREE_FLAG(read_in_flight); -BTREE_FLAG(read_error); -BTREE_FLAG(need_write); -BTREE_FLAG(noevict); -BTREE_FLAG(write_idx); -BTREE_FLAG(accessed); -BTREE_FLAG(write_in_flight); -BTREE_FLAG(write_in_flight_inner); -BTREE_FLAG(just_written); -BTREE_FLAG(dying); -BTREE_FLAG(fake); -BTREE_FLAG(need_rewrite); -BTREE_FLAG(never_write); +BTREE_FLAGS() +#undef x static inline struct btree_write *btree_current_write(struct btree *b) { @@ -583,24 +596,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b) return __btree_node_type(b->c.level, b->c.btree_id); } -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - switch (type) { - case BKEY_TYPE_extents: - case BKEY_TYPE_reflink: - return true; - default: - return false; - } -} - -static inline bool btree_node_is_extents(struct btree *b) -{ - return btree_node_type_is_extents(btree_node_type(b)); -} - #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ (1U << BKEY_TYPE_inodes)| \ (1U << BKEY_TYPE_stripes)| \ (1U << BKEY_TYPE_reflink)| \ @@ -616,6 +614,16 @@ static inline bool btree_node_is_extents(struct btree *b) (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + return (1U << type) & BTREE_ID_IS_EXTENTS; +} + #define BTREE_ID_HAS_SNAPSHOTS \ ((1U << BTREE_ID_extents)| \ (1U << BTREE_ID_inodes)| \ @@ -633,6 +641,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id) enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -645,6 +654,7 @@ enum btree_update_flags { }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) @@ -659,6 +669,7 @@ enum btree_update_flags { ((1U << KEY_TYPE_alloc)| \ (1U << KEY_TYPE_alloc_v2)| \ (1U << KEY_TYPE_alloc_v3)| \ + (1U << KEY_TYPE_alloc_v4)| \ (1U << KEY_TYPE_stripe)| \ (1U << KEY_TYPE_inode)| \ (1U << KEY_TYPE_inode_v2)| \ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 89f07e58f61b..ad13b0739a68 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -16,12 +16,12 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { - __BTREE_INSERT_NOFAIL, + /* First two bits for journal watermark: */ + __BTREE_INSERT_NOFAIL = 2, __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_JOURNAL_RECLAIM, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, @@ -41,9 +41,6 @@ enum btree_insert_flags { /* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -/* Indicates that we have pre-reserved space in the journal: */ -#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) - /* Insert is being called from journal reclaim path: */ #define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) @@ -63,7 +60,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, u64 *); + struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); @@ -73,12 +70,18 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, bool); +int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); + void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); +int bch2_trans_log_msg(struct btree_trans *, const char *); + /** * bch2_trans_commit - insert keys at given iterator positions * @@ -135,21 +138,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) - if ((cmp_int(btree_id, i->btree_id) ?: - bpos_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->btree_id) - return i->k; - break; - } - - return NULL; -} - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 6872e56b5c41..42ae3b0c5839 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -16,6 +16,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -40,11 +41,11 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) struct bkey_s_c k; struct bkey_s_c_btree_ptr_v2 bp; struct bkey unpacked; - char buf1[100], buf2[100]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; BUG_ON(!b->c.level); - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) return; bch2_btree_node_iter_init_from_start(&iter, b); @@ -57,9 +58,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) if (bpos_cmp(next_node, bp.v->min_key)) { bch2_dump_btree_node(c, b); - panic("expected next min_key %s got %s\n", - (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), - (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2)); + bch2_bpos_to_text(&buf1, next_node); + bch2_bpos_to_text(&buf2, bp.v->min_key); + panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); } bch2_btree_node_iter_advance(&iter, b); @@ -67,9 +68,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) if (bch2_btree_node_iter_end(&iter)) { if (bpos_cmp(k.k->p, b->key.k.p)) { bch2_dump_btree_node(c, b); - panic("expected end %s got %s\n", - (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), - (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2)); + bch2_bpos_to_text(&buf1, b->key.k.p); + bch2_bpos_to_text(&buf2, k.k->p); + panic("expected end %s got %s\n", buf1.buf, buf2.buf); } break; } @@ -180,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, struct disk_reservation *res, struct closure *cl, + bool interior_node, unsigned flags) { struct write_point *wp; @@ -192,10 +194,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, if (flags & BTREE_INSERT_USE_RESERVE) { nr_reserve = 0; - alloc_reserve = RESERVE_BTREE_MOVINGGC; + alloc_reserve = RESERVE_btree_movinggc; } else { nr_reserve = BTREE_NODE_RESERVE; - alloc_reserve = RESERVE_BTREE; + alloc_reserve = RESERVE_btree; } mutex_lock(&c->btree_reserve_cache_lock); @@ -241,7 +243,9 @@ retry: bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); mem_alloc: - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, interior_node); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); /* we hold cannibalize_lock: */ BUG_ON(IS_ERR(b)); @@ -257,15 +261,19 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev { struct bch_fs *c = as->c; struct btree *b; + struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; int ret; BUG_ON(level >= BTREE_MAX_DEPTH); - BUG_ON(!as->nr_prealloc_nodes); + BUG_ON(!p->nr); + + b = p->b[--p->nr]; - b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); set_btree_node_accessed(b); - set_btree_node_dirty(c, b); + set_btree_node_dirty_acct(c, b); set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); @@ -371,70 +379,94 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) static void bch2_btree_reserve_put(struct btree_update *as) { struct bch_fs *c = as->c; + struct prealloc_nodes *p; mutex_lock(&c->btree_reserve_cache_lock); - while (as->nr_prealloc_nodes) { - struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + for (p = as->prealloc_nodes; + p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); + p++) { + while (p->nr) { + struct btree *b = p->b[--p->nr]; - six_unlock_write(&b->c.lock); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { + struct btree_alloc *a = + &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - a->ob = b->ob; - b->ob.nr = 0; - bkey_copy(&a->k, &b->key); - } else { - bch2_open_buckets_put(c, &b->ob); - } - - btree_node_lock_type(c, b, SIX_LOCK_write); - __btree_node_free(c, b); - six_unlock_write(&b->c.lock); + a->ob = b->ob; + b->ob.nr = 0; + bkey_copy(&a->k, &b->key); + } else { + bch2_open_buckets_put(c, &b->ob); + } - six_unlock_intent(&b->c.lock); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } } mutex_unlock(&c->btree_reserve_cache_lock); } -static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, - unsigned flags, struct closure *cl) +static int bch2_btree_reserve_get(struct btree_update *as, + unsigned nr_nodes[2], + unsigned flags) { struct bch_fs *c = as->c; + struct closure cl; struct btree *b; + unsigned interior; int ret; - BUG_ON(nr_nodes > BTREE_RESERVE_MAX); + closure_init_stack(&cl); +retry: + + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: + * + * BTREE_INSERT_NOWAIT only applies to btree node allocation, not + * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) - return ret; + goto err; - while (as->nr_prealloc_nodes < nr_nodes) { - b = __bch2_btree_node_alloc(c, &as->disk_res, - flags & BTREE_INSERT_NOWAIT - ? NULL : cl, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err_free; - } + for (interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { + b = __bch2_btree_node_alloc(c, &as->disk_res, + flags & BTREE_INSERT_NOWAIT + ? NULL : &cl, + interior, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err; + } - as->prealloc_nodes[as->nr_prealloc_nodes++] = b; + p->b[p->nr++] = b; + } } bch2_btree_cache_cannibalize_unlock(c); + closure_sync(&cl); return 0; -err_free: +err: bch2_btree_cache_cannibalize_unlock(c); - trace_btree_reserve_get_fail(c, nr_nodes, cl); + closure_sync(&cl); + + if (ret == -EAGAIN) + goto retry; + + trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl); return ret; } @@ -500,24 +532,25 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, struct bkey_i *k; int ret; - trans->extra_journal_entries = (void *) &as->journal_entries[0]; - trans->extra_journal_entry_u64s = as->journal_u64s; + ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s); + if (ret) + return ret; + + memcpy(&darray_top(trans->extra_journal_entries), + as->journal_entries, + as->journal_u64s * sizeof(u64)); + trans->extra_journal_entries.nr += as->journal_u64s; + trans->journal_pin = &as->journal; for_each_keylist_key(&as->new_keys, k) { - ret = bch2_trans_mark_key(trans, - bkey_s_c_null, - bkey_i_to_s_c(k), - BTREE_TRIGGER_INSERT); + ret = bch2_trans_mark_new(trans, k, 0); if (ret) return ret; } for_each_keylist_key(&as->old_keys, k) { - ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(k), - bkey_s_c_null, - BTREE_TRIGGER_OVERWRITE); + ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0); if (ret) return ret; } @@ -545,8 +578,6 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; - BUG_ON(!journal_pin_active(&as->journal)); - /* * Wait for any in flight writes to finish before we free the old nodes * on disk: @@ -582,7 +613,7 @@ static void btree_update_nodes_written(struct btree_update *as) BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED, + JOURNAL_WATERMARK_reserved, btree_update_nodes_written_trans(&trans, as)); bch2_trans_exit(&trans); @@ -602,11 +633,13 @@ err: * we're in journal error state: */ - btree_node_lock_type(c, b, SIX_LOCK_intent); - btree_node_lock_type(c, b, SIX_LOCK_write); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); mutex_lock(&c->btree_interior_update_lock); list_del(&as->write_blocked_list); + if (list_empty(&b->write_blocked)) + clear_btree_node_write_blocked(b); /* * Node might have been freed, recheck under @@ -651,13 +684,14 @@ err: BUG_ON(b->will_make_reachable != (unsigned long) as); b->will_make_reachable = 0; + clear_btree_node_will_make_reachable(b); } mutex_unlock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - btree_node_lock_type(c, b, SIX_LOCK_read); + six_lock_read(&b->c.lock, NULL, NULL); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -717,6 +751,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) as->mode = BTREE_INTERIOR_UPDATING_NODE; as->b = b; + + set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); mutex_unlock(&c->btree_interior_update_lock); @@ -782,6 +818,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree as->new_nodes[as->nr_new_nodes++] = b; b->will_make_reachable = 1UL|(unsigned long) as; + set_btree_node_will_make_reachable(b); mutex_unlock(&c->btree_interior_update_lock); @@ -804,6 +841,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) * xchg() is for synchronization with bch2_btree_complete_write: */ v = xchg(&b->will_make_reachable, 0); + clear_btree_node_will_make_reachable(b); as = (struct btree_update *) (v & ~1UL); if (!as) { @@ -869,7 +907,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, closure_wake_up(&c->btree_interior_update_wait); } - clear_btree_node_dirty(c, b); + clear_btree_node_dirty_acct(c, b); clear_btree_node_need_write(b); /* @@ -930,31 +968,43 @@ static void bch2_btree_update_done(struct btree_update *as) static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, unsigned nr_nodes, unsigned flags) + unsigned level, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; - struct closure cl; u64 start_time = local_clock(); int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; - int journal_flags = 0; + unsigned nr_nodes[2] = { 0, 0 }; + unsigned update_level = level; + int journal_flags = flags & JOURNAL_WATERMARK_MASK; int ret = 0; BUG_ON(!path->should_be_locked); - if (flags & BTREE_INSERT_JOURNAL_RESERVED) - journal_flags |= JOURNAL_RES_GET_RESERVED; + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; - closure_init_stack(&cl); -retry: + while (1) { + nr_nodes[!!update_level] += 1 + split; + update_level++; + + if (!btree_path_node(path, update_level)) + break; + + /* + * XXX: figure out how far we might need to split, + * instead of locking/reserving all the way to the root: + */ + split = update_level + 1 < BTREE_MAX_DEPTH; + } + + /* Might have to allocate a new root: */ + if (update_level < BTREE_MAX_DEPTH) + nr_nodes[1] += 1; - /* - * XXX: figure out how far we might need to split, - * instead of locking/reserving all the way to the root: - */ if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { - trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, + trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, path->btree_id, &path->pos); ret = btree_trans_restart(trans); return ERR_PTR(ret); @@ -1002,60 +1052,37 @@ retry: if (ret) goto err; + bch2_trans_unlock(trans); + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret == -EAGAIN) { - bch2_trans_unlock(trans); - - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - bch2_btree_update_free(as); - btree_trans_restart(trans); - return ERR_PTR(ret); - } - - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags); - if (ret) { - trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); - goto err; - } - - if (!bch2_trans_relock(trans)) { - ret = -EINTR; - goto err; - } + journal_flags); + if (ret) { + bch2_btree_update_free(as); + trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); + btree_trans_restart(trans); + return ERR_PTR(ret); } ret = bch2_disk_reservation_get(c, &as->disk_res, - nr_nodes * btree_sectors(c), + (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, disk_res_flags); if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(as, nr_nodes, flags); if (ret) goto err; - bch2_journal_pin_add(&c->journal, - atomic64_read(&c->journal.seq), - &as->journal, NULL); + if (!bch2_trans_relock(trans)) { + ret = -EINTR; + goto err; + } return as; err: bch2_btree_update_free(as); - - if (ret == -EAGAIN) { - bch2_trans_unlock(trans); - closure_sync(&cl); - ret = -EINTR; - } - - if (ret == -EINTR && bch2_trans_relock(trans)) - goto retry; - return ERR_PTR(ret); } @@ -1105,8 +1132,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *old; trace_btree_set_root(c, b); - BUG_ON(!b->written && - !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); + BUG_ON(!b->written); old = btree_node_root(c, b); @@ -1146,13 +1172,17 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); if (invalid) { - char buf[160]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); - bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid); + printbuf_exit(&buf); dump_stack(); } @@ -1170,7 +1200,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, bch2_btree_node_iter_advance(node_iter, b); bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); - set_btree_node_dirty(c, b); + set_btree_node_dirty_acct(c, b); set_btree_node_need_write(b); } @@ -1391,8 +1421,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - bch2_btree_node_write(c, n1, SIX_LOCK_intent); - bch2_btree_node_write(c, n2, SIX_LOCK_intent); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); /* * Note that on recursive parent_keys == keys, so we @@ -1411,7 +1441,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); - bch2_btree_node_write(c, n3, SIX_LOCK_intent); + bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); } } else { trace_btree_compact(c, b); @@ -1419,7 +1449,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->c.lock); - bch2_btree_node_write(c, n1, SIX_LOCK_intent); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); @@ -1556,14 +1586,13 @@ int bch2_btree_split_leaf(struct btree_trans *trans, struct btree_path *path, unsigned flags) { - struct bch_fs *c = trans->c; struct btree *b = path_l(path)->b; struct btree_update *as; unsigned l; int ret = 0; as = bch2_btree_update_start(trans, path, path->level, - btree_update_reserve_required(c, b), flags); + true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1634,15 +1663,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, } if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { - char buf1[100], buf2[100]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); - bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); + bch2_bpos_to_text(&buf1, prev->data->max_key); + bch2_bpos_to_text(&buf2, next->data->min_key); bch_err(c, "btree topology error in btree merge:\n" " prev ends at %s\n" " next starts at %s", - buf1, buf2); + buf1.buf, buf2.buf); + printbuf_exit(&buf1); + printbuf_exit(&buf2); bch2_topology_error(c); ret = -EIO; goto err; @@ -1672,11 +1703,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, goto out; parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, level, - btree_update_reserve_required(c, parent) + 1, - flags| + as = bch2_btree_update_start(trans, path, level, false, BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); + BTREE_INSERT_USE_RESERVE| + flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -1689,6 +1719,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, n = bch2_btree_node_alloc(as, b->c.level); bch2_btree_update_add_new_node(as, n); + SET_BTREE_NODE_SEQ(n->data, + max(BTREE_NODE_SEQ(b->data), + BTREE_NODE_SEQ(m->data)) + 1); + btree_set_min(n, prev->data->min_key); btree_set_max(n, next->data->max_key); n->data->format = new_f; @@ -1701,7 +1735,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_build_aux_trees(n); six_unlock_write(&n->c.lock); - bch2_btree_node_write(c, n, SIX_LOCK_intent); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); bkey_init(&delete.k); delete.k.p = prev->key.k.p; @@ -1755,10 +1789,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, parent = btree_node_parent(iter->path, b); as = bch2_btree_update_start(trans, iter->path, b->c.level, - (parent - ? btree_update_reserve_required(c, parent) - : 0) + 1, - flags); + false, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) { trace_btree_gc_rewrite_node_fail(c, b); @@ -1775,7 +1806,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, trace_btree_gc_rewrite_node(c, b); - bch2_btree_node_write(c, n, SIX_LOCK_intent); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); @@ -1847,9 +1878,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) - return; - if (!percpu_ref_tryget(&c->writes)) return; @@ -1878,21 +1906,14 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter2 = { NULL }; struct btree *parent; - u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; int ret; if (!skip_triggers) { - ret = bch2_trans_mark_key(trans, - bkey_s_c_null, - bkey_i_to_s_c(new_key), - BTREE_TRIGGER_INSERT); + ret = bch2_trans_mark_new(trans, new_key, 0); if (ret) return ret; - ret = bch2_trans_mark_key(trans, - bkey_i_to_s_c(&b->key), - bkey_s_c_null, - BTREE_TRIGGER_OVERWRITE); + ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0); if (ret) return ret; } @@ -1918,6 +1939,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, btree_node_unlock(iter2.path, iter2.path->level); path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; iter2.path->level++; + btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE); + + bch2_btree_path_check_sort(trans, iter2.path, 0); ret = bch2_btree_iter_traverse(&iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); @@ -1926,19 +1950,24 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - trans->extra_journal_entries = (void *) &journal_entries[0]; - trans->extra_journal_entry_u64s = - journal_entry_set((void *) &journal_entries[0], - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - new_key, new_key->k.u64s); + ret = darray_make_room(trans->extra_journal_entries, + jset_u64s(new_key->k.u64s)); + if (ret) + return ret; + + journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + new_key, new_key->k.u64s); + trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_RESERVE| BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED); + JOURNAL_WATERMARK_reserved); if (ret) goto err; @@ -2001,7 +2030,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite return -EINTR; } - new_hash = bch2_btree_node_mem_alloc(c); + new_hash = bch2_btree_node_mem_alloc(c, false); } path->intent_ref++; @@ -2077,7 +2106,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c); + b = bch2_btree_node_mem_alloc(c, false); bch2_btree_cache_cannibalize_unlock(c); set_btree_node_fake(b); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 8cf59cee6e4e..e72eb8795616 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -76,18 +76,20 @@ struct btree_update { struct journal_entry_pin journal; /* Preallocated nodes we reserve when we start the update: */ - struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; - unsigned nr_prealloc_nodes; + struct prealloc_nodes { + struct btree *b[BTREE_UPDATE_NODES_MAX]; + unsigned nr; + } prealloc_nodes[2]; /* Nodes being freed: */ struct keylist old_keys; u64 _old_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_VAL_U64s_MAX]; + BKEY_BTREE_PTR_U64s_MAX]; /* Nodes being added: */ struct keylist new_keys; u64 _new_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_VAL_U64s_MAX]; + BKEY_BTREE_PTR_U64s_MAX]; /* New nodes, that will be made reachable by this update: */ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 1966441b1a62..a0480c63dd81 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -15,6 +15,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "subvolume.h" #include "replicas.h" @@ -22,6 +23,10 @@ #include <linux/sort.h> #include <trace/events/bcachefs.h> +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags); + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -162,10 +167,24 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); + unsigned long old, new, v; + unsigned idx = w - b->writes; + + six_lock_read(&b->c.lock, NULL, NULL); + v = READ_ONCE(b->flags); + + do { + old = new = v; + + if (!(old & (1 << BTREE_NODE_dirty)) || + !!(old & (1 << BTREE_NODE_write_idx)) != idx || + w->journal.seq != seq) + break; + + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); - btree_node_lock_type(c, b, SIX_LOCK_read); - bch2_btree_node_write_cond(c, b, - (btree_current_write(b) == w && w->journal.seq == seq)); + btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); return 0; } @@ -194,7 +213,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, /** * btree_insert_key - insert a key one key into a leaf node */ -static bool btree_insert_key_leaf(struct btree_trans *trans, +static void btree_insert_key_leaf(struct btree_trans *trans, struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; @@ -205,12 +224,9 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - EBUG_ON(!insert->level && - !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); - if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, &insert_l(insert)->iter, insert->k))) - return false; + return; i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, le64_to_cpu(i->journal_seq))); @@ -218,7 +234,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); if (unlikely(!btree_node_dirty(b))) - set_btree_node_dirty(c, b); + set_btree_node_dirty_acct(c, b); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -231,8 +247,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, if (u64s_added > live_u64s_added && bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); - - return true; } /* Cached btree updates: */ @@ -268,7 +282,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, return ret; if (!bch2_trans_relock(trans)) { - trace_trans_restart_journal_preres_get(trans->ip, trace_ip); + trace_trans_restart_journal_preres_get(trans->fn, trace_ip); return -EINTR; } @@ -281,15 +295,40 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; - if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) - flags |= JOURNAL_RES_GET_RESERVED; - ret = bch2_journal_res_get(&c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, + flags| + (trans->flags & JOURNAL_WATERMARK_MASK)); return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; } +#define JSET_ENTRY_LOG_U64s 4 + +static noinline void journal_transaction_name(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res); + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned u64s = JSET_ENTRY_LOG_U64s - 1; + unsigned b, buflen = u64s * sizeof(u64); + + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 0; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + b = min_t(unsigned, strlen(trans->fn), buflen); + memcpy(l->d, trans->fn, b); + while (b < buflen) + l->d[b++] = '\0'; + + trans->journal_res.offset += JSET_ENTRY_LOG_U64s; + trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s; +} + static inline enum btree_insert_ret btree_key_can_insert(struct btree_trans *trans, struct btree *b, @@ -308,14 +347,15 @@ btree_key_can_insert_cached(struct btree_trans *trans, struct btree_path *path, unsigned u64s) { + struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; - unsigned new_u64s; + unsigned old_u64s = ck->u64s, new_u64s; struct bkey_i *new_k; EBUG_ON(path->level); if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(trans->c) && + bch2_btree_key_cache_must_wait(c) && !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) return BTREE_INSERT_NEED_JOURNAL_RECLAIM; @@ -330,12 +370,27 @@ btree_key_can_insert_cached(struct btree_trans *trans, new_u64s = roundup_pow_of_two(u64s); new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[path->btree_id], new_u64s); return -ENOMEM; + } ck->u64s = new_u64s; ck->k = new_k; - return BTREE_INSERT_OK; + /* + * Keys returned by peek() are no longer valid pointers, so we need a + * transaction restart: + */ + trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_, + path->btree_id, &path->pos, + old_u64s, new_u64s); + /* + * Not using btree_trans_restart() because we can't unlock here, we have + * write locks held: + */ + trans->restarted = true; + return -EINTR; } static inline void do_btree_insert_one(struct btree_trans *trans, @@ -343,18 +398,16 @@ static inline void do_btree_insert_one(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - bool did_work; EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); i->k->k.needs_whiteout = false; - did_work = !i->cached - ? btree_insert_key_leaf(trans, i) - : bch2_btree_insert_key_cached(trans, i->path, i->k); - if (!did_work) - return; + if (!i->cached) + btree_insert_key_leaf(trans, i); + else + bch2_btree_insert_key_cached(trans, i->path, i->k); if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { bch2_journal_add_keys(j, &trans->journal_res, @@ -367,10 +420,163 @@ static inline void do_btree_insert_one(struct btree_trans *trans, } } -static noinline void bch2_trans_mark_gc(struct btree_trans *trans) +/* Triggers: */ + +static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) +{ + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + int ret; + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + + if (!btree_node_type_needs_gc(i->btree_id)) + return 0; + + if (bch2_bkey_ops[old.k->type].atomic_trigger == + bch2_bkey_ops[i->k->k.type].atomic_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + + _deleted.p = i->path->pos; + + ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +} + +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) +{ + /* + * Transactional triggers create new btree_insert_entries, so we can't + * pass them a pointer to a btree_insert_entry, that memory is going to + * move: + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && + bch2_bkey_ops[old.k->type].trans_trigger == + bch2_bkey_ops[i->k->k.type].trans_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, old, i->k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE| + i->flags) ?: 1; + } else if (overwrite && !i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_trans_mark_old(trans, old, i->flags) ?: 1; + } else if (!i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1; + } else { + return 0; + } +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->btree_id != btree_id) + continue; + + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + if (btree_id == BTREE_ID_alloc) + continue; + + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; + } + + trans_for_each_update(trans, i) { + if (i->btree_id > BTREE_ID_alloc) + break; + if (i->btree_id == BTREE_ID_alloc) { + ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + if (ret) + return ret; + break; + } + } + + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); + + return 0; +} + +static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + int ret = 0; trans_for_each_update(trans, i) { /* @@ -379,10 +585,14 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) - bch2_mark_update(trans, i->path, i->k, - i->flags|BTREE_TRIGGER_GC); + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } } + + return ret; } static inline int @@ -398,7 +608,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, int ret; if (race_fault()) { - trace_trans_restart_fault_inject(trans->ip, trace_ip); + trace_trans_restart_fault_inject(trans->fn, trace_ip); trans->restarted = true; return -EINTR; } @@ -435,6 +645,32 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (btree_node_type_needs_gc(i->bkey_type)) marking = true; + + /* + * Revalidate before calling mem triggers - XXX, ugly: + * + * - successful btree node splits don't cause transaction + * restarts and will have invalidated the pointer to the bkey + * value + * - btree_node_lock_for_insert() -> btree_node_prep_for_write() + * when it has to resort + * - btree_key_can_insert_cached() when it has to reallocate + * + * Ugly because we currently have no way to tell if the + * pointer's been invalidated, which means it's debatabale + * whether we should be stashing the old key at all. + */ + i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { + struct bkey_i *j_k = + bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p); + + if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } } /* @@ -446,17 +682,20 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, JOURNAL_RES_GET_NONBLOCK); if (ret) return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); } else { trans->journal_res.seq = c->journal.replay_journal_seq; } - if (unlikely(trans->extra_journal_entry_u64s)) { + if (unlikely(trans->extra_journal_entries.nr)) { memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries, - trans->extra_journal_entry_u64s); + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); - trans->journal_res.offset += trans->extra_journal_entry_u64s; - trans->journal_res.u64s -= trans->extra_journal_entry_u64s; + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; } /* @@ -478,11 +717,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, return BTREE_INSERT_NEED_MARK_REPLICAS; trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) - bch2_mark_update(trans, i->path, i->k, i->flags); + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, i->flags); + if (ret) + return ret; + } - if (unlikely(c->gc_pos.phase)) - bch2_trans_mark_gc(trans); + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_commit_run_gc_triggers(trans); + if (ret) + return ret; + } trans_for_each_update(trans, i) do_btree_insert_one(trans, i); @@ -572,8 +817,10 @@ static inline int trans_lock_write(struct btree_trans *trans) if (have_conflicting_read_lock(trans, i->path)) goto fail; - __btree_node_lock_type(trans->c, insert_l(i)->b, - SIX_LOCK_write); + btree_node_lock_type(trans, i->path, + insert_l(i)->b, + i->path->pos, i->level, + SIX_LOCK_write, NULL, NULL); } bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); @@ -588,10 +835,18 @@ fail: bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); } - trace_trans_restart_would_deadlock_write(trans->ip); + trace_trans_restart_would_deadlock_write(trans->fn); return btree_trans_restart(trans); } +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -601,42 +856,29 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct bkey_s_c old; int ret, u64s_delta = 0; trans_for_each_update(trans, i) { const char *invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type); if (invalid) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", - buf, (void *) trans->ip, - (void *) i->ip_allocated, invalid); - bch2_fatal_error(c); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", + buf.buf, trans->fn, (void *) i->ip_allocated, invalid); + printbuf_exit(&buf); return -EINVAL; } btree_insert_entry_checks(trans, i); } trans_for_each_update(trans, i) { - struct bkey u; - - /* - * peek_slot() doesn't yet work on iterators that point to - * interior nodes: - */ - if (i->cached || i->level) + if (i->cached) continue; - old = bch2_btree_path_peek_slot(i->path, &u); - ret = bkey_err(old); - if (unlikely(ret)) - return ret; - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + u64s_delta -= i->old_btree_u64s; if (!same_leaf_as_next(trans, i)) { if (u64s_delta <= 0) { @@ -653,8 +895,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| - ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) - ? JOURNAL_RES_GET_RESERVED : 0)); + (trans->flags & JOURNAL_WATERMARK_MASK)); if (unlikely(ret == -EAGAIN)) ret = bch2_trans_journal_preres_get_cold(trans, trans->journal_preres_u64s, trace_ip); @@ -669,6 +910,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_drop_overwrites_from_journal(trans); + trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_unlock_write_inlined(trans, i->path, @@ -716,7 +960,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, return 0; if (ret == -EINTR) - trace_trans_restart_btree_node_split(trans->ip, trace_ip, + trace_trans_restart_btree_node_split(trans->fn, trace_ip, i->btree_id, &i->path->pos); break; case BTREE_INSERT_NEED_MARK_REPLICAS: @@ -729,14 +973,14 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (bch2_trans_relock(trans)) return 0; - trace_trans_restart_mark_replicas(trans->ip, trace_ip); + trace_trans_restart_mark_replicas(trans->fn, trace_ip); ret = -EINTR; break; case BTREE_INSERT_NEED_JOURNAL_RES: bch2_trans_unlock(trans); if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && - !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { + !(trans->flags & JOURNAL_WATERMARK_reserved)) { trans->restarted = true; ret = -EAGAIN; break; @@ -749,13 +993,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (bch2_trans_relock(trans)) return 0; - trace_trans_restart_journal_res_get(trans->ip, trace_ip); + trace_trans_restart_journal_res_get(trans->fn, trace_ip); ret = -EINTR; break; case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - trace_trans_blocked_journal_reclaim(trans->ip, trace_ip); + trace_trans_blocked_journal_reclaim(trans->fn, trace_ip); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); @@ -765,7 +1009,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (bch2_trans_relock(trans)) return 0; - trace_trans_restart_journal_reclaim(trans->ip, trace_ip); + trace_trans_restart_journal_reclaim(trans->fn, trace_ip); ret = -EINTR; break; default: @@ -774,7 +1018,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, } BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); - BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL)); + BUG_ON(ret == -ENOSPC && + !(trans->flags & BTREE_INSERT_NOWAIT) && + (trans->flags & BTREE_INSERT_NOFAIL)); return ret; } @@ -785,7 +1031,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) struct bch_fs *c = trans->c; int ret; - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || + test_bit(BCH_FS_STARTED, &c->flags)) return -EROFS; bch2_trans_unlock(trans); @@ -801,155 +1048,72 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +/* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to + * do. + */ +static noinline int +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - struct bkey_s_c old; - struct bkey unpacked; - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; - bool trans_trigger_run; - unsigned btree_id = 0; + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; int ret = 0; - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) - btree_id_start++; - - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->insert_trigger_run || - (i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - continue; - - BUG_ON(i->overwrite_trigger_run); - - i->insert_trigger_run = true; - trans_trigger_run = true; - - old = bch2_btree_path_peek_slot(i->path, &unpacked); - _deleted.p = i->path->pos; - - if (old.k->type == i->k->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - i->overwrite_trigger_run = true; - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); - } else { - ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), - BTREE_TRIGGER_INSERT|i->flags); - } - - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip, _RET_IP_, - i->btree_id, &i->path->pos); - if (ret) - return ret; - } - } while (trans_trigger_run); - - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->overwrite_trigger_run || - (i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - continue; - - BUG_ON(!i->insert_trigger_run); - - i->overwrite_trigger_run = true; - trans_trigger_run = true; - - old = bch2_btree_path_peek_slot(i->path, &unpacked); - _deleted.p = i->path->pos; - - ret = bch2_trans_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|i->flags); - - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip, _RET_IP_, - i->btree_id, &i->path->pos); - if (ret) - return ret; - } - } while (trans_trigger_run); + trans_for_each_update(trans, i) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) + break; } - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; + return ret; } int __bch2_trans_commit(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; unsigned u64s; int ret = 0; if (!trans->nr_updates && - !trans->extra_journal_entry_u64s) + !trans->extra_journal_entries.nr) goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&trans->c->gc_lock); + lockdep_assert_held(&c->gc_lock); - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out_reset; - trans->journal_u64s = trans->extra_journal_entry_u64s; - trans->journal_preres_u64s = 0; + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + ret = do_bch2_trans_commit_to_journal_replay(trans); + goto out_reset; + } if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!percpu_ref_tryget(&trans->c->writes))) { + unlikely(!percpu_ref_tryget(&c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) goto out_reset; } -#ifdef CONFIG_BCACHEFS_DEBUG - /* - * if BTREE_TRIGGER_NORUN is set, it means we're probably being called - * from the key cache flush code: - */ - trans_for_each_update(trans, i) - if (!i->cached && - !(i->flags & BTREE_TRIGGER_NORUN)) - bch2_btree_key_cache_verify_clean(trans, - i->btree_id, i->k->k.p); -#endif + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out; + trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_preres_u64s = 0; + + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + + if (trans->journal_transaction_names) + trans->journal_u64s += JSET_ENTRY_LOG_U64s; trans_for_each_update(trans, i) { BUG_ON(!i->path->should_be_locked); if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { - trace_trans_restart_upgrade(trans->ip, _RET_IP_, + trace_trans_restart_upgrade(trans->fn, _RET_IP_, i->btree_id, &i->path->pos); ret = btree_trans_restart(trans); goto out; @@ -965,7 +1129,7 @@ int __bch2_trans_commit(struct btree_trans *trans) } if (trans->extra_journal_res) { - ret = bch2_disk_reservation_add(trans->c, trans->disk_res, + ret = bch2_disk_reservation_add(c, trans->disk_res, trans->extra_journal_res, (trans->flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0); @@ -984,10 +1148,10 @@ retry: if (ret) goto err; out: - bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + bch2_journal_preres_put(&c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&trans->c->writes); + percpu_ref_put(&c->writes); out_reset: trans_for_each_update(trans, i) bch2_path_put(trans, i->path, true); @@ -995,8 +1159,7 @@ out_reset: trans->extra_journal_res = 0; trans->nr_updates = 0; trans->hooks = NULL; - trans->extra_journal_entries = NULL; - trans->extra_journal_entry_u64s = 0; + trans->extra_journal_entries.nr = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; @@ -1023,6 +1186,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, struct bkey_s_c k; int ret; + if (!btree_type_has_snapshots(id)) + return 0; + if (!snapshot_t(c, pos.snapshot)->children[0]) return 0; @@ -1051,10 +1217,10 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, return ret; } -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_update_flags flags) +int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) { struct bch_fs *c = trans->c; struct btree_iter iter, update_iter; @@ -1068,7 +1234,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_INTENT| BTREE_ITER_WITH_UPDATES| BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -1212,19 +1378,16 @@ nomerge1: bkey_reassemble(update, k); bch2_cut_front(insert->k.p, update); - bch2_trans_copy_iter(&update_iter, &iter); - update_iter.pos = update->k.p; - ret = bch2_trans_update(trans, &update_iter, update, + ret = bch2_trans_update_by_path(trans, iter.path, update, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| flags); - bch2_trans_iter_exit(trans, &update_iter); - if (ret) goto err; goto out; } next: - k = bch2_btree_iter_next(&iter); + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -1301,26 +1464,25 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, return ret; } -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; - BUG_ON(!iter->path->should_be_locked); - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return bch2_trans_update_extent(trans, iter, k, flags); + BUG_ON(!path->should_be_locked); BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); - BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); + BUG_ON(bpos_cmp(k->k.p, path->pos)); n = (struct btree_insert_entry) { .flags = flags, - .bkey_type = __btree_node_type(iter->path->level, iter->btree_id), - .btree_id = iter->btree_id, - .level = iter->path->level, - .cached = iter->flags & BTREE_ITER_CACHED, - .path = iter->path, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, .k = k, .ip_allocated = _RET_IP_, }; @@ -1331,16 +1493,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter btree_insert_entry_cmp(i - 1, i) >= 0); #endif - if (bkey_deleted(&n.k->k) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - n.k->k.type = KEY_TYPE_whiteout; - } - /* * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: @@ -1353,28 +1505,95 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter !btree_insert_entry_cmp(&n, i)) { BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - /* - * This is a hack to ensure that inode creates update the btree, - * not the key cache, which helps with cache coherency issues in - * other areas: - */ - if (n.cached && !i->cached) { - i->k = n.k; - i->flags = n.flags; - return 0; - } - bch2_path_put(trans, i->path, true); - *i = n; - } else + i->flags = n.flags; + i->cached = n.cached; + i->k = n.k; + i->path = n.path; + i->ip_allocated = n.ip_allocated; + } else { array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); - __btree_path_get(n.path, true); + i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; + + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { + struct bkey_i *j_k = + bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p); + if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } + + __btree_path_get(n.path, true); return 0; } +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_path *path = iter->update_path ?: iter->path; + struct bkey_cached *ck; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + bpos_cmp(iter->key_cache_path->pos, k->k.p)) { + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); + btree_trans_restart(trans); + return -EINTR; + } + + iter->key_cache_path->should_be_locked = true; + } + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags); +} + void bch2_trans_commit_hook(struct btree_trans *trans, struct btree_trans_commit_hook *h) { @@ -1428,14 +1647,14 @@ int bch2_btree_delete_at(struct btree_trans *trans, int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, - unsigned iter_flags, + unsigned update_flags, u64 *journal_seq) { struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); retry: while ((bch2_trans_begin(trans), (k = bch2_btree_iter_peek(&iter)).k) && @@ -1463,7 +1682,7 @@ retry: */ delete.k.p = iter.pos; - if (btree_node_type_is_extents(id)) { + if (iter.flags & BTREE_ITER_IS_EXTENTS) { unsigned max_sectors = KEY_SIZE_MAX & (~0 << trans->c->block_bits); @@ -1478,7 +1697,8 @@ retry: ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); + BTREE_INSERT_NOFAIL| + update_flags); bch2_disk_reservation_put(trans->c, &disk_res); if (ret) break; @@ -1500,8 +1720,37 @@ retry: */ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, + unsigned update_flags, u64 *journal_seq) { return bch2_trans_do(c, NULL, journal_seq, 0, - bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq)); + bch2_btree_delete_range_trans(&trans, id, start, end, + update_flags, journal_seq)); +} + +int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) +{ + unsigned len = strlen(msg); + unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); + struct jset_entry_log *l; + int ret; + + ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s)); + if (ret) + return ret; + + l = (void *) &darray_top(trans->extra_journal_entries); + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 1; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + memcpy(l->d, msg, len); + while (len & 7) + l->d[len++] = '\0'; + + trans->extra_journal_entries.nr += jset_u64s(u64s); + return 0; } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index f7d4a0678e39..7654ab24a909 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -11,6 +11,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "ec.h" #include "error.h" #include "inode.h" @@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, } } -/* - * Clear journal_seq_valid for buckets for which it's not needed, to prevent - * wraparound: - */ -void bch2_bucket_seq_cleanup(struct bch_fs *c) -{ - u64 journal_seq = atomic64_read(&c->journal.seq); - u16 last_seq_ondisk = c->journal.flushed_seq_ondisk; - struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; - struct bucket_mark m; - unsigned i; - - if (journal_seq - c->last_bucket_seq_cleanup < - (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) - return; - - c->last_bucket_seq_cleanup = journal_seq; - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) { - bucket_cmpxchg(g, m, ({ - if (!m.journal_seq_valid || - bucket_needs_journal_commit(m, last_seq_ondisk)) - break; - - m.journal_seq_valid = 0; - })); - } - up_read(&ca->bucket_lock); - } -} - void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; @@ -315,29 +279,24 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -static inline int is_unavailable_bucket(struct bucket_mark m) +static inline int is_unavailable_bucket(struct bch_alloc_v4 a) { - return !is_available_bucket(m); + return a.dirty_sectors || a.stripe; } static inline int bucket_sectors_fragmented(struct bch_dev *ca, - struct bucket_mark m) + struct bch_alloc_v4 a) { - return bucket_sectors_used(m) - ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) + return a.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) : 0; } -static inline int is_stripe_data_bucket(struct bucket_mark m) +static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a) { - return m.stripe && m.data_type != BCH_DATA_parity; -} - -static inline enum bch_data_type bucket_type(struct bucket_mark m) -{ - return m.cached_sectors && !m.dirty_sectors + return a.cached_sectors && !a.dirty_sectors ? BCH_DATA_cached - : m.data_type; + : a.data_type; } static inline void account_bucket(struct bch_fs_usage *fs_usage, @@ -352,19 +311,13 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new, + struct bch_alloc_v4 old, + struct bch_alloc_v4 new, u64 journal_seq, bool gc) { struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; - /* - * Hack for bch2_fs_initialize path, where we're first marking sb and - * journal non-transactionally: - */ - if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) - journal_seq = 1; - preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); u = dev_usage_ptr(ca, journal_seq, gc); @@ -390,9 +343,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); preempt_enable(); +} + +static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket old, struct bucket new, + u64 journal_seq, bool gc) +{ + struct bch_alloc_v4 old_a = { + .gen = old.gen, + .data_type = old.data_type, + .dirty_sectors = old.dirty_sectors, + .cached_sectors = old.cached_sectors, + .stripe = old.stripe, + }; + struct bch_alloc_v4 new_a = { + .gen = new.gen, + .data_type = new.data_type, + .dirty_sectors = new.dirty_sectors, + .cached_sectors = new.cached_sectors, + .stripe = new.stripe, + }; - if (!is_available_bucket(old) && is_available_bucket(new)) - bch2_wake_allocator(ca); + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); } static inline int __update_replicas(struct bch_fs *c, @@ -416,22 +388,23 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, { struct bch_fs_usage __percpu *fs_usage; int idx, ret = 0; - char buf[200]; + struct printbuf buf = PRINTBUF; percpu_down_read(&c->mark_lock); + buf.atomic++; idx = bch2_replicas_entry_idx(c, r); if (idx < 0 && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err(c, "no replicas entry\n" " while marking %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) { + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { percpu_up_read(&c->mark_lock); ret = bch2_mark_replicas(c, r); - if (ret) - return ret; - percpu_down_read(&c->mark_lock); + + if (ret) + goto err; idx = bch2_replicas_entry_idx(c, r); } if (idx < 0) { @@ -447,6 +420,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, err: fsck_err: percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); return ret; } @@ -525,49 +499,21 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator) -{ - struct bucket *g = bucket(ca, b); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - new.owned_by_allocator = owned_by_allocator; - })); - - BUG_ON(owned_by_allocator == old.owned_by_allocator); -} - -static inline u8 bkey_alloc_gen(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_alloc: - return bkey_s_c_to_alloc(k).v->gen; - case KEY_TYPE_alloc_v2: - return bkey_s_c_to_alloc_v2(k).v->gen; - case KEY_TYPE_alloc_v3: - return bkey_s_c_to_alloc_v3(k).v->gen; - default: - return 0; - } -} - -static int bch2_mark_alloc(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_alloc(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bkey_alloc_unpacked u; - struct bch_dev *ca; - struct bucket *g; - struct bucket_mark old_m, m; + struct bch_alloc_v4 old_a, new_a; + struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); int ret = 0; - /* We don't do anything for deletions - do we?: */ - if (!bkey_is_alloc(new.k)) - return 0; + if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket || + new.k->p.offset >= ca->mi.nbuckets, trans, + "alloc key outside range of device's buckets")) + return -EIO; /* * alloc btree is read in by bch2_alloc_read, not gc: @@ -576,49 +522,80 @@ static int bch2_mark_alloc(struct btree_trans *trans, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; + bch2_alloc_to_v4(old, &old_a); + bch2_alloc_to_v4(new, &new_a); + + if ((flags & BTREE_TRIGGER_INSERT) && + !old_a.data_type != !new_a.data_type && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_alloc_v3); - v->journal_seq = cpu_to_le64(journal_seq); + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + new_a.journal_seq = !new_a.data_type && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; + v->journal_seq = new_a.journal_seq; } - ca = bch_dev_bkey_exists(c, new.k->p.inode); + if (old_a.data_type && !new_a.data_type && new_a.journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + new_a.journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } - if (new.k->p.offset >= ca->mi.nbuckets) - return 0; + if (!new_a.data_type && + (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); - u = bch2_alloc_unpack(new); + if ((flags & BTREE_TRIGGER_INSERT) && + BCH_ALLOC_V4_NEED_DISCARD(&new_a) && + !new_a.journal_seq) + bch2_do_discards(c); + + if (!old_a.data_type && + new_a.data_type && + should_invalidate_buckets(ca)) + bch2_do_invalidates(c); + + if (bucket_state(new_a) == BUCKET_need_gc_gens) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } percpu_down_read(&c->mark_lock); - if (!gc && u.gen != bkey_alloc_gen(old)) - *bucket_gen(ca, new.k->p.offset) = u.gen; + if (!gc && new_a.gen != old_a.gen) + *bucket_gen(ca, new.k->p.offset) = new_a.gen; - g = __bucket(ca, new.k->p.offset, gc); + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); - old_m = bucket_cmpxchg(g, m, ({ - m.gen = u.gen; - m.data_type = u.data_type; - m.dirty_sectors = u.dirty_sectors; - m.cached_sectors = u.cached_sectors; - m.stripe = u.stripe != 0; + if (gc) { + struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (journal_seq) { - m.journal_seq_valid = 1; - m.journal_seq = journal_seq; - } - })); + bucket_lock(g); - bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); + g->gen_valid = 1; + g->gen = new_a.gen; + g->data_type = new_a.data_type; + g->stripe = new_a.stripe; + g->stripe_redundancy = new_a.stripe_redundancy; + g->dirty_sectors = new_a.dirty_sectors; + g->cached_sectors = new_a.cached_sectors; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; - g->gen_valid = 1; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; + bucket_unlock(g); + } percpu_up_read(&c->mark_lock); /* @@ -627,9 +604,9 @@ static int bch2_mark_alloc(struct btree_trans *trans, */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_m.cached_sectors) { + old_a.cached_sectors) { ret = update_cached_sectors(c, new, ca->dev_idx, - -old_m.cached_sectors, + -old_a.cached_sectors, journal_seq, gc); if (ret) { bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); @@ -637,29 +614,18 @@ static int bch2_mark_alloc(struct btree_trans *trans, } trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), - old_m.cached_sectors); + old_a.cached_sectors); } return 0; } -#define checked_add(a, b) \ -({ \ - unsigned _res = (unsigned) (a) + (b); \ - bool overflow = _res > U16_MAX; \ - if (overflow) \ - _res = U16_MAX; \ - (a) = _res; \ - overflow; \ -}) - void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, enum bch_data_type data_type, unsigned sectors, struct gc_pos pos, unsigned flags) { - struct bucket *g; - struct bucket_mark old, new; + struct bucket old, new, *g; bool overflow; BUG_ON(!(flags & BTREE_TRIGGER_GC)); @@ -674,10 +640,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, percpu_down_read(&c->mark_lock); g = gc_bucket(ca, b); - old = bucket_cmpxchg(g, new, ({ - new.data_type = data_type; - overflow = checked_add(new.dirty_sectors, sectors); - })); + + bucket_lock(g); + old = *g; + + g->data_type = data_type; + g->dirty_sectors += sectors; + overflow = g->dirty_sectors < sectors; + + new = *g; + bucket_unlock(g); bch2_fs_inconsistent_on(old.data_type && old.data_type != data_type, c, @@ -691,7 +663,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, bch2_data_types[old.data_type ?: data_type], old.dirty_sectors, sectors); - bch2_dev_usage_update(c, ca, old, new, 0, true); + bch2_dev_usage_update_m(c, ca, old, new, 0, true); percpu_up_read(&c->mark_lock); } @@ -710,83 +682,99 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 bucket_data_type, - u16 dirty_sectors, u16 cached_sectors) + u8 b_gen, u8 bucket_data_type, + u32 dirty_sectors, u32 cached_sectors) { - size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); u16 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; - char buf[200]; + struct printbuf buf = PRINTBUF; + int ret = 0; - if (gen_after(ptr->gen, bucket_gen)) { + if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - if (bucket_gen != ptr->gen && !ptr->cached) { + if (b_gen != ptr->gen && !ptr->cached) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, + *bucket_gen(ca, bucket_nr), bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - if (bucket_gen != ptr->gen) - return 1; + if (b_gen != ptr->gen) { + ret = 1; + goto err; + } if (bucket_data_type && ptr_data_type && bucket_data_type != ptr_data_type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type], bch2_data_types[ptr_data_type], - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { + if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], bucket_sectors, sectors, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EIO; + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; } - - return 0; +err: + printbuf_exit(&buf); + return ret; } static int mark_stripe_bucket(struct btree_trans *trans, struct bkey_s_c k, unsigned ptr_idx, - u64 journal_seq, unsigned flags) + unsigned flags) { struct bch_fs *c = trans->c; + u64 journal_seq = trans->journal_res.seq; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; @@ -794,9 +782,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g; - struct bucket_mark new, old; - char buf[200]; + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(!(flags & BTREE_TRIGGER_GC)); @@ -804,45 +791,45 @@ static int mark_stripe_bucket(struct btree_trans *trans, /* * XXX doesn't handle deletion */ percpu_down_read(&c->mark_lock); + buf.atomic++; g = PTR_GC_BUCKET(ca, ptr); - if (g->mark.dirty_sectors || + if (g->dirty_sectors || (g->stripe && g->stripe != k.k->p.offset)) { bch2_fs_inconsistent(c, "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EINVAL; goto err; } - old = bucket_cmpxchg(g, new, ({ - ret = check_bucket_ref(c, k, ptr, sectors, data_type, - new.gen, new.data_type, - new.dirty_sectors, new.cached_sectors); - if (ret) - goto err; - - new.dirty_sectors += sectors; - if (data_type) - new.data_type = data_type; + bucket_lock(g); + old = *g; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } + ret = check_bucket_ref(c, k, ptr, sectors, data_type, + new.gen, new.data_type, + new.dirty_sectors, new.cached_sectors); + if (ret) { + bucket_unlock(g); + goto err; + } - new.stripe = true; - })); + new.dirty_sectors += sectors; + if (data_type) + new.data_type = data_type; g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + new = *g; + bucket_unlock(g); + + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); err: percpu_up_read(&c->mark_lock); - - return 0; + printbuf_exit(&buf); + return ret; } static int __mark_pointer(struct btree_trans *trans, @@ -850,9 +837,9 @@ static int __mark_pointer(struct btree_trans *trans, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 bucket_gen, u8 *bucket_data_type, - u16 *dirty_sectors, u16 *cached_sectors) + u32 *dirty_sectors, u32 *cached_sectors) { - u16 *dst_sectors = !ptr->cached + u32 *dst_sectors = !ptr->cached ? dirty_sectors : cached_sectors; int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, @@ -876,11 +863,9 @@ static int bch2_mark_pointer(struct btree_trans *trans, { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g; + struct bucket old, new, *g; u8 bucket_data_type; - u64 v; int ret = 0; BUG_ON(!(flags & BTREE_TRIGGER_GC)); @@ -888,35 +873,27 @@ static int bch2_mark_pointer(struct btree_trans *trans, percpu_down_read(&c->mark_lock); g = PTR_GC_BUCKET(ca, &p.ptr); - v = atomic64_read(&g->_mark.v); - do { - new.v.counter = old.v.counter = v; - bucket_data_type = new.data_type; - - ret = __mark_pointer(trans, k, &p.ptr, sectors, - data_type, new.gen, - &bucket_data_type, - &new.dirty_sectors, - &new.cached_sectors); - if (ret) - goto err; + bucket_lock(g); + old = *g; - new.data_type = bucket_data_type; + bucket_data_type = g->data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (ret) { + bucket_unlock(g); + goto err; + } - if (flags & BTREE_TRIGGER_NOATOMIC) { - g->_mark = new; - break; - } - } while ((v = atomic64_cmpxchg(&g->_mark.v, - old.v.counter, - new.v.counter)) != old.v.counter); + g->data_type = bucket_data_type; + + new = *g; + bucket_unlock(g); - bch2_dev_usage_update(c, ca, old, new, journal_seq, true); + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); err: percpu_up_read(&c->mark_lock); @@ -937,9 +914,11 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, BUG_ON(!(flags & BTREE_TRIGGER_GC)); m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); - - if (!m) + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.idx); return -ENOMEM; + } spin_lock(&c->ec_stripes_heap_lock); @@ -962,9 +941,9 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, return 0; } -static int bch2_mark_extent(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_extent(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; @@ -1032,10 +1011,11 @@ static int bch2_mark_extent(struct btree_trans *trans, if (r.e.nr_devs) { ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); if (ret) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf); + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); return ret; } } @@ -1043,14 +1023,14 @@ static int bch2_mark_extent(struct btree_trans *trans, return 0; } -static int bch2_mark_stripe(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_stripe(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - size_t idx = new.k->p.offset; + u64 idx = new.k->p.offset; const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(old).v : NULL; const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe @@ -1064,13 +1044,16 @@ static int bch2_mark_stripe(struct btree_trans *trans, struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m || (old_s && !m->alive)) { - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf1), c, old); - bch2_bkey_val_to_text(&PBUF(buf2), c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" "old %s\n" - "new %s", idx, buf1, buf2); + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); bch2_inconsistent_error(c); return -1; } @@ -1100,9 +1083,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - if (!m) + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); return -ENOMEM; - + } /* * This will be wrong when we bring back runtime gc: we should * be unmarking the old key and then marking the new key @@ -1124,7 +1109,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, memset(m->block_sectors, 0, sizeof(m->block_sectors)); for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); + ret = mark_stripe_bucket(trans, new, i, flags); if (ret) return ret; } @@ -1133,10 +1118,11 @@ static int bch2_mark_stripe(struct btree_trans *trans, ((s64) m->sectors * m->nr_redundant), journal_seq, gc); if (ret) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf); + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); return ret; } } @@ -1144,9 +1130,9 @@ static int bch2_mark_stripe(struct btree_trans *trans, return 0; } -static int bch2_mark_inode(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { struct bch_fs *c = trans->c; struct bch_fs_usage __percpu *fs_usage; @@ -1175,9 +1161,9 @@ static int bch2_mark_inode(struct btree_trans *trans, return 0; } -static int bch2_mark_reservation(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_reservation(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; @@ -1207,18 +1193,24 @@ static int bch2_mark_reservation(struct btree_trans *trans, return 0; } -static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, u64 *idx, unsigned flags, size_t r_idx) { + struct bch_fs *c = trans->c; struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 next_idx = end; s64 ret = 0; + struct printbuf buf = PRINTBUF; if (r_idx >= c->reflink_gc_nr) goto not_found; r = genradix_ptr(&c->reflink_gc_table, r_idx); - if (*idx < r->offset - r->size) + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) goto not_found; BUG_ON((s64) r->refcount + add < 0); @@ -1227,37 +1219,37 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, *idx = r->offset; return 0; not_found: - *idx = U64_MAX; - ret = -EIO; - - /* - * XXX: we're replacing the entire reflink pointer with an error - * key, we should just be replacing the part that was missing: - */ - if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { + if (fsck_err(c, "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { struct bkey_i_error new; bkey_init(&new.k); new.k.type = KEY_TYPE_error; - new.k.p = p.k->p; - new.k.size = p.k->size; - ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i); + new.k.p = bkey_start_pos(p.k); + new.k.p.offset += *idx - start; + bch2_key_resize(&new.k, next_idx - *idx); + ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); } + + *idx = next_idx; fsck_err: + printbuf_exit(&buf); return ret; } -static int bch2_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct reflink_gc *ref; size_t l, r, m; - u64 idx = le64_to_cpu(p.v->idx); + u64 idx = le64_to_cpu(p.v->idx), start = idx; u64 end = le64_to_cpu(p.v->idx) + p.k->size; int ret = 0; @@ -1281,73 +1273,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans, } while (idx < end && !ret) - ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); - - return ret; -} - -int bch2_mark_key(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned flags) -{ - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; - - switch (k.k->type) { - case KEY_TYPE_alloc: - case KEY_TYPE_alloc_v2: - case KEY_TYPE_alloc_v3: - return bch2_mark_alloc(trans, old, new, flags); - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return bch2_mark_extent(trans, old, new, flags); - case KEY_TYPE_stripe: - return bch2_mark_stripe(trans, old, new, flags); - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - return bch2_mark_inode(trans, old, new, flags); - case KEY_TYPE_reservation: - return bch2_mark_reservation(trans, old, new, flags); - case KEY_TYPE_reflink_p: - return bch2_mark_reflink_p(trans, old, new, flags); - case KEY_TYPE_snapshot: - return bch2_mark_snapshot(trans, old, new, flags); - default: - return 0; - } -} - -int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *new, unsigned flags) -{ - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - struct bkey_s_c old; - struct bkey unpacked; - int ret; - - _deleted.p = path->pos; - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc(path->btree_id)) - return 0; - - old = bch2_btree_path_peek_slot(path, &unpacked); - - if (old.k->type == new->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); - } + ret = __bch2_mark_reflink_p(trans, p, start, end, + &idx, flags, l++); return ret; } @@ -1359,33 +1286,26 @@ void fs_usage_apply_warn(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - char buf[200]; + struct printbuf buf = PRINTBUF; bch_err(c, "disk usage increased %lli more than %u sectors reserved", should_not_have_added, disk_res_sectors); trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + pr_err("while inserting"); - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - pr_err("%s", buf); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err(" %s", buf.buf); pr_err("overlapping with"); - - if (!i->cached) { - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); - - bch2_bkey_val_to_text(&PBUF(buf), c, k); - pr_err("%s", buf); - } else { - struct bkey_cached *ck = (void *) i->path->l[0].b; - - if (ck->valid) { - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); - pr_err("%s", buf); - } - } + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, old); + pr_err(" %s", buf.buf); } + __WARN(); + printbuf_exit(&buf); } int bch2_trans_fs_usage_apply(struct btree_trans *trans, @@ -1466,52 +1386,25 @@ need_mark: /* trans_mark: */ -static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, - const struct bch_extent_ptr *ptr, - struct bkey_alloc_unpacked *u) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); - struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); - int ret; - - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - *u = update && !bpos_cmp(update->k.p, pos) - ? bch2_alloc_unpack(bkey_i_to_s_c(update)) - : alloc_mem_to_key(c, iter); - - return 0; -} - static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) { struct btree_iter iter; - struct bkey_alloc_unpacked u; + struct bkey_i_alloc_v4 *a; int ret; - ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, - u.gen, &u.data_type, - &u.dirty_sectors, &u.cached_sectors); + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors); if (ret) goto out; - ret = bch2_alloc_write(trans, &iter, &u, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); if (ret) goto out; out: @@ -1523,7 +1416,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) { - struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; struct bkey_i_stripe *s; @@ -1539,16 +1431,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto err; if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, + bch2_trans_inconsistent(trans, "pointer to nonexistent stripe %llu", (u64) p.ec.idx); - bch2_inconsistent_error(c); ret = -EIO; goto err; } if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { - bch2_fs_inconsistent(c, + bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); ret = -EIO; @@ -1577,10 +1468,14 @@ err: return ret; } -static int bch2_trans_mark_extent(struct btree_trans *trans, - struct bkey_s_c k, unsigned flags) +int bch2_trans_mark_extent(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1642,7 +1537,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; struct btree_iter iter; - struct bkey_alloc_unpacked u; + struct bkey_i_alloc_v4 *a; enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant ? BCH_DATA_parity : 0; s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; @@ -1651,59 +1546,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, if (deleting) sectors = -sectors; - ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, - u.gen, u.data_type, - u.dirty_sectors, u.cached_sectors); + a->v.gen, a->v.data_type, + a->v.dirty_sectors, a->v.cached_sectors); if (ret) goto err; if (!deleting) { - if (bch2_fs_inconsistent_on(u.stripe || - u.stripe_redundancy, c, + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, u.gen, - bch2_data_types[u.data_type], - u.dirty_sectors, - u.stripe, s.k->p.offset)) { + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { ret = -EIO; goto err; } - if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c, + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, u.gen, - bch2_data_types[u.data_type], - u.dirty_sectors, + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, s.k->p.offset)) { ret = -EIO; goto err; } - u.stripe = s.k->p.offset; - u.stripe_redundancy = s.v->nr_redundant; + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; } else { - if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset || - u.stripe_redundancy != s.v->nr_redundant, c, + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, u.gen, - s.k->p.offset, u.stripe)) { + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { ret = -EIO; goto err; } - u.stripe = 0; - u.stripe_redundancy = 0; + a->v.stripe = 0; + a->v.stripe_redundancy = 0; } - u.dirty_sectors += sectors; + a->v.dirty_sectors += sectors; if (data_type) - u.data_type = !deleting ? data_type : 0; + a->v.data_type = !deleting ? data_type : 0; - ret = bch2_alloc_write(trans, &iter, &u, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); if (ret) goto err; err: @@ -1711,66 +1606,68 @@ err: return ret; } -static int bch2_trans_mark_stripe(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +int bch2_trans_mark_stripe(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { - struct bkey_s_c_stripe old_s = { .k = NULL }; - struct bkey_s_c_stripe new_s = { .k = NULL }; + const struct bch_stripe *old_s = NULL; + struct bch_stripe *new_s = NULL; struct bch_replicas_padded r; unsigned i, nr_blocks; int ret = 0; if (old.k->type == KEY_TYPE_stripe) - old_s = bkey_s_c_to_stripe(old); - if (new.k->type == KEY_TYPE_stripe) - new_s = bkey_s_c_to_stripe(new); + old_s = bkey_s_c_to_stripe(old).v; + if (new->k.type == KEY_TYPE_stripe) + new_s = &bkey_i_to_stripe(new)->v; /* * If the pointers aren't changing, we don't need to do anything: */ - if (new_s.k && old_s.k && - new_s.v->nr_blocks == old_s.v->nr_blocks && - new_s.v->nr_redundant == old_s.v->nr_redundant && - !memcmp(old_s.v->ptrs, new_s.v->ptrs, - new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; - BUG_ON(new_s.k && old_s.k && - (new_s.v->nr_blocks != old_s.v->nr_blocks || - new_s.v->nr_redundant != old_s.v->nr_redundant)); + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); - nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks; + nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - if (new_s.k) { - s64 sectors = le16_to_cpu(new_s.v->sectors); + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); - bch2_bkey_to_replicas(&r.e, new); - update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); + update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); } - if (old_s.k) { - s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); bch2_bkey_to_replicas(&r.e, old); - update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); + update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); } for (i = 0; i < nr_blocks; i++) { - if (new_s.k && old_s.k && - !memcmp(&new_s.v->ptrs[i], - &old_s.v->ptrs[i], - sizeof(new_s.v->ptrs[i]))) + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) continue; - if (new_s.k) { - ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false); + if (new_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_i_to_s_c_stripe(new), i, false); if (ret) break; } - if (old_s.k) { - ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true); + if (old_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); if (ret) break; } @@ -1779,12 +1676,12 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } -static int bch2_trans_mark_inode(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned flags) +int bch2_trans_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) { - int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); if (nr) { struct replicas_delta_list *d = @@ -1795,9 +1692,14 @@ static int bch2_trans_mark_inode(struct btree_trans *trans, return 0; } -static int bch2_trans_mark_reservation(struct btree_trans *trans, - struct bkey_s_c k, unsigned flags) +int bch2_trans_mark_reservation(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; struct replicas_delta_list *d; @@ -1825,7 +1727,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_i *n; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), @@ -1845,19 +1747,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, refcount = bkey_refcount(n); if (!refcount) { - bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); - bch2_fs_inconsistent(c, + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf); + *idx, buf.buf); ret = -EIO; goto err; } if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); - bch2_fs_inconsistent(c, + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf); + *idx, buf.buf); ret = -EIO; goto err; } @@ -1879,11 +1781,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, le64_add_cpu(refcount, add); - if (!*refcount) { - n->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&n->k, 0); - } - bch2_btree_iter_set_pos_to_extent_start(&iter); ret = bch2_trans_update(trans, &iter, n, 0); if (ret) @@ -1892,12 +1789,18 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, *idx = k.k->p.offset; err: bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } -static int bch2_trans_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c k, unsigned flags) +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); u64 idx, end_idx; int ret = 0; @@ -1918,31 +1821,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, return ret; } -int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, - struct bkey_s_c new, unsigned flags) -{ - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return bch2_trans_mark_extent(trans, k, flags); - case KEY_TYPE_stripe: - return bch2_trans_mark_stripe(trans, old, new, flags); - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - return bch2_trans_mark_inode(trans, old, new, flags); - case KEY_TYPE_reservation: - return bch2_trans_mark_reservation(trans, k, flags); - case KEY_TYPE_reflink_p: - return bch2_trans_mark_reflink_p(trans, k, flags); - default: - return 0; - } -} - static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, @@ -1950,11 +1828,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_alloc_unpacked u; - struct bch_extent_ptr ptr = { - .dev = ca->dev_idx, - .offset = bucket_to_sector(ca, b), - }; + struct bkey_i_alloc_v4 *a; int ret = 0; /* @@ -1963,26 +1837,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (b >= ca->mi.nbuckets) return 0; - ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + if (IS_ERR(a)) + return PTR_ERR(a); - if (u.data_type && u.data_type != type) { + if (a->v.data_type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - iter.pos.inode, iter.pos.offset, u.gen, - bch2_data_types[u.data_type], + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], bch2_data_types[type], bch2_data_types[type]); ret = -EIO; goto out; } - u.data_type = type; - u.dirty_sectors = sectors; + a->v.data_type = type; + a->v.dirty_sectors = sectors; - ret = bch2_alloc_write(trans, &iter, &u, 0); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); if (ret) goto out; out: @@ -2145,65 +2019,29 @@ recalculate: /* Startup/shutdown: */ -static void buckets_free_rcu(struct rcu_head *rcu) -{ - struct bucket_array *buckets = - container_of(rcu, struct bucket_array, rcu); - - kvpfree(buckets, - sizeof(struct bucket_array) + - buckets->nbuckets * sizeof(struct bucket)); -} - static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets); + kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { - struct bucket_array *buckets = NULL, *old_buckets = NULL; struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; - alloc_fifo free[RESERVE_NR]; - alloc_fifo free_inc; - alloc_heap alloc_heap; - - size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); - size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), - btree_reserve * 2); - bool resize = ca->buckets[0] != NULL; + bool resize = ca->bucket_gens != NULL; int ret = -ENOMEM; - unsigned i; - memset(&free, 0, sizeof(free)); - memset(&free_inc, 0, sizeof(free_inc)); - memset(&alloc_heap, 0, sizeof(alloc_heap)); - - if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + - nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO)) || - !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO)) || - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + (c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)) || - !init_fifo(&free[RESERVE_MOVINGGC], - copygc_reserve, GFP_KERNEL) || - !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || - !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) + GFP_KERNEL|__GFP_ZERO)))) goto err; - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = nbuckets; bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; @@ -2215,64 +2053,39 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) percpu_down_write(&c->mark_lock); } - old_buckets = bucket_array(ca); old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - size_t n = min(buckets->nbuckets, old_buckets->nbuckets); + size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); - memcpy(buckets->b, - old_buckets->b, - n * sizeof(struct bucket)); memcpy(bucket_gens->b, old_bucket_gens->b, n); - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); + if (buckets_nouse) + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); } - rcu_assign_pointer(ca->buckets[0], buckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); - buckets = old_buckets; bucket_gens = old_bucket_gens; swap(ca->buckets_nouse, buckets_nouse); + nbuckets = ca->mi.nbuckets; + if (resize) { percpu_up_write(&c->mark_lock); + up_write(&ca->bucket_lock); up_write(&c->gc_lock); } - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - fifo_move(&free[i], &ca->free[i]); - swap(ca->free[i], free[i]); - } - fifo_move(&free_inc, &ca->free_inc); - swap(ca->free_inc, free_inc); - spin_unlock(&c->freelist_lock); - - /* with gc lock held, alloc_heap can't be in use: */ - swap(ca->alloc_heap, alloc_heap); - - nbuckets = ca->mi.nbuckets; - - if (resize) - up_write(&ca->bucket_lock); - ret = 0; err: - free_heap(&alloc_heap); - free_fifo(&free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); if (bucket_gens) - call_rcu(&old_buckets->rcu, bucket_gens_free_rcu); - if (buckets) - call_rcu(&old_buckets->rcu, buckets_free_rcu); + call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); return ret; } @@ -2281,15 +2094,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { unsigned i; - free_heap(&ca->alloc_heap); - free_fifo(&ca->free_inc); - for (i = 0; i < RESERVE_NR; i++) - free_fifo(&ca->free[i]); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(rcu_dereference_protected(ca->buckets[0], 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), + sizeof(struct bucket_gens) + ca->mi.nbuckets); for (i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 45c6d230f242..853bc9dd1294 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -15,52 +15,32 @@ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -#define bucket_cmpxchg(g, new, expr) \ -({ \ - struct bucket *_g = g; \ - u64 _v = atomic64_read(&(g)->_mark.v); \ - struct bucket_mark _old; \ - \ - do { \ - (new).v.counter = _old.v.counter = _v; \ - expr; \ - } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ - _old.v.counter, \ - (new).v.counter)) != _old.v.counter);\ - _old; \ -}) - -static inline struct bucket_array *__bucket_array(struct bch_dev *ca, - bool gc) +static inline void bucket_unlock(struct bucket *b) { - return rcu_dereference_check(ca->buckets[gc], - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->gc_lock) || - lockdep_is_held(&ca->bucket_lock)); + smp_store_release(&b->lock, 0); } -static inline struct bucket_array *bucket_array(struct bch_dev *ca) +static inline void bucket_lock(struct bucket *b) { - return __bucket_array(ca, false); + while (xchg(&b->lock, 1)) + cpu_relax(); } -static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) +static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) { - struct bucket_array *buckets = __bucket_array(ca, gc); - - BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); - return buckets->b + b; + return rcu_dereference_check(ca->buckets_gc, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); } static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - return __bucket(ca, b, true); -} + struct bucket_array *buckets = gc_bucket_array(ca); -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -{ - return __bucket(ca, b, false); + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) @@ -70,7 +50,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) percpu_rwsem_is_held(&ca->fs->mark_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); - } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) @@ -81,26 +60,18 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } -/* - * bucket_gc_gen() returns the difference between the bucket's current gen and - * the oldest gen of any pointer into that bucket in the btree. - */ - -static inline u8 bucket_gc_gen(struct bucket *g) -{ - return g->mark.gen - g->oldest_gen; -} - static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { return sector_to_bucket(ca, ptr->offset); } -static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, + const struct bch_extent_ptr *ptr) { - return bucket(ca, PTR_BUCKET_NR(ca, ptr)); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); } static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, @@ -147,74 +118,55 @@ static inline u8 ptr_stale(struct bch_dev *ca, return ret; } -/* bucket gc marks */ - -static inline unsigned bucket_sectors_used(struct bucket_mark mark) -{ - return mark.dirty_sectors + mark.cached_sectors; -} - -static inline bool is_available_bucket(struct bucket_mark mark) -{ - return !mark.dirty_sectors && !mark.stripe; -} - -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -} - /* Device usage: */ struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage stats) + struct bch_dev_usage stats, + enum alloc_reserve reserve) { - u64 total = ca->mi.nbuckets - ca->mi.first_bucket; + s64 total = ca->mi.nbuckets - ca->mi.first_bucket; + s64 reserved = 0; + + switch (reserve) { + case RESERVE_none: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case RESERVE_movinggc: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_btree: + reserved += ca->nr_btree_reserve; + fallthrough; + case RESERVE_btree_movinggc: + break; + default: + BUG(); + } if (WARN_ONCE(stats.buckets_unavailable > total, "buckets_unavailable overflow (%llu > %llu)\n", stats.buckets_unavailable, total)) return 0; - return total - stats.buckets_unavailable; -} - -static inline u64 dev_buckets_available(struct bch_dev *ca) -{ - return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); + return max_t(s64, 0, + total - + stats.buckets_unavailable - + ca->nr_open_buckets - + reserved); } -static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, - struct bch_dev_usage stats) +static inline u64 dev_buckets_available(struct bch_dev *ca, + enum alloc_reserve reserve) { - struct bch_fs *c = ca->fs; - s64 available = __dev_buckets_available(ca, stats); - unsigned i; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - available -= fifo_used(&ca->free[i]); - available -= fifo_used(&ca->free_inc); - available -= ca->nr_open_buckets; - spin_unlock(&c->freelist_lock); - - return max(available, 0LL); -} - -static inline u64 dev_buckets_reclaimable(struct bch_dev *ca) -{ - return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); } /* Filesystem usage: */ static inline unsigned fs_usage_u64s(struct bch_fs *c) { - return sizeof(struct bch_fs_usage) / sizeof(u64) + READ_ONCE(c->replicas.nr); } @@ -240,21 +192,54 @@ bch2_fs_usage_read_short(struct bch_fs *); /* key/bucket marking: */ -void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_update(struct btree_trans *, struct btree_path *, - struct bkey_i *, unsigned); +int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); + +int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, - struct bkey_s_c, unsigned); + struct bkey_i *, unsigned); + +static inline int bch2_trans_mark_old(struct btree_trans *trans, + struct bkey_s_c old, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = old.k->p; + + return bch2_trans_mark_key(trans, old, &deleted, + BTREE_TRIGGER_OVERWRITE|flags); +} + +static inline int bch2_trans_mark_new(struct btree_trans *trans, + struct bkey_i *new, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = new->k.p; + + return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); +} + int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 18bca269b750..e79a33795bf9 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -7,42 +7,15 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 -struct bucket_mark { - union { - atomic64_t v; - - struct { - u8 gen; - u8 data_type:3, - owned_by_allocator:1, - journal_seq_valid:1, - stripe:1; - u16 dirty_sectors; - u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket can't be - * reused until the journal sequence number written to disk is >= the - * bucket's journal sequence number: - */ - u16 journal_seq; - }; - }; -}; - struct bucket { - union { - struct bucket_mark _mark; - const struct bucket_mark mark; - }; - - u64 io_time[2]; - u8 oldest_gen; - u8 gc_gen; - unsigned gen_valid:1; - u8 stripe_redundancy; - u32 stripe; + u8 lock; + u8 gen_valid:1; + u8 data_type:7; + u8 gen; + u8 stripe_redundancy; + u32 stripe; + u32 dirty_sectors; + u32 cached_sectors; }; struct bucket_array { @@ -121,7 +94,7 @@ struct copygc_heap_entry { u8 dev; u8 gen; u8 replicas; - u16 fragmentation; + u32 fragmentation; u32 sectors; u64 offset; }; diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c new file mode 100644 index 000000000000..2e5b955080de --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets_waiting_for_journal.h" +#include <linux/random.h> + +static inline struct bucket_hashed * +bucket_hash(struct buckets_waiting_for_journal_table *t, + unsigned hash_seed_idx, u64 dev_bucket) +{ + unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); + + BUG_ON(!is_power_of_2(t->size)); + + return t->d + (h & (t->size - 1)); +} + +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) +{ + unsigned i; + + t->size = size; + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) + get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); + memset(t->d, 0, sizeof(t->d[0]) * size); +} + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket) +{ + struct buckets_waiting_for_journal_table *t; + u64 dev_bucket = (u64) dev << 56 | bucket; + bool ret = false; + unsigned i; + + mutex_lock(&b->lock); + t = b->t; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); + + if (h->dev_bucket == dev_bucket) { + ret = h->journal_seq > flushed_seq; + break; + } + } + + mutex_unlock(&b->lock); + + return ret; +} + +static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, + struct bucket_hashed *new, + u64 flushed_seq) +{ + struct bucket_hashed *last_evicted = NULL; + unsigned tries, i; + + for (tries = 0; tries < 10; tries++) { + struct bucket_hashed *old, *victim = NULL; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + old = bucket_hash(t, i, new->dev_bucket); + + if (old->dev_bucket == new->dev_bucket || + old->journal_seq <= flushed_seq) { + *old = *new; + return true; + } + + if (last_evicted != old) + victim = old; + } + + /* hashed to same slot 3 times: */ + if (!victim) + break; + + /* Failed to find an empty slot: */ + swap(*new, *victim); + last_evicted = victim; + } + + return false; +} + +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket, + u64 journal_seq) +{ + struct buckets_waiting_for_journal_table *t, *n; + struct bucket_hashed tmp, new = { + .dev_bucket = (u64) dev << 56 | bucket, + .journal_seq = journal_seq, + }; + size_t i, new_size, nr_elements = 1, nr_rehashes = 0; + int ret = 0; + + mutex_lock(&b->lock); + + if (likely(bucket_table_insert(b->t, &new, flushed_seq))) + goto out; + + t = b->t; + for (i = 0; i < t->size; i++) + nr_elements += t->d[i].journal_seq > flushed_seq; + + new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; + + n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + +retry_rehash: + nr_rehashes++; + bucket_table_init(n, new_size); + + tmp = new; + BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); + + for (i = 0; i < t->size; i++) { + if (t->d[i].journal_seq <= flushed_seq) + continue; + + tmp = t->d[i]; + if (!bucket_table_insert(n, &tmp, flushed_seq)) + goto retry_rehash; + } + + b->t = n; + kvfree(t); + + pr_debug("took %zu rehashes, table at %zu/%zu elements", + nr_rehashes, nr_elements, b->t->size); +out: + mutex_unlock(&b->lock); + + return ret; +} + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + kvfree(b->t); +} + +#define INITIAL_TABLE_SIZE 8 + +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + mutex_init(&b->lock); + + b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); + if (!b->t) + return -ENOMEM; + + bucket_table_init(b->t, INITIAL_TABLE_SIZE); + return 0; +} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h new file mode 100644 index 000000000000..d2ae19cbe18c --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H +#define _BUCKETS_WAITING_FOR_JOURNAL_H + +#include "buckets_waiting_for_journal_types.h" + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64); +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64, u64); + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h new file mode 100644 index 000000000000..fea7f944d0ed --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H +#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H + +#include <linux/siphash.h> + +struct bucket_hashed { + u64 dev_bucket; + u64 journal_seq; +}; + +struct buckets_waiting_for_journal_table { + size_t size; + siphash_key_t hash_seeds[3]; + struct bucket_hashed d[]; +}; + +struct buckets_waiting_for_journal { + struct mutex lock; + struct buckets_waiting_for_journal_table *t; +}; + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index db68a78276cf..aa26588ed5ed 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -568,8 +568,11 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!dev) + return -EINVAL; + for_each_online_member(ca, c, i) - if (ca->disk_sb.bdev->bd_dev == dev) { + if (ca->dev == dev) { percpu_ref_put(&ca->io_ref); return i; } diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index fbe8603cfb30..425582f60d7a 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -93,9 +93,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * } } -static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, - struct nonce nonce, - struct scatterlist *sg, size_t len) +static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + struct scatterlist *sg, size_t len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; @@ -104,17 +104,20 @@ static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); - BUG_ON(ret); + if (ret) + pr_err("got error %i from crypto_skcipher_encrypt()", ret); + + return ret; } -static inline void do_encrypt(struct crypto_sync_skcipher *tfm, +static inline int do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { struct scatterlist sg; sg_init_one(&sg, buf, len); - do_encrypt_sg(tfm, nonce, &sg, len); + return do_encrypt_sg(tfm, nonce, &sg, len); } int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, @@ -136,25 +139,29 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, goto err; } - do_encrypt(chacha20, nonce, buf, len); + ret = do_encrypt(chacha20, nonce, buf, len); err: crypto_free_sync_skcipher(chacha20); return ret; } -static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, - struct nonce nonce) +static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, + struct nonce nonce) { u8 key[POLY1305_KEY_SIZE]; + int ret; nonce.d[3] ^= BCH_NONCE_POLY; memset(key, 0, sizeof(key)); - do_encrypt(c->chacha20, nonce, key, sizeof(key)); + ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); + if (ret) + return ret; desc->tfm = c->poly1305; crypto_shash_init(desc); crypto_shash_update(desc, key, sizeof(key)); + return 0; } struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, @@ -196,13 +203,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, } } -void bch2_encrypt(struct bch_fs *c, unsigned type, +int bch2_encrypt(struct bch_fs *c, unsigned type, struct nonce nonce, void *data, size_t len) { if (!bch2_csum_type_is_encryption(type)) - return; + return 0; - do_encrypt(c->chacha20, nonce, data, len); + return do_encrypt(c->chacha20, nonce, data, len); } static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, @@ -277,23 +284,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, return __bch2_checksum_bio(c, type, nonce, bio, &iter); } -void bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) +int bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) { struct bio_vec bv; struct bvec_iter iter; struct scatterlist sgl[16], *sg = sgl; size_t bytes = 0; + int ret = 0; if (!bch2_csum_type_is_encryption(type)) - return; + return 0; sg_init_table(sgl, ARRAY_SIZE(sgl)); bio_for_each_segment(bv, bio, iter) { if (sg == sgl + ARRAY_SIZE(sgl)) { sg_mark_end(sg - 1); - do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + + ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + if (ret) + return ret; nonce = nonce_add(nonce, bytes); bytes = 0; @@ -307,7 +318,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, } sg_mark_end(sg - 1); - do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); } struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, @@ -407,16 +418,12 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, } #ifdef __KERNEL__ -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +static int __bch2_request_key(char *key_description, struct bch_key *key) { - char key_description[60]; struct key *keyring_key; const struct user_key_payload *ukp; int ret; - snprintf(key_description, sizeof(key_description), - "bcachefs:%pUb", &sb->user_uuid); - keyring_key = request_key(&key_type_logon, key_description, NULL); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); @@ -436,16 +443,10 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) } #else #include <keyutils.h> -#include <uuid/uuid.h> -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +static int __bch2_request_key(char *key_description, struct bch_key *key) { key_serial_t key_id; - char key_description[60]; - char uuid[40]; - - uuid_unparse_lower(sb->user_uuid.b, uuid); - sprintf(key_description, "bcachefs:%s", uuid); key_id = request_key("user", key_description, NULL, KEY_SPEC_USER_KEYRING); @@ -459,6 +460,17 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) } #endif +int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +{ + char key_description[60]; + char uuid[40]; + + uuid_unparse_lower(sb->user_uuid.b, uuid); + sprintf(key_description, "bcachefs:%s", uuid); + + return __bch2_request_key(key_description, key); +} + int bch2_decrypt_sb_key(struct bch_fs *c, struct bch_sb_field_crypt *crypt, struct bch_key *key) diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index f5c1a609c5c4..c86c3c05d620 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); -void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, +int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, void *data, size_t); struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, @@ -61,8 +61,8 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, struct bch_extent_crc_unpacked *, unsigned, unsigned, unsigned); -void bch2_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); +int bch2_encrypt_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 8e4179d8dc27..7d9ebcc9a445 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -197,9 +197,11 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, goto err; workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); - ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); + ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = ZSTD_decompressDCtx(ctx, + src_len = le32_to_cpup(src_data.b); + + ret = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); @@ -333,8 +335,8 @@ static int attempt_compress(struct bch_fs *c, return strm.total_out; } case BCH_COMPRESSION_TYPE_zstd: { - ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, - ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, + zstd_cctx_workspace_bound(&c->zstd_params.cParams)); /* * ZSTD requires that when we decompress we pass in the exact @@ -347,11 +349,11 @@ static int attempt_compress(struct bch_fs *c, * factor (7 bytes) from the dst buffer size to account for * that. */ - size_t len = ZSTD_compressCCtx(ctx, + size_t len = zstd_compress_cctx(ctx, dst + 4, dst_len - 4 - 7, src, src_len, - c->zstd_params); - if (ZSTD_isError(len)) + &c->zstd_params); + if (zstd_is_error(len)) return 0; *((__le32 *) dst) = cpu_to_le32(len); @@ -546,7 +548,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t decompress_workspace_size = 0; bool decompress_workspace_needed; - ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0); + ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max); struct { unsigned feature; unsigned type; @@ -558,8 +560,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, - ZSTD_CCtxWorkspaceBound(params.cParams), - ZSTD_DCtxWorkspaceBound() }, + zstd_cctx_workspace_bound(¶ms.cParams), + zstd_dctx_workspace_bound() }, }, *i; int ret = 0; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 index 000000000000..745b1cdb0d17 --- /dev/null +++ b/fs/bcachefs/darray.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H + +/* + * Dynamic arrays: + * + * Inspired by CCAN's darray + */ + +#include "util.h" +#include <linux/slab.h> + +#define DARRAY(type) \ +struct { \ + size_t nr, size; \ + type *data; \ +} + +typedef DARRAY(void) darray_void; + +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) +{ + if (d->nr + more > d->size) { + size_t new_size = roundup_pow_of_two(d->nr + more); + void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); + + if (!data) + return -ENOMEM; + + d->data = data; + d->size = new_size; + } + + return 0; +} + +#define darray_make_room(_d, _more) \ + __darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more)) + +#define darray_top(_d) ((_d).data[(_d).nr]) + +#define darray_push(_d, _item) \ +({ \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + (_d).data[(_d).nr++] = (_item); \ + _ret; \ +}) + +#define darray_insert_item(_d, _pos, _item) \ +({ \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + array_insert_item((_d).data, (_d).nr, (_pos), (_item)); \ + _ret; \ +}) + +#define darray_for_each(_d, _i) \ + for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + +#define darray_init(_d) \ +do { \ + (_d).data = NULL; \ + (_d).nr = (_d).size = 0; \ +} while (0) + +#define darray_exit(_d) \ +do { \ + kfree((_d).data); \ + darray_init(_d); \ +} while (0) + +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ee5b7f696796..2d65ae370931 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -169,10 +169,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) failed |= bch2_btree_verify_replica(c, b, p); if (failed) { - char buf[200]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); + printbuf_exit(&buf); } out: mutex_unlock(&c->verify_lock); @@ -184,12 +185,12 @@ out: /* XXX: bch_fs refcounting */ struct dump_iter { - struct bpos from; - struct bch_fs *c; + struct bch_fs *c; enum btree_id id; + struct bpos from; + u64 iter; - char buf[1 << 12]; - size_t bytes; /* what's currently in buf */ + struct printbuf buf; char __user *ubuf; /* destination user buffer */ size_t size; /* size of requested read */ @@ -198,9 +199,9 @@ struct dump_iter { static int flush_buf(struct dump_iter *i) { - if (i->bytes) { - size_t bytes = min(i->bytes, i->size); - int err = copy_to_user(i->ubuf, i->buf, bytes); + if (i->buf.pos) { + size_t bytes = min_t(size_t, i->buf.pos, i->size); + int err = copy_to_user(i->ubuf, i->buf.buf, bytes); if (err) return err; @@ -208,8 +209,8 @@ static int flush_buf(struct dump_iter *i) i->ret += bytes; i->ubuf += bytes; i->size -= bytes; - i->bytes -= bytes; - memmove(i->buf, i->buf + bytes, i->bytes); + i->buf.pos -= bytes; + memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); } return 0; @@ -226,15 +227,20 @@ static int bch2_dump_open(struct inode *inode, struct file *file) file->private_data = i; i->from = POS_MIN; + i->iter = 0; i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); i->id = bd->id; + i->buf = PRINTBUF; return 0; } static int bch2_dump_release(struct inode *inode, struct file *file) { - kfree(file->private_data); + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); return 0; } @@ -266,11 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, k = bch2_btree_iter_peek(&iter); while (k.k && !(err = bkey_err(k))) { - bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); - i->bytes = strlen(i->buf); - BUG_ON(i->bytes >= sizeof(i->buf)); - i->buf[i->bytes] = '\n'; - i->bytes++; + bch2_bkey_val_to_text(&i->buf, i->c, k); + pr_char(&i->buf, '\n'); k = bch2_btree_iter_next(&iter); i->from = iter.pos; @@ -319,8 +322,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { - bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); - i->bytes = strlen(i->buf); + bch2_btree_node_to_text(&i->buf, i->c, b); err = flush_buf(i); if (err) break; @@ -384,16 +386,14 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_btree_node_iter_peek(&l->iter, l->b); if (l->b != prev_node) { - bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); - i->bytes = strlen(i->buf); + bch2_btree_node_to_text(&i->buf, i->c, l->b); err = flush_buf(i); if (err) break; } prev_node = l->b; - bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); - i->bytes = strlen(i->buf); + bch2_bfloat_to_text(&i->buf, l->b, _k); err = flush_buf(i); if (err) break; @@ -422,10 +422,148 @@ static const struct file_operations bfloat_failed_debug_ops = { .read = bch2_read_bfloat_failed, }; +static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + out->tabstops[0] = 32; + + pr_buf(out, "%px btree=%s l=%u ", + b, + bch2_btree_ids[b->c.btree_id], + b->c.level); + pr_newline(out); + + pr_indent_push(out, 2); + + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + pr_newline(out); + + pr_buf(out, "flags: "); + pr_tab(out); + bch2_flags_to_text(out, bch2_btree_node_flags, b->flags); + pr_newline(out); + + pr_buf(out, "written:"); + pr_tab(out); + pr_buf(out, "%u", b->written); + pr_newline(out); + + pr_buf(out, "writes blocked:"); + pr_tab(out); + pr_buf(out, "%u", !list_empty_careful(&b->write_blocked)); + pr_newline(out); + + pr_buf(out, "will make reachable:"); + pr_tab(out); + pr_buf(out, "%lx", b->will_make_reachable); + pr_newline(out); + + pr_buf(out, "journal pin %px:", &b->writes[0].journal); + pr_tab(out); + pr_buf(out, "%llu", b->writes[0].journal.seq); + pr_newline(out); + + pr_buf(out, "journal pin %px:", &b->writes[1].journal); + pr_tab(out); + pr_buf(out, "%llu", b->writes[1].journal.seq); + pr_newline(out); + + pr_indent_pop(out, 2); +} + +static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + rcu_read_lock(); + i->buf.atomic++; + tbl = rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++;; + } else { + done = true; + } + --i->buf.atomic; + rcu_read_unlock(); + } while (!done); + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations cached_btree_nodes_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_cached_btree_nodes_read, +}; + +static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); + i->iter++; + } while (!done); + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations journal_pins_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_journal_pins_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { - if (!IS_ERR_OR_NULL(c->debug)) - debugfs_remove_recursive(c->debug); + if (!IS_ERR_OR_NULL(c->fs_debug_dir)) + debugfs_remove_recursive(c->fs_debug_dir); } void bch2_fs_debug_init(struct bch_fs *c) @@ -437,29 +575,39 @@ void bch2_fs_debug_init(struct bch_fs *c) return; snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); - c->debug = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->debug)) + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); + if (IS_ERR_OR_NULL(c->fs_debug_dir)) + return; + + debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, + c->btree_debug, &cached_btree_nodes_ops); + + debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, + c->btree_debug, &journal_pins_ops); + + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; for (bd = c->btree_debug; bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], - 0400, c->debug, bd, - &btree_debug_ops); + debugfs_create_file(bch2_btree_ids[bd->id], + 0400, c->btree_debug_dir, bd, + &btree_debug_ops); snprintf(name, sizeof(name), "%s-formats", bch2_btree_ids[bd->id]); - bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, - &btree_format_debug_ops); + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &btree_format_debug_ops); snprintf(name, sizeof(name), "%s-bfloat-failed", bch2_btree_ids[bd->id]); - bd->failed = debugfs_create_file(name, 0400, c->debug, bd, - &bfloat_failed_debug_ops); + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &bfloat_failed_debug_ops); } } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 6f699b736b34..760e4f74715f 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -122,9 +122,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - bch_scnmemcpy(out, d.v->d_name, - bch2_dirent_name_bytes(d)); - pr_buf(out, " -> %llu type %s", + pr_buf(out, "%.*s -> %llu type %s", + bch2_dirent_name_bytes(d), + d.v->d_name, d.v->d_type != DT_SUBVOL ? le64_to_cpu(d.v->d_inum) : le32_to_cpu(d.v->d_child_subvol), @@ -470,16 +470,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) if (ret) return ret; - for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir.inum, 0, snapshot), 0, k, ret) { - if (k.k->p.inode > dir.inum) - break; - + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir.inum, 0, snapshot), + POS(dir.inum, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { ret = -ENOTEMPTY; break; } - } bch2_trans_iter_exit(trans, &iter); return ret; @@ -503,11 +500,9 @@ retry: if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, - SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { - if (k.k->p.inode > inum.inum) - break; - + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), + POS(inum.inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) continue; diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index c52b6faac9b4..81b41b07c24b 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -17,24 +17,20 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); struct bch_disk_group *g, *sorted = NULL; - struct bch_sb_field_members *mi; - struct bch_member *m; - unsigned i, nr_groups, len; - const char *err = NULL; - - mi = bch2_sb_get_members(sb); - groups = bch2_sb_get_disk_groups(sb); - nr_groups = disk_groups_nr(groups); + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + unsigned nr_groups = disk_groups_nr(groups); + unsigned i, len; + int ret = -EINVAL; - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) { + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; unsigned g; if (!BCH_MEMBER_GROUP(m)) @@ -42,45 +38,54 @@ static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, g = BCH_MEMBER_GROUP(m) - 1; - if (g >= nr_groups || - BCH_GROUP_DELETED(&groups->entries[g])) - return "disk has invalid group"; + if (g >= nr_groups) { + pr_buf(err, "disk %u has invalid label %u (have %u)", + i, g, nr_groups); + return -EINVAL; + } + + if (BCH_GROUP_DELETED(&groups->entries[g])) { + pr_buf(err, "disk %u has deleted label %u", i, g); + return -EINVAL; + } } if (!nr_groups) - return NULL; + return 0; + + for (i = 0; i < nr_groups; i++) { + g = groups->entries + i; - for (g = groups->entries; - g < groups->entries + nr_groups; - g++) { if (BCH_GROUP_DELETED(g)) continue; len = strnlen(g->label, sizeof(g->label)); if (!len) { - err = "group with empty label"; - goto err; + pr_buf(err, "label %u empty", i); + return -EINVAL; } } sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); if (!sorted) - return "cannot allocate memory"; + return -ENOMEM; memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); - for (i = 0; i + 1 < nr_groups; i++) - if (!BCH_GROUP_DELETED(sorted + i) && - !group_cmp(sorted + i, sorted + i + 1)) { - err = "duplicate groups"; + for (g = sorted; g + 1 < sorted + nr_groups; g++) + if (!BCH_GROUP_DELETED(g) && + !group_cmp(&g[0], &g[1])) { + pr_buf(err, "duplicate label %llu.%.*s", + BCH_GROUP_PARENT(g), + (int) sizeof(g->label), g->label); goto err; } - err = NULL; + ret = 0; err: kfree(sorted); - return err; + return 0; } static void bch2_sb_disk_groups_to_text(struct printbuf *out, @@ -338,12 +343,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, - struct bch_sb_handle *sb, - unsigned v) +void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct bch_sb_field_disk_groups *groups = - bch2_sb_get_disk_groups(sb->sb); + bch2_sb_get_disk_groups(sb); struct bch_disk_group *g; unsigned nr = 0; u16 path[32]; @@ -372,15 +375,13 @@ void bch2_disk_path_to_text(struct printbuf *out, v = path[--nr]; g = groups->entries + v; - bch_scnmemcpy(out, g->label, - strnlen(g->label, sizeof(g->label))); - + pr_buf(out, "%.*s", (int) sizeof(g->label), g->label); if (nr) pr_buf(out, "."); } return; inval: - pr_buf(out, "invalid group %u", v); + pr_buf(out, "invalid label %u", v); } int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) @@ -444,7 +445,10 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) return -EINVAL; } -void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) +void bch2_opt_target_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) { struct target t = target_decode(v); @@ -452,33 +456,49 @@ void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) case TARGET_NULL: pr_buf(out, "none"); break; - case TARGET_DEV: { - struct bch_dev *ca; - - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && percpu_ref_tryget(&ca->io_ref)) { - char b[BDEVNAME_SIZE]; - - pr_buf(out, "/dev/%s", - bdevname(ca->disk_sb.bdev, b)); - percpu_ref_put(&ca->io_ref); - } else if (ca) { - pr_buf(out, "offline device %u", t.dev); + case TARGET_DEV: + if (c) { + struct bch_dev *ca; + + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + char b[BDEVNAME_SIZE]; + + pr_buf(out, "/dev/%s", + bdevname(ca->disk_sb.bdev, b)); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + pr_buf(out, "offline device %u", t.dev); + } else { + pr_buf(out, "invalid device %u", t.dev); + } + + rcu_read_unlock(); } else { - pr_buf(out, "invalid device %u", t.dev); + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + struct bch_member *m = mi->members + t.dev; + + if (bch2_dev_exists(sb, mi, t.dev)) { + pr_buf(out, "Device "); + pr_uuid(out, m->uuid.b); + pr_buf(out, " (%u)", t.dev); + } else { + pr_buf(out, "Bad device %u", t.dev); + } } - - rcu_read_unlock(); break; - } case TARGET_GROUP: - mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(out, &c->disk_sb, t.group); - mutex_unlock(&c->sb_lock); + if (c) { + mutex_lock(&c->sb_lock); + bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); + mutex_unlock(&c->sb_lock); + } else { + bch2_disk_path_to_text(out, sb, t.group); + } break; default: BUG(); diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h index 3d84f23c34ed..de915480514b 100644 --- a/fs/bcachefs/disk_groups.h +++ b/fs/bcachefs/disk_groups.h @@ -75,11 +75,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *); /* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, - unsigned); +void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); int bch2_sb_disk_groups_to_cpu(struct bch_fs *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 3cccd1faade5..616a551265e0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -286,14 +286,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - char buf2[200]; + struct printbuf buf2 = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); bch_err_ratelimited(c, "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", (void *) _RET_IP_, i, j, v->csum_type, - want.lo, got.lo, buf2); + want.lo, got.lo, buf2.buf); + printbuf_exit(&buf2); clear_bit(i, buf->valid); break; } @@ -677,7 +678,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx) return bch2_btree_delete_range(c, BTREE_ID_stripes, POS(0, idx), POS(0, idx + 1), - NULL); + 0, NULL); } static void ec_stripe_delete_work(struct work_struct *work) @@ -1294,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(nr_have_data > h->s->nr_data); BUG_ON(nr_have_parity > h->s->nr_parity); - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - buckets.nr = 0; if (nr_have_parity < h->s->nr_parity) { ret = bch2_bucket_alloc_set(c, &buckets, @@ -1306,8 +1304,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, &nr_have_parity, &have_cache, h->copygc - ? RESERVE_MOVINGGC - : RESERVE_NONE, + ? RESERVE_movinggc + : RESERVE_none, 0, cl); @@ -1323,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, } if (ret) - goto err; + return ret; } buckets.nr = 0; @@ -1335,8 +1333,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, &nr_have_data, &have_cache, h->copygc - ? RESERVE_MOVINGGC - : RESERVE_NONE, + ? RESERVE_movinggc + : RESERVE_none, 0, cl); @@ -1351,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, } if (ret) - goto err; + return ret; } -err: - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return ret; + + return 0; } /* XXX: doesn't obey target: */ @@ -1558,50 +1554,48 @@ void bch2_stripes_heap_start(struct bch_fs *c) bch2_stripes_heap_insert(c, m, iter.pos); } -static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) +int bch2_stripes_read(struct bch_fs *c) { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; const struct bch_stripe *s; - struct bch_fs *c = trans->c; struct stripe *m; unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; + int ret; - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - return ret; + bch2_trans_init(&trans, c, 0, 0); - s = bkey_s_c_to_stripe(k).v; + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->alive = true; - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + s = bkey_s_c_to_stripe(k).v; - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_update(c, m, k.k->p.offset); - spin_unlock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->alive = true; + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; - return ret; -} + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); -int bch2_stripes_read(struct bch_fs *c) -{ - struct btree_trans trans; - int ret; + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + } + bch2_trans_iter_exit(&trans, &iter); - bch2_trans_init(&trans, c, 0, 0); - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, - bch2_stripes_read_fn); bch2_trans_exit(&trans); + if (ret) bch_err(c, "error reading stripes: %i", ret); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 78d468c7680a..9d508a2f3bbc 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, .key_invalid = bch2_stripe_invalid, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_stripe, \ + .atomic_trigger = bch2_mark_stripe, \ } static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 2cea694575e9..8279a9ba76a5 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); + bch_err(c, "inconsistency detected - emergency read only"); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); @@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c) void bch2_fatal_error(struct bch_fs *c) { if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); + bch_err(c, "fatal error - emergency read only"); } void bch2_io_error_work(struct work_struct *work) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 986938298adc..6e63c38186f3 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *); #define bch2_fs_inconsistent_on(cond, c, ...) \ ({ \ - int _ret = !!(cond); \ + bool _ret = unlikely(!!(cond)); \ \ if (_ret) \ bch2_fs_inconsistent(c, __VA_ARGS__); \ @@ -59,7 +59,7 @@ do { \ #define bch2_dev_inconsistent_on(cond, ca, ...) \ ({ \ - int _ret = !!(cond); \ + bool _ret = unlikely(!!(cond)); \ \ if (_ret) \ bch2_dev_inconsistent(ca, __VA_ARGS__); \ @@ -67,6 +67,26 @@ do { \ }) /* + * When a transaction update discovers or is causing a fs inconsistency, it's + * helpful to also dump the pending updates: + */ +#define bch2_trans_inconsistent(trans, ...) \ +({ \ + bch_err(trans->c, __VA_ARGS__); \ + bch2_inconsistent_error(trans->c); \ + bch2_dump_trans_updates(trans); \ +}) + +#define bch2_trans_inconsistent_on(cond, trans, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_trans_inconsistent(trans, __VA_ARGS__); \ + _ret; \ +}) + +/* * Fsck errors: inconsistency errors we detect at mount time, and should ideally * be able to repair: */ @@ -129,7 +149,7 @@ void bch2_flush_fsck_errs(struct bch_fs *); /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ #define __fsck_err_on(cond, c, _flags, ...) \ - ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) + (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) #define need_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) @@ -164,7 +184,7 @@ do { \ #define bch2_fs_fatal_err_on(cond, c, ...) \ ({ \ - int _ret = !!(cond); \ + bool _ret = unlikely(!!(cond)); \ \ if (_ret) \ bch2_fs_fatal_error(c, __VA_ARGS__); \ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 58b2c96f450c..2fd5d9672a44 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - unsigned ret = 0; + unsigned ret = 0, lru = 0; bkey_extent_entry_for_each(ptrs, entry) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: + /* Might also be updating LRU btree */ + if (entry->ptr.cached) + lru++; + + fallthrough; case BCH_EXTENT_ENTRY_stripe_ptr: ret++; } } - return ret; + /* + * Updating keys in the alloc btree may also update keys in the + * freespace or discard btrees: + */ + return lru + ret * 2; } static int count_iters_for_insert(struct btree_trans *trans, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 44c584e9adaa..77a0d49a2372 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -954,15 +954,25 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + if (!ca) { + pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev, + b, offset, ptr->gen, + ptr->cached ? " cached" : ""); + + if (ca && ptr_stale(ca, ptr)) + pr_buf(out, " stale"); + } break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9c2567274a2b..ae650849d98a 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -381,6 +381,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .key_invalid = bch2_btree_ptr_invalid, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ } #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ @@ -388,6 +390,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ } /* KEY_TYPE_extent: */ @@ -402,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ } /* KEY_TYPE_reservation: */ @@ -414,6 +420,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_invalid = bch2_reservation_invalid, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ + .trans_trigger = bch2_trans_mark_reservation, \ + .atomic_trigger = bch2_mark_reservation, \ } /* Extent checksum entries: */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 26d5cad7e6a5..05429c9631cd 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -17,10 +17,6 @@ * * With one based indexing each level of the tree starts at a power of two - * good for cacheline alignment: - * - * Size parameter is treated as if we were using 0 based indexing, however: - * valid nodes, and inorder indices, are in the range [1..size) - that is, there - * are actually size - 1 elements */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) @@ -42,12 +38,12 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size - 1); + return rounddown_pow_of_two(size); } static inline unsigned eytzinger1_last(unsigned size) { - return rounddown_pow_of_two(size) - 1; + return rounddown_pow_of_two(size + 1) - 1; } /* @@ -62,13 +58,13 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i >= size); + EBUG_ON(i > size); - if (eytzinger1_right_child(i) < size) { + if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size) - __fls(i); - i >>= i >= size; + i <<= __fls(size + 1) - __fls(i); + i >>= i > size; } else { i >>= ffz(i) + 1; } @@ -78,14 +74,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i >= size); + EBUG_ON(i > size); - if (eytzinger1_left_child(i) < size) { + if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size) - __fls(i); + i <<= __fls(size + 1) - __fls(i); i -= 1; - i >>= i >= size; + i >>= i > size; } else { i >>= __ffs(i) + 1; } @@ -95,17 +91,17 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size - rounddown_pow_of_two(size - 1)) << 1; + return (size + 1 - rounddown_pow_of_two(size)) << 1; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned extra) { unsigned b = __fls(i); - unsigned shift = __fls(size - 1) - b; + unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i >= size); + EBUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -130,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i >= size); + EBUG_ON(!i || i > size); /* * sign bit trick: @@ -144,7 +140,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, shift = __ffs(i); i >>= shift + 1; - i |= 1U << (__fls(size - 1) - shift); + i |= 1U << (__fls(size) - shift); return i; } @@ -185,39 +181,39 @@ static inline unsigned eytzinger0_right_child(unsigned i) static inline unsigned eytzinger0_first(unsigned size) { - return eytzinger1_first(size + 1) - 1; + return eytzinger1_first(size) - 1; } static inline unsigned eytzinger0_last(unsigned size) { - return eytzinger1_last(size + 1) - 1; + return eytzinger1_last(size) - 1; } static inline unsigned eytzinger0_next(unsigned i, unsigned size) { - return eytzinger1_next(i + 1, size + 1) - 1; + return eytzinger1_next(i + 1, size) - 1; } static inline unsigned eytzinger0_prev(unsigned i, unsigned size) { - return eytzinger1_prev(i + 1, size + 1) - 1; + return eytzinger1_prev(i + 1, size) - 1; } static inline unsigned eytzinger0_extra(unsigned size) { - return eytzinger1_extra(size + 1); + return eytzinger1_extra(size); } static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, unsigned extra) { - return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; + return __eytzinger1_to_inorder(i + 1, size, extra) - 1; } static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, unsigned extra) { - return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; + return __inorder_to_eytzinger1(i + 1, size, extra) - 1; } static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 9cdd03f3eeb0..051372b88347 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -35,6 +35,15 @@ #include <trace/events/bcachefs.h> #include <trace/events/writeback.h> +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + static inline struct address_space *faults_disabled_mapping(void) { return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); @@ -1024,7 +1033,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); + BTREE_ITER_SLOTS); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -1062,8 +1071,6 @@ retry: sectors = min(sectors, k.k->size - offset_into_extent); - bch2_trans_unlock(trans); - if (readpages_iter) readpage_bio_extend(readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); @@ -1280,7 +1287,7 @@ static void bch2_writepage_io_done(struct closure *cl) * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: */ - WARN_ON(io->op.i_sectors_delta > 0); + WARN_ON_ONCE(io->op.i_sectors_delta > 0); /* * (error (due to going RO) halfway through a page can screw that up @@ -1466,8 +1473,8 @@ do_io: sectors << 9, offset << 9)); /* Check for writing past i_size: */ - WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c))); + WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c))); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; @@ -1810,11 +1817,11 @@ again: * to check that the address is actually valid, when atomic * usercopies are used, below. */ - if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { bytes = min_t(unsigned long, iov_iter_count(iter), PAGE_SIZE - offset); - if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ret = -EFAULT; break; } @@ -1872,7 +1879,7 @@ static void bch2_dio_read_complete(struct closure *cl) { struct dio_read *dio = container_of(cl, struct dio_read, cl); - dio->req->ki_complete(dio->req, dio->ret, 0); + dio->req->ki_complete(dio->req, dio->ret); bio_check_or_release(&dio->rbio.bio, dio->should_dirty); } @@ -1921,7 +1928,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) iter->count -= shorten; bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_VECS), + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), &c->dio_read_bioset); bio->bi_end_io = bch2_direct_IO_read_endio; @@ -1956,7 +1963,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_VECS), + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), &c->bio_read); bio->bi_end_io = bch2_direct_IO_read_split_endio; start: @@ -2103,7 +2110,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) while (1) { iter_count = dio->iter.count; - if (kthread) + if (kthread && dio->mm) kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -2113,7 +2120,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) dropped_locks = fdm_dropped_locks(); current->faults_disabled_mapping = NULL; - if (kthread) + if (kthread && dio->mm) kthread_unuse_mm(dio->mm); /* @@ -2246,7 +2253,7 @@ err: inode_dio_end(&inode->v); if (!sync) { - req->ki_complete(req, ret, 0); + req->ki_complete(req, ret); ret = -EIOCBQUEUED; } return ret; @@ -2306,9 +2313,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_is_bvec(iter) - ? 0 - : iov_iter_npages(iter, BIO_MAX_VECS), + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); init_completion(&dio->done); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 2d2ad7f768c0..d462c06899d6 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -30,6 +30,7 @@ #include <linux/pagemap.h> #include <linux/posix_acl.h> #include <linux/random.h> +#include <linux/seq_file.h> #include <linux/statfs.h> #include <linux/string.h> #include <linux/xattr.h> @@ -104,7 +105,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum), - 0 && c->opts.inodes_use_key_cache); + c->opts.inodes_use_key_cache); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); @@ -134,7 +135,6 @@ int __must_check bch2_write_inode(struct bch_fs *c, int ret; bch2_trans_init(&trans, c, 0, 512); - trans.ip = _RET_IP_; retry: bch2_trans_begin(&trans); @@ -934,9 +934,9 @@ retry: bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, SPOS(ei->v.i_ino, start, snapshot), 0); - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = bkey_err(k)) && - bkey_cmp(iter.pos, end) < 0) { + while (!(ret = btree_trans_too_many_iters(&trans)) && + (k = bch2_btree_iter_peek_upto(&iter, end)).k && + !(ret = bkey_err(k))) { enum btree_id data_btree = BTREE_ID_extents; if (!bkey_extent_is_data(k.k) && @@ -1472,12 +1472,12 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode_inum(inode), true); + bch2_inode_rm(c, inode_inum(inode)); } } void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) + snapshot_id_list *s) { struct super_block *sb = c->vfs_sb; struct inode *inode; @@ -1675,7 +1675,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; enum bch_opt_id i; - char buf[512]; + struct printbuf buf = PRINTBUF; + int ret = 0; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; @@ -1687,13 +1688,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; - bch2_opt_to_text(&PBUF(buf), c, opt, v, + printbuf_reset(&buf); + bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); seq_putc(seq, ','); - seq_puts(seq, buf); + seq_puts(seq, buf.buf); } - return 0; + if (buf.allocation_failure) + ret = -ENOMEM; + printbuf_exit(&buf); + return ret; } static void bch2_put_super(struct super_block *sb) diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index b2211ec7f302..9f4b57e30e2a 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -191,7 +191,7 @@ int bch2_setattr_nonsize(struct user_namespace *, struct iattr *); int __bch2_unlink(struct inode *, struct dentry *, bool); -void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); +void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); void bch2_vfs_exit(void); int bch2_vfs_init(void); @@ -199,7 +199,7 @@ int bch2_vfs_init(void); #else static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) {} + snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 361dbf338023..2582ddf14803 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "bkey_buf.h" #include "btree_update.h" +#include "darray.h" #include "dirent.h" #include "error.h" #include "fs-common.h" @@ -471,11 +472,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; if (bkey_cmp(s->pos, pos)) - s->nr = 0; + s->ids.nr = 0; s->pos = pos; /* Might get called multiple times due to lock restarts */ - if (s->nr && s->d[s->nr - 1] == pos.snapshot) + if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot) return 0; return snapshots_seen_add(c, s, pos.snapshot); @@ -498,7 +499,7 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see ancestor = snapshot_t(c, ancestor)->equiv; /* @ancestor should be the snapshot most recently added to @seen */ - BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); + BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor); BUG_ON(seen->pos.snapshot != ancestor); if (id == ancestor) @@ -507,11 +508,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see if (!bch2_snapshot_is_ancestor(c, id, ancestor)) return false; - for (i = seen->nr - 2; - i >= 0 && seen->d[i] >= id; + for (i = seen->ids.nr - 2; + i >= 0 && seen->ids.data[i] >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && - bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) && + bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor)) return false; return true; @@ -537,26 +538,25 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, } #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ + for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) +struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; +}; + struct inode_walker { bool first_this_inode; u64 cur_inum; - size_t nr; - size_t size; - struct inode_walker_entry { - struct bch_inode_unpacked inode; - u32 snapshot; - u64 count; - } *d; + DARRAY(struct inode_walker_entry) inodes; }; static void inode_walker_exit(struct inode_walker *w) { - kfree(w->d); - w->d = NULL; + darray_exit(w->inodes); } static struct inode_walker inode_walker_init(void) @@ -564,40 +564,17 @@ static struct inode_walker inode_walker_init(void) return (struct inode_walker) { 0, }; } -static int inode_walker_realloc(struct inode_walker *w) -{ - if (w->nr == w->size) { - size_t new_size = max_t(size_t, 8UL, w->size * 2); - void *d = krealloc(w->d, new_size * sizeof(w->d[0]), - GFP_KERNEL); - if (!d) - return -ENOMEM; - - w->d = d; - w->size = new_size; - } - - return 0; -} - static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { struct bch_inode_unpacked u; - int ret; - - ret = inode_walker_realloc(w); - if (ret) - return ret; BUG_ON(bch2_inode_unpack(inode, &u)); - w->d[w->nr++] = (struct inode_walker_entry) { + return darray_push(w->inodes, ((struct inode_walker_entry) { .inode = u, .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, - }; - - return 0; + })); } static int __walk_inode(struct btree_trans *trans, @@ -616,7 +593,7 @@ static int __walk_inode(struct btree_trans *trans, goto lookup_snapshot; } - w->nr = 0; + w->inodes.nr = 0; for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -634,26 +611,25 @@ static int __walk_inode(struct btree_trans *trans, w->cur_inum = pos.inode; w->first_this_inode = true; lookup_snapshot: - for (i = 0; i < w->nr; i++) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) + for (i = 0; i < w->inodes.nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) goto found; return INT_MAX; found: - BUG_ON(pos.snapshot > w->d[i].snapshot); + BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); - if (pos.snapshot != w->d[i].snapshot) { + if (pos.snapshot != w->inodes.data[i].snapshot) { ancestor_pos = i; - while (i && w->d[i - 1].snapshot > pos.snapshot) + while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) --i; - ret = inode_walker_realloc(w); + ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]); if (ret) return ret; - array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); - w->d[i].snapshot = pos.snapshot; - w->d[i].count = 0; + w->inodes.data[i].snapshot = pos.snapshot; + w->inodes.data[i].count = 0; } return i; @@ -669,7 +645,7 @@ static int __get_visible_inodes(struct btree_trans *trans, struct bkey_s_c k; int ret; - w->nr = 0; + w->inodes.nr = 0; for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -695,15 +671,16 @@ static int check_key_has_snapshot(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; fsck_err: + printbuf_exit(&buf); return ret; } @@ -743,7 +720,7 @@ static int hash_check_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; - char buf[200]; + struct printbuf buf = PRINTBUF; struct bkey_s_c k; u64 hash; int ret = 0; @@ -767,8 +744,9 @@ static int hash_check_key(struct btree_trans *trans, if (fsck_err_on(k.k->type == desc.key_type && !desc.cmp_bkey(k, hash_k), c, "duplicate hash table keys:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - hash_k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + buf.buf))) { ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; break; } @@ -779,13 +757,16 @@ static int hash_check_key(struct btree_trans *trans, } } +out: bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; bad_hash: if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " "hashed to %llu\n%s", desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash, - (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE) return 0; ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); @@ -793,9 +774,9 @@ bad_hash: bch_err(c, "hash_redo_key err %i", ret); return ret; } - return -EINTR; + ret = -EINTR; fsck_err: - return ret; + goto out; } static int check_inode(struct btree_trans *trans, @@ -1125,7 +1106,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) int ret = 0, ret2 = 0; s64 count2; - for (i = w->d; i < w->d + w->nr; i++) { + darray_for_each(w->inodes, i) { if (i->inode.bi_sectors == i->count) continue; @@ -1163,32 +1144,34 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct bkey_s_c k; struct inode_walker_entry *i; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; k = bch2_btree_iter_peek(iter); if (!k.k) - return 0; + goto out; ret = bkey_err(k); if (ret) - return ret; + goto err; ret = check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } ret = snapshots_seen_update(c, s, k.k->p); if (ret) - return ret; + goto err; if (k.k->type == KEY_TYPE_whiteout) - return 0; + goto out; if (inode->cur_inum != k.k->p.inode) { ret = check_i_sectors(trans, inode); if (ret) - return ret; + goto err; } #if 0 if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { @@ -1198,33 +1181,43 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); bch2_bkey_val_to_text(&PBUF(buf2), c, k); - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) - return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { + ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; + goto out; + } } #endif ret = __walk_inode(trans, inode, k.k->p); if (ret < 0) - return ret; + goto err; if (fsck_err_on(ret == INT_MAX, c, "extent in missing inode:\n %s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } - if (ret == INT_MAX) - return 0; + if (ret == INT_MAX) { + ret = 0; + goto out; + } - i = inode->d + ret; + i = inode->inodes.data + ret; ret = 0; if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && !S_ISLNK(i->inode.bi_mode), c, "extent in non regular inode mode %o:\n %s", i->inode.bi_mode, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { @@ -1234,11 +1227,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, "extent type %u offset %llu past end of inode %llu, i_size %llu", k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, k.k->p.snapshot), POS(k.k->p.inode, U64_MAX), 0, NULL) ?: -EINTR; + goto out; } } } @@ -1250,7 +1244,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_reassemble(&prev, c, k); #endif +out: +err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -1309,12 +1306,13 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) int ret = 0, ret2 = 0; s64 count2; - for (i = w->d; i < w->d + w->nr; i++) { + darray_for_each(w->inodes, i) { if (i->inode.bi_nlink == i->count) continue; - count2 = lockrestart_do(trans, - bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); + count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); + if (count2 < 0) + return count2; if (i->count != count2) { bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", @@ -1347,7 +1345,7 @@ static int check_dirent_target(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_i_dirent *n; bool backpointer_exists = true; - char buf[200]; + struct printbuf buf = PRINTBUF; int ret = 0; if (!target->bi_dir && @@ -1373,9 +1371,7 @@ static int check_dirent_target(struct btree_trans *trans, "directory %llu with multiple links", target->bi_inum)) { ret = __remove_dirent(trans, d.k->p); - if (ret) - goto err; - return 0; + goto out; } if (fsck_err_on(backpointer_exists && @@ -1412,18 +1408,19 @@ static int check_dirent_target(struct btree_trans *trans, "incorrect d_type: got %s, should be %s:\n%s", bch2_d_type_str(d.v->d_type), bch2_d_type_str(inode_d_type(target)), - (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto err; bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) - return ret; + goto err; d = dirent_i_to_s_c(n); } @@ -1437,19 +1434,21 @@ static int check_dirent_target(struct btree_trans *trans, n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto err; bkey_reassemble(&n->k_i, d.s_c); n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) - return ret; + goto err; d = dirent_i_to_s_c(n); } +out: err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -1463,68 +1462,81 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k; struct bkey_s_c_dirent d; struct inode_walker_entry *i; - char buf[200]; - int ret; + struct printbuf buf = PRINTBUF; + int ret = 0; k = bch2_btree_iter_peek(iter); if (!k.k) - return 0; + goto out; ret = bkey_err(k); if (ret) - return ret; + goto err; ret = check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } ret = snapshots_seen_update(c, s, k.k->p); if (ret) - return ret; + goto err; if (k.k->type == KEY_TYPE_whiteout) - return 0; + goto out; if (dir->cur_inum != k.k->p.inode) { ret = check_subdir_count(trans, dir); if (ret) - return ret; + goto err; } ret = __walk_inode(trans, dir, k.k->p); if (ret < 0) - return ret; + goto err; if (fsck_err_on(ret == INT_MAX, c, "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } - if (ret == INT_MAX) - return 0; + if (ret == INT_MAX) { + ret = 0; + goto out; + } - i = dir->d + ret; + i = dir->inodes.data + ret; ret = 0; if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, "dirent in non directory inode type %s:\n%s", bch2_d_type_str(inode_d_type(&i->inode)), - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return bch2_btree_delete_at(trans, iter, 0); + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) - return ret; - if (ret) /* dirent has been deleted */ - return 0; + goto err; + if (ret) { + /* dirent has been deleted */ + ret = 0; + goto out; + } if (k.k->type != KEY_TYPE_dirent) - return 0; + goto out; d = bkey_s_c_to_dirent(k); @@ -1537,24 +1549,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ret = __subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); if (ret && ret != -ENOENT) - return ret; + goto err; if (fsck_err_on(ret, c, "dirent points to missing subvolume %llu", - le64_to_cpu(d.v->d_child_subvol))) - return __remove_dirent(trans, d.k->p); + le64_to_cpu(d.v->d_child_subvol))) { + ret = __remove_dirent(trans, d.k->p); + goto err; + } ret = __lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && ret != -ENOENT) - return ret; + goto err; if (fsck_err_on(ret, c, "subvolume %u points to missing subvolume root %llu", target_subvol, target_inum)) { bch_err(c, "repair not implemented yet"); - return -EINVAL; + ret = -EINVAL; + goto err; } if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, @@ -1564,32 +1579,33 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, subvol_root.bi_subvol = target_subvol; ret = __write_inode(trans, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } else { ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) - return ret; + goto err; - if (fsck_err_on(!target->nr, c, + if (fsck_err_on(!target->inodes.nr, c, "dirent points to missing inode:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { ret = __remove_dirent(trans, d.k->p); if (ret) - return ret; + goto err; } - for (i = target->d; i < target->d + target->nr; i++) { + darray_for_each(target->inodes, i) { ret = check_dirent_target(trans, iter, d, &i->inode, i->snapshot); if (ret) - return ret; + goto err; } } @@ -1597,7 +1613,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) i->count++; +out: +err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -1680,7 +1699,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ret = 0; if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); fsck_err: @@ -1790,21 +1809,18 @@ static int check_root(struct bch_fs *c) check_root_trans(&trans)); } -struct pathbuf { - size_t nr; - size_t size; - - struct pathbuf_entry { - u64 inum; - u32 snapshot; - } *entries; +struct pathbuf_entry { + u64 inum; + u32 snapshot; }; -static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) +typedef DARRAY(struct pathbuf_entry) pathbuf; + +static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { struct pathbuf_entry *i; - for (i = p->entries; i < p->entries + p->nr; i++) + darray_for_each(*p, i) if (i->inum == inum && i->snapshot == snapshot) return true; @@ -1812,26 +1828,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) +static int path_down(struct bch_fs *c, pathbuf *p, + u64 inum, u32 snapshot) { - if (p->nr == p->size) { - size_t new_size = max_t(size_t, 256UL, p->size * 2); - void *n = krealloc(p->entries, - new_size * sizeof(p->entries[0]), - GFP_KERNEL); - if (!n) { - return -ENOMEM; - } - - p->entries = n; - p->size = new_size; - }; - - p->entries[p->nr++] = (struct pathbuf_entry) { + int ret = darray_push(*p, ((struct pathbuf_entry) { .inum = inum, .snapshot = snapshot, - }; - return 0; + })); + + if (ret) + bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", + p->size); + return ret; } /* @@ -1840,7 +1848,7 @@ static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) * XXX: we should also be verifying that inodes are in the right subvolumes */ static int check_path(struct btree_trans *trans, - struct pathbuf *p, + pathbuf *p, struct bch_inode_unpacked *inode, u32 snapshot) { @@ -1893,7 +1901,7 @@ static int check_path(struct btree_trans *trans, if (!S_ISDIR(inode->bi_mode)) break; - ret = path_down(p, inode->bi_inum, snapshot); + ret = path_down(c, p, inode->bi_inum, snapshot); if (ret) { bch_err(c, "memory allocation failure"); return ret; @@ -1914,7 +1922,7 @@ static int check_path(struct btree_trans *trans, /* XXX print path */ bch_err(c, "directory structure loop"); - for (i = p->entries; i < p->entries + p->nr; i++) + darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); @@ -1951,7 +1959,7 @@ static int check_directory_structure(struct bch_fs *c) struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; - struct pathbuf path = { 0, 0, NULL }; + pathbuf path = { 0, }; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -1981,7 +1989,7 @@ static int check_directory_structure(struct bch_fs *c) BUG_ON(ret == -EINTR); - kfree(path.entries); + darray_exit(path); bch2_trans_exit(&trans); return ret; @@ -1998,12 +2006,15 @@ struct nlink_table { } *d; }; -static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot) +static int add_nlink(struct bch_fs *c, struct nlink_table *t, + u64 inum, u32 snapshot) { if (t->nr == t->size) { size_t new_size = max_t(size_t, 128UL, t->size * 2); void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); if (!d) { + bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", + new_size); return -ENOMEM; } @@ -2093,7 +2104,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, if (!u.bi_nlink) continue; - ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot); + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); if (ret) { *end = k.k->p.offset; ret = 0; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index ef6da53567b8..14b0b595202d 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans, u32 snapshot; int ret; - if (0 && trans->c->opts.inodes_use_key_cache) - flags |= BTREE_ITER_CACHED; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) return ret; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), flags); + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -585,79 +583,62 @@ found_slot: static int bch2_inode_delete_keys(struct btree_trans *trans, subvol_inum inum, enum btree_id id) { - u64 offset = 0; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; int ret = 0; - while (!ret || ret == -EINTR) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i delete; - u32 snapshot; + /* + * We're never going to be deleting extents, no need to use an extent + * iterator: + */ + bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + while (1) { bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - continue; + goto err; - bch2_trans_iter_init(trans, &iter, id, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); - - if (!k.k || iter.pos.inode != inum.inum) { - bch2_trans_iter_exit(trans, &iter); - break; - } + bch2_btree_iter_set_snapshot(&iter, snapshot); + k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); ret = bkey_err(k); if (ret) goto err; + if (!k.k) + break; + bkey_init(&delete.k); delete.k.p = iter.pos; - if (btree_node_type_is_extents(iter.btree_id)) { - unsigned max_sectors = - min_t(u64, U64_MAX - iter.pos.offset, - KEY_SIZE_MAX & (~0 << trans->c->block_bits)); - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - - ret = bch2_extent_trim_atomic(trans, &iter, &delete); - if (ret) - goto err; - } - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: - bch2_trans_commit(trans, &disk_res, NULL, + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); - bch2_disk_reservation_put(trans->c, &disk_res); err: - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); + if (ret && ret != -EINTR) + break; } + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans trans; struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; struct bkey_s_c k; - unsigned iter_flags = BTREE_ITER_INTENT; u32 snapshot; int ret; - if (0 && cached && c->opts.inodes_use_key_cache) - iter_flags |= BTREE_ITER_CACHED; - bch2_trans_init(&trans, c, 0, 1024); /* @@ -681,7 +662,8 @@ retry: goto err; bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), iter_flags); + SPOS(0, inum.inum, snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 723186d8afb6..2337ecfc600e 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -13,11 +13,15 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode (struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ } #define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ .key_invalid = bch2_inode_v2_invalid, \ .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ } static inline bool bkey_is_inode(const struct bkey *k) @@ -87,7 +91,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, u32, u64); -int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); +int bch2_inode_rm(struct bch_fs *, subvol_inum); int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 50b90b728a6d..36929451af2c 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -764,6 +764,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) struct bch_fs *c = op->c; struct nonce nonce = extent_nonce(op->version, op->crc); struct bch_csum csum; + int ret; if (!bch2_csum_type_is_encryption(op->crc.csum_type)) return 0; @@ -778,10 +779,10 @@ static int bch2_write_decrypt(struct bch_write_op *op) if (bch2_crc_cmp(op->crc.csum, csum)) return -EIO; - bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); op->crc.csum_type = 0; op->crc.csum = (struct bch_csum) { 0, 0 }; - return 0; + return ret; } static enum prep_encoded_ret { @@ -996,8 +997,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.live_size = src_len >> 9; swap(dst->bi_iter.bi_size, dst_len); - bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); + ret = bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + if (ret) + goto err; + crc.csum = bch2_checksum_bio(c, op->csum_type, extent_nonce(version, crc), dst); crc.csum_type = op->csum_type; @@ -1055,7 +1059,7 @@ static void __bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct write_point *wp; - struct bio *bio; + struct bio *bio = NULL; bool skip_put = true; unsigned nofs_flags; int ret; @@ -1772,6 +1776,7 @@ static void __bch2_read_endio(struct work_struct *work) struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; struct bch_csum csum; + int ret; nofs_flags = memalloc_nofs_save(); @@ -1806,7 +1811,10 @@ static void __bch2_read_endio(struct work_struct *work) crc.live_size = bvec_iter_sectors(rbio->bvec_iter); if (crc_is_compressed(crc)) { - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; } else { @@ -1817,7 +1825,9 @@ static void __bch2_read_endio(struct work_struct *work) BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); src->bi_iter.bi_size = dst_iter.bi_size; - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -1830,7 +1840,10 @@ static void __bch2_read_endio(struct work_struct *work) * Re encrypt data we decrypted, so it's consistent with * rbio->crc: */ - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + promote_start(rbio->promote, rbio); rbio->promote = NULL; } @@ -1865,6 +1878,11 @@ decompression_err: "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); goto out; +decrypt_err: + bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; } static void bch2_read_endio(struct bio *bio) @@ -1893,9 +1911,8 @@ static void bch2_read_endio(struct bio *bio) return; } - if (rbio->pick.ptr.cached && - (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr))) { + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -1954,6 +1971,35 @@ err: return ret; } +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), + BTREE_ITER_CACHED); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) + goto out; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s", buf.buf); + bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + bch2_trans_iter_exit(trans, &iter); +out: + printbuf_exit(&buf); +} + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, @@ -1963,7 +2009,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca; + struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -1980,7 +2026,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, zero_fill_bio_iter(&orig->bio, iter); goto out_read_done; } - +retry_pick: pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ @@ -1993,8 +2039,27 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto err; } - if (pick_ret > 0) - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); if (flags & BCH_READ_NODECODE) { /* @@ -2241,7 +2306,7 @@ retry: bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); + BTREE_ITER_SLOTS); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; @@ -2282,12 +2347,6 @@ retry: */ sectors = min(sectors, k.k->size - offset_into_extent); - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(&trans); - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index 1aa422dccef7..fb5114518666 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -50,7 +50,7 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { - return op->alloc_reserve == RESERVE_MOVINGGC + return op->alloc_reserve == RESERVE_movinggc ? op->c->copygc_wq : op->c->btree_update_wq; } @@ -79,7 +79,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->compression_type = bch2_compression_opt_to_type[opts.compression]; op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; - op->alloc_reserve = RESERVE_NONE; + op->alloc_reserve = RESERVE_none; op->incompressible = 0; op->open_buckets.nr = 0; op->devs_have.nr = 0; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index e0017dcf3312..505e8367b5f2 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -15,23 +15,26 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "super-io.h" #include <trace/events/bcachefs.h> -static u64 last_unwritten_seq(struct journal *j) -{ - union journal_res_state s = READ_ONCE(j->reservations); +#define x(n) #n, +static const char * const bch2_journal_watermarks[] = { + JOURNAL_WATERMARKS() + NULL +}; - lockdep_assert_held(&j->lock); - - return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); -} +static const char * const bch2_journal_errors[] = { + JOURNAL_ERRORS() + NULL +}; +#undef x static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { - return seq >= last_unwritten_seq(j); + return seq > j->seq_ondisk; } static bool __journal_entry_is_open(union journal_res_state state) @@ -39,6 +42,11 @@ static bool __journal_entry_is_open(union journal_res_state state) return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; } +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + static bool journal_entry_is_open(struct journal *j) { return __journal_entry_is_open(j->reservations); @@ -50,8 +58,6 @@ journal_seq_to_buf(struct journal *j, u64 seq) struct journal_buf *buf = NULL; EBUG_ON(seq > journal_cur_seq(j)); - EBUG_ON(seq == journal_cur_seq(j) && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); if (journal_seq_unwritten(j, seq)) { buf = j->buf + (seq & JOURNAL_BUF_MASK); @@ -69,54 +75,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) p->devs.nr = 0; } -static void journal_pin_new_entry(struct journal *j) -{ - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - journal_pin_list_init(fifo_push_ref(&j->pin), 1); -} - -static void bch2_journal_buf_init(struct journal *j) -{ - struct journal_buf *buf = journal_cur_buf(j); - - bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - - memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(journal_cur_seq(j)); - buf->data->u64s = 0; -} - -void bch2_journal_halt(struct journal *j) -{ - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - do { - old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return; - - new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); - - /* - * XXX: we're not using j->lock here because this can be called from - * interrupt context, this can race with journal_write_done() - */ - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); - closure_wake_up(&journal_cur_buf(j)->wait); -} - /* journal entry close/open: */ void __bch2_journal_buf_put(struct journal *j) @@ -132,7 +90,7 @@ void __bch2_journal_buf_put(struct journal *j) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static bool __journal_entry_close(struct journal *j) +static void __journal_entry_close(struct journal *j, unsigned closed_val) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -140,34 +98,24 @@ static bool __journal_entry_close(struct journal *j) u64 v = atomic64_read(&j->reservations.counter); unsigned sectors; + BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && + closed_val != JOURNAL_ENTRY_ERROR_VAL); + lockdep_assert_held(&j->lock); do { old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) - return true; - - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { - /* this entry will never be written: */ - closure_wake_up(&buf->wait); - return true; - } - - if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { - set_bit(JOURNAL_NEED_WRITE, &j->flags); - j->need_write_time = local_clock(); - } + new.cur_entry_offset = closed_val; - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; - new.idx++; - - if (new.idx == new.unwritten_idx) - return false; - - BUG_ON(journal_state_count(new, new.idx)); + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || + old.cur_entry_offset == new.cur_entry_offset) + return; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + if (!__journal_entry_is_open(old)) + return; + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); @@ -197,36 +145,42 @@ static bool __journal_entry_close(struct journal *j) */ buf->last_seq = journal_last_seq(j); buf->data->last_seq = cpu_to_le64(buf->last_seq); + BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); - /* Initialize new buffer: */ - journal_pin_new_entry(j); - - bch2_journal_buf_init(j); - cancel_delayed_work(&j->write_work); - clear_bit(JOURNAL_NEED_WRITE, &j->flags); bch2_journal_space_available(j); bch2_journal_buf_put(j, old.idx); - return true; +} + +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + spin_unlock(&j->lock); } static bool journal_entry_want_write(struct journal *j) { - union journal_res_state s = READ_ONCE(j->reservations); - bool ret = false; + bool ret = !journal_entry_is_open(j) || + journal_cur_seq(j) == journal_last_unwritten_seq(j); - /* - * Don't close it yet if we already have a write in flight, but do set - * NEED_WRITE: - */ - if (s.idx != s.unwritten_idx) - set_bit(JOURNAL_NEED_WRITE, &j->flags); - else - ret = __journal_entry_close(j); + /* Don't close it yet if we already have a write in flight: */ + if (ret) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + else if (nr_unwritten_journal_entries(j)) { + struct journal_buf *buf = journal_cur_buf(j); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + } return ret; } @@ -255,34 +209,71 @@ static bool journal_entry_close(struct journal *j) static int journal_entry_open(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = journal_cur_buf(j); + struct journal_buf *buf = j->buf + + ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); union journal_res_state old, new; int u64s; u64 v; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return cur_entry_blocked; + return JOURNAL_ERR_blocked; if (j->cur_entry_error) return j->cur_entry_error; + if (bch2_journal_error(j)) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + + if (!fifo_free(&j->pin)) + return JOURNAL_ERR_journal_pin_full; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) + return JOURNAL_ERR_max_in_flight; + BUG_ON(!j->cur_entry_sectors); + buf->expires = + (journal_cur_seq(j) == j->flushed_seq_ondisk + ? jiffies + : j->last_flush_write) + + msecs_to_jiffies(c->opts.journal_flush_delay); + buf->u64s_reserved = j->entry_u64s_reserved; buf->disk_sectors = j->cur_entry_sectors; buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); u64s = (int) (buf->sectors << 9) / sizeof(u64) - journal_entry_overhead(j); - u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= 0) + return JOURNAL_ERR_journal_full; - if (u64s <= le32_to_cpu(buf->data->u64s)) - return cur_entry_journal_full; + if (fifo_empty(&j->pin) && j->reclaim_thread) + wake_up_process(j->reclaim_thread); + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); + journal_pin_list_init(fifo_push_ref(&j->pin), 1); + + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); + + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; + + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); + buf->data->u64s = 0; /* * Must be set before marking the journal entry as open: @@ -293,14 +284,14 @@ static int journal_entry_open(struct journal *j) do { old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return cur_entry_insufficient_devices; + BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + new.idx++; + BUG_ON(journal_state_count(new, new.idx)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); - EBUG_ON(journal_state_count(new, new.idx)); journal_state_inc(&new); + new.cur_entry_offset = 0; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -318,8 +309,7 @@ static int journal_entry_open(struct journal *j) static bool journal_quiesced(struct journal *j) { - union journal_res_state s = READ_ONCE(j->reservations); - bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); + bool ret = atomic64_read(&j->seq) == j->seq_ondisk; if (!ret) journal_entry_close(j); @@ -334,8 +324,21 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + long delta; - journal_entry_close(j); + spin_lock(&j->lock); + if (!__journal_entry_is_open(j->reservations)) + goto unlock; + + delta = journal_cur_buf(j)->expires - jiffies; + + if (delta > 0) + mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); +unlock: + spin_unlock(&j->lock); } static int __journal_res_get(struct journal *j, struct journal_res *res, @@ -364,13 +367,12 @@ retry: return 0; } - if (!(flags & JOURNAL_RES_GET_RESERVED) && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { /* * Don't want to close current journal entry, just need to * invoke reclaim: */ - ret = cur_entry_journal_full; + ret = JOURNAL_ERR_journal_full; goto unlock; } @@ -385,20 +387,13 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - if (journal_entry_is_open(j) && - !__journal_entry_close(j)) { - /* - * We failed to get a reservation on the current open journal - * entry because it's full, and we can't close it because - * there's still a previous one in flight: - */ + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + ret = journal_entry_open(j); + + if (ret == JOURNAL_ERR_max_in_flight) trace_journal_entry_full(c); - ret = cur_entry_blocked; - } else { - ret = journal_entry_open(j); - } unlock: - if ((ret && ret != cur_entry_insufficient_devices) && + if ((ret && ret != JOURNAL_ERR_insufficient_devices) && !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; trace_journal_full(c); @@ -410,23 +405,24 @@ unlock: if (!ret) goto retry; - if ((ret == cur_entry_journal_full || - ret == cur_entry_journal_pin_full) && + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && !can_discard && - j->reservations.idx == j->reservations.unwritten_idx && - (flags & JOURNAL_RES_GET_RESERVED)) { - char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); - - bch_err(c, "Journal stuck!"); - if (journal_debug_buf) { - bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); - bch_err(c, "%s", journal_debug_buf); - - bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); - bch_err(c, "Journal pins:\n%s", journal_debug_buf); - kfree(journal_debug_buf); - } + !nr_unwritten_journal_entries(j) && + (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { + struct printbuf buf = PRINTBUF; + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", + bch2_journal_errors[ret]); + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); + + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); + + printbuf_exit(&buf); bch2_fatal_error(c); dump_stack(); } @@ -435,8 +431,8 @@ unlock: * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == cur_entry_journal_full || - ret == cur_entry_journal_pin_full) && + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -449,7 +445,7 @@ unlock: } } - return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; + return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; } /* @@ -528,7 +524,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -573,12 +569,15 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, } /* if seq was written, but not flushed - flush a newer one instead */ - seq = max(seq, last_unwritten_seq(j)); + seq = max(seq, journal_last_unwritten_seq(j)); recheck_need_open: - if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + if (seq > journal_cur_seq(j)) { struct journal_res res = { 0 }; + if (journal_entry_is_open(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + spin_unlock(&j->lock); ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); @@ -588,7 +587,11 @@ recheck_need_open: seq = res.seq; buf = j->buf + (seq & JOURNAL_BUF_MASK); buf->must_flush = true; - set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } if (parent && !closure_wait(&buf->wait, parent)) BUG(); @@ -640,6 +643,58 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) return ret ?: ret2 < 0 ? ret2 : 0; } +/* + * bch2_journal_flush_async - if there is an open journal entry, or a journal + * still being written, write it and wait for the write to complete + */ +void bch2_journal_flush_async(struct journal *j, struct closure *parent) +{ + bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); +} + +int bch2_journal_flush(struct journal *j) +{ + return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); +} + +/* + * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * @seq + */ +bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; + bool ret = false; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; + + if (seq <= c->journal.flushed_seq_ondisk) + return false; + + spin_lock(&j->lock); + if (seq <= c->journal.flushed_seq_ondisk) + goto out; + + for (unwritten_seq = journal_last_unwritten_seq(j); + unwritten_seq < seq; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + + /* journal write is already in flight, and was a flush write: */ + if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + goto out; + + buf->noflush = true; + } + + ret = true; +out: + spin_unlock(&j->lock); + return ret; +} + int bch2_journal_meta(struct journal *j) { struct journal_buf *buf; @@ -654,55 +709,48 @@ int bch2_journal_meta(struct journal *j) buf = j->buf + (res.seq & JOURNAL_BUF_MASK); buf->must_flush = true; - set_bit(JOURNAL_NEED_WRITE, &j->flags); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } bch2_journal_res_put(j, &res); return bch2_journal_flush_seq(j, res.seq); } -/* - * bch2_journal_flush_async - if there is an open journal entry, or a journal - * still being written, write it and wait for the write to complete - */ -void bch2_journal_flush_async(struct journal *j, struct closure *parent) +int bch2_journal_log_msg(struct journal *j, const char *fmt, ...) { - u64 seq, journal_seq; + struct jset_entry_log *entry; + struct journal_res res = { 0 }; + unsigned msglen, u64s; + va_list args; + int ret; - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); + va_start(args, fmt); + msglen = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return; - } - spin_unlock(&j->lock); + u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64))); - bch2_journal_flush_seq_async(j, seq, parent); -} + ret = bch2_journal_res_get(j, &res, u64s, 0); + if (ret) + return ret; -int bch2_journal_flush(struct journal *j) -{ - u64 seq, journal_seq; + entry = container_of(journal_res_entry(j, &res), + struct jset_entry_log, entry);; + memset(entry, 0, u64s * sizeof(u64)); + entry->entry.type = BCH_JSET_ENTRY_log; + entry->entry.u64s = u64s - 1; - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); + va_start(args, fmt); + vsnprintf(entry->d, INT_MAX, fmt, args); + va_end(args); - if (journal_entry_is_open(j)) { - seq = journal_seq; - } else if (journal_seq) { - seq = journal_seq - 1; - } else { - spin_unlock(&j->lock); - return 0; - } - spin_unlock(&j->lock); + bch2_journal_res_put(j, &res); - return bch2_journal_flush_seq(j, seq); + return bch2_journal_flush_seq(j, res.seq); } /* block/unlock the journal: */ @@ -732,28 +780,53 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; u64 *new_bucket_seq = NULL, *new_buckets = NULL; + struct open_bucket **ob = NULL; + long *bu = NULL; + unsigned i, nr_got = 0, nr_want = nr - ja->nr; + unsigned old_nr = ja->nr; + unsigned old_discard_idx = ja->discard_idx; + unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; + unsigned old_dirty_idx = ja->dirty_idx; + unsigned old_cur_idx = ja->cur_idx; int ret = 0; - /* don't handle reducing nr of buckets yet: */ - if (nr <= ja->nr) - return 0; + if (c) { + bch2_journal_block(&c->journal); + bch2_journal_flush_all_pins(&c->journal); + } + bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); + ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL); new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) { + if (!bu || !ob || !new_buckets || !new_bucket_seq) { ret = -ENOMEM; - goto err; + goto err_unblock; } - journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) { - ret = -ENOSPC; - goto err; + for (nr_got = 0; nr_got < nr_want; nr_got++) { + if (new_fs) { + bu[nr_got] = bch2_bucket_alloc_new_fs(ca); + if (bu[nr_got] < 0) { + ret = -ENOSPC; + break; + } + } else { + ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, + false, cl); + if (IS_ERR(ob[nr_got])) { + ret = cl ? -EAGAIN : -ENOSPC; + break; + } + + bu[nr_got] = ob[nr_got]->bucket; + } } + if (!nr_got) + goto err_unblock; + /* * We may be called from the device add path, before the new device has * actually been added to the running filesystem: @@ -766,51 +839,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); - if (!new_fs) - spin_unlock(&c->journal.lock); - - while (ja->nr < nr) { - struct open_bucket *ob = NULL; - unsigned pos; - long b; + for (i = 0; i < nr_got; i++) { + unsigned pos = ja->discard_idx ?: ja->nr; + long b = bu[i]; - if (new_fs) { - b = bch2_bucket_alloc_new_fs(ca); - if (b < 0) { - ret = -ENOSPC; - goto err; - } - } else { - rcu_read_lock(); - ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, - false, cl); - rcu_read_unlock(); - if (IS_ERR(ob)) { - ret = cl ? -EAGAIN : -ENOSPC; - goto err; - } - - b = ob->bucket; - } - - if (c) - spin_lock(&c->journal.lock); - - /* - * XXX - * For resize at runtime, we should be writing the new - * superblock before inserting into the journal array - */ - - pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; __array_insert_item(ja->buckets, ja->nr, pos); __array_insert_item(ja->bucket_seq, ja->nr, pos); - __array_insert_item(journal_buckets->buckets, ja->nr, pos); ja->nr++; ja->buckets[pos] = b; ja->bucket_seq[pos] = 0; - journal_buckets->buckets[pos] = cpu_to_le64(b); if (pos <= ja->discard_idx) ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -820,29 +858,56 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + } + + ret = bch2_journal_buckets_to_sb(c, ca); + if (ret) { + /* Revert: */ + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = old_nr; + ja->discard_idx = old_discard_idx; + ja->dirty_idx_ondisk = old_dirty_idx_ondisk; + ja->dirty_idx = old_dirty_idx; + ja->cur_idx = old_cur_idx; + } + + if (!new_fs) + spin_unlock(&c->journal.lock); - if (c) - spin_unlock(&c->journal.lock); + if (c) + bch2_journal_unblock(&c->journal); - if (!new_fs) { + if (ret) + goto err; + + if (!new_fs) { + for (i = 0; i < nr_got; i++) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_trans_mark_metadata_bucket(&trans, ca, - b, BCH_DATA_journal, + bu[i], BCH_DATA_journal, ca->mi.bucket_size)); - - bch2_open_bucket_put(c, ob); - - if (ret) + if (ret) { + bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); goto err; + } } } err: - bch2_sb_resize_journal(&ca->disk_sb, - ja->nr + sizeof(*journal_buckets) / sizeof(u64)); + if (ob && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); + kfree(new_bucket_seq); kfree(new_buckets); + kfree(ob); + kfree(bu); return ret; +err_unblock: + if (c) + bch2_journal_unblock(&c->journal); + goto err; } /* @@ -855,11 +920,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, struct journal_device *ja = &ca->journal; struct closure cl; unsigned current_nr; - int ret; + int ret = 0; + + /* don't handle reducing nr of buckets yet: */ + if (nr < ja->nr) + return 0; closure_init_stack(&cl); - do { + while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { struct disk_reservation disk_res = { 0, 0 }; closure_sync(&cl); @@ -887,7 +956,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, if (ja->nr != current_nr) bch2_write_super(c); mutex_unlock(&c->sb_lock); - } while (ret == -EAGAIN); + } return ret; } @@ -918,17 +987,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { - union journal_res_state state; bool ret = false; - unsigned i; + u64 seq; spin_lock(&j->lock); - state = READ_ONCE(j->reservations); - i = state.idx; + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j) && !ret; + seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, seq); - while (i != state.unwritten_idx) { - i = (i - 1) & JOURNAL_BUF_MASK; - if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) + if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) ret = true; } spin_unlock(&j->lock); @@ -943,6 +1011,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); wait_event(j->wait, journal_entry_close(j)); @@ -957,11 +1026,9 @@ void bch2_fs_journal_stop(struct journal *j) BUG_ON(!bch2_journal_error(j) && test_bit(JOURNAL_REPLAY_DONE, &j->flags) && - (journal_entry_is_open(j) || - j->last_empty_seq + 1 != journal_cur_seq(j))); + j->last_empty_seq != journal_cur_seq(j)); cancel_delayed_work_sync(&j->write_work); - bch2_journal_reclaim_stop(j); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq, @@ -991,6 +1058,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; + j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); @@ -1028,11 +1096,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, set_bit(JOURNAL_STARTED, &j->flags); j->last_flush_write = jiffies; - journal_pin_new_entry(j); - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - - bch2_journal_buf_init(j); + j->reservations.unwritten_idx++; c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1060,9 +1125,20 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_get_journal(sb); + struct bch_sb_field_journal_v2 *journal_buckets_v2 = + bch2_sb_get_journal_v2(sb); unsigned i; - ja->nr = bch2_nr_journal_buckets(journal_buckets); + ja->nr = 0; + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + + for (i = 0; i < nr; i++) + ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); + } else if (journal_buckets) { + ja->nr = bch2_nr_journal_buckets(journal_buckets); + } ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) @@ -1077,8 +1153,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->buckets) return -ENOMEM; - for (i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + unsigned j, dst = 0; + + for (i = 0; i < nr; i++) + for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + ja->buckets[dst++] = + le64_to_cpu(journal_buckets_v2->d[i].start) + j; + } else if (journal_buckets) { + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + } return 0; } @@ -1144,17 +1230,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) union journal_res_state s; struct bch_dev *ca; unsigned long now = jiffies; + u64 seq; unsigned i; + out->atomic++; + out->tabstops[0] = 24; + rcu_read_lock(); s = READ_ONCE(j->reservations); - pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin)); + pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size); pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + pr_buf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); + pr_buf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1164,35 +1256,54 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error); + pr_buf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); pr_buf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - pr_buf(out, "error\n"); + pr_buf(out, "error"); break; case JOURNAL_ENTRY_CLOSED_VAL: - pr_buf(out, "closed\n"); + pr_buf(out, "closed"); break; default: - pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); + pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); break; } - pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx)); + pr_newline(out); - i = s.idx; - while (i != s.unwritten_idx) { - i = (i - 1) & JOURNAL_BUF_MASK; + for (seq = journal_cur_seq(j); + seq >= journal_last_unwritten_seq(j); + --seq) { + i = seq & JOURNAL_BUF_MASK; - pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", - i, journal_state_count(s, i), j->buf[i].sectors); + pr_buf(out, "unwritten entry:"); + pr_tab(out); + pr_buf(out, "%llu", seq); + pr_newline(out); + pr_indent_push(out, 2); + + pr_buf(out, "refcount:"); + pr_tab(out); + pr_buf(out, "%u", journal_state_count(s, i)); + pr_newline(out); + + pr_buf(out, "sectors:"); + pr_tab(out); + pr_buf(out, "%u", j->buf[i].sectors); + pr_newline(out); + + pr_buf(out, "expires"); + pr_tab(out); + pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies); + pr_newline(out); + + pr_indent_pop(out, 2); } pr_buf(out, - "need write:\t\t%i\n" "replay done:\t\t%i\n", - test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); pr_buf(out, "space:\n"); @@ -1230,6 +1341,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } rcu_read_unlock(); + + --out->atomic; } void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) @@ -1239,27 +1352,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) spin_unlock(&j->lock); } -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - u64 i; spin_lock(&j->lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, i) { - pr_buf(out, "%llu: count %u\n", - i, atomic_read(&pin_list->count)); + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; - list_for_each_entry(pin, &pin_list->list, list) - pr_buf(out, "\t%px %ps\n", - pin, pin->flush); + pin_list = journal_seq_pin(j, *seq); - if (!list_empty(&pin_list->flushed)) - pr_buf(out, "flushed:\n"); + pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); + pr_newline(out); + pr_indent_push(out, 2); - list_for_each_entry(pin, &pin_list->flushed, list) - pr_buf(out, "\t%px %ps\n", - pin, pin->flush); + list_for_each_entry(pin, &pin_list->list, list) { + pr_buf(out, "\t%px %ps", pin, pin->flush); + pr_newline(out); } + + list_for_each_entry(pin, &pin_list->key_cache_list, list) { + pr_buf(out, "\t%px %ps", pin, pin->flush); + pr_newline(out); + } + + if (!list_empty(&pin_list->flushed)) { + pr_buf(out, "flushed:"); + pr_newline(out); + } + + list_for_each_entry(pin, &pin_list->flushed, list) { + pr_buf(out, "\t%px %ps", pin, pin->flush); + pr_newline(out); + } + + pr_indent_pop(out, 2); + + --out->atomic; spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index c39cbbf1bccd..e7321c327d9d 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -141,6 +141,11 @@ static inline u64 journal_cur_seq(struct journal *j) return j->pin.back - 1; } +static inline u64 journal_last_unwritten_seq(struct journal *j) +{ + return j->seq_ondisk + 1; +} + void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) @@ -261,9 +266,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) .buf3_count = idx == 3, }).v, &j->reservations.counter); - EBUG_ON(((s.idx - idx) & 3) > - ((s.idx - s.unwritten_idx) & 3)); - if (!journal_state_count(s, idx) && idx == s.unwritten_idx) __bch2_journal_buf_put(j); } @@ -293,9 +295,9 @@ static inline void bch2_journal_res_put(struct journal *j, int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, unsigned); -#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -#define JOURNAL_RES_GET_CHECK (1 << 1) -#define JOURNAL_RES_GET_RESERVED (1 << 2) +/* First two bits for JOURNAL_WATERMARK: */ +#define JOURNAL_RES_GET_NONBLOCK (1 << 2) +#define JOURNAL_RES_GET_CHECK (1 << 3) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -316,8 +318,7 @@ static inline int journal_res_get_fast(struct journal *j, EBUG_ON(!journal_state_count(new, new.idx)); - if (!(flags & JOURNAL_RES_GET_RESERVED) && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) return 0; new.cur_entry_offset += res->u64s; @@ -370,23 +371,27 @@ out: /* journal_preres: */ -static inline bool journal_check_may_get_unreserved(struct journal *j) +static inline void journal_set_watermark(struct journal *j) { union journal_preres_state s = READ_ONCE(j->prereserved); - bool ret = s.reserved < s.remaining && - fifo_free(&j->pin) > 8; - - lockdep_assert_held(&j->lock); - - if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { - if (ret) { - set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); - journal_wake(j); - } else { - clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); - } - } - return ret; + unsigned watermark = JOURNAL_WATERMARK_any; + + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + if (fifo_free(&j->pin) < j->pin.size / 8) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + + if (s.reserved > s.remaining) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + if (!s.remaining) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static inline void bch2_journal_preres_put(struct journal *j, @@ -406,12 +411,8 @@ static inline void bch2_journal_preres_put(struct journal *j, closure_wake_up(&j->preres_wait); } - if (s.reserved <= s.remaining && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { - spin_lock(&j->lock); - journal_check_may_get_unreserved(j); - spin_unlock(&j->lock); - } + if (s.reserved <= s.remaining && j->watermark) + journal_set_watermark(j); } int __bch2_journal_preres_get(struct journal *, @@ -432,8 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, old.v = new.v = v; ret = 0; - if ((flags & JOURNAL_RES_GET_RESERVED) || - test_bit(JOURNAL_NOCHANGES, &j->flags) || + if ((flags & JOURNAL_WATERMARK_reserved) || new.reserved + d < new.remaining) { new.reserved += d; ret = 1; @@ -477,7 +477,9 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); +bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); +int bch2_journal_log_msg(struct journal *, const char *, ...); void bch2_journal_halt(struct journal *); @@ -501,6 +503,7 @@ void bch2_journal_block(struct journal *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index e161e86e48c4..e61b88930a7f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" @@ -47,12 +48,12 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct bch_extent_ptr entry_ptr, + struct journal_ptr entry_ptr, struct journal_list *jlist, struct jset *j, bool bad) { struct journal_replay *i, *pos, *dup = NULL; - struct bch_extent_ptr *ptr; + struct journal_ptr *ptr; struct list_head *where; size_t bytes = vstruct_bytes(j); u64 last_seq = 0; @@ -252,14 +253,15 @@ static int journal_validate_key(struct bch_fs *c, const char *where, invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id)); if (invalid) { - char buf[160]; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", type, where, (u64 *) k - entry->_data, le16_to_cpu(entry->u64s), - invalid, buf); + invalid, buf.buf); + printbuf_exit(&buf); le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -274,7 +276,7 @@ fsck_err: return ret; } -static int journal_entry_validate_btree_keys(struct bch_fs *c, +static int journal_entry_btree_keys_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -295,7 +297,24 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, return 0; } -static int journal_entry_validate_btree_root(struct bch_fs *c, +static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct bkey_i *k; + bool first = true; + + vstruct_for_each(entry, k) { + if (!first) { + pr_newline(out); + pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } + pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } +} + +static int journal_entry_btree_root_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -323,7 +342,13 @@ fsck_err: return ret; } -static int journal_entry_validate_prio_ptrs(struct bch_fs *c, +static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + +static int journal_entry_prio_ptrs_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -332,7 +357,12 @@ static int journal_entry_validate_prio_ptrs(struct bch_fs *c, return 0; } -static int journal_entry_validate_blacklist(struct bch_fs *c, +static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ +} + +static int journal_entry_blacklist_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -347,7 +377,16 @@ fsck_err: return ret; } -static int journal_entry_validate_blacklist_v2(struct bch_fs *c, +static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist *bl = + container_of(entry, struct jset_entry_blacklist, entry); + + pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq)); +} + +static int journal_entry_blacklist_v2_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -373,7 +412,18 @@ fsck_err: return ret; } -static int journal_entry_validate_usage(struct bch_fs *c, +static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist_v2 *bl = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + pr_buf(out, "start=%llu end=%llu", + le64_to_cpu(bl->start), + le64_to_cpu(bl->end)); +} + +static int journal_entry_usage_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -394,7 +444,18 @@ fsck_err: return ret; } -static int journal_entry_validate_data_usage(struct bch_fs *c, +static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + pr_buf(out, "type=%s v=%llu", + bch2_fs_usage_types[u->entry.btree_id], + le64_to_cpu(u->v)); +} + +static int journal_entry_data_usage_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -416,7 +477,17 @@ fsck_err: return ret; } -static int journal_entry_validate_clock(struct bch_fs *c, +static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + bch2_replicas_entry_to_text(out, &u->r); + pr_buf(out, "=%llu", le64_to_cpu(u->v)); +} + +static int journal_entry_clock_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -442,7 +513,16 @@ fsck_err: return ret; } -static int journal_entry_validate_dev_usage(struct bch_fs *c, +static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); +} + +static int journal_entry_dev_usage_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -479,15 +559,59 @@ fsck_err: return ret; } +static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + pr_buf(out, "dev=%u", le32_to_cpu(u->dev)); + + for (i = 0; i < nr_types; i++) { + if (i < BCH_DATA_NR) + pr_buf(out, " %s", bch2_data_types[i]); + else + pr_buf(out, " (unknown data type %u)", i); + pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu", + le64_to_cpu(u->d[i].buckets), + le64_to_cpu(u->d[i].sectors), + le64_to_cpu(u->d[i].fragmented)); + } + + pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu", + le64_to_cpu(u->buckets_ec), + le64_to_cpu(u->buckets_unavailable)); +} + +static int journal_entry_log_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + return 0; +} + +static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); + + pr_buf(out, "%.*s", bytes, l->d); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, const char *, struct jset_entry *, unsigned, int, int); + void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; static const struct jset_entry_ops bch2_jset_entry_ops[] = { #define x(f, nr) \ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ - .validate = journal_entry_validate_##f, \ + .validate = journal_entry_##f##_validate, \ + .to_text = journal_entry_##f##_to_text, \ }, BCH_JSET_ENTRY_TYPES() #undef x @@ -503,6 +627,17 @@ int bch2_journal_entry_validate(struct bch_fs *c, const char *where, : 0; } +void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + if (entry->type < BCH_JSET_ENTRY_NR) { + pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_jset_entry_ops[entry->type].to_text(out, c, entry); + } else { + pr_buf(out, "(unknown type %u)", entry->type); + } +} + static int jset_validate_entries(struct bch_fs *c, struct jset *jset, int write) { @@ -592,9 +727,11 @@ static int jset_validate(struct bch_fs *c, sector, le64_to_cpu(jset->seq))) ret = JOURNAL_ENTRY_BAD; - bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); + bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret); csum_done: /* last_seq is ignored when JSET_NO_FLUSH is true */ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && @@ -737,9 +874,12 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { - .dev = ca->dev_idx, - .offset = offset, + ret = journal_entry_add(c, ca, (struct journal_ptr) { + .dev = ca->dev_idx, + .bucket = bucket, + .bucket_offset = offset - + bucket_to_sector(ca, ja->buckets[bucket]), + .sector = offset, }, jlist, j, ret != 0); mutex_unlock(&jlist->lock); @@ -766,12 +906,14 @@ static void bch2_journal_read_device(struct closure *cl) struct journal_device *ja = container_of(cl, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); + struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); + struct journal_replay *r; struct journal_read_buf buf = { NULL, 0 }; u64 min_seq = U64_MAX; unsigned i; - int ret; + int ret = 0; if (!ja->nr) goto out; @@ -803,11 +945,37 @@ static void bch2_journal_read_device(struct closure *cl) * allocate */ while (ja->bucket_seq[ja->cur_idx] > min_seq && - ja->bucket_seq[ja->cur_idx] > + ja->bucket_seq[ja->cur_idx] == ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = 0; + ja->sectors_free = ca->mi.bucket_size; + + mutex_lock(&jlist->lock); + list_for_each_entry(r, jlist->head, list) { + for (i = 0; i < r->nr_ptrs; i++) { + if (r->ptrs[i].dev == ca->dev_idx && + sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { + unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) + + vstruct_sectors(&r->j, c->block_bits); + + ja->sectors_free = min(ja->sectors_free, + ca->mi.bucket_size - wrote); + } + } + } + mutex_unlock(&jlist->lock); + + if (ja->bucket_seq[ja->cur_idx] && + ja->sectors_free == ca->mi.bucket_size) { + bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); + bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); + for (i = 0; i < 3; i++) { + unsigned idx = ja->cur_idx - 1 + i; + bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); + } + ja->sectors_free = 0; + } /* * Set dirty_idx to indicate the entire journal is full and needs to be @@ -817,6 +985,7 @@ static void bch2_journal_read_device(struct closure *cl) ja->discard_idx = ja->dirty_idx_ondisk = ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: + bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); closure_return(cl); @@ -828,8 +997,8 @@ err: goto out; } -static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) { unsigned i; @@ -837,13 +1006,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); u64 offset; - div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); + div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); if (i) pr_buf(out, " "); - pr_buf(out, "%u:%llu (offset %llu)", + pr_buf(out, "%u:%u:%u (sector %llu)", j->ptrs[i].dev, - (u64) j->ptrs[i].offset, offset); + j->ptrs[i].bucket, + j->ptrs[i].bucket_offset, + j->ptrs[i].sector); } } @@ -854,6 +1025,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; + struct printbuf buf = PRINTBUF; size_t keys = 0, entries = 0; bool degraded = false; u64 seq, last_seq = 0; @@ -912,7 +1084,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, if (!last_seq) { fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); - return -1; + ret = -1; + goto err; } /* Drop blacklisted entries and entries older than last_seq: */ @@ -944,7 +1117,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, while (seq < le64_to_cpu(i->j.seq)) { u64 missing_start, missing_end; - char buf1[200], buf2[200]; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; while (seq < le64_to_cpu(i->j.seq) && bch2_journal_seq_is_blacklisted(c, seq, false)) @@ -960,14 +1133,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, seq++; if (i->list.prev != list) { - struct printbuf out = PBUF(buf1); struct journal_replay *p = list_prev_entry(i, list); - bch2_journal_ptrs_to_text(&out, c, p); - pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); + bch2_journal_ptrs_to_text(&buf1, c, p); + pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits)); } else - sprintf(buf1, "(none)"); - bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); + pr_buf(&buf1, "(none)"); + bch2_journal_ptrs_to_text(&buf2, c, i); missing_end = seq - 1; fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" @@ -975,7 +1147,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, " next at %s", missing_start, missing_end, last_seq, *blacklist_seq - 1, - buf1, buf2); + buf1.buf, buf2.buf); + + printbuf_exit(&buf1); + printbuf_exit(&buf2); } seq++; @@ -989,14 +1164,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, .e.nr_required = 1, }; unsigned ptr; - char buf[80]; if (i->ignore) continue; ret = jset_validate_entries(c, &i->j, READ); if (ret) - goto fsck_err; + goto err; for (ptr = 0; ptr < i->nr_ptrs; ptr++) replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; @@ -1008,15 +1182,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, * the devices - this is wrong: */ + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, &replicas.e); + if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, "superblock not marked as containing replicas %s", - (bch2_replicas_entry_to_text(&PBUF(buf), - &replicas.e), buf)))) { + buf.buf))) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) - return ret; + goto err; } for_each_jset_key(k, _n, entry, &i->j) @@ -1030,7 +1206,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, if (*start_seq != *blacklist_seq) bch_info(c, "dropped unflushed entries %llu-%llu", *blacklist_seq, *start_seq - 1); +err: fsck_err: + printbuf_exit(&buf); return ret; } @@ -1157,49 +1335,6 @@ done: return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } -static void journal_write_compact(struct jset *jset) -{ - struct jset_entry *i, *next, *prev = NULL; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each_safe(jset, i, next) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == BCH_JSET_ENTRY_btree_keys && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); - } - - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -} - static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { /* we aren't holding j->lock: */ @@ -1225,7 +1360,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) { - return j->buf + j->reservations.unwritten_idx; + return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); } static void journal_write_done(struct closure *cl) @@ -1262,15 +1397,18 @@ static void journal_write_done(struct closure *cl) journal_seq_pin(j, seq)->devs = w->devs_written; if (!err) { - j->seq_ondisk = seq; - if (!JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; + + bch2_do_discards(c); + closure_wake_up(&c->freelist_wait); } } else if (!j->err_seq || seq < j->err_seq) j->err_seq = seq; + j->seq_ondisk = seq; + /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard * more buckets: @@ -1286,7 +1424,7 @@ static void journal_write_done(struct closure *cl) v = atomic64_read(&j->reservations.counter); do { old.v = new.v = v; - BUG_ON(new.idx == new.unwritten_idx); + BUG_ON(journal_state_count(new, new.unwritten_idx)); new.unwritten_idx++; } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -1297,13 +1435,24 @@ static void journal_write_done(struct closure *cl) closure_wake_up(&w->wait); journal_wake(j); - if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) - mod_delayed_work(c->io_complete_wq, &j->write_work, 0); - spin_unlock(&j->lock); - - if (new.unwritten_idx != new.idx && - !journal_state_count(new, new.unwritten_idx)) + if (!journal_state_count(new, new.unwritten_idx) && + journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + + /* + * We don't close a journal entry to write it while there's + * previous entries still in flight - the current journal entry + * might want to be written now: + */ + + mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + } + + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) @@ -1385,7 +1534,7 @@ void bch2_journal_write(struct closure *cl) struct jset_entry *start, *end; struct jset *jset; struct bio *bio; - char *journal_debug_buf = NULL; + struct printbuf journal_debug_buf = PRINTBUF; bool validate_before_checksum = false; unsigned i, sectors, bytes, u64s, nr_rw_members = 0; int ret; @@ -1398,10 +1547,11 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); spin_lock(&j->lock); - if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && - !w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + if (bch2_journal_error(j) || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); jset->last_seq = 0; @@ -1438,10 +1588,8 @@ void bch2_journal_write(struct closure *cl) le32_add_cpu(&jset->u64s, u64s); BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); - journal_write_compact(jset); - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_new_versioning + jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber ? cpu_to_le32(BCH_JSET_VERSION_OLD) : cpu_to_le32(c->sb.version); @@ -1461,9 +1609,12 @@ void bch2_journal_write(struct closure *cl) jset_validate_for_write(c, jset)) goto err; - bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret)) + goto err; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); @@ -1488,11 +1639,8 @@ retry_alloc: goto retry_alloc; } - if (ret) { - journal_debug_buf = kmalloc(4096, GFP_ATOMIC); - if (journal_debug_buf) - __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); - } + if (ret) + __bch2_journal_debug_to_text(&journal_debug_buf, j); /* * write is allocated, no longer need to account for it in @@ -1509,8 +1657,8 @@ retry_alloc: if (ret) { bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf); - kfree(journal_debug_buf); + journal_debug_buf.buf); + printbuf_exit(&journal_debug_buf); bch2_fatal_error(c); continue_at(cl, journal_write_done, c->io_complete_wq); return; @@ -1518,7 +1666,7 @@ retry_alloc: w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (test_bit(JOURNAL_NOCHANGES, &j->flags)) + if (c->opts.nochanges) goto no_io; for_each_rw_member(ca, c, i) @@ -1541,16 +1689,12 @@ retry_alloc: } } - bch2_bucket_seq_cleanup(c); - continue_at(cl, do_journal_write, c->io_complete_wq); return; no_io: - bch2_bucket_seq_cleanup(c); - continue_at(cl, journal_write_done, c->io_complete_wq); return; err: - bch2_inconsistent_error(c); + bch2_fatal_error(c); continue_at(cl, journal_write_done, c->io_complete_wq); } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index f34281a28f12..f2001835e43e 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -8,7 +8,12 @@ */ struct journal_replay { struct list_head list; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; + struct journal_ptr { + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; + } ptrs[BCH_REPLICAS_MAX]; unsigned nr_ptrs; /* checksum error, but we may want to try using it anyways: */ @@ -40,8 +45,13 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *, - unsigned, int, int); +int bch2_journal_entry_validate(struct bch_fs *, const char *, + struct jset_entry *, unsigned, int, int); +void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); + +void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct journal_replay *); int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ab9a6d966d5e..a9f7d5a7feb2 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -34,10 +34,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { - unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) - ? ((journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr) - : ja->nr; + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; /* * Don't use the last bucket unless writing the new last_seq @@ -61,25 +59,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) old.v, new.v)) != old.v); } -static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) -{ - unsigned sectors = 0; - - while (!sectors && *idx != j->reservations.idx) { - sectors = j->buf[*idx].sectors; - - *idx = (*idx + 1) & JOURNAL_BUF_MASK; - } - - return sectors; -} - static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { struct journal_device *ja = &ca->journal; - unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; + unsigned sectors, buckets, unwritten; + u64 seq; if (from == journal_space_total) return (struct journal_space) { @@ -94,7 +80,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, * We that we don't allocate the space for a journal entry * until we write it out - thus, account for it here: */ - while ((unwritten = get_unwritten_sectors(j, &idx))) { + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; + + if (!unwritten) + continue; + /* entry won't fit on this device, skip: */ if (unwritten > ca->mi.bucket_size) continue; @@ -202,7 +195,7 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < c->opts.metadata_replicas_required) { - ret = cur_entry_insufficient_devices; + ret = JOURNAL_ERR_insufficient_devices; goto out; } @@ -216,23 +209,24 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!clean_ondisk && - j->reservations.idx == - j->reservations.unwritten_idx) { - char *buf = kmalloc(4096, GFP_ATOMIC); - - bch_err(c, "journal stuck"); - if (buf) { - __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); - pr_err("\n%s", buf); - kfree(buf); - } + journal_cur_seq(j) == j->seq_ondisk) { + struct printbuf buf = PRINTBUF; + + __bch2_journal_debug_to_text(&buf, j); + bch_err(c, "journal stuck\n%s", buf.buf); + printbuf_exit(&buf); + /* + * Hack: bch2_fatal_error() calls bch2_journal_halt() which + * takes journal lock: + */ + spin_unlock(&j->lock); bch2_fatal_error(c); - ret = cur_entry_journal_stuck; + spin_lock(&j->lock); + + ret = JOURNAL_ERR_journal_stuck; } else if (!j->space[journal_space_discarded].next_entry) - ret = cur_entry_journal_full; - else if (!fifo_free(&j->pin)) - ret = cur_entry_journal_pin_full; + ret = JOURNAL_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -251,7 +245,7 @@ out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; journal_set_remaining(j, u64s_remaining); - journal_check_may_get_unreserved(j); + journal_set_watermark(j); if (!ret) journal_wake(j); @@ -286,7 +280,8 @@ void bch2_journal_do_discards(struct journal *j) struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { - if (ca->mi.discard && + if (!c->opts.nochanges && + ca->mi.discard && blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, @@ -373,9 +368,6 @@ static inline void __journal_pin_drop(struct journal *j, if (atomic_dec_and_test(&pin_list->count) && pin_list == &fifo_peek_front(&j->pin)) bch2_journal_reclaim_fast(j); - else if (fifo_used(&j->pin) == 1 && - atomic_read(&pin_list->count) == 1) - journal_wake(j); } void bch2_journal_pin_drop(struct journal *j, @@ -489,9 +481,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, u64 seq; int err; - if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) - return 0; - lockdep_assert_held(&j->reclaim_lock); while (1) { @@ -671,7 +660,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (nr_flushed) wake_up(&j->reclaim_wait); - } while ((min_nr || min_key_cache) && !direct); + } while ((min_nr || min_key_cache) && nr_flushed && !direct); memalloc_noreclaim_restore(flags); @@ -688,12 +677,11 @@ static int bch2_journal_reclaim_thread(void *arg) struct journal *j = arg; struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned long delay, now; + bool journal_empty; int ret = 0; set_freezable(); - kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); - j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { @@ -716,10 +704,17 @@ static int bch2_journal_reclaim_thread(void *arg) break; if (j->reclaim_kicked) break; - if (time_after_eq(jiffies, j->next_reclaim)) - break; - freezable_schedule_timeout(j->next_reclaim - jiffies); + spin_lock(&j->lock); + journal_empty = fifo_empty(&j->pin); + spin_unlock(&j->lock); + + if (journal_empty) + freezable_schedule(); + else if (time_after(j->next_reclaim, jiffies)) + freezable_schedule_timeout(j->next_reclaim - jiffies); + else + break; } __set_current_state(TASK_RUNNING); } @@ -771,7 +766,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; + if (journal_flush_pins(j, seq_to_flush, 0, 0)) + *did_work = true; spin_lock(&j->lock); /* @@ -780,8 +776,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, */ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || journal_last_seq(j) > seq_to_flush || - (fifo_used(&j->pin) == 1 && - atomic_read(&fifo_peek_front(&j->pin).count) == 1); + !fifo_used(&j->pin); spin_unlock(&j->lock); mutex_unlock(&j->reclaim_lock); @@ -829,10 +824,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); - while (!ret && seq < j->pin.back) { + while (!ret) { struct bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); + if (seq >= j->pin.back) + break; bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, journal_seq_pin(j, seq)->devs); seq++; diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c new file mode 100644 index 000000000000..8efe7b7e3dcb --- /dev/null +++ b/fs/bcachefs/journal_sb.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "journal_sb.h" + +#include <linux/sort.h> + +/* BCH_SB_FIELD_journal: */ + +static int u64_cmp(const void *_l, const void *_r) +{ + const u64 *l = _l; + const u64 *r = _r; + + return cmp_int(*l, *r); +} + +static int bch2_sb_journal_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -EINVAL; + unsigned nr; + unsigned i; + u64 *b; + + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return 0; + + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) + return -ENOMEM; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + if (!b[0]) { + pr_buf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0] < le16_to_cpu(m->first_bucket)) { + pr_buf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { + pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) { + pr_buf(err, "duplicate journal buckets %llu", b[i]); + goto err; + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + unsigned i, nr = bch2_nr_journal_buckets(journal); + + pr_buf(out, "Buckets: "); + for (i = 0; i < nr; i++) + pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i])); + pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_journal_validate, + .to_text = bch2_sb_journal_to_text, +}; + +struct u64_range { + u64 start; + u64 end; +}; + +static int u64_range_cmp(const void *_l, const void *_r) +{ + const struct u64_range *l = _l; + const struct u64_range *r = _r; + + return cmp_int(l->start, r->start); +} + +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -EINVAL; + unsigned nr; + unsigned i; + struct u64_range *b; + + nr = bch2_sb_field_journal_v2_nr_entries(journal); + if (!nr) + return 0; + + b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL); + if (!b) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + b[i].start = le64_to_cpu(journal->d[i].start); + b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + } + + sort(b, nr, sizeof(*b), u64_range_cmp, NULL); + + if (!b[0].start) { + pr_buf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0].start < le16_to_cpu(m->first_bucket)) { + pr_buf(err, "journal bucket %llu before first bucket %u", + b[0].start, le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { + pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) { + if (b[i].end == b[i + 1].start) { + pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + + if (b[i].end > b[i + 1].start) { + pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); + + pr_buf(out, "Buckets: "); + for (i = 0; i < nr; i++) + pr_buf(out, " %llu-%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); + pr_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { + .validate = bch2_sb_journal_v2_validate, + .to_text = bch2_sb_journal_v2_to_text, +}; + +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal_v2 *j; + unsigned i, dst = 0, nr = 1; + + lockdep_assert_held(&c->sb_lock); + + if (!ja->nr) { + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); + return 0; + } + + for (i = 0; i + 1 < ja->nr; i++) + if (ja->buckets[i] + 1 != ja->buckets[i + 1]) + nr++; + + j = bch2_sb_resize_journal_v2(&ca->disk_sb, + (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); + if (!j) + return -ENOSPC; + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + + j->d[dst].start = le64_to_cpu(ja->buckets[0]); + j->d[dst].nr = le64_to_cpu(1); + + for (i = 1; i < ja->nr; i++) { + if (ja->buckets[i] == ja->buckets[i - 1] + 1) { + le64_add_cpu(&j->d[dst].nr, 1); + } else { + dst++; + j->d[dst].start = le64_to_cpu(ja->buckets[i]); + j->d[dst].nr = le64_to_cpu(1); + } + } + + return 0; +} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h new file mode 100644 index 000000000000..a39192e9f6f4 --- /dev/null +++ b/fs/bcachefs/journal_sb.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "super-io.h" +#include "vstructs.h" + +static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +{ + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; +} + +static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) +{ + if (!j) + return 0; + + return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal; +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; + +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 10bd23e969d2..3140c8731431 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -66,6 +66,12 @@ blacklist_entry_try_merge(struct bch_fs *c, return bl; } +static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, + u64 start, u64 end) +{ + return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); +} + int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; @@ -76,28 +82,21 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); nr = blacklist_nr_entries(bl); - if (bl) { - for (i = 0; i < nr; i++) { - struct journal_seq_blacklist_entry *e = - bl->start + i; - - if (start == le64_to_cpu(e->start) && - end == le64_to_cpu(e->end)) - goto out; - - if (start <= le64_to_cpu(e->start) && - end >= le64_to_cpu(e->end)) { - e->start = cpu_to_le64(start); - e->end = cpu_to_le64(end); - - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; - } + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; + + if (bl_entry_contig_or_overlaps(e, start, end)) { + e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); + e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; } } @@ -189,27 +188,34 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) return 0; } -static const char * -bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); - struct journal_seq_blacklist_entry *i; - unsigned nr = blacklist_nr_entries(bl); + unsigned i, nr = blacklist_nr_entries(bl); - for (i = bl->start; i < bl->start + nr; i++) { - if (le64_to_cpu(i->start) >= - le64_to_cpu(i->end)) - return "entry start >= end"; - - if (i + 1 < bl->start + nr && - le64_to_cpu(i[0].end) > - le64_to_cpu(i[1].start)) - return "entries out of order"; + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = bl->start + i; + + if (le64_to_cpu(e->start) >= + le64_to_cpu(e->end)) { + pr_buf(err, "entry %u start >= end (%llu >= %llu)", + i, le64_to_cpu(e->start), le64_to_cpu(e->end)); + return -EINVAL; + } + + if (i + 1 < nr && + le64_to_cpu(e[0].end) > + le64_to_cpu(e[1].start)) { + pr_buf(err, "entry %u out of order with next entry (%llu > %llu)", + i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); + return -EINVAL; + } } - return NULL; + return 0; } static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, @@ -229,9 +235,88 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, le64_to_cpu(i->start), le64_to_cpu(i->end)); } + pr_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .validate = bch2_sb_journal_seq_blacklist_validate, .to_text = bch2_sb_journal_seq_blacklist_to_text }; + +void bch2_blacklist_entries_gc(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans trans; + unsigned i, nr, new_nr; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter iter; + struct btree *b; + + bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, + 0, 0, BTREE_ITER_PREFETCH); +retry: + bch2_trans_begin(&trans); + + b = bch2_btree_iter_peek_node(&iter); + + while (!(ret = PTR_ERR_OR_ZERO(b)) && + b && + !test_bit(BCH_FS_STOPPING, &c->flags)) + b = bch2_btree_iter_next_node(&iter); + + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); + } + + bch2_trans_exit(&trans); + if (ret) + return; + + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + if (!bl) + goto out; + + nr = blacklist_nr_entries(bl); + dst = bl->start; + + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } + + new_nr = dst - bl->start; + + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index b4f876a04586..afb886ec8e25 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -17,4 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; +void bch2_blacklist_entries_gc(struct work_struct *); + #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 54cc69bde1bb..a6cdb885ad41 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -25,6 +25,8 @@ struct journal_buf { struct closure_waitlist wait; u64 last_seq; /* copy of data->last_seq */ + long expires; + u64 flush_time; unsigned buf_size; /* size in bytes of @data */ unsigned sectors; /* maximum size for current entry */ @@ -139,20 +141,39 @@ enum journal_space_from { journal_space_nr, }; -/* - * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, - * either because something's waiting on the write to complete or because it's - * been dirty too long and the timer's expired. - */ - enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, - JOURNAL_RECLAIM_STARTED, - JOURNAL_NEED_WRITE, - JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_SKIP_FLUSH, - JOURNAL_NOCHANGES, +}; + +#define JOURNAL_WATERMARKS() \ + x(any) \ + x(copygc) \ + x(reserved) + +enum journal_watermark { +#define x(n) JOURNAL_WATERMARK_##n, + JOURNAL_WATERMARKS() +#undef x +}; + +#define JOURNAL_WATERMARK_MASK 3 + +/* Reasons we may fail to get a journal reservation: */ +#define JOURNAL_ERRORS() \ + x(ok) \ + x(blocked) \ + x(max_in_flight) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ + x(insufficient_devices) + +enum journal_errors { +#define x(n) JOURNAL_ERR_##n, + JOURNAL_ERRORS() +#undef x }; /* Embedded in struct bch_fs */ @@ -162,6 +183,7 @@ struct journal { unsigned long flags; union journal_res_state reservations; + enum journal_watermark watermark; /* Max size of current journal entry */ unsigned cur_entry_u64s; @@ -171,14 +193,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum { - cur_entry_ok, - cur_entry_blocked, - cur_entry_journal_full, - cur_entry_journal_pin_full, - cur_entry_journal_stuck, - cur_entry_insufficient_devices, - } cur_entry_error; + enum journal_errors cur_entry_error; union journal_preres_state prereserved; @@ -246,6 +261,10 @@ struct journal { spinlock_t err_lock; struct mutex reclaim_lock; + /* + * Used for waiting until journal reclaim has freed up space in the + * journal: + */ wait_queue_head_t reclaim_wait; struct task_struct *reclaim_thread; bool reclaim_kicked; @@ -265,7 +284,6 @@ struct journal { unsigned long last_flush_write; u64 res_get_blocked_start; - u64 need_write_time; u64 write_start_time; u64 nr_flush_writes; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c new file mode 100644 index 000000000000..4f0e6960e597 --- /dev/null +++ b/fs/bcachefs/lru.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" + +const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + if (bkey_val_bytes(k.k) < sizeof(*lru)) + return "incorrect value size"; + + return NULL; +} + +void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + pr_buf(out, "idx %llu", le64_to_cpu(lru->idx)); +} + +static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + u64 existing_idx; + int ret = 0; + + if (!time) + return 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + POS(id, time), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_lru) { + bch2_fs_inconsistent(c, + "pointer to nonexistent lru %llu:%llu", + id, time); + ret = -EIO; + goto err; + } + + existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); + if (existing_idx != idx) { + bch2_fs_inconsistent(c, + "lru %llu:%llu with wrong backpointer: got %llu, should be %llu", + id, time, existing_idx, idx); + ret = -EIO; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_lru *lru; + int ret = 0; + + if (!*time) + return 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, + POS(lru_id, *time), + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES, k, ret) + if (bkey_deleted(k.k)) + break; + + if (ret) + goto err; + + BUG_ON(iter.pos.inode != lru_id); + *time = iter.pos.offset; + + lru = bch2_trans_kmalloc(trans, sizeof(*lru)); + ret = PTR_ERR_OR_ZERO(lru); + if (ret) + goto err; + + bkey_lru_init(&lru->k_i); + lru->k.p = iter.pos; + lru->v.idx = cpu_to_le64(idx); + + ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, + u64 old_time, u64 *new_time) +{ + if (old_time == *new_time) + return 0; + + return lru_delete(trans, id, idx, old_time) ?: + lru_set(trans, id, idx, new_time); +} + +static int bch2_check_lru_key(struct btree_trans *trans, + struct btree_iter *lru_iter, bool initial) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c lru_k, k; + struct bch_alloc_v4 a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + u64 idx; + int ret; + + lru_k = bch2_btree_iter_peek(lru_iter); + if (!lru_k.k) + return 0; + + ret = bkey_err(lru_k); + if (ret) + return ret; + + idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(lru_k.k->p.inode, idx), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_alloc_to_v4(k, &a); + + if (fsck_err_on(bucket_state(a) != BUCKET_cached || + a.io_time[READ] != lru_k.k->p.offset, c, + "incorrect lru entry %s\n" + " for %s", + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.p = lru_iter->pos; + + ret = bch2_trans_update(trans, lru_iter, update, 0); + if (ret) + goto err; + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +int bch2_check_lrus(struct bch_fs *c, bool initial) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_check_lru_key(&trans, &iter, initial)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; + +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 index 000000000000..4db6a8399332 --- /dev/null +++ b/fs/bcachefs/lru.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_H +#define _BCACHEFS_LRU_H + +const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_lru (struct bkey_ops) { \ + .key_invalid = bch2_lru_invalid, \ + .val_to_text = bch2_lru_to_text, \ +} + +int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *); + +int bch2_check_lrus(struct bch_fs *, bool); + +#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index f73be9cb7ac3..1de213506adf 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -92,10 +92,10 @@ next: if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { struct bkey_i *update; - size_t i; + u32 *i; - for (i = 0; i < s.nr; i++) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) + darray_for_each(s.ids, i) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) goto next; update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); @@ -125,7 +125,7 @@ next: } } bch2_trans_iter_exit(trans, &iter); - kfree(s.d); + darray_exit(s.ids); return ret; } @@ -351,8 +351,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, } if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { - m->op.alloc_reserve = RESERVE_MOVINGGC; - m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + m->op.alloc_reserve = RESERVE_movinggc; } else { /* XXX: this should probably be passed in */ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; @@ -481,25 +480,26 @@ static void move_read_endio(struct bio *bio) atomic_sub(io->read_sectors, &ctxt->read_sectors); io->read_completed = true; - if (next_pending_write(ctxt)) - wake_up(&ctxt->wait); - + wake_up(&ctxt->wait); closure_put(&ctxt->cl); } -static void do_pending_writes(struct moving_context *ctxt) +static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) { struct moving_io *io; + if (trans) + bch2_trans_unlock(trans); + while ((io = next_pending_write(ctxt))) { list_del(&io->list); closure_call(&io->cl, move_write, NULL, &ctxt->cl); } } -#define move_ctxt_wait_event(_ctxt, _cond) \ +#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ do { \ - do_pending_writes(_ctxt); \ + do_pending_writes(_ctxt, _trans); \ \ if (_cond) \ break; \ @@ -507,11 +507,12 @@ do { \ next_pending_write(_ctxt) || (_cond)); \ } while (1) -static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, + struct btree_trans *trans) { unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - move_ctxt_wait_event(ctxt, + move_ctxt_wait_event(ctxt, trans, !atomic_read(&ctxt->write_sectors) || atomic_read(&ctxt->write_sectors) != sectors_pending); } @@ -533,14 +534,6 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -691,26 +684,36 @@ static int __bch2_move_data(struct bch_fs *c, schedule_timeout(delay); if (unlikely(freezing(current))) { - bch2_trans_unlock(&trans); - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); - bch2_trans_begin(&trans); + move_ctxt_wait_event(ctxt, &trans, + atomic_read(&ctxt->write_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); - k = bch2_btree_iter_peek(&iter); + move_ctxt_wait_event(ctxt, &trans, + atomic_read(&ctxt->read_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); - stats->pos = iter.pos; + bch2_trans_begin(&trans); + k = bch2_btree_iter_peek(&iter); if (!k.k) break; + ret = bkey_err(k); + if (ret == -EINTR) + continue; if (ret) break; + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; + stats->pos = iter.pos; + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; @@ -745,22 +748,22 @@ static int __bch2_move_data(struct bch_fs *c, BUG(); } - /* unlock before doing IO: */ + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { - if (ret2 == -EINTR) { - bch2_trans_begin(&trans); + if (ret2 == -EINTR) continue; - } if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); + bch2_move_ctxt_wait_for_io(ctxt, &trans); continue; } @@ -845,7 +848,7 @@ int bch2_move_data(struct bch_fs *c, } - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); EBUG_ON(atomic_read(&ctxt.write_sectors)); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 7b7eee9b1773..cb6b81678ecc 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -6,6 +6,7 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" @@ -29,21 +30,6 @@ #include <linux/sort.h> #include <linux/wait.h> -/* - * We can't use the entire copygc reserve in one iteration of copygc: we may - * need the buckets we're freeing up to go back into the copygc reserve to make - * forward progress, but if the copygc reserve is full they'll be available for - * any allocation - and it's possible that in a given iteration, we free up most - * of the buckets we're going to free before we allocate most of the buckets - * we're going to allocate. - * - * If we only use half of the reserve per iteration, then in steady state we'll - * always have room in the reserve for the buckets we're going to need in the - * next iteration: - */ -#define COPYGC_BUCKETS_PER_ITER(ca) \ - ((ca)->free[RESERVE_MOVINGGC].size / 2) - static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) { const struct copygc_heap_entry *l = _l; @@ -69,10 +55,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, .dev = p.ptr.dev, .offset = p.ptr.offset, }; + ssize_t i; - ssize_t i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); + if (p.ptr.cached) + continue; + + i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); #if 0 /* eytzinger search verify code: */ ssize_t j = -1, k; @@ -101,7 +91,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, data_opts->target = io_opts->background_target; data_opts->nr_replicas = 1; data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED; + JOURNAL_WATERMARK_copygc; data_opts->rewrite_dev = p.ptr.dev; if (p.has_ec) @@ -114,37 +104,113 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, return DATA_SKIP; } -static bool have_copygc_reserve(struct bch_dev *ca) +static inline int fragmentation_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) { - bool ret; + return cmp_int(l.fragmentation, r.fragmentation); +} + +static int walk_buckets_to_copygc(struct bch_fs *c) +{ + copygc_heap *h = &c->copygc_heap; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + int ret; - spin_lock(&ca->fs->freelist_lock); - ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || - ca->allocator_state != ALLOCATOR_running; - spin_unlock(&ca->fs->freelist_lock); + bch2_trans_init(&trans, c, 0, 0); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); + struct copygc_heap_entry e; + + bch2_alloc_to_v4(k, &a); + + if (a.data_type != BCH_DATA_user || + a.dirty_sectors >= ca->mi.bucket_size || + bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) + continue; + + e = (struct copygc_heap_entry) { + .dev = iter.pos.inode, + .gen = a.gen, + .replicas = 1 + a.stripe_redundancy, + .fragmentation = (u64) a.dirty_sectors * (1ULL << 31) + / ca->mi.bucket_size, + .sectors = a.dirty_sectors, + .offset = bucket_to_sector(ca, iter.pos.offset), + }; + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); + + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); return ret; } -static inline int fragmentation_cmp(copygc_heap *heap, - struct copygc_heap_entry l, - struct copygc_heap_entry r) +static int bucket_inorder_cmp(const void *_l, const void *_r) { - return cmp_int(l.fragmentation, r.fragmentation); + const struct copygc_heap_entry *l = _l; + const struct copygc_heap_entry *r = _r; + + return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); +} + +static int check_copygc_was_done(struct bch_fs *c, + u64 *sectors_not_moved, + u64 *buckets_not_moved) +{ + copygc_heap *h = &c->copygc_heap; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct copygc_heap_entry *i; + int ret = 0; + + sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); + + for (i = h->data; i < h->data + h->used; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + + bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); + + ret = lockrestart_do(&trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) + break; + + bch2_alloc_to_v4(k, &a); + + if (a.gen == i->gen && a.dirty_sectors) { + *sectors_not_moved += a.dirty_sectors; + *buckets_not_moved += 1; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; } static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; struct copygc_heap_entry e, *i; - struct bucket_array *buckets; struct bch_move_stats move_stats; u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; u64 sectors_reserved = 0; u64 buckets_to_move, buckets_not_moved = 0; struct bch_dev *ca; unsigned dev_idx; - size_t b, heap_size = 0; + size_t heap_size = 0; int ret; bch_move_stats_init(&move_stats, "copygc"); @@ -169,44 +235,25 @@ static int bch2_copygc(struct bch_fs *c) } for_each_rw_member(ca, c, dev_idx) { - closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); - - spin_lock(&ca->fs->freelist_lock); - sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; - spin_unlock(&ca->fs->freelist_lock); - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - struct bucket *g = buckets->b + b; - struct bucket_mark m = READ_ONCE(g->mark); - struct copygc_heap_entry e; - - if (m.owned_by_allocator || - m.data_type != BCH_DATA_user || - !bucket_sectors_used(m) || - bucket_sectors_used(m) >= ca->mi.bucket_size) - continue; - - WARN_ON(m.stripe && !g->stripe_redundancy); - - e = (struct copygc_heap_entry) { - .dev = dev_idx, - .gen = m.gen, - .replicas = 1 + g->stripe_redundancy, - .fragmentation = bucket_sectors_used(m) * (1U << 15) - / ca->mi.bucket_size, - .sectors = bucket_sectors_used(m), - .offset = bucket_to_sector(ca, b), - }; - heap_add_or_replace(h, e, -fragmentation_cmp, NULL); - } - up_read(&ca->bucket_lock); + s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc), + ca->mi.nbuckets >> 6); + + sectors_reserved += avail * ca->mi.bucket_size; + } + + ret = walk_buckets_to_copygc(c); + if (ret) { + bch2_fs_fatal_error(c, "error walking buckets to copygc!"); + return ret; + } + + if (!h->used) { + bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!"); + return 0; } /* - * Our btree node allocations also come out of RESERVE_MOVINGGC: + * Our btree node allocations also come out of RESERVE_movingc: */ sectors_reserved = (sectors_reserved * 3) / 4; if (!sectors_reserved) { @@ -226,8 +273,11 @@ static int bch2_copygc(struct bch_fs *c) buckets_to_move = h->used; - if (!buckets_to_move) + if (!buckets_to_move) { + bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", + sectors_reserved); return 0; + } eytzinger0_sort(h->data, h->used, sizeof(h->data[0]), @@ -240,30 +290,18 @@ static int bch2_copygc(struct bch_fs *c) writepoint_ptr(&c->copygc_write_point), copygc_pred, NULL, &move_stats); + if (ret) { + bch_err(c, "error %i from bch2_move_data() in copygc", ret); + return ret; + } - for_each_rw_member(ca, c, dev_idx) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - for (i = h->data; i < h->data + h->used; i++) { - struct bucket_mark m; - size_t b; - - if (i->dev != dev_idx) - continue; - - b = sector_to_bucket(ca, i->offset); - m = READ_ONCE(buckets->b[b].mark); - - if (i->gen == m.gen && - bucket_sectors_used(m)) { - sectors_not_moved += bucket_sectors_used(m); - buckets_not_moved++; - } - } - up_read(&ca->bucket_lock); + ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); + if (ret) { + bch_err(c, "error %i from check_copygc_was_done()", ret); + return ret; } - if (sectors_not_moved && !ret) + if (sectors_not_moved) bch_warn_ratelimited(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", sectors_not_moved, sectors_to_move, @@ -301,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) for_each_rw_member(ca, c, dev_idx) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); - fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * - ca->mi.bucket_size) >> 1); + fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + ca->mi.bucket_size) >> 1); fragmented = usage.d[BCH_DATA_user].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index d9ca69f2ecde..77fbb7d2194e 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -9,7 +9,12 @@ #include "super-io.h" #include "util.h" -#define x(t, n) #t, +#define x(t, n) [n] = #t, + +const char * const bch2_metadata_versions[] = { + BCH_METADATA_VERSIONS() + NULL +}; const char * const bch2_error_actions[] = { BCH_ERROR_ACTIONS() @@ -71,6 +76,16 @@ const char * const bch2_member_states[] = { NULL }; +const char * const bch2_jset_entry_types[] = { + BCH_JSET_ENTRY_TYPES() + NULL +}; + +const char * const bch2_fs_usage_types[] = { + BCH_FS_USAGE_TYPES() + NULL +}; + #undef x const char * const bch2_d_types[BCH_DT_MAX] = { @@ -86,6 +101,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = { [DT_SUBVOL] = "subvol", }; +u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) +{ + BUG(); +} + +void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) +{ + BUG(); +} + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ @@ -199,42 +224,43 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } -static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v) +int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { - if (msg) - pr_err("invalid %s%s: too small (min %llu)", - msg, opt->attr.name, opt->min); + if (err) + pr_buf(err, "%s: too small (min %llu)", + opt->attr.name, opt->min); return -ERANGE; } if (opt->max && v >= opt->max) { - if (msg) - pr_err("invalid %s%s: too big (max %llu)", - msg, opt->attr.name, opt->max); + if (err) + pr_buf(err, "%s: too big (max %llu)", + opt->attr.name, opt->max); return -ERANGE; } if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { - if (msg) - pr_err("invalid %s %s: not a multiple of 512", - msg, opt->attr.name); + if (err) + pr_buf(err, "%s: not a multiple of 512", + opt->attr.name); return -EINVAL; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { - if (msg) - pr_err("invalid %s%s: must be a power of two", - msg, opt->attr.name); + if (err) + pr_buf(err, "%s: must be a power of two", + opt->attr.name); return -EINVAL; } return 0; } -int bch2_opt_parse(struct bch_fs *c, const char *msg, +int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, - const char *val, u64 *res) + const char *val, u64 *res, + struct printbuf *err) { ssize_t ret; @@ -267,10 +293,11 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg, return ret; } - return bch2_opt_validate(opt, msg, *res); + return bch2_opt_validate(opt, *res, err); } -void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, +void bch2_opt_to_text(struct printbuf *out, + struct bch_fs *c, struct bch_sb *sb, const struct bch_option *opt, u64 v, unsigned flags) { @@ -300,7 +327,7 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, opt->choices[v]); break; case BCH_OPT_FN: - opt->to_text(out, c, v); + opt->to_text(out, c, sb, v); break; default: BUG(); @@ -346,6 +373,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, char *copied_opts, *copied_opts_start; char *opt, *name, *val; int ret, id; + struct printbuf err = PRINTBUF; u64 v; if (!options) @@ -365,8 +393,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, if (id < 0) goto bad_opt; - ret = bch2_opt_parse(c, "mount option ", - &bch2_opt_table[id], val, &v); + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); if (ret < 0) goto bad_val; } else { @@ -409,7 +436,7 @@ bad_opt: ret = -1; goto out; bad_val: - pr_err("Invalid value %s for mount option %s", val, name); + pr_err("Invalid mount option %s", err.buf); ret = -1; goto out; no_val: @@ -418,9 +445,26 @@ no_val: goto out; out: kfree(copied_opts_start); + printbuf_exit(&err); return ret; } +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +{ + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + v = opt->get_sb(sb); + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = 1ULL << v; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v <<= 9; + + return v; +} + /* * Initial options from superblock - here we don't want any options undefined, * any options the superblock doesn't specify are set to 0: @@ -428,28 +472,14 @@ out: int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { unsigned id; - int ret; for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - u64 v; - if (opt->get_sb == NO_SB_OPT) + if (opt->get_sb == BCH2_NO_SB_OPT) continue; - v = opt->get_sb(sb); - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = 1ULL << v; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v <<= 9; - - ret = bch2_opt_validate(opt, "superblock option ", v); - if (ret) - return ret; - - bch2_opt_set_by_id(opts, id, v); + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); } return 0; @@ -457,7 +487,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_NO_SB_OPT) + if (opt->set_sb == SET_BCH2_NO_SB_OPT) return; if (opt->flags & OPT_SB_FIELD_SECTORS) @@ -471,7 +501,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_NO_SB_OPT) + if (opt->set_sb == SET_BCH2_NO_SB_OPT) return; mutex_lock(&c->sb_lock); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 661eb5764f68..8bc67d07afb9 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -8,6 +8,7 @@ #include <linux/sysfs.h> #include "bcachefs_format.h" +extern const char * const bch2_metadata_versions[]; extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; @@ -20,6 +21,8 @@ extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const bch2_data_types[]; extern const char * const bch2_member_states[]; +extern const char * const bch2_jset_entry_types[]; +extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; static inline const char *bch2_d_type_str(unsigned d_type) @@ -40,7 +43,8 @@ static inline const char *bch2_d_type_str(unsigned d_type) */ /* dummy option, for options that aren't stored in the superblock */ -LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); +u64 BCH2_NO_SB_OPT(const struct bch_sb *); +void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); /* When can be set: */ enum opt_flags { @@ -200,7 +204,7 @@ enum opt_type { x(btree_node_mem_ptr_optimization, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, true, \ + BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -227,7 +231,7 @@ enum opt_type { x(inline_data, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, true, \ + BCH2_NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ x(acl, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ @@ -252,26 +256,26 @@ enum opt_type { x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in degraded mode") \ x(very_degraded, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, true, \ NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Extra debugging information during mount/recovery")\ x(journal_flush_delay, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U32_MAX), \ + OPT_UINT(1, U32_MAX), \ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ NULL, "Delay in milliseconds before automatic journal commits")\ x(journal_flush_disabled, u8, \ @@ -289,94 +293,109 @@ enum opt_type { x(fsck, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Fix errors during fsck without asking") \ x(ratelimit_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ NULL, "Ratelimit error messages during fsck") \ x(nochanges, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Super read only mode - no writes at all will be issued,\n"\ "even if we have to replay the journal") \ x(norecovery, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ x(rebuild_replicas, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Rebuild the superblock replicas section") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't free journal entries/keys after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Read all journal entries, not just dirty ones")\ + x(read_journal_only, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ + NULL, "Log transaction function names in journal") \ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't open device in exclusive mode") \ x(sb, u64, \ OPT_MOUNT, \ OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, BCH_SB_SECTOR, \ + BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ OPT_FS, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(nostart, u8, \ 0, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don\'t start filesystem, only open devices") \ x(reconstruct_alloc, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ + x(buckets_nouse, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(fs_size, u64, \ OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, 0, \ + BCH2_NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(bucket, u32, \ OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, 0, \ + BCH2_NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(durability, u8, \ OPT_DEVICE, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ - NO_SB_OPT, 1, \ + BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") @@ -443,7 +462,7 @@ struct bch_option { }; struct { int (*parse)(struct bch_fs *, const char *, u64 *); - void (*to_text)(struct printbuf *, struct bch_fs *, u64); + void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); }; }; @@ -458,18 +477,20 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); -int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, - const char *, u64 *); +int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); +int bch2_opt_parse(struct bch_fs *, const struct bch_option *, + const char *, u64 *, struct printbuf *); #define OPT_SHOW_FULL_LIST (1 << 0) #define OPT_SHOW_MOUNT_STYLE (1 << 1) -void bch2_opt_to_text(struct printbuf *, struct bch_fs *, +void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, const struct bch_option *, u64, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 8f8f4b0accd6..ca029a00e7b8 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -6,19 +6,55 @@ #include "subvolume.h" #include "super-io.h" -static const char *bch2_sb_validate_quota(struct bch_sb *sb, - struct bch_sb_field *f) +static const char * const bch2_quota_types[] = { + "user", + "group", + "project", +}; + +static const char * const bch2_quota_counters[] = { + "space", + "inodes", +}; + +static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); - if (vstruct_bytes(&q->field) != sizeof(*q)) - return "invalid field quota: wrong size"; + if (vstruct_bytes(&q->field) < sizeof(*q)) { + pr_buf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&q->field), sizeof(*q)); + return -EINVAL; + } - return NULL; + return 0; +} + +static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + unsigned qtyp, counter; + + for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { + pr_buf(out, "%s: flags %llx", + bch2_quota_types[qtyp], + le64_to_cpu(q->q[qtyp].flags)); + + for (counter = 0; counter < Q_COUNTERS; counter++) + pr_buf(out, " %s timelimit %u warnlimit %u", + bch2_quota_counters[counter], + le32_to_cpu(q->q[qtyp].c[counter].timelimit), + le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); + + pr_newline(out); + } } const struct bch_sb_field_ops bch_sb_field_ops_quota = { - .validate = bch2_sb_validate_quota, + .validate = bch2_sb_quota_validate, + .to_text = bch2_sb_quota_to_text, }; const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -32,11 +68,6 @@ const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -static const char * const bch2_quota_counters[] = { - "space", - "inodes", -}; - void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -570,7 +601,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_USR, 0), POS(QTYP_USR + 1, 0), - NULL); + 0, NULL); if (ret) return ret; } @@ -582,7 +613,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_GRP, 0), POS(QTYP_GRP + 1, 0), - NULL); + 0, NULL); if (ret) return ret; } @@ -594,7 +625,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_PRJ, 0), POS(QTYP_PRJ + 1, 0), - NULL); + 0, NULL); if (ret) return ret; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index a573fede05b1..d914892f5339 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -257,35 +257,47 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) { struct bch_fs_rebalance *r = &c->rebalance; struct rebalance_work w = rebalance_work(c); - char h1[21], h2[21]; - bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); - bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); - pr_buf(out, "fullest_dev (%i):\t%s/%s\n", - w.dev_most_full_idx, h1, h2); + out->tabstops[0] = 20; - bch2_hprint(&PBUF(h1), w.total_work << 9); - bch2_hprint(&PBUF(h2), c->capacity << 9); - pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); + pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx); + pr_tab(out); - pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); + bch2_hprint(out, w.dev_most_full_work << 9); + pr_buf(out, "/"); + bch2_hprint(out, w.dev_most_full_capacity << 9); + pr_newline(out); + + pr_buf(out, "total work:"); + pr_tab(out); + + bch2_hprint(out, w.total_work << 9); + pr_buf(out, "/"); + bch2_hprint(out, c->capacity << 9); + pr_newline(out); + + pr_buf(out, "rate:"); + pr_tab(out); + pr_buf(out, "%u", r->pd.rate.rate); + pr_newline(out); switch (r->state) { case REBALANCE_WAITING: - pr_buf(out, "waiting\n"); + pr_buf(out, "waiting"); break; case REBALANCE_THROTTLED: - bch2_hprint(&PBUF(h1), + pr_buf(out, "throttled for %lu sec or ", + (r->throttled_until_cputime - jiffies) / HZ); + bch2_hprint(out, (r->throttled_until_iotime - atomic64_read(&c->io_clock[WRITE].now)) << 9); - pr_buf(out, "throttled for %lu sec or %s io\n", - (r->throttled_until_cputime - jiffies) / HZ, - h1); + pr_buf(out, " io"); break; case REBALANCE_RUNNING: - pr_buf(out, "running\n"); + pr_buf(out, "running"); break; } + pr_newline(out); } void bch2_rebalance_stop(struct bch_fs *c) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 62efcc9504ba..ca92fe84c248 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "lru.h" #include "move.h" #include "quota.h" #include "recovery.h" @@ -59,23 +60,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) static int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, - struct journal_key *r) + const struct journal_key *r) { return (cmp_int(l_btree_id, r->btree_id) ?: cmp_int(l_level, r->level) ?: bpos_cmp(l_pos, r->k->k.p)); } -static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) { - return (cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p)); + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -static size_t journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +size_t bch2_journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; @@ -96,6 +95,24 @@ static size_t journal_key_search(struct journal_keys *journal_keys, return l; } +struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_key *end = keys->d + keys->nr; + struct journal_key *k = keys->d + + bch2_journal_key_search(keys, btree_id, level, pos); + + while (k < end && k->overwritten) + k++; + + if (k < end && + k->btree_id == btree_id && + k->level == level) + return k->k; + return NULL; +} + static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) { struct bkey_i *n = iter->keys->d[idx].k; @@ -109,17 +126,33 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign iter->idx++; } -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) { struct journal_key n = { .btree_id = id, .level = level, - .allocated = true + .k = k, + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, }; struct journal_keys *keys = &c->journal_keys; struct journal_iter *iter; - unsigned idx = journal_key_search(keys, id, level, k->k.p); + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } if (keys->nr == keys->size) { struct journal_keys new_keys = { @@ -140,25 +173,34 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, *keys = new_keys; } - n.k = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); - if (!n.k) - return -ENOMEM; + array_insert_item(keys->d, keys->nr, idx, n); - bkey_copy(n.k, k); + list_for_each_entry(iter, &c->journal_iters, list) + journal_iter_fix(c, iter, idx); - if (idx < keys->nr && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; - } else { - array_insert_item(keys->d, keys->nr, idx, n); + return 0; +} - list_for_each_entry(iter, &c->journal_iters, list) - journal_iter_fix(c, iter, idx); - } +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct bkey_i *n; + int ret; - return 0; + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) + return -ENOMEM; + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) + kfree(n); + return ret; } int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, @@ -172,17 +214,33 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->nr && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + !bpos_cmp(keys->d[idx].k->k.p, pos)) + keys->d[idx].overwritten = true; +} + static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->idx - iter->keys->nr - ? iter->keys->d + iter->idx : NULL; + struct journal_key *k = iter->keys->d + iter->idx; - if (k && - k->btree_id == iter->btree_id && - k->level == iter->level) - return k->k; + while (k < iter->keys->d + iter->keys->nr && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return k->k; + + iter->idx++; + k = iter->keys->d + iter->idx; + } - iter->idx = iter->keys->nr; return NULL; } @@ -205,8 +263,7 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->btree_id = id; iter->level = level; iter->keys = &c->journal_keys; - iter->idx = journal_key_search(&c->journal_keys, id, level, pos); - list_add(&iter->list, &c->journal_iters); + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -292,106 +349,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b) +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) { memset(iter, 0, sizeof(*iter)); iter->b = b; - bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(c, &iter->journal, - b->c.btree_id, b->c.level, b->data->min_key); + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); } -/* Walk btree, overlaying keys from the journal: */ - -static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, - struct btree_and_journal_iter iter) -{ - unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; - struct bkey_s_c k; - struct bkey_buf tmp; - - BUG_ON(!b->c.level); - - bch2_bkey_buf_init(&tmp); - - while (i < nr && - (k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, - b->c.btree_id, b->c.level - 1); - - bch2_btree_and_journal_iter_advance(&iter); - i++; - } - - bch2_bkey_buf_exit(&tmp, c); -} - -static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, - enum btree_id btree_id, - btree_walk_key_fn key_fn) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf tmp; - struct btree *child; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (b->c.level) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - child = bch2_btree_node_get_noiter(c, tmp.k, - b->c.btree_id, b->c.level - 1, - false); - - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; - - btree_and_journal_iter_prefetch(c, b, iter); - - ret = bch2_btree_and_journal_walk_recurse(trans, child, - btree_id, key_fn); - six_unlock_read(&child->c.lock); - } else { - ret = key_fn(trans, k); - } - - if (ret) - break; - - bch2_btree_and_journal_iter_advance(&iter); - } - - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, - btree_walk_key_fn key_fn) +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) { - struct bch_fs *c = trans->c; - struct btree *b = c->btree_roots[btree_id].b; - int ret = 0; - - if (btree_node_fake(b)) - return 0; + struct btree_node_iter node_iter; - six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); - six_unlock_read(&b->c.lock); - - return ret; + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -416,9 +400,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p) ?: + return journal_key_cmp(l, r) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -511,8 +493,8 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int __bch2_journal_replay_key(struct btree_trans *trans, - struct journal_key *k) +static int bch2_journal_replay_key(struct btree_trans *trans, + struct journal_key *k) { struct btree_iter iter; unsigned iter_flags = @@ -521,111 +503,75 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, int ret; if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; + iter_flags |= BTREE_ITER_CACHED; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) -{ - unsigned commit_flags = - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED; + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; - if (!k->allocated) - commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; + /* Must be checked with btree locked: */ + if (k->overwritten) + goto out; - return bch2_trans_do(c, NULL, NULL, commit_flags, - __bch2_journal_replay_key(&trans, k)); + ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; } static int journal_sort_seq_cmp(const void *_l, const void *_r) { - const struct journal_key *l = _l; - const struct journal_key *r = _r; + const struct journal_key *l = *((const struct journal_key **)_l); + const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(r->level, l->level) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->btree_id, r->btree_id) ?: - bpos_cmp(l->k->k.p, r->k->k.p); + return cmp_int(l->journal_seq, r->journal_seq); } -static int bch2_journal_replay(struct bch_fs *c, - struct journal_keys keys) +static int bch2_journal_replay(struct bch_fs *c) { + struct journal_keys *keys = &c->journal_keys; + struct journal_key **keys_sorted, *k; struct journal *j = &c->journal; - struct journal_key *i; - u64 seq; + size_t i; int ret; - sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); + if (!keys_sorted) + return -ENOMEM; - if (keys.nr) - replay_now_at(j, keys.journal_seq_base); + for (i = 0; i < keys->nr; i++) + keys_sorted[i] = &keys->d[i]; - seq = j->replay_journal_seq; + sort(keys_sorted, keys->nr, + sizeof(keys_sorted[0]), + journal_sort_seq_cmp, NULL); - /* - * First replay updates to the alloc btree - these will only update the - * btree key cache: - */ - for_each_journal_key(keys, i) { - cond_resched(); + if (keys->nr) + replay_now_at(j, keys->journal_seq_base); - if (!i->level && i->btree_id == BTREE_ID_alloc) { - j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_journal_replay_key(c, i); - if (ret) - goto err; - } - } + for (i = 0; i < keys->nr; i++) { + k = keys_sorted[i]; - /* - * Next replay updates to interior btree nodes: - */ - for_each_journal_key(keys, i) { cond_resched(); - if (i->level) { - j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_journal_replay_key(c, i); - if (ret) - goto err; - } - } - - /* - * Now that the btree is in a consistent state, we can start journal - * reclaim (which will be flushing entries from the btree key cache back - * to the btree: - */ - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); - set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); - journal_reclaim_kick(j); - - j->replay_journal_seq = seq; + if (!k->allocated) + replay_now_at(j, keys->journal_seq_base + k->journal_seq); - /* - * Now replay leaf node updates: - */ - for_each_journal_key(keys, i) { - cond_resched(); - - if (i->level || i->btree_id == BTREE_ID_alloc) - continue; - - replay_now_at(j, keys.journal_seq_base + i->journal_seq); - - ret = bch2_journal_replay_key(c, i); - if (ret) + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + (!k->allocated + ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved + : 0), + bch2_journal_replay_key(&trans, k)); + if (ret) { + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", + ret, bch2_btree_ids[k->btree_id], k->level); goto err; + } } replay_now_at(j, j->replay_journal_seq_end); @@ -633,10 +579,12 @@ static int bch2_journal_replay(struct bch_fs *c, bch2_journal_set_replay_done(j); bch2_journal_flush_all_pins(j); - return bch2_journal_error(j); + ret = bch2_journal_error(j); + + if (keys->nr && !ret) + bch2_journal_log_msg(&c->journal, "journal replay finished"); err: - bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", - ret, bch2_btree_ids[i->btree_id], i->level); + kvfree(keys_sorted); return ret; } @@ -674,15 +622,15 @@ static int journal_replay_entry_early(struct bch_fs *c, container_of(entry, struct jset_entry_usage, entry); switch (entry->btree_id) { - case FS_USAGE_RESERVED: + case BCH_FS_USAGE_reserved: if (entry->level < BCH_REPLICAS_MAX) c->usage_base->persistent_reserved[entry->level] = le64_to_cpu(u->v); break; - case FS_USAGE_INODES: + case BCH_FS_USAGE_inodes: c->usage_base->nr_inodes = le64_to_cpu(u->v); break; - case FS_USAGE_KEY_VERSION: + case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, le64_to_cpu(u->v)); break; @@ -702,10 +650,7 @@ static int journal_replay_entry_early(struct bch_fs *c, struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / - sizeof(struct jset_entry_dev_usage_type); - unsigned i; + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); @@ -820,6 +765,8 @@ static int verify_superblock_clean(struct bch_fs *c, { unsigned i; struct bch_sb_field_clean *clean = *cleanp; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; int ret = 0; if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, @@ -832,7 +779,6 @@ static int verify_superblock_clean(struct bch_fs *c, } for (i = 0; i < BTREE_ID_NR; i++) { - char buf1[200], buf2[200]; struct bkey_i *k1, *k2; unsigned l1 = 0, l2 = 0; @@ -842,6 +788,19 @@ static int verify_superblock_clean(struct bch_fs *c, if (!k1 && !k2) continue; + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + if (k1) + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); + else + pr_buf(&buf1, "(none)"); + + if (k2) + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); + else + pr_buf(&buf2, "(none)"); + mustfix_fsck_err_on(!k1 || !k2 || IS_ERR(k1) || IS_ERR(k2) || @@ -851,10 +810,12 @@ static int verify_superblock_clean(struct bch_fs *c, "superblock btree root %u doesn't match journal after clean shutdown\n" "sb: l=%u %s\n" "journal: l=%u %s\n", i, - l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), - l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); + l1, buf1.buf, + l2, buf2.buf); } fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); return ret; } @@ -881,7 +842,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) return ERR_PTR(-ENOMEM); } - ret = bch2_sb_clean_validate(c, clean, READ); + ret = bch2_sb_clean_validate_late(c, clean, READ); if (ret) { mutex_unlock(&c->sb_lock); return ERR_PTR(ret); @@ -977,7 +938,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c) static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) { - struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked inode; @@ -991,7 +951,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) goto err; if (!bkey_is_inode(k.k)) { - bch_err(c, "root inode not found"); + bch_err(trans->c, "root inode not found"); ret = -ENOENT; goto err; } @@ -1069,8 +1029,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); c->opts.version_upgrade = true; c->opts.fsck = true; - } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { - bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); + } else if (c->sb.version < bcachefs_metadata_version_alloc_v4) { + bch_info(c, "filesystem version is prior to alloc_v4 - upgrading"); c->opts.version_upgrade = true; } } @@ -1084,6 +1044,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { struct journal_replay *i; + bch_verbose(c, "starting journal read"); ret = bch2_journal_read(c, &c->journal_entries, &blacklist_seq, &journal_seq); if (ret) @@ -1132,6 +1093,9 @@ use_clean: blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } + if (c->opts.read_journal_only) + goto out; + if (c->opts.reconstruct_alloc) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); drop_alloc_keys(&c->journal_keys); @@ -1143,6 +1107,16 @@ use_clean: if (ret) goto err; + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); @@ -1163,7 +1137,11 @@ use_clean: bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; + + down_read(&c->gc_lock); ret = bch2_alloc_read(c); + up_read(&c->gc_lock); + if (ret) goto err; bch_verbose(c, "alloc read done"); @@ -1175,7 +1153,12 @@ use_clean: goto err; bch_verbose(c, "stripes_read done"); - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + /* + * If we're not running fsck, this ensures bch2_fsck_err() calls are + * instead interpreted as bch2_inconsistent_err() calls: + */ + if (!c->opts.fsck) + set_bit(BCH_FS_FSCK_DONE, &c->flags); if (c->opts.fsck || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || @@ -1183,18 +1166,32 @@ use_clean: test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bool metadata_only = c->opts.norecovery; - bch_info(c, "starting mark and sweep"); - err = "error in mark and sweep"; + bch_info(c, "checking allocations"); + err = "error checking allocations"; ret = bch2_gc(c, true, metadata_only); if (ret) goto err; - bch_verbose(c, "mark and sweep done"); + bch_verbose(c, "done checking allocations"); + } + + if (c->opts.fsck) { + bch_info(c, "checking need_discard and freespace btrees"); + err = "error checking need_discard and freespace btrees"; + ret = bch2_check_alloc_info(c, true); + if (ret) + goto err; + + ret = bch2_check_lrus(c, true); + if (ret) + goto err; + bch_verbose(c, "done checking need_discard and freespace btrees"); } bch2_stripes_heap_start(c); clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); /* * Skip past versions that might have possibly been used (as nonces), @@ -1206,29 +1203,18 @@ use_clean: if (c->opts.norecovery) goto out; - bch_verbose(c, "starting journal replay"); + bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); err = "journal replay failed"; - ret = bch2_journal_replay(c, c->journal_keys); + ret = bch2_journal_replay(c); if (ret) goto err; - bch_verbose(c, "journal replay done"); + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); - if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && - !c->opts.nochanges) { - /* - * note that even when filesystem was clean there might be work - * to do here, if we ran gc (because of fsck) which recalculated - * oldest_gen: - */ - bch_verbose(c, "writing allocation info"); - err = "error writing out alloc info"; - ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error writing alloc info"); - goto err; - } - bch_verbose(c, "alloc write done"); - } + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c); @@ -1279,34 +1265,7 @@ use_clean: bch_verbose(c, "quotas done"); } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || - le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { - struct bch_move_stats stats; - - bch_move_stats_init(&stats, "recovery"); - - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c); - if (ret) - goto err; - - ret = bch2_scan_old_btree_nodes(c, &stats); - if (ret) - goto err; - bch_info(c, "scanning for old btree nodes done"); - } - mutex_lock(&c->sb_lock); - /* - * With journal replay done, we can clear the journal seq blacklist - * table: - */ - BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); - BUG_ON(le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written); - - bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0); - if (c->opts.version_upgrade) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); @@ -1330,6 +1289,28 @@ use_clean: bch2_write_super(c); mutex_unlock(&c->sb_lock); + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || + le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch_move_stats_init(&stats, "recovery"); + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); + if (ret) + goto err; + + ret = bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } + + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); + ret = 0; out: set_bit(BCH_FS_FSCK_DONE, &c->flags); @@ -1375,15 +1356,13 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); - set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); - err = "unable to allocate journal buckets"; for_each_online_member(ca, c, i) { ret = bch2_dev_journal_alloc(ca); @@ -1409,6 +1388,7 @@ int bch2_fs_initialize(struct bch_fs *c) * Write out the superblock and journal buckets, now that we can do * btree updates */ + bch_verbose(c, "marking superblocks"); err = "error marking superblock and journal"; for_each_member_device(ca, c, i) { ret = bch2_trans_mark_dev_sb(c, ca); @@ -1420,6 +1400,12 @@ int bch2_fs_initialize(struct bch_fs *c) ca->new_fs_bucket_idx = 0; } + bch_verbose(c, "initializing freespace"); + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + err = "error creating root snapshot node"; ret = bch2_fs_initialize_subvolumes(c); if (ret) diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index e45c70b3693f..e6927a918df3 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -31,24 +31,32 @@ struct btree_and_journal_iter { } last; }; +size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, + unsigned, struct bpos); +struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id, + unsigned, struct bpos pos); + +int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, + unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); - -int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); - void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_entries_free(struct list_head *); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index c8d6d73681e0..6824730945d4 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -98,6 +98,24 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); } +int bch2_trans_mark_reflink_v(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); + + if (!r->v.refcount) { + r->k.type = KEY_TYPE_deleted; + r->k.size = 0; + set_bkey_val_u64s(&r->k, 0); + return 0; + } + } + + return bch2_trans_mark_extent(trans, old, new, flags); +} + /* indirect inline data */ const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, @@ -119,6 +137,24 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } +int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + struct bkey_i_indirect_inline_data *r = + bkey_i_to_indirect_inline_data(new); + + if (!r->v.refcount) { + r->k.type = KEY_TYPE_deleted; + r->k.size = 0; + set_bkey_val_u64s(&r->k, 0); + } + } + + return 0; +} + static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *orig) diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 3745873fd88d..8eb41c0292eb 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -10,27 +10,37 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ .val_to_text = bch2_reflink_p_to_text, \ - .key_merge = bch2_reflink_p_merge, \ + .key_merge = bch2_reflink_p_merge, \ + .trans_trigger = bch2_trans_mark_reflink_p, \ + .atomic_trigger = bch2_mark_reflink_p, \ } const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, + struct bkey_i *, unsigned); #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_reflink_v, \ + .atomic_trigger = bch2_mark_extent, \ } const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_indirect_inline_data(struct btree_trans *, + struct bkey_s_c, struct bkey_i *, + unsigned); #define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ + .trans_trigger = bch2_trans_mark_indirect_inline_data, \ } static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 6c5ea78d6762..c2771112d573 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -36,23 +36,40 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } +void bch2_replicas_entry_v0_to_text(struct printbuf *out, + struct bch_replicas_entry_v0 *e) +{ + unsigned i; + + if (e->data_type < BCH_DATA_NR) + pr_buf(out, "%s", bch2_data_types[e->data_type]); + else + pr_buf(out, "(invalid data type %u)", e->data_type); + + pr_buf(out, ": %u [", e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + pr_buf(out, i ? " %u" : "%u", e->devs[i]); + pr_buf(out, "]"); +} + void bch2_replicas_entry_to_text(struct printbuf *out, struct bch_replicas_entry *e) { unsigned i; - pr_buf(out, "%s: %u/%u [", - bch2_data_types[e->data_type], - e->nr_required, - e->nr_devs); + if (e->data_type < BCH_DATA_NR) + pr_buf(out, "%s", bch2_data_types[e->data_type]); + else + pr_buf(out, "(invalid data type %u)", e->data_type); + pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs); for (i = 0; i < e->nr_devs; i++) pr_buf(out, i ? " %u" : "%u", e->devs[i]); pr_buf(out, "]"); } void bch2_cpu_replicas_to_text(struct printbuf *out, - struct bch_replicas_cpu *r) + struct bch_replicas_cpu *r) { struct bch_replicas_entry *e; bool first = true; @@ -413,18 +430,10 @@ err: goto out; } -static int __bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *r, - bool check) -{ - return likely(bch2_replicas_marked(c, r)) ? 0 - : check ? -1 - : bch2_mark_replicas_slowpath(c, r); -} - int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) { - return __bch2_mark_replicas(c, r, false); + return likely(bch2_replicas_marked(c, r)) + ? 0 : bch2_mark_replicas_slowpath(c, r); } /* replicas delta list: */ @@ -808,67 +817,78 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, return 0; } -static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) +static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, + struct bch_sb *sb, + struct printbuf *err) { - unsigned i; + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + unsigned i, j; sort_cmp_size(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, memcmp, NULL); - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_entry *l = + for (i = 0; i < cpu_r->nr; i++) { + struct bch_replicas_entry *e = cpu_replicas_entry(cpu_r, i); - struct bch_replicas_entry *r = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - if (!memcmp(l, r, cpu_r->entry_size)) - return "duplicate replicas entry"; - } + if (e->data_type >= BCH_DATA_NR) { + pr_buf(err, "invalid data type in entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } - return NULL; -} + if (!e->nr_devs) { + pr_buf(err, "no devices in entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } -static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) -{ - struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu cpu_r = { .entries = NULL }; - struct bch_replicas_entry *e; - const char *err; - unsigned i; + if (e->nr_required > 1 && + e->nr_required >= e->nr_devs) { + pr_buf(err, "bad nr_required in entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } - for_each_replicas_entry(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; + for (j = 0; j < e->nr_devs; j++) + if (!bch2_dev_exists(sb, mi, e->devs[j])) { + pr_buf(err, "invalid device %u in entry ", e->devs[j]); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } - err = "invalid replicas entry: no devices"; - if (!e->nr_devs) - goto err; + if (i + 1 < cpu_r->nr) { + struct bch_replicas_entry *n = + cpu_replicas_entry(cpu_r, i + 1); - err = "invalid replicas entry: bad nr_required"; - if (e->nr_required > 1 && - e->nr_required >= e->nr_devs) - goto err; + BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr_devs; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; + if (!memcmp(e, n, cpu_r->entry_size)) { + pr_buf(err, "duplicate replicas entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } + } } - err = "cannot allocate memory"; + return 0; +} + +static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); + struct bch_replicas_cpu cpu_r; + int ret; + if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) - goto err; + return -ENOMEM; - err = check_dup_replicas_entries(&cpu_r); -err: + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); - return err; + return ret; } static void bch2_sb_replicas_to_text(struct printbuf *out, @@ -886,49 +906,50 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, bch2_replicas_entry_to_text(out, e); } + pr_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_replicas = { - .validate = bch2_sb_validate_replicas, + .validate = bch2_sb_replicas_validate, .to_text = bch2_sb_replicas_to_text, }; -static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) +static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu cpu_r = { .entries = NULL }; - struct bch_replicas_entry_v0 *e; - const char *err; - unsigned i; + struct bch_replicas_cpu cpu_r; + int ret; - for_each_replicas_entry_v0(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; + if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) + return -ENOMEM; - err = "invalid replicas entry: no devices"; - if (!e->nr_devs) - goto err; + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); + return ret; +} - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr_devs; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; - } +static void bch2_sb_replicas_v0_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_replicas_entry_v0 *e; + bool first = true; - err = "cannot allocate memory"; - if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) - goto err; + for_each_replicas_entry(sb_r, e) { + if (!first) + pr_buf(out, " "); + first = false; - err = check_dup_replicas_entries(&cpu_r); -err: - kfree(cpu_r.entries); - return err; + bch2_replicas_entry_v0_to_text(out, e); + } + pr_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { - .validate = bch2_sb_validate_replicas_v0, + .validate = bch2_sb_replicas_v0_validate, + .to_text = bch2_sb_replicas_v0_to_text, }; /* Query replicas: */ @@ -969,11 +990,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (dflags & ~flags) { if (print) { - char buf[100]; + struct printbuf buf = PRINTBUF; - bch2_replicas_entry_to_text(&PBUF(buf), e); + bch2_replicas_entry_to_text(&buf, e); bch_err(c, "insufficient devices online (%u) for replicas entry %s", - nr_online, buf); + nr_online, buf.buf); + printbuf_exit(&buf); } ret = false; break; @@ -985,19 +1007,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, return ret; } -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { - struct bch_replicas_entry *e; - unsigned i, ret = 0; + struct bch_sb_field_replicas *replicas; + struct bch_sb_field_replicas_v0 *replicas_v0; + unsigned i, data_has = 0; + + replicas = bch2_sb_get_replicas(sb); + replicas_v0 = bch2_sb_get_replicas_v0(sb); + + if (replicas) { + struct bch_replicas_entry *r; + + for_each_replicas_entry(replicas, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } else if (replicas_v0) { + struct bch_replicas_entry_v0 *r; + + for_each_replicas_entry_v0(replicas_v0, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } - percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) - for (i = 0; i < e->nr_devs; i++) - if (e->devs[i] == ca->dev_idx) - ret |= 1 << e->data_type; + return data_has; +} - percpu_up_read(&c->mark_lock); +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned ret; + + mutex_lock(&c->sb_lock); + ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&c->sb_lock); return ret; } diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index d237d7c51ccb..87820b2e1ad3 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -64,6 +64,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, unsigned, bool); +unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 57d636740d2f..591bbb9f8beb 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -163,12 +163,10 @@ bch2_hash_lookup(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), BTREE_ITER_SLOTS|flags, k, ret) { - if (iter->pos.inode != inum.inum) - break; - if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) return 0; @@ -199,15 +197,12 @@ bch2_hash_hole(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inum.inum) - break; - + POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) if (!is_visible_key(desc, inum, k)) return 0; - } bch2_trans_iter_exit(trans, iter); return ret ?: -ENOSPC; @@ -260,14 +255,12 @@ int bch2_hash_set(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_norestart(trans, iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, SPOS(inum.inum, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), + POS(inum.inum, U64_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter.pos.inode != inum.inum) - break; - if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 8aeb2e417a15..cdb89ba216cc 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -139,7 +139,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans) for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { u32 id = k.k->p.offset, child[2]; - unsigned nr_live = 0, live_idx; + unsigned nr_live = 0, live_idx = 0; if (k.k->type != KEY_TYPE_snapshot) continue; @@ -151,7 +151,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans) for (i = 0; i < 2; i++) { ret = snapshot_live(trans, child[i]); if (ret < 0) - break; + goto err; if (ret) live_idx = i; @@ -162,6 +162,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans) ? snapshot_t(c, child[live_idx])->equiv : id; } +err: bch2_trans_iter_exit(trans, &iter); if (ret) @@ -456,10 +457,10 @@ err: return ret; } -static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) { struct btree_iter iter; struct bkey_i_snapshot *n; @@ -522,7 +523,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, n = bch2_trans_kmalloc(trans, sizeof(*n)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto err; bkey_reassemble(&n->k_i, k); @@ -544,36 +545,21 @@ err: return ret; } -static int snapshot_id_add(struct snapshot_id_list *s, u32 id) +static int snapshot_id_add(snapshot_id_list *s, u32 id) { BUG_ON(snapshot_list_has_id(s, id)); - if (s->nr == s->size) { - size_t new_size = max(8U, s->size * 2); - void *n = krealloc(s->d, - new_size * sizeof(s->d[0]), - GFP_KERNEL); - if (!n) { - pr_err("error allocating snapshot ID list"); - return -ENOMEM; - } - - s->d = n; - s->size = new_size; - }; - - s->d[s->nr++] = id; - return 0; + return darray_push(*s, id); } static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, - struct snapshot_id_list *deleted, + snapshot_id_list *deleted, enum btree_id btree_id) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - struct snapshot_id_list equiv_seen = { 0 }; + snapshot_id_list equiv_seen = { 0 }; struct bpos last_pos = POS_MIN; int ret = 0; @@ -620,7 +606,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &iter); - kfree(equiv_seen.d); + darray_exit(equiv_seen); return ret; } @@ -632,7 +618,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_snapshot snap; - struct snapshot_id_list deleted = { 0 }; + snapshot_id_list deleted = { 0 }; u32 i, id, children[2]; int ret = 0; @@ -712,15 +698,15 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) for (i = 0; i < deleted.nr; i++) { ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, deleted.d[i])); + bch2_snapshot_node_delete(&trans, deleted.data[i])); if (ret) { bch_err(c, "error deleting snapshot %u: %i", - deleted.d[i], ret); + deleted.data[i], ret); goto err; } } err: - kfree(deleted.d); + darray_exit(deleted); bch2_trans_exit(&trans); percpu_ref_put(&c->writes); } @@ -875,14 +861,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - struct snapshot_id_list s; + snapshot_id_list s; u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); s = c->snapshots_unlinked; - memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); + darray_init(c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); if (!s.nr) @@ -890,7 +876,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) bch2_evict_subvolume_inodes(c, &s); - for (id = s.d; id < s.d + s.nr; id++) { + for (id = s.data; id < s.data + s.nr; id++) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_subvolume_delete(&trans, *id)); if (ret) { @@ -899,7 +885,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) } } - kfree(s.d); + darray_exit(s); } percpu_ref_put(&c->writes); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index e4c3fdcdf22f..f609291acafa 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_SUBVOLUME_H #define _BCACHEFS_SUBVOLUME_H +#include "darray.h" #include "subvolume_types.h" void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -58,15 +59,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances struct snapshots_seen { struct bpos pos; - size_t nr; - size_t size; - u32 *d; + DARRAY(u32) ids; }; static inline void snapshots_seen_exit(struct snapshots_seen *s) { - kfree(s->d); - s->d = NULL; + kfree(s->ids.data); + s->ids.data = NULL; } static inline void snapshots_seen_init(struct snapshots_seen *s) @@ -76,30 +75,19 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - if (s->nr == s->size) { - size_t new_size = max(s->size, (size_t) 128) * 2; - u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); - - if (!d) { - bch_err(c, "error reallocating snapshots_seen table (new size %zu)", - new_size); - return -ENOMEM; - } - - s->size = new_size; - s->d = d; - } - - s->d[s->nr++] = id; - return 0; + int ret = darray_push(s->ids, id); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; } -static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - unsigned i; + u32 *i; - for (i = 0; i < s->nr; i++) - if (id == s->d[i]) + darray_for_each(*s, i) + if (*i == id) return true; return false; } @@ -122,6 +110,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); +/* only exported for tests: */ +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + int bch2_subvolume_delete(struct btree_trans *, u32); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 9410b9587591..f7562b5d51df 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,10 +2,8 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -struct snapshot_id_list { - u32 nr; - u32 size; - u32 *d; -}; +#include "darray.h" + +typedef DARRAY(u32) snapshot_id_list; #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index b8d2cf66a630..71abf87114df 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -10,6 +10,7 @@ #include "io.h" #include "journal.h" #include "journal_io.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" @@ -27,8 +28,8 @@ const char * const bch2_sb_fields[] = { NULL }; -static const char *bch2_sb_field_validate(struct bch_sb *, - struct bch_sb_field *); +static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, + struct printbuf *); struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, enum bch_sb_field_type type) @@ -202,22 +203,31 @@ static inline void __bch2_sb_layout_size_assert(void) BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); } -static const char *validate_sb_layout(struct bch_sb_layout *layout) +static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) { u64 offset, prev_offset, max_sectors; unsigned i; - if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) - return "Not a bcachefs superblock layout"; + if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { + pr_buf(out, "Not a bcachefs superblock layout"); + return -EINVAL; + } - if (layout->layout_type != 0) - return "Invalid superblock layout type"; + if (layout->layout_type != 0) { + pr_buf(out, "Invalid superblock layout type %u", + layout->layout_type); + return -EINVAL; + } - if (!layout->nr_superblocks) - return "Invalid superblock layout: no superblocks"; + if (!layout->nr_superblocks) { + pr_buf(out, "Invalid superblock layout: no superblocks"); + return -EINVAL; + } - if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) - return "Invalid superblock layout: too many superblocks"; + if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { + pr_buf(out, "Invalid superblock layout: too many superblocks"); + return -EINVAL; + } max_sectors = 1 << layout->sb_max_size_bits; @@ -226,122 +236,163 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) for (i = 1; i < layout->nr_superblocks; i++) { offset = le64_to_cpu(layout->sb_offset[i]); - if (offset < prev_offset + max_sectors) - return "Invalid superblock layout: superblocks overlap"; + if (offset < prev_offset + max_sectors) { + pr_buf(out, "Invalid superblock layout: superblocks overlap\n" + " (sb %u ends at %llu next starts at %llu", + i - 1, prev_offset + max_sectors, offset); + return -EINVAL; + } prev_offset = offset; } - return NULL; + return 0; } -const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, + int rw) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; struct bch_sb_field_members *mi; - const char *err; + enum bch_opt_id opt_id; u32 version, version_min; u16 block_size; + int ret; version = le16_to_cpu(sb->version); - version_min = version >= bcachefs_metadata_version_new_versioning + version_min = version >= bcachefs_metadata_version_bkey_renumber ? le16_to_cpu(sb->version_min) : version; - if (version >= bcachefs_metadata_version_max || - version_min < bcachefs_metadata_version_min) - return "Unsupported superblock version"; + if (version >= bcachefs_metadata_version_max) { + pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", + version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } + + if (version_min < bcachefs_metadata_version_min) { + pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", + version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } - if (version_min > version) - return "Bad minimum version"; + if (version_min > version) { + pr_buf(out, "Bad minimum version %u, greater than version field %u", + version_min, version); + return -EINVAL; + } if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) - return "Filesystem has incompatible features"; + (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { + pr_buf(out, "Filesystem has incompatible features"); + return -EINVAL; + } block_size = le16_to_cpu(sb->block_size); - if (block_size > PAGE_SECTORS) - return "Bad block size"; + if (block_size > PAGE_SECTORS) { + pr_buf(out, "Block size too big (got %u, max %u)", + block_size, PAGE_SECTORS); + return -EINVAL; + } - if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) - return "Bad user UUID"; + if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { + pr_buf(out, "Bad user UUID (got zeroes)"); + return -EINVAL; + } - if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) - return "Bad internal UUID"; + if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { + pr_buf(out, "Bad intenal UUID (got zeroes)"); + return -EINVAL; + } if (!sb->nr_devices || - sb->nr_devices <= sb->dev_idx || - sb->nr_devices > BCH_SB_MEMBERS_MAX) - return "Bad number of member devices"; - - if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) - return "Invalid number of data replicas"; + sb->nr_devices > BCH_SB_MEMBERS_MAX) { + pr_buf(out, "Bad number of member devices %u (max %u)", + sb->nr_devices, BCH_SB_MEMBERS_MAX); + return -EINVAL; + } - if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) - return "Invalid number of data replicas"; + if (sb->dev_idx >= sb->nr_devices) { + pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)", + sb->dev_idx, sb->nr_devices); + return -EINVAL; + } - if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) - return "Invalid metadata checksum type"; + if (!sb->time_precision || + le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { + pr_buf(out, "Invalid time precision: %u (min 1, max %lu)", + le32_to_cpu(sb->time_precision), NSEC_PER_SEC); + return -EINVAL; + } - if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) - return "Invalid metadata checksum type"; + if (rw == READ) { + /* + * Been seeing a bug where these are getting inexplicably + * zeroed, so we'r now validating them, but we have to be + * careful not to preven people's filesystems from mounting: + */ + if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); + } - if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) - return "Invalid compression type"; + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { + const struct bch_option *opt = bch2_opt_table + opt_id; - if (!BCH_SB_BTREE_NODE_SIZE(sb)) - return "Btree node size not set"; + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, opt_id); - if (BCH_SB_GC_RESERVE(sb) < 5) - return "gc reserve percentage too small"; + pr_buf(out, "Invalid option "); + ret = bch2_opt_validate(opt, v, out); + if (ret) + return ret; - if (!sb->time_precision || - le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) - return "invalid time precision"; + printbuf_reset(out); + } + } /* validate layout */ - err = validate_sb_layout(&sb->layout); - if (err) - return err; + ret = validate_sb_layout(&sb->layout, out); + if (ret) + return ret; vstruct_for_each(sb, f) { - if (!f->u64s) - return "Invalid superblock: invalid optional field"; + if (!f->u64s) { + pr_buf(out, "Invalid superblock: optional with size 0 (type %u)", + le32_to_cpu(f->type)); + return -EINVAL; + } - if (vstruct_next(f) > vstruct_last(sb)) - return "Invalid superblock: invalid optional field"; + if (vstruct_next(f) > vstruct_last(sb)) { + pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", + le32_to_cpu(f->type)); + return -EINVAL; + } } /* members must be validated first: */ mi = bch2_sb_get_members(sb); - if (!mi) - return "Invalid superblock: member info area missing"; + if (!mi) { + pr_buf(out, "Invalid superblock: member info area missing"); + return -EINVAL; + } - err = bch2_sb_field_validate(sb, &mi->field); - if (err) - return err; + ret = bch2_sb_field_validate(sb, &mi->field, out); + if (ret) + return ret; vstruct_for_each(sb, f) { if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) continue; - err = bch2_sb_field_validate(sb, f); - if (err) - return err; + ret = bch2_sb_field_validate(sb, f, out); + if (ret) + return ret; } - return NULL; + return 0; } /* device open: */ @@ -403,7 +454,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) memcpy(dst->compat, src->compat, sizeof(dst->compat)); for (i = 0; i < BCH_SB_FIELD_NR; i++) { - if (i == BCH_SB_FIELD_journal) + if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) continue; src_f = bch2_sb_field_get(src, i); @@ -434,9 +485,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) __copy_super(&c->disk_sb, src); - if (BCH_SB_INITIALIZED(c->disk_sb.sb)) - set_bit(BCH_FS_INITIALIZED, &c->flags); - ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret) return ret; @@ -470,10 +518,12 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) /* read superblock: */ -static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) +static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) { struct bch_csum csum; + u32 version, version_min; size_t bytes; + int ret; reread: bio_reset(sb->bio); bio_set_dev(sb->bio, sb->bdev); @@ -481,40 +531,65 @@ reread: bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); - if (submit_bio_wait(sb->bio)) - return "IO error"; + ret = submit_bio_wait(sb->bio); + if (ret) { + pr_buf(err, "IO error: %i", ret); + return ret; + } + + if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { + pr_buf(err, "Not a bcachefs superblock"); + return -EINVAL; + } - if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) - return "Not a bcachefs superblock"; + version = le16_to_cpu(sb->sb->version); + version_min = version >= bcachefs_metadata_version_bkey_renumber + ? le16_to_cpu(sb->sb->version_min) + : version; - if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || - le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) - return "Unsupported superblock version"; + if (version >= bcachefs_metadata_version_max) { + pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", + version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } + + if (version_min < bcachefs_metadata_version_min) { + pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", + version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } bytes = vstruct_bytes(sb->sb); - if (bytes > 512 << sb->sb->layout.sb_max_size_bits) - return "Bad superblock: too big"; + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { + pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", + bytes, 512UL << sb->sb->layout.sb_max_size_bits); + return -EINVAL; + } if (bytes > sb->buffer_size) { if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) - return "cannot allocate memory"; + return -ENOMEM; goto reread; } - if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) - return "unknown csum type"; + if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); + return -EINVAL; + } /* XXX: verify MACs */ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), null_nonce(), sb->sb); - if (bch2_crc_cmp(csum, sb->sb->csum)) - return "bad checksum reading superblock"; + if (bch2_crc_cmp(csum, sb->sb->csum)) { + pr_buf(err, "bad checksum"); + return -EINVAL; + } sb->seq = le64_to_cpu(sb->sb->seq); - return NULL; + return 0; } int bch2_read_super(const char *path, struct bch_opts *opts, @@ -522,7 +597,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts, { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; - const char *err; + struct printbuf err = PRINTBUF; __le64 *i; int ret; @@ -554,25 +629,28 @@ int bch2_read_super(const char *path, struct bch_opts *opts, goto out; } - err = "cannot allocate memory"; ret = bch2_sb_realloc(sb, 0); - if (ret) + if (ret) { + pr_buf(&err, "error allocating memory for superblock"); goto err; + } - ret = -EFAULT; - err = "dynamic fault"; - if (bch2_fs_init_fault("read_super")) + if (bch2_fs_init_fault("read_super")) { + pr_buf(&err, "dynamic fault"); + ret = -EFAULT; goto err; + } - ret = -EINVAL; - err = read_one_super(sb, offset); - if (!err) + ret = read_one_super(sb, offset, &err); + if (!ret) goto got_super; if (opt_defined(*opts, sb)) goto err; - pr_err("error reading default superblock: %s", err); + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", + path, err.buf); + printbuf_reset(&err); /* * Error reading primary superblock - read location of backup @@ -588,13 +666,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts, */ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); - err = "IO error"; - if (submit_bio_wait(sb->bio)) + ret = submit_bio_wait(sb->bio); + if (ret) { + pr_buf(&err, "IO error: %i", ret); goto err; + } memcpy(&layout, sb->sb, sizeof(layout)); - err = validate_sb_layout(&layout); - if (err) + ret = validate_sb_layout(&layout, &err); + if (ret) goto err; for (i = layout.sb_offset; @@ -604,32 +684,39 @@ int bch2_read_super(const char *path, struct bch_opts *opts, if (offset == opt_get(*opts, sb)) continue; - err = read_one_super(sb, offset); - if (!err) + ret = read_one_super(sb, offset, &err); + if (!ret) goto got_super; } - ret = -EINVAL; goto err; got_super: - err = "Superblock block size smaller than device block size"; - ret = -EINVAL; if (le16_to_cpu(sb->sb->block_size) << 9 < bdev_logical_block_size(sb->bdev)) { - pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)", + pr_buf(&err, "block size (%u) smaller than device block size (%u)", le16_to_cpu(sb->sb->block_size) << 9, bdev_logical_block_size(sb->bdev)); - goto err_no_print; + ret = -EINVAL; + goto err; } ret = 0; sb->have_layout = true; + + ret = bch2_sb_validate(sb, &err, READ); + if (ret) { + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", + path, err.buf); + goto err_no_print; + } out: pr_verbose_init(*opts, "ret %i", ret); + printbuf_exit(&err); return ret; err: - pr_err("error reading superblock: %s", err); + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", + path, err.buf); err_no_print: bch2_free_super(sb); goto out; @@ -703,8 +790,8 @@ int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; struct bch_dev *ca; + struct printbuf err = PRINTBUF; unsigned i, sb = 0, nr_wrote; - const char *err; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; @@ -731,10 +818,12 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_from_fs(c, ca); for_each_online_member(ca, c, i) { - err = bch2_sb_validate(&ca->disk_sb); - if (err) { - bch2_fs_inconsistent(c, "sb invalid before write: %s", err); - ret = -1; + printbuf_reset(&err); + + ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); + percpu_ref_put(&ca->io_ref); goto out; } } @@ -752,11 +841,24 @@ int bch2_write_super(struct bch_fs *c) closure_sync(cl); for_each_online_member(ca, c, i) { - if (!ca->sb_write_error && - ca->disk_sb.seq != - le64_to_cpu(ca->sb_read_scratch->seq)) { + if (ca->sb_write_error) + continue; + + if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock write was silently dropped! (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { bch2_fs_fatal_error(c, - "Superblock modified by another process"); + "Superblock modified by another process (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); ret = -EROFS; goto out; @@ -811,6 +913,7 @@ int bch2_write_super(struct bch_fs *c) out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); + printbuf_exit(&err); return ret; } @@ -825,133 +928,218 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) mutex_unlock(&c->sb_lock); } -/* BCH_SB_FIELD_journal: */ - -static int u64_cmp(const void *_l, const void *_r) -{ - u64 l = *((const u64 *) _l), r = *((const u64 *) _r); - - return l < r ? -1 : l > r ? 1 : 0; -} +/* BCH_SB_FIELD_members: */ -static const char *bch2_sb_validate_journal(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_members_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { - struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; - const char *err; - unsigned nr; + struct bch_sb_field_members *mi = field_to_type(f, members); unsigned i; - u64 *b; - - journal = bch2_sb_get_journal(sb); - if (!journal) - return NULL; - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return NULL; - - b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); - if (!b) - return "cannot allocate memory"; + if ((void *) (mi->members + sb->nr_devices) > + vstruct_end(&mi->field)) { + pr_buf(err, "too many devices for section size"); + return -EINVAL; + } - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; - sort(b, nr, sizeof(u64), u64_cmp, NULL); + if (!bch2_member_exists(m)) + continue; - err = "journal bucket at sector 0"; - if (!b[0]) - goto err; + if (le64_to_cpu(m->nbuckets) > LONG_MAX) { + pr_buf(err, "device %u: too many buckets (got %llu, max %lu)", + i, le64_to_cpu(m->nbuckets), LONG_MAX); + return -EINVAL; + } - err = "journal bucket before first bucket"; - if (m && b[0] < le16_to_cpu(m->first_bucket)) - goto err; + if (le64_to_cpu(m->nbuckets) - + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { + pr_buf(err, "device %u: not enough buckets (got %llu, max %u)", + i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); + return -EINVAL; + } - err = "journal bucket past end of device"; - if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) - goto err; + if (le16_to_cpu(m->bucket_size) < + le16_to_cpu(sb->block_size)) { + pr_buf(err, "device %u: bucket size %u smaller than block size %u", + i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); + return -EINVAL; + } - err = "duplicate journal buckets"; - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) - goto err; + if (le16_to_cpu(m->bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) { + pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu", + i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); + return -EINVAL; + } + } - err = NULL; -err: - kfree(b); - return err; + return 0; } -static const struct bch_sb_field_ops bch_sb_field_ops_journal = { - .validate = bch2_sb_validate_journal, -}; - -/* BCH_SB_FIELD_members: */ - -static const char *bch2_sb_validate_members(struct bch_sb *sb, - struct bch_sb_field *f) +static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) { struct bch_sb_field_members *mi = field_to_type(f, members); - struct bch_member *m; + struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); + unsigned i; - if ((void *) (mi->members + sb->nr_devices) > - vstruct_end(&mi->field)) - return "Invalid superblock: bad member info"; + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + unsigned data_have = bch2_sb_dev_has_data(sb, i); + u64 bucket_size = le16_to_cpu(m->bucket_size); + u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) { if (!bch2_member_exists(m)) continue; - if (le64_to_cpu(m->nbuckets) > LONG_MAX) - return "Too many buckets"; + pr_buf(out, "Device:"); + pr_tab(out); + pr_buf(out, "%u", i); + pr_newline(out); + + pr_indent_push(out, 2); + + pr_buf(out, "UUID:"); + pr_tab(out); + pr_uuid(out, m->uuid.b); + pr_newline(out); + + pr_buf(out, "Size:"); + pr_tab(out); + pr_units(out, device_size, device_size << 9); + pr_newline(out); + + pr_buf(out, "Bucket size:"); + pr_tab(out); + pr_units(out, bucket_size, bucket_size << 9); + pr_newline(out); + + pr_buf(out, "First bucket:"); + pr_tab(out); + pr_buf(out, "%u", le16_to_cpu(m->first_bucket)); + pr_newline(out); + + pr_buf(out, "Buckets:"); + pr_tab(out); + pr_buf(out, "%llu", le64_to_cpu(m->nbuckets)); + pr_newline(out); + + pr_buf(out, "Last mount:"); + pr_tab(out); + if (m->last_mount) + pr_time(out, le64_to_cpu(m->last_mount)); + else + pr_buf(out, "(never)"); + pr_newline(out); + + pr_buf(out, "State:"); + pr_tab(out); + pr_buf(out, "%s", + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(m)] + : "unknown"); + pr_newline(out); + + pr_buf(out, "Group:"); + pr_tab(out); + if (BCH_MEMBER_GROUP(m)) { + unsigned idx = BCH_MEMBER_GROUP(m) - 1; + + if (idx < disk_groups_nr(gi)) + pr_buf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + pr_buf(out, "(bad disk labels section)"); + } else { + pr_buf(out, "(none)"); + } + pr_newline(out); - if (le64_to_cpu(m->nbuckets) - - le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) - return "Not enough buckets"; + pr_buf(out, "Data allowed:"); + pr_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(m)) + bch2_flags_to_text(out, bch2_data_types, + BCH_MEMBER_DATA_ALLOWED(m)); + else + pr_buf(out, "(none)"); + pr_newline(out); - if (le16_to_cpu(m->bucket_size) < - le16_to_cpu(sb->block_size)) - return "bucket size smaller than block size"; + pr_buf(out, "Has data:"); + pr_tab(out); + if (data_have) + bch2_flags_to_text(out, bch2_data_types, data_have); + else + pr_buf(out, "(none)"); + pr_newline(out); - if (le16_to_cpu(m->bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) - return "bucket size smaller than btree node size"; - } + pr_buf(out, "Discard:"); + pr_tab(out); + pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m)); + pr_newline(out); - return NULL; + pr_buf(out, "Freespace initialized:"); + pr_tab(out); + pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); + pr_newline(out); + + pr_indent_pop(out, 2); + } } static const struct bch_sb_field_ops bch_sb_field_ops_members = { - .validate = bch2_sb_validate_members, + .validate = bch2_sb_members_validate, + .to_text = bch2_sb_members_to_text, }; /* BCH_SB_FIELD_crypt: */ -static const char *bch2_sb_validate_crypt(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) - return "invalid field crypt: wrong size"; + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { + pr_buf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -EINVAL; + } + + if (BCH_CRYPT_KDF_TYPE(crypt)) { + pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + return -EINVAL; + } - if (BCH_CRYPT_KDF_TYPE(crypt)) - return "invalid field crypt: bad kdf type"; + return 0; +} - return NULL; +static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + pr_buf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); + pr_newline(out); + pr_buf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); + pr_newline(out); + pr_buf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); + pr_newline(out); + pr_buf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); + pr_newline(out); } static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_validate_crypt, + .validate = bch2_sb_crypt_validate, + .to_text = bch2_sb_crypt_to_text, }; /* BCH_SB_FIELD_clean: */ -int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) +int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { struct jset_entry *entry; int ret; @@ -1027,7 +1215,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage, entry); u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = FS_USAGE_INODES; + u->entry.btree_id = BCH_FS_USAGE_inodes; u->v = cpu_to_le64(c->usage_base->nr_inodes); } @@ -1037,7 +1225,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage, entry); u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = FS_USAGE_KEY_VERSION; + u->entry.btree_id = BCH_FS_USAGE_key_version; u->v = cpu_to_le64(atomic64_read(&c->key_version)); } @@ -1047,7 +1235,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage, entry); u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = FS_USAGE_RESERVED; + u->entry.btree_id = BCH_FS_USAGE_reserved; u->entry.level = i; u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); } @@ -1123,7 +1311,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) } sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); + sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); /* Trying to catch outstanding bug: */ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); @@ -1140,7 +1328,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) * this should be in the write path, and we should be validating every * superblock section: */ - ret = bch2_sb_clean_validate(c, sb_clean, WRITE); + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); if (ret) { bch_err(c, "error writing marking filesystem clean: validate error"); goto out; @@ -1151,19 +1339,47 @@ out: mutex_unlock(&c->sb_lock); } -static const char *bch2_sb_validate_clean(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); - if (vstruct_bytes(&clean->field) < sizeof(*clean)) - return "invalid field crypt: wrong size"; + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { + pr_buf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -EINVAL; + } - return NULL; + return 0; +} + +static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + struct jset_entry *entry; + + pr_buf(out, "flags: %x", le32_to_cpu(clean->flags)); + pr_newline(out); + pr_buf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); + pr_newline(out); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + + bch2_journal_entry_to_text(out, NULL, entry); + pr_newline(out); + } } static const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_validate_clean, + .validate = bch2_sb_clean_validate, + .to_text = bch2_sb_clean_to_text, }; static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { @@ -1173,14 +1389,27 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #undef x }; -static const char *bch2_sb_field_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) { unsigned type = le32_to_cpu(f->type); + struct printbuf field_err = PRINTBUF; + int ret; + + if (type >= BCH_SB_FIELD_NR) + return 0; + + ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); + if (ret) { + pr_buf(err, "Invalid superblock section %s: %s", + bch2_sb_fields[type], + field_err.buf); + pr_newline(err); + bch2_sb_field_to_text(err, sb, f); + } - return type < BCH_SB_FIELD_NR - ? bch2_sb_field_ops[type]->validate(sb, f) - : NULL; + printbuf_exit(&field_err); + return ret; } void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, @@ -1190,13 +1419,183 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ? bch2_sb_field_ops[type] : NULL; + if (!out->tabstops[0]) + out->tabstops[0] = 32; + if (ops) pr_buf(out, "%s", bch2_sb_fields[type]); else pr_buf(out, "(unknown field %u)", type); - pr_buf(out, " (size %llu):", vstruct_bytes(f)); + pr_buf(out, " (size %zu):", vstruct_bytes(f)); + pr_newline(out); - if (ops && ops->to_text) + if (ops && ops->to_text) { + pr_indent_push(out, 2); bch2_sb_field_ops[type]->to_text(out, sb, f); + pr_indent_pop(out, 2); + } +} + +void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) +{ + unsigned i; + + pr_buf(out, "Type: %u", l->layout_type); + pr_newline(out); + + pr_buf(out, "Superblock max size: "); + pr_units(out, + 1 << l->sb_max_size_bits, + 512 << l->sb_max_size_bits); + pr_newline(out); + + pr_buf(out, "Nr superblocks: %u", l->nr_superblocks); + pr_newline(out); + + pr_buf(out, "Offsets: "); + for (i = 0; i < l->nr_superblocks; i++) { + if (i) + pr_buf(out, ", "); + pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i])); + } + pr_newline(out); +} + +void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + bool print_layout, unsigned fields) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field *f; + u64 fields_have = 0; + unsigned nr_devices = 0; + + if (!out->tabstops[0]) + out->tabstops[0] = 32; + + mi = bch2_sb_get_members(sb); + if (mi) { + struct bch_member *m; + + for (m = mi->members; + m < mi->members + sb->nr_devices; + m++) + nr_devices += bch2_member_exists(m); + } + + pr_buf(out, "External UUID:"); + pr_tab(out); + pr_uuid(out, sb->user_uuid.b); + pr_newline(out); + + pr_buf(out, "Internal UUID:"); + pr_tab(out); + pr_uuid(out, sb->uuid.b); + pr_newline(out); + + pr_buf(out, "Device index:"); + pr_tab(out); + pr_buf(out, "%u", sb->dev_idx); + pr_newline(out); + + pr_buf(out, "Label:"); + pr_tab(out); + pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label); + pr_newline(out); + + pr_buf(out, "Version:"); + pr_tab(out); + pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]); + pr_newline(out); + + pr_buf(out, "Oldest version on disk:"); + pr_tab(out); + pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); + pr_newline(out); + + pr_buf(out, "Created:"); + pr_tab(out); + if (sb->time_base_lo) + pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + pr_buf(out, "(not set)"); + pr_newline(out); + + pr_buf(out, "Sequence number:"); + pr_tab(out); + pr_buf(out, "%llu", le64_to_cpu(sb->seq)); + pr_newline(out); + + pr_buf(out, "Superblock size:"); + pr_tab(out); + pr_buf(out, "%zu", vstruct_bytes(sb)); + pr_newline(out); + + pr_buf(out, "Clean:"); + pr_tab(out); + pr_buf(out, "%llu", BCH_SB_CLEAN(sb)); + pr_newline(out); + + pr_buf(out, "Devices:"); + pr_tab(out); + pr_buf(out, "%u", nr_devices); + pr_newline(out); + + pr_buf(out, "Sections:"); + vstruct_for_each(sb, f) + fields_have |= 1 << le32_to_cpu(f->type); + pr_tab(out); + bch2_flags_to_text(out, bch2_sb_fields, fields_have); + pr_newline(out); + + pr_buf(out, "Features:"); + pr_tab(out); + bch2_flags_to_text(out, bch2_sb_features, + le64_to_cpu(sb->features[0])); + pr_newline(out); + + pr_buf(out, "Compat features:"); + pr_tab(out); + bch2_flags_to_text(out, bch2_sb_compat, + le64_to_cpu(sb->compat[0])); + pr_newline(out); + + pr_newline(out); + pr_buf(out, "Options:"); + pr_newline(out); + pr_indent_push(out, 2); + { + enum bch_opt_id id; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, id); + + pr_buf(out, "%s:", opt->attr.name); + pr_tab(out); + bch2_opt_to_text(out, NULL, sb, opt, v, + OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); + pr_newline(out); + } + } + } + + pr_indent_pop(out, 2); + + if (print_layout) { + pr_newline(out); + pr_buf(out, "layout:"); + pr_newline(out); + pr_indent_push(out, 2); + bch2_sb_layout_to_text(out, &sb->layout); + pr_indent_pop(out, 2); + } + + vstruct_for_each(sb, f) + if (fields & (1 << le32_to_cpu(f->type))) { + pr_newline(out); + bch2_sb_field_to_text(out, sb, f); + } } diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 5c264875acb4..14a25f6fe29a 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -38,9 +38,8 @@ BCH_SB_FIELDS() extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { - const char * (*validate)(struct bch_sb *, struct bch_sb_field *); - void (*to_text)(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); + int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; static inline __le64 bch2_sb_magic(struct bch_fs *c) @@ -66,8 +65,6 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -const char *bch2_sb_validate(struct bch_sb_handle *); - int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); void __bch2_check_set_feature(struct bch_fs *, unsigned); @@ -78,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) __bch2_check_set_feature(c, feat); } -/* BCH_SB_FIELD_journal: */ - -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - /* BCH_SB_FIELD_members: */ static inline bool bch2_member_exists(struct bch_member *m) @@ -115,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .durability = BCH_MEMBER_DURABILITY(mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), }; } @@ -124,12 +113,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); -int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int); +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); int bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); +void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); +void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index df6bffeffe06..4a071711d363 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -16,6 +16,7 @@ #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" +#include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" @@ -198,17 +199,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) */ bch2_journal_flush_all_pins(&c->journal); - /* - * If the allocator threads didn't all start up, the btree updates to - * write out alloc info aren't going to work: - */ - if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) - goto nowrote_alloc; - bch_verbose(c, "flushing journal and stopping allocators"); bch2_journal_flush_all_pins(&c->journal); - set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); do { clean_passes++; @@ -233,17 +226,11 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete"); set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -nowrote_alloc: + closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); flush_work(&c->btree_interior_update_work); - for_each_member_device(ca, c, i) - bch2_dev_allocator_stop(ca); - - clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); - bch2_fs_journal_stop(&c->journal); /* @@ -279,10 +266,6 @@ void bch2_fs_read_only(struct bch_fs *c) /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: - * - * (This is really blocking new _allocations_, writes to previously - * allocated space can still happen until stopping the allocator in - * bch2_dev_allocator_stop()). */ percpu_ref_kill(&c->writes); @@ -411,19 +394,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - for_each_rw_member(ca, c, i) { - ret = bch2_dev_allocator_start(ca); - if (ret) { - bch_err(c, "error starting allocator threads"); - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - - for_each_rw_member(ca, c, i) - bch2_wake_allocator(ca); + bch2_do_discards(c); if (!early) { ret = bch2_fs_read_write_late(c); @@ -468,6 +439,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); @@ -528,6 +500,8 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_STOPPING, &c->flags); + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); @@ -670,6 +644,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -690,6 +665,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + INIT_LIST_HEAD(&c->journal_entries); INIT_LIST_HEAD(&c->journal_iters); @@ -737,7 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); + uuid_unparse_lower(c->sb.user_uuid.b, c->name); /* Compat: */ if (sb->version <= bcachefs_metadata_version_inode_v2 && @@ -755,6 +733,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_opts_apply(&c->opts, opts); + /* key cache currently disabled for inodes, because of snapshots: */ + c->opts.inodes_use_key_cache = 0; + + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; + if (c->opts.inodes_use_key_cache) + c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); @@ -805,6 +790,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c); bch2_fs_subvolumes_init(c) ?: bch2_fs_io_init(c) ?: bch2_fs_encryption_init(c) ?: @@ -814,9 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - if (c->opts.nochanges) - set_bit(JOURNAL_NOCHANGES, &c->journal.flags); - mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && @@ -852,12 +835,9 @@ noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; - char buf[512]; - struct printbuf p = PBUF(buf); + struct printbuf p = PRINTBUF; bool first = true; - strcpy(buf, "(null)"); - if (c->opts.read_only) { pr_buf(&p, "ro"); first = false; @@ -876,10 +856,14 @@ static void print_mount_opts(struct bch_fs *c) if (!first) pr_buf(&p, ","); first = false; - bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); + bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } - bch_info(c, "mounted with opts: %s", buf); + if (!p.pos) + pr_buf(&p, "(null)"); + + bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); + printbuf_exit(&p); } int bch2_fs_start(struct bch_fs *c) @@ -927,20 +911,6 @@ int bch2_fs_start(struct bch_fs *c) set_bit(BCH_FS_STARTED, &c->flags); - /* - * Allocator threads don't start filling copygc reserve until after we - * set BCH_FS_STARTED - wake them now: - * - * XXX ugly hack: - * Need to set ca->allocator_state here instead of relying on the - * allocator threads to do it to avoid racing with the copygc threads - * checking it and thinking they have no alloc reserve: - */ - for_each_online_member(ca, c, i) { - ca->allocator_state = ALLOCATOR_running; - bch2_wake_allocator(ca); - } - if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { @@ -1032,8 +1002,6 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - bch2_dev_allocator_stop(ca); - cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -1148,8 +1116,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; - if (opt_defined(c->opts, discard)) - ca->mi.discard = opt_get(c->opts, discard); + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / btree_sectors(c)); if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || @@ -1200,12 +1168,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; - if (ca->mi.state == BCH_MEMBER_STATE_rw && - bch2_dev_allocator_start(ca)) { - bch2_dev_free(ca); - goto err; - } - bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1251,6 +1213,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); + ca->dev = ca->disk_sb.bdev->bd_dev; + percpu_ref_reinit(&ca->io_ref); return 0; @@ -1389,14 +1353,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) /* * The allocator thread itself allocates btree nodes, so stop it first: */ - bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); bch2_dev_journal_stop(&c->journal, ca); bch2_copygc_start(c); } -static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); @@ -1404,8 +1367,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - - return bch2_dev_allocator_start(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1432,7 +1393,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&c->sb_lock); if (new_state == BCH_MEMBER_STATE_rw) - ret = __bch2_dev_read_write(c, ca); + __bch2_dev_read_write(c, ca); rebalance_wakeup(c); @@ -1455,30 +1416,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) { - struct btree_trans trans; - size_t i; + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); int ret; - bch2_trans_init(&trans, c, 0, 0); - - for (i = 0; i < ca->mi.nbuckets; i++) { - ret = lockrestart_do(&trans, - bch2_btree_key_cache_flush(&trans, - BTREE_ID_alloc, POS(ca->dev_idx, i))); - if (ret) - break; - } - bch2_trans_exit(&trans); - - if (ret) { + ret = bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_NORUN, NULL); + if (ret) bch_err(c, "error %i removing dev alloc info", ret); - return ret; - } - return bch2_btree_delete_range(c, BTREE_ID_alloc, - POS(ca->dev_idx, 0), - POS(ca->dev_idx + 1, 0), - NULL); + return ret; } int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) @@ -1543,11 +1494,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) data = bch2_dev_has_data(c, ca); if (data) { - char data_has_str[100]; + struct printbuf data_has = PRINTBUF; - bch2_flags_to_text(&PBUF(data_has_str), - bch2_data_types, data); - bch_err(ca, "Remove failed, still has data (%s)", data_has_str); + bch2_flags_to_text(&data_has, bch2_data_types, data); + bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); + printbuf_exit(&data_has); ret = -EBUSY; goto err; } @@ -1596,52 +1547,58 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_sb_field_members *mi; struct bch_member dev_mi; unsigned dev_idx, nr_devices, u64s; + struct printbuf errbuf = PRINTBUF; int ret; ret = bch2_read_super(path, &opts, &sb); - if (ret) - return ret; - - err = bch2_sb_validate(&sb); - if (err) - return -EINVAL; + if (ret) { + bch_err(c, "device add error: error reading super: %i", ret); + goto err; + } dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; err = bch2_dev_may_add(sb.sb, c); - if (err) - return -EINVAL; + if (err) { + bch_err(c, "device add error: %s", err); + ret = -EINVAL; + goto err; + } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { bch2_free_super(&sb); - return -ENOMEM; + ret = -ENOMEM; + goto err; } ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) { bch2_dev_free(ca); - return ret; + goto err; } - err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); - if (ret) + if (ret) { + bch_err(c, "device add error: journal alloc failed"); goto err; + } down_write(&c->state_lock); mutex_lock(&c->sb_lock); - err = "insufficient space in new superblock"; ret = bch2_sb_from_fs(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: new device superblock too small"); goto err_unlock; + } mi = bch2_sb_get_members(ca->disk_sb.sb); if (!bch2_sb_resize_members(&ca->disk_sb, le32_to_cpu(mi->field.u64s) + sizeof(dev_mi) / sizeof(u64))) { + bch_err(c, "device add error: new device superblock too small"); ret = -ENOSPC; goto err_unlock; } @@ -1654,7 +1611,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) goto have_slot; no_slot: - err = "no slots available in superblock"; + bch_err(c, "device add error: already have maximum number of devices"); ret = -ENOSPC; goto err_unlock; @@ -1663,12 +1620,12 @@ have_slot: u64s = (sizeof(struct bch_sb_field_members) + sizeof(struct bch_member) * nr_devices) / sizeof(u64); - err = "no space in superblock for member info"; - ret = -ENOSPC; - mi = bch2_sb_resize_members(&c->disk_sb, u64s); - if (!mi) + if (!mi) { + bch_err(c, "device add error: no room in superblock for member info"); + ret = -ENOSPC; goto err_unlock; + } /* success: */ @@ -1684,18 +1641,22 @@ have_slot: bch2_dev_usage_journal_reserve(c); - err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: error marking new superblock: %i", ret); goto err_late; + } + + ret = bch2_fs_freespace_init(c); + if (ret) { + bch_err(c, "device add error: error initializing free space: %i", ret); + goto err_late; + } ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) - goto err_late; - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return 0; @@ -1707,12 +1668,12 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - bch_err(c, "Unable to add device: %s", err); + printbuf_exit(&errbuf); return ret; err_late: up_write(&c->state_lock); - bch_err(c, "Error going rw after adding device: %s", err); - return -EINVAL; + ca = NULL; + goto err; } /* Hot add existing device to running filesystem: */ @@ -1755,11 +1716,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - ret = __bch2_dev_read_write(c, ca); - if (ret) - goto err; - } + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); @@ -1846,20 +1804,14 @@ err: } /* return with ref on ca->ref: */ -struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) +struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { struct bch_dev *ca; - dev_t dev; unsigned i; - int ret; - - ret = lookup_bdev(path, &dev); - if (ret) - return ERR_PTR(ret); rcu_read_lock(); for_each_member_device_rcu(ca, c, i, NULL) - if (ca->disk_sb.bdev->bd_dev == dev) + if (!strcmp(name, ca->name)) goto found; ca = ERR_PTR(-ENOENT); found: @@ -1878,18 +1830,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_sb_field_members *mi; unsigned i, best_sb = 0; const char *err; + struct printbuf errbuf = PRINTBUF; int ret = 0; + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + pr_verbose_init(opts, ""); if (!nr_devices) { - c = ERR_PTR(-EINVAL); - goto out2; - } - - if (!try_module_get(THIS_MODULE)) { - c = ERR_PTR(-ENODEV); - goto out2; + ret = -EINVAL; + goto err; } sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); @@ -1903,9 +1854,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, if (ret) goto err; - err = bch2_sb_validate(&sb[i]); - if (err) - goto err_print; } for (i = 1; i < nr_devices; i++) @@ -1960,8 +1908,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } out: kfree(sb); + printbuf_exit(&errbuf); module_put(THIS_MODULE); -out2: pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err_print: @@ -1978,81 +1926,6 @@ err: goto out; } -static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, - struct bch_opts opts) -{ - const char *err; - struct bch_fs *c; - bool allocated_fs = false; - int ret; - - err = bch2_sb_validate(sb); - if (err) - return err; - - mutex_lock(&bch_fs_list_lock); - c = __bch2_uuid_to_fs(sb->sb->uuid); - if (c) { - closure_get(&c->cl); - - err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); - if (err) - goto err; - } else { - allocated_fs = true; - c = bch2_fs_alloc(sb->sb, opts); - - err = "bch2_fs_alloc() error"; - if (IS_ERR(c)) - goto err; - } - - err = "bch2_dev_online() error"; - - mutex_lock(&c->sb_lock); - if (bch2_dev_attach_bdev(c, sb)) { - mutex_unlock(&c->sb_lock); - goto err; - } - mutex_unlock(&c->sb_lock); - - if (!c->opts.nostart && bch2_fs_may_start(c)) { - err = "error starting filesystem"; - ret = bch2_fs_start(c); - if (ret) - goto err; - } - - closure_put(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return NULL; -err: - mutex_unlock(&bch_fs_list_lock); - - if (allocated_fs && !IS_ERR(c)) - bch2_fs_stop(c); - else if (c) - closure_put(&c->cl); - - return err; -} - -const char *bch2_fs_open_incremental(const char *path) -{ - struct bch_sb_handle sb; - struct bch_opts opts = bch2_opts_empty(); - const char *err; - - if (bch2_read_super(path, &opts, &sb)) - return "error reading superblock"; - - err = __bch2_fs_open_incremental(&sb, opts); - bch2_free_super(&sb); - - return err; -} - /* Global interfaces/init */ static void bcachefs_exit(void) diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index c3273e9c711d..6d3efda26e63 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return remainder; } +static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, + u32 *offset) +{ + return div_u64_rem(s, ca->mi.bucket_size, offset); +} + static inline bool bch2_dev_is_online(struct bch_dev *ca) { return !percpu_ref_is_zero(&ca->io_ref); @@ -254,6 +260,5 @@ void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -const char *bch2_fs_open_incremental(const char *path); #endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index d8b159a5b7f7..89419fc7930d 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -32,6 +32,7 @@ struct bch_member_cpu { u8 discard; u8 data_allowed; u8 durability; + u8 freespace_initialized; u8 valid; }; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 07e9b214bcb5..2594fec4b821 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -46,8 +46,28 @@ struct sysfs_ops type ## _sysfs_ops = { \ } #define SHOW(fn) \ +static ssize_t fn ## _to_text(struct printbuf *, \ + struct kobject *, struct attribute *);\ + \ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ char *buf) \ +{ \ + struct printbuf out = PRINTBUF; \ + ssize_t ret = fn ## _to_text(&out, kobj, attr); \ + \ + if (!ret && out.allocation_failure) \ + ret = -ENOMEM; \ + \ + if (!ret) { \ + ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ + memcpy(buf, out.buf, ret); \ + } \ + printbuf_exit(&out); \ + return ret; \ +} \ + \ +static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ + struct attribute *attr) #define STORE(fn) \ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ @@ -64,22 +84,19 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ #define sysfs_printf(file, fmt, ...) \ do { \ if (attr == &sysfs_ ## file) \ - return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ + pr_buf(out, fmt "\n", __VA_ARGS__); \ } while (0) #define sysfs_print(file, var) \ do { \ if (attr == &sysfs_ ## file) \ - return snprint(buf, PAGE_SIZE, var); \ + snprint(out, var); \ } while (0) #define sysfs_hprint(file, val) \ do { \ - if (attr == &sysfs_ ## file) { \ - bch2_hprint(&out, val); \ - pr_buf(&out, "\n"); \ - return out.pos - buf; \ - } \ + if (attr == &sysfs_ ## file) \ + bch2_hprint(out, val); \ } while (0) #define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) @@ -153,13 +170,10 @@ read_attribute(congested); read_attribute(btree_avg_write_size); -read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(journal_pins); read_attribute(btree_updates); -read_attribute(dirty_btree_nodes); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_transactions); @@ -170,11 +184,11 @@ read_attribute(internal_uuid); read_attribute(has_data); read_attribute(alloc_debug); -write_attribute(wake_allocator); read_attribute(read_realloc_races); read_attribute(extent_migrate_done); read_attribute(extent_migrate_raced); +read_attribute(bucket_alloc_fail); rw_attribute(discard); rw_attribute(label); @@ -192,7 +206,7 @@ read_attribute(new_stripes); read_attribute(io_timers_read); read_attribute(io_timers_write); -read_attribute(data_op_data_progress); +read_attribute(data_jobs); #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); @@ -230,32 +244,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c) return nr ? div64_u64(sectors, nr) : 0; } -static long stats_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_move_stats *stats) -{ - pr_buf(out, "%s: data type %s btree_id %s position: ", - stats->name, - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); - pr_buf(out, "%s", "\n"); - - return 0; -} - static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) { long ret = 0; - struct bch_move_stats *iter; + struct bch_move_stats *stats; mutex_lock(&c->data_progress_lock); - - if (list_empty(&c->data_progress_list)) - pr_buf(out, "%s", "no progress to report\n"); - else - list_for_each_entry(iter, &c->data_progress_list, list) { - stats_to_text(out, c, iter); - } + list_for_each_entry(stats, &c->data_progress_list, list) { + pr_buf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + pr_buf(out, "%s", "\n"); + } mutex_unlock(&c->data_progress_lock); return ret; @@ -266,8 +268,12 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, + enum btree_id id; + u64 nr_uncompressed_extents = 0, nr_compressed_extents = 0, + nr_incompressible_extents = 0, + uncompressed_sectors = 0, + incompressible_sectors = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; int ret; @@ -277,47 +283,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) - if (k.k->type == KEY_TYPE_extent) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + for (id = 0; id < BTREE_ID_NR; id++) { + if (!((1U << id) & BTREE_ID_HAS_PTRS)) + continue; + + for_each_btree_key(&trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) { - if (!crc_is_compressed(p.crc)) { - nr_uncompressed_extents++; - uncompressed_sectors += e.k->size; - } else { - nr_compressed_extents++; + bool compressed = false, uncompressed = false, incompressible = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + switch (p.crc.compression_type) { + case BCH_COMPRESSION_TYPE_none: + uncompressed = true; + uncompressed_sectors += k.k->size; + break; + case BCH_COMPRESSION_TYPE_incompressible: + incompressible = true; + incompressible_sectors += k.k->size; + break; + default: compressed_sectors_compressed += p.crc.compressed_size; compressed_sectors_uncompressed += p.crc.uncompressed_size; + compressed = true; + break; } - - /* only looking at the first ptr */ - break; } + + if (incompressible) + nr_incompressible_extents++; + else if (uncompressed) + nr_uncompressed_extents++; + else if (compressed) + nr_compressed_extents++; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(&trans, &iter); + } bch2_trans_exit(&trans); + if (ret) return ret; - pr_buf(out, - "uncompressed data:\n" - " nr extents: %llu\n" - " size (bytes): %llu\n" - "compressed data:\n" - " nr extents: %llu\n" - " compressed size (bytes): %llu\n" - " uncompressed size (bytes): %llu\n", - nr_uncompressed_extents, - uncompressed_sectors << 9, - nr_compressed_extents, - compressed_sectors_compressed << 9, - compressed_sectors_uncompressed << 9); + pr_buf(out, "uncompressed:\n"); + pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents); + pr_buf(out, " size: "); + bch2_hprint(out, uncompressed_sectors << 9); + pr_buf(out, "\n"); + + pr_buf(out, "compressed:\n"); + pr_buf(out, " nr extents: %llu\n", nr_compressed_extents); + pr_buf(out, " compressed size: "); + bch2_hprint(out, compressed_sectors_compressed << 9); + pr_buf(out, "\n"); + pr_buf(out, " uncompressed size: "); + bch2_hprint(out, compressed_sectors_uncompressed << 9); + pr_buf(out, "\n"); + + pr_buf(out, "incompressible:\n"); + pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents); + pr_buf(out, " size: "); + bch2_hprint(out, incompressible_sectors << 9); + pr_buf(out, "\n"); return 0; } @@ -331,7 +362,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); @@ -345,13 +375,13 @@ SHOW(bch2_fs) atomic_long_read(&c->extent_migrate_done)); sysfs_print(extent_migrate_raced, atomic_long_read(&c->extent_migrate_raced)); + sysfs_print(bucket_alloc_fail, + atomic_long_read(&c->bucket_alloc_fail)); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - if (attr == &sysfs_gc_gens_pos) { - bch2_gc_gens_pos_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_gc_gens_pos) + bch2_gc_gens_pos_to_text(out, c); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -361,83 +391,48 @@ SHOW(bch2_fs) max(0LL, c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)) << 9); - if (attr == &sysfs_rebalance_work) { - bch2_rebalance_work_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_rebalance_work) + bch2_rebalance_work_to_text(out, c); sysfs_print(promote_whole_extents, c->promote_whole_extents); /* Debugging: */ - if (attr == &sysfs_journal_debug) { - bch2_journal_debug_to_text(&out, &c->journal); - return out.pos - buf; - } + if (attr == &sysfs_journal_debug) + bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_journal_pins) { - bch2_journal_pins_to_text(&out, &c->journal); - return out.pos - buf; - } + if (attr == &sysfs_btree_updates) + bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_updates) { - bch2_btree_updates_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_btree_cache) + bch2_btree_cache_to_text(out, c); - if (attr == &sysfs_dirty_btree_nodes) { - bch2_dirty_btree_nodes_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_btree_key_cache) + bch2_btree_key_cache_to_text(out, &c->btree_key_cache); - if (attr == &sysfs_btree_cache) { - bch2_btree_cache_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_btree_transactions) + bch2_btree_trans_to_text(out, c); - if (attr == &sysfs_btree_key_cache) { - bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); - return out.pos - buf; - } + if (attr == &sysfs_stripes_heap) + bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_btree_transactions) { - bch2_btree_trans_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c); - if (attr == &sysfs_stripes_heap) { - bch2_stripes_heap_to_text(&out, c); - return out.pos - buf; - } - - if (attr == &sysfs_open_buckets) { - bch2_open_buckets_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_compression_stats) + bch2_compression_stats_to_text(out, c); - if (attr == &sysfs_compression_stats) { - bch2_compression_stats_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_new_stripes) + bch2_new_stripes_to_text(out, c); - if (attr == &sysfs_new_stripes) { - bch2_new_stripes_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_io_timers_read) + bch2_io_timers_to_text(out, &c->io_clock[READ]); - if (attr == &sysfs_io_timers_read) { - bch2_io_timers_to_text(&out, &c->io_clock[READ]); - return out.pos - buf; - } - if (attr == &sysfs_io_timers_write) { - bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); - return out.pos - buf; - } + if (attr == &sysfs_io_timers_write) + bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - if (attr == &sysfs_data_op_data_progress) { - data_progress_to_text(&out, c); - return out.pos - buf; - } + if (attr == &sysfs_data_jobs) + data_progress_to_text(out, c); return 0; } @@ -482,6 +477,17 @@ STORE(bch2_fs) /* Debugging: */ + if (!test_bit(BCH_FS_RW, &c->flags)) + return -EROFS; + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); + } + if (attr == &sysfs_trigger_gc) { /* * Full gc is currently incompatible with btree key cache: @@ -495,14 +501,6 @@ STORE(bch2_fs) #endif } - if (attr == &sysfs_prune_cache) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); - } - #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -547,7 +545,7 @@ struct attribute *bch2_fs_files[] = { SHOW(bch2_fs_internal) { struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - return bch2_fs_show(&c->kobj, attr, buf); + return bch2_fs_to_text(out, &c->kobj, attr); } STORE(bch2_fs_internal) @@ -559,9 +557,7 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_journal_debug, - &sysfs_journal_pins, &sysfs_btree_updates, - &sysfs_dirty_btree_nodes, &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_btree_transactions, @@ -577,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_read_realloc_races, &sysfs_extent_migrate_done, &sysfs_extent_migrate_raced, + &sysfs_bucket_alloc_fail, &sysfs_gc_gens_pos, @@ -587,7 +584,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), - &sysfs_data_op_data_progress, + &sysfs_data_jobs, &sysfs_internal_uuid, NULL @@ -597,39 +594,47 @@ struct attribute *bch2_fs_internal_files[] = { SHOW(bch2_fs_opts_dir) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); const struct bch_option *opt = container_of(attr, struct bch_option, attr); int id = opt - bch2_opt_table; u64 v = bch2_opt_get_by_id(&c->opts, id); - bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); - pr_buf(&out, "\n"); + bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); + pr_char(out, '\n'); - return out.pos - buf; + return 0; } STORE(bch2_fs_opts_dir) { struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int ret, id = opt - bch2_opt_table; + int ret = size, id = opt - bch2_opt_table; char *tmp; u64 v; + /* + * We don't need to take c->writes for correctness, but it eliminates an + * unsightly error message in the dmesg log when we're RO: + */ + if (unlikely(!percpu_ref_tryget(&c->writes))) + return -EROFS; + tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) - return -ENOMEM; + if (!tmp) { + ret = -ENOMEM; + goto err; + } - ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); kfree(tmp); if (ret < 0) - return ret; + goto err; ret = bch2_opt_check_may_set(c, id, v); if (ret < 0) - return ret; + goto err; bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); @@ -639,8 +644,9 @@ STORE(bch2_fs_opts_dir) bch2_rebalance_add_work(c, S64_MAX); rebalance_wakeup(c); } - - return size; +err: + percpu_ref_put(&c->writes); + return ret; } SYSFS_OPS(bch2_fs_opts_dir); @@ -670,13 +676,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) SHOW(bch2_fs_time_stats) { struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - struct printbuf out = _PBUF(buf, PAGE_SIZE); #define x(name) \ - if (attr == &sysfs_time_stat_##name) { \ - bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ - return out.pos - buf; \ - } + if (attr == &sysfs_time_stat_##name) \ + bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); BCH_TIME_STATS() #undef x @@ -697,24 +700,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -{ - enum alloc_reserve i; - - spin_lock(&ca->fs->freelist_lock); - - pr_buf(out, "free_inc:\t%zu\t%zu\n", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - for (i = 0; i < RESERVE_NR; i++) - pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, - fifo_used(&ca->free[i]), - ca->free[i].size); - - spin_unlock(&ca->fs->freelist_lock); -} - static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; @@ -740,9 +725,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "ec\t%16llu\n" "available%15llu\n" "\n" - "free_inc\t\t%zu/%zu\n" - "free[RESERVE_MOVINGGC]\t%zu/%zu\n" - "free[RESERVE_NONE]\t%zu/%zu\n" "freelist_wait\t\t%s\n" "open buckets allocated\t%u\n" "open buckets this dev\t%u\n" @@ -750,13 +732,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "open_buckets_wait\t%s\n" "open_buckets_btree\t%u\n" "open_buckets_user\t%u\n" - "btree reserve cache\t%u\n" - "thread state:\t\t%s\n", + "btree reserve cache\t%u\n", stats.buckets_ec, - __dev_buckets_available(ca, stats), - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + __dev_buckets_available(ca, stats, RESERVE_none), c->freelist_wait.list.first ? "waiting" : "empty", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, ca->nr_open_buckets, @@ -764,8 +742,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) c->open_buckets_wait.list.first ? "waiting" : "empty", nr[BCH_DATA_btree], nr[BCH_DATA_user], - c->btree_reserve_cache_nr, - bch2_allocator_states[ca->allocator_state]); + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { @@ -792,7 +769,6 @@ SHOW(bch2_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_printf(uuid, "%pU\n", ca->uuid.b); @@ -805,58 +781,44 @@ SHOW(bch2_dev) if (attr == &sysfs_label) { if (ca->mi.group) { mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(&out, &c->disk_sb, + bch2_disk_path_to_text(out, c->disk_sb.sb, ca->mi.group - 1); mutex_unlock(&c->sb_lock); } - pr_buf(&out, "\n"); - return out.pos - buf; + pr_char(out, '\n'); } if (attr == &sysfs_has_data) { - bch2_flags_to_text(&out, bch2_data_types, + bch2_flags_to_text(out, bch2_data_types, bch2_dev_has_data(c, ca)); - pr_buf(&out, "\n"); - return out.pos - buf; + pr_char(out, '\n'); } if (attr == &sysfs_state_rw) { - bch2_string_opt_to_text(&out, bch2_member_states, + bch2_string_opt_to_text(out, bch2_member_states, ca->mi.state); - pr_buf(&out, "\n"); - return out.pos - buf; + pr_char(out, '\n'); } - if (attr == &sysfs_iodone) { - dev_iodone_to_text(&out, ca); - return out.pos - buf; - } + if (attr == &sysfs_iodone) + dev_iodone_to_text(out, ca); sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - if (attr == &sysfs_io_latency_stats_read) { - bch2_time_stats_to_text(&out, &ca->io_latency[READ]); - return out.pos - buf; - } - if (attr == &sysfs_io_latency_stats_write) { - bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); - return out.pos - buf; - } + if (attr == &sysfs_io_latency_stats_read) + bch2_time_stats_to_text(out, &ca->io_latency[READ]); + + if (attr == &sysfs_io_latency_stats_write) + bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); - if (attr == &sysfs_reserve_stats) { - reserve_stats_to_text(&out, ca); - return out.pos - buf; - } - if (attr == &sysfs_alloc_debug) { - dev_alloc_debug_to_text(&out, ca); - return out.pos - buf; - } + if (attr == &sysfs_alloc_debug) + dev_alloc_debug_to_text(out, ca); return 0; } @@ -894,9 +856,6 @@ STORE(bch2_dev) return ret; } - if (attr == &sysfs_wake_allocator) - bch2_wake_allocator(ca); - return size; } SYSFS_OPS(bch2_dev); @@ -922,11 +881,8 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, - &sysfs_reserve_stats, - /* debug: */ &sysfs_alloc_debug, - &sysfs_wake_allocator, NULL }; diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 60ccb94e5de5..4369bfc55a94 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "btree_update.h" #include "journal_reclaim.h" +#include "subvolume.h" #include "tests.h" #include "linux/kthread.h" @@ -14,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), - SPOS(0, U64_MAX, U32_MAX), + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - SPOS(0, U64_MAX, U32_MAX), - NULL); + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); BUG_ON(ret); } @@ -146,7 +146,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - POS_MIN, 0, k, ret) { + SPOS(0, 0, U32_MAX), 0, k, ret) { if (k.k->p.inode) break; @@ -202,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS_MIN, 0, k, ret) { + SPOS(0, 0, U32_MAX), 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; } @@ -256,8 +256,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, - 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, ret) { if (k.k->p.inode) break; @@ -272,7 +272,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS, k, ret) { BUG_ON(k.k->p.offset != i); BUG_ON(bkey_deleted(k.k) != (i & 1)); @@ -321,8 +322,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, - 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; @@ -335,7 +336,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS, k, ret) { BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -363,7 +365,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) struct bkey_s_c k; bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); @@ -383,7 +386,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) struct bkey_s_c k; bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0); k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); @@ -406,8 +410,6 @@ static int insert_test_extent(struct bch_fs *c, struct bkey_i_cookie k; int ret; - //pr_info("inserting %llu-%llu v %llu", start, end, test_version); - bkey_cookie_init(&k.k_i); k.k_i.k.p.offset = end; k.k_i.k.p.snapshot = U32_MAX; @@ -459,6 +461,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) __test_extent_overwrite(c, 32, 64, 32, 128); } +/* snapshot unit tests */ + +/* Test skipping over keys in unrelated snapshots: */ +static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie cookie; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = snapid_hi; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); + k = bch2_btree_iter_peek(&iter); + + BUG_ON(k.k->p.snapshot != U32_MAX); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_snapshots(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie cookie; + u32 snapids[2]; + u32 snapid_subvols[2] = { 1, 1 }; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = U32_MAX; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_snapshot_node_create(&trans, U32_MAX, + snapids, + snapid_subvols, + 2)); + if (ret) + return ret; + + if (snapids[0] > snapids[1]) + swap(snapids[0], snapids[1]); + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + if (ret) { + bch_err(c, "err %i from test_snapshot_filter", ret); + return ret; + } + + return 0; +} + /* perf tests */ static u64 test_rand(void) @@ -747,7 +813,8 @@ static int seq_delete(struct bch_fs *c, u64 nr) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS_MAX, NULL); + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); if (ret) bch_err(c, "error in seq_delete: %i", ret); return ret; @@ -785,8 +852,10 @@ static int btree_perf_test_thread(void *data) } ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); - if (ret) + if (ret) { + bch_err(j->c, "%ps: error %i", j->fn, ret); j->ret = ret; + } if (atomic_dec_and_test(&j->done)) { j->finish = sched_clock(); @@ -800,7 +869,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, u64 nr, unsigned nr_threads) { struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; - char name_buf[20], nr_buf[20], per_sec_buf[20]; + char name_buf[20]; + struct printbuf nr_buf = PRINTBUF; + struct printbuf per_sec_buf = PRINTBUF; unsigned i; u64 time; @@ -839,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, perf_test(test_extent_overwrite_middle); perf_test(test_extent_overwrite_all); + perf_test(test_snapshots); + if (!j.fn) { pr_err("unknown test %s", testname); return -EINVAL; @@ -859,13 +932,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, time = j.finish - j.start; scnprintf(name_buf, sizeof(name_buf), "%s:", testname); - bch2_hprint(&PBUF(nr_buf), nr); - bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); + bch2_hprint(&nr_buf, nr); + bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", - name_buf, nr_buf, nr_threads, + name_buf, nr_buf.buf, nr_threads, div_u64(time, NSEC_PER_SEC), div_u64(time * nr_threads, nr), - per_sec_buf); + per_sec_buf.buf); + printbuf_exit(&per_sec_buf); + printbuf_exit(&nr_buf); return j.ret; } diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 52de7c49cacb..37fc20413764 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -99,6 +99,71 @@ STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) STRTO_H(strtou64, u64) +static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra) +{ + unsigned new_size; + char *buf; + + if (out->pos + extra + 1 < out->size) + return 0; + + new_size = roundup_pow_of_two(out->size + extra); + buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC); + + if (!buf) { + out->allocation_failure = true; + return -ENOMEM; + } + + out->buf = buf; + out->size = new_size; + return 0; +} + +void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + int len; + + do { + va_start(args, fmt); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + va_end(args); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_realloc(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +void bch2_pr_tab_rjust(struct printbuf *buf) +{ + BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); + + if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) { + unsigned move = buf->pos - buf->last_field; + unsigned shift = buf->tabstops[buf->tabstop] - + printbuf_linelen(buf); + + bch2_printbuf_realloc(buf, shift); + + if (buf->last_field + shift + 1 < buf->size) { + move = min(move, buf->size - 1 - buf->last_field - shift); + + memmove(buf->buf + buf->last_field + shift, + buf->buf + buf->last_field, + move); + memset(buf->buf + buf->last_field, ' ', shift); + buf->pos += shift; + buf->buf[buf->pos] = 0; + } + } + + buf->last_field = buf->pos; + buf->tabstop++; +} + void bch2_hprint(struct printbuf *buf, s64 v) { int u, t = 0; @@ -114,10 +179,25 @@ void bch2_hprint(struct printbuf *buf, s64 v) * 103 is magic: t is in the range [-1023, 1023] and we want * to turn it into [-9, 9] */ - if (u && v < 100 && v > -100) + if (u && t && v < 100 && v > -100) pr_buf(buf, ".%i", t / 103); if (u) - pr_buf(buf, "%c", si_units[u]); + pr_char(buf, si_units[u]); +} + +void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes) +{ + switch (out->units) { + case PRINTBUF_UNITS_RAW: + pr_buf(out, "%llu", raw); + break; + case PRINTBUF_UNITS_BYTES: + pr_buf(out, "%llu", bytes); + break; + case PRINTBUF_UNITS_HUMAN_READABLE: + bch2_hprint(out, bytes); + break; + } } void bch2_string_opt_to_text(struct printbuf *out, @@ -136,9 +216,6 @@ void bch2_flags_to_text(struct printbuf *out, unsigned bit, nr = 0; bool first = true; - if (out->pos != out->end) - *out->pos = '\0'; - while (list[nr]) nr++; @@ -467,36 +544,44 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd) pd->backpressure = 1; } -size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) +void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) { - /* 2^64 - 1 is 20 digits, plus null byte */ - char rate[21]; - char actual[21]; - char target[21]; - char proportional[21]; - char derivative[21]; - char change[21]; - s64 next_io; + out->tabstops[0] = 20; + + pr_buf(out, "rate:"); + pr_tab(out); + bch2_hprint(out, pd->rate.rate); + pr_newline(out); + + pr_buf(out, "target:"); + pr_tab(out); + bch2_hprint(out, pd->last_target); + pr_newline(out); + + pr_buf(out, "actual:"); + pr_tab(out); + bch2_hprint(out, pd->last_actual); + pr_newline(out); + + pr_buf(out, "proportional:"); + pr_tab(out); + bch2_hprint(out, pd->last_proportional); + pr_newline(out); - bch2_hprint(&PBUF(rate), pd->rate.rate); - bch2_hprint(&PBUF(actual), pd->last_actual); - bch2_hprint(&PBUF(target), pd->last_target); - bch2_hprint(&PBUF(proportional), pd->last_proportional); - bch2_hprint(&PBUF(derivative), pd->last_derivative); - bch2_hprint(&PBUF(change), pd->last_change); + pr_buf(out, "derivative:"); + pr_tab(out); + bch2_hprint(out, pd->last_derivative); + pr_newline(out); - next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); + pr_buf(out, "change:"); + pr_tab(out); + bch2_hprint(out, pd->last_change); + pr_newline(out); - return sprintf(buf, - "rate:\t\t%s/sec\n" - "target:\t\t%s\n" - "actual:\t\t%s\n" - "proportional:\t%s\n" - "derivative:\t%s\n" - "change:\t\t%s/sec\n" - "next io:\t%llims\n", - rate, target, actual, proportional, - derivative, change, next_io); + pr_buf(out, "next io:"); + pr_tab(out); + pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); + pr_newline(out); } /* misc: */ @@ -579,19 +664,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -void bch_scnmemcpy(struct printbuf *out, - const char *src, size_t len) -{ - size_t n = printbuf_remaining(out); - - if (n) { - n = min(n - 1, len); - memcpy(out->pos, src, n); - out->pos += n; - *out->pos = '\0'; - } -} - #include "eytzinger.h" static int alignment_ok(const void *base, size_t align) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 80402b398442..888693703c75 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -210,9 +210,11 @@ do { \ \ BUG_ON(_i >= (h)->used); \ (h)->used--; \ - heap_swap(h, _i, (h)->used, set_backpointer); \ - heap_sift_up(h, _i, cmp, set_backpointer); \ - heap_sift_down(h, _i, cmp, set_backpointer); \ + if ((_i) < (h)->used) { \ + heap_swap(h, _i, (h)->used, set_backpointer); \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + heap_sift_down(h, _i, cmp, set_backpointer); \ + } \ } while (0) #define heap_pop(h, d, cmp, set_backpointer) \ @@ -235,31 +237,157 @@ do { \ #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) +enum printbuf_units { + PRINTBUF_UNITS_RAW, + PRINTBUF_UNITS_BYTES, + PRINTBUF_UNITS_HUMAN_READABLE, +}; + struct printbuf { - char *pos; - char *end; + char *buf; + unsigned size; + unsigned pos; + unsigned last_newline; + unsigned last_field; + unsigned indent; + enum printbuf_units units:8; + u8 atomic; + bool allocation_failure:1; + u8 tabstop; + u8 tabstops[4]; }; +#define PRINTBUF ((struct printbuf) { NULL }) + +static inline void printbuf_exit(struct printbuf *buf) +{ + kfree(buf->buf); + buf->buf = ERR_PTR(-EINTR); /* poison value */ +} + +static inline void printbuf_reset(struct printbuf *buf) +{ + buf->pos = 0; + buf->last_newline = 0; + buf->last_field = 0; + buf->indent = 0; + buf->tabstop = 0; +} + static inline size_t printbuf_remaining(struct printbuf *buf) { - return buf->end - buf->pos; + return buf->size - buf->pos; } -#define _PBUF(_buf, _len) \ - ((struct printbuf) { \ - .pos = _buf, \ - .end = _buf + _len, \ - }) +static inline size_t printbuf_linelen(struct printbuf *buf) +{ + return buf->pos - buf->last_newline; +} -#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) +void bch2_pr_buf(struct printbuf *out, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); -#define pr_buf(_out, ...) \ -do { \ - (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ - __VA_ARGS__); \ -} while (0) +#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__) -void bch_scnmemcpy(struct printbuf *, const char *, size_t); +static inline void pr_char(struct printbuf *out, char c) +{ + bch2_pr_buf(out, "%c", c); +} + +static inline void pr_indent_push(struct printbuf *buf, unsigned spaces) +{ + buf->indent += spaces; + while (spaces--) + pr_char(buf, ' '); +} + +static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces) +{ + if (buf->last_newline + buf->indent == buf->pos) { + buf->pos -= spaces; + buf->buf[buf->pos] = 0; + } + buf->indent -= spaces; +} + +static inline void pr_newline(struct printbuf *buf) +{ + unsigned i; + + pr_char(buf, '\n'); + + buf->last_newline = buf->pos; + + for (i = 0; i < buf->indent; i++) + pr_char(buf, ' '); + + buf->last_field = buf->pos; + buf->tabstop = 0; +} + +static inline void pr_tab(struct printbuf *buf) +{ + BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); + + while (printbuf_remaining(buf) > 1 && + printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) + pr_char(buf, ' '); + + buf->last_field = buf->pos; + buf->tabstop++; +} + +void bch2_pr_tab_rjust(struct printbuf *); + +static inline void pr_tab_rjust(struct printbuf *buf) +{ + bch2_pr_tab_rjust(buf); +} + +void bch2_pr_units(struct printbuf *, s64, s64); +#define pr_units(...) bch2_pr_units(__VA_ARGS__) + +static inline void pr_sectors(struct printbuf *out, u64 v) +{ + bch2_pr_units(out, v, v << 9); +} + +#ifdef __KERNEL__ +static inline void pr_time(struct printbuf *out, u64 time) +{ + pr_buf(out, "%llu", time); +} +#else +#include <time.h> +static inline void pr_time(struct printbuf *out, u64 _time) +{ + char time_str[64]; + time_t time = _time; + struct tm *tm = localtime(&time); + size_t err = strftime(time_str, sizeof(time_str), "%c", tm); + if (!err) + pr_buf(out, "(formatting error)"); + else + pr_buf(out, "%s", time_str); +} +#endif + +#ifdef __KERNEL__ +static inline void uuid_unparse_lower(u8 *uuid, char *out) +{ + sprintf(out, "%pUb", uuid); +} +#else +#include <uuid/uuid.h> +#endif + +static inline void pr_uuid(struct printbuf *out, u8 *uuid) +{ + char uuid_str[40]; + + uuid_unparse_lower(uuid, uuid_str); + pr_buf(out, uuid_str); +} int bch2_strtoint_h(const char *, int *); int bch2_strtouint_h(const char *, unsigned int *); @@ -323,8 +451,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res) _r; \ }) -#define snprint(buf, size, var) \ - snprintf(buf, size, \ +#define snprint(out, var) \ + pr_buf(out, \ type_is(var, int) ? "%i\n" \ : type_is(var, unsigned) ? "%u\n" \ : type_is(var, long) ? "%li\n" \ @@ -441,7 +569,7 @@ struct bch_pd_controller { void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); void bch2_pd_controller_init(struct bch_pd_controller *); -size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); +void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); #define sysfs_pd_controller_attribute(name) \ rw_attribute(name##_rate); \ @@ -465,7 +593,7 @@ do { \ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ \ if (attr == &sysfs_##name##_rate_debug) \ - return bch2_pd_controller_print_debug(var, buf); \ + bch2_pd_controller_debug_to_text(out, var); \ } while (0) #define sysfs_pd_controller_store(name, var) \ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h index c099cdc0605f..53a694d71967 100644 --- a/fs/bcachefs/vstructs.h +++ b/fs/bcachefs/vstructs.h @@ -20,7 +20,7 @@ ({ \ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ \ - (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ + (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ }) #define vstruct_bytes(_s) \ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 4d7db64e3ef3..8d23b4c2449e 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -111,11 +111,11 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, else pr_buf(out, "(unknown type %u)", xattr.v->x_type); - bch_scnmemcpy(out, xattr.v->x_name, - xattr.v->x_name_len); - pr_buf(out, ":"); - bch_scnmemcpy(out, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + pr_buf(out, "%.*s:%.*s", + xattr.v->x_name_len, + xattr.v->x_name, + le16_to_cpu(xattr.v->x_val_len), + (char *) xattr_val(xattr.v)); } static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, @@ -311,13 +311,9 @@ retry: if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, - SPOS(inum, offset, snapshot), 0, k, ret) { - BUG_ON(k.k->p.inode < inum); - - if (k.k->p.inode > inum) - break; - + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), + POS(inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_xattr) continue; @@ -426,9 +422,8 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); const struct bch_option *opt; int id, inode_opt_id; - char buf[512]; - struct printbuf out = PBUF(buf); - unsigned val_len; + struct printbuf out = PRINTBUF; + int ret; u64 v; id = bch2_opt_lookup(name); @@ -449,16 +444,21 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, return -ENODATA; v = bch2_opt_get_by_id(&opts, id); - bch2_opt_to_text(&out, c, opt, v, 0); + bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); - val_len = out.pos - buf; + ret = out.pos; - if (buffer && val_len > size) - return -ERANGE; + if (out.allocation_failure) { + ret = -ENOMEM; + } else if (buffer) { + if (out.pos > size) + ret = -ERANGE; + else + memcpy(buffer, out.buf, out.pos); + } - if (buffer) - memcpy(buffer, buf, val_len); - return val_len; + printbuf_exit(&out); + return ret; } static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, @@ -525,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, memcpy(buf, value, size); buf[size] = '\0'; - ret = bch2_opt_parse(c, NULL, opt, buf, &v); + ret = bch2_opt_parse(c, opt, buf, &v, NULL); kfree(buf); if (ret < 0) |