From ceac31bcb6992cb8b7770d2a0e91b055e5020431 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 15 Apr 2021 13:05:38 -0400 Subject: Update bcachefs sources to fe72e70682 bcachefs: Fix for btree_gc repairing interior btree ptrs --- .bcachefs_revision | 2 +- include/linux/wait.h | 1 + include/trace/events/bcachefs.h | 161 +++++------ libbcachefs/alloc_background.c | 462 ++++++++++--------------------- libbcachefs/alloc_background.h | 6 +- libbcachefs/alloc_foreground.c | 62 +---- libbcachefs/alloc_types.h | 12 + libbcachefs/bcachefs.h | 22 +- libbcachefs/bkey_methods.c | 38 +++ libbcachefs/btree_gc.c | 597 +++++++++------------------------------- libbcachefs/btree_gc.h | 12 +- libbcachefs/btree_io.c | 25 +- libbcachefs/btree_io.h | 1 - libbcachefs/btree_iter.c | 139 ++++++---- libbcachefs/btree_iter.h | 2 +- libbcachefs/btree_update_leaf.c | 77 +++--- libbcachefs/buckets.c | 152 ++-------- libbcachefs/buckets.h | 12 +- libbcachefs/buckets_types.h | 5 + libbcachefs/fs-io.c | 103 ++++--- libbcachefs/fsck.c | 59 ++-- libbcachefs/journal.c | 37 ++- libbcachefs/journal.h | 5 +- libbcachefs/journal_reclaim.c | 9 +- libbcachefs/move.c | 2 +- libbcachefs/movinggc.c | 15 +- libbcachefs/recovery.c | 25 +- libbcachefs/replicas.c | 4 +- libbcachefs/super.c | 17 +- libbcachefs/sysfs.c | 50 ++-- libbcachefs/tests.c | 37 +++ 31 files changed, 776 insertions(+), 1375 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 4891beca..82c9b19f 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -8eca47e4d5c4e6817ad4c020be4280bd82104efd +fe72e70682cd2430a099c08c3135253675030d28 diff --git a/include/linux/wait.h b/include/linux/wait.h index c3d98242..d1d33e67 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -90,6 +90,7 @@ do { \ __wait_event(wq, condition); \ } while (0) +#define wait_event_freezable(wq, condition) ({wait_event(wq, condition); 0; }) #define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; }) #define wait_event_interruptible(wq, condition) ({wait_event(wq, condition); 0; }) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 5ab05693..e6c3e17a 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -353,28 +353,6 @@ DEFINE_EVENT(btree_node, btree_set_root, /* Garbage collection */ -DEFINE_EVENT(btree_node, btree_gc_coalesce, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -TRACE_EVENT(btree_gc_coalesce_fail, - TP_PROTO(struct bch_fs *c, int reason), - TP_ARGS(c, reason), - - TP_STRUCT__entry( - __field(u8, reason ) - __array(char, uuid, 16 ) - ), - - TP_fast_assign( - __entry->reason = reason; - memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); - ), - - TP_printk("%pU: %u", __entry->uuid, __entry->reason) -); - DEFINE_EVENT(btree_node, btree_gc_rewrite_node, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) @@ -395,16 +373,6 @@ DEFINE_EVENT(bch_fs, gc_end, TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, gc_coalesce_start, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, gc_coalesce_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, TP_PROTO(struct bch_fs *c), TP_ARGS(c) @@ -412,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, /* Allocator */ -TRACE_EVENT(alloc_batch, - TP_PROTO(struct bch_dev *ca, size_t free, size_t total), - TP_ARGS(ca, free, total), +TRACE_EVENT(alloc_scan, + TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped), + TP_ARGS(ca, found, inc_gen, inc_gen_skipped), TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(size_t, free ) - __field(size_t, total ) + __field(dev_t, dev ) + __field(u64, found ) + __field(u64, inc_gen ) + __field(u64, inc_gen_skipped ) ), TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - __entry->free = free; - __entry->total = total; + __entry->dev = ca->disk_sb.bdev->bd_dev; + __entry->found = found; + __entry->inc_gen = inc_gen; + __entry->inc_gen_skipped = inc_gen_skipped; ), - TP_printk("%pU free %zu total %zu", - __entry->uuid, __entry->free, __entry->total) + TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->found, __entry->inc_gen, __entry->inc_gen_skipped) ); TRACE_EVENT(invalidate, @@ -449,13 +420,10 @@ TRACE_EVENT(invalidate, ), TP_printk("invalidated %u sectors at %d,%d sector=%llu", - __entry->sectors, MAJOR(__entry->dev), - MINOR(__entry->dev), __entry->offset) -); - -DEFINE_EVENT(bch_fs, rescale_prios, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) + __entry->sectors, + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->offset) ); DECLARE_EVENT_CLASS(bucket_alloc, @@ -463,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc, TP_ARGS(ca, reserve), TP_STRUCT__entry( - __array(char, uuid, 16) - __field(enum alloc_reserve, reserve ) + __field(dev_t, dev ) + __field(enum alloc_reserve, reserve ) ), TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - __entry->reserve = reserve; + __entry->dev = ca->disk_sb.bdev->bd_dev; + __entry->reserve = reserve; ), - TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) + TP_printk("%d,%d reserve %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve) ); DEFINE_EVENT(bucket_alloc, bucket_alloc, @@ -598,77 +568,93 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, TRACE_EVENT(trans_restart_would_deadlock, TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, + bool in_traverse_all, unsigned reason, enum btree_id have_btree_id, unsigned have_iter_type, + struct bpos *have_pos, enum btree_id want_btree_id, - unsigned want_iter_type), - TP_ARGS(trans_ip, caller_ip, reason, - have_btree_id, have_iter_type, - want_btree_id, want_iter_type), + unsigned want_iter_type, + struct bpos *want_pos), + TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason, + have_btree_id, have_iter_type, have_pos, + want_btree_id, want_iter_type, want_pos), TP_STRUCT__entry( __field(unsigned long, trans_ip ) __field(unsigned long, caller_ip ) + __field(u8, in_traverse_all ) __field(u8, reason ) __field(u8, have_btree_id ) __field(u8, have_iter_type ) __field(u8, want_btree_id ) __field(u8, want_iter_type ) + + __field(u64, have_pos_inode ) + __field(u64, have_pos_offset ) + __field(u32, have_pos_snapshot) + __field(u32, want_pos_snapshot) + __field(u64, want_pos_inode ) + __field(u64, want_pos_offset ) ), TP_fast_assign( __entry->trans_ip = trans_ip; __entry->caller_ip = caller_ip; + __entry->in_traverse_all = in_traverse_all; __entry->reason = reason; __entry->have_btree_id = have_btree_id; __entry->have_iter_type = have_iter_type; __entry->want_btree_id = want_btree_id; __entry->want_iter_type = want_iter_type; + + __entry->have_pos_inode = have_pos->inode; + __entry->have_pos_offset = have_pos->offset; + __entry->have_pos_snapshot = have_pos->snapshot; + + __entry->want_pos_inode = want_pos->inode; + __entry->want_pos_offset = want_pos->offset; + __entry->want_pos_snapshot = want_pos->snapshot; ), - TP_printk("%ps %pS because %u have %u:%u want %u:%u", + TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", (void *) __entry->trans_ip, (void *) __entry->caller_ip, + __entry->in_traverse_all, __entry->reason, __entry->have_btree_id, __entry->have_iter_type, + __entry->have_pos_inode, + __entry->have_pos_offset, + __entry->have_pos_snapshot, __entry->want_btree_id, - __entry->want_iter_type) -); - -TRACE_EVENT(trans_restart_iters_realloced, - TP_PROTO(unsigned long ip, unsigned nr), - TP_ARGS(ip, nr), - - TP_STRUCT__entry( - __field(unsigned long, ip ) - __field(unsigned, nr ) - ), - - TP_fast_assign( - __entry->ip = ip; - __entry->nr = nr; - ), - - TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) + __entry->want_iter_type, + __entry->want_pos_inode, + __entry->want_pos_offset, + __entry->want_pos_snapshot) ); TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(unsigned long ip, unsigned long bytes), - TP_ARGS(ip, bytes), + TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, + unsigned long bytes), + TP_ARGS(trans_ip, caller_ip, bytes), TP_STRUCT__entry( - __field(unsigned long, ip ) - __field(unsigned long, bytes ) + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) + __field(unsigned long, bytes ) ), TP_fast_assign( - __entry->ip = ip; - __entry->bytes = bytes; + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; ), - TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) + TP_printk("%ps %pS bytes %lu", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, + __entry->bytes) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, @@ -726,6 +712,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse, TP_ARGS(ip) ); +DEFINE_EVENT(transaction_restart, trans_traverse_all, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + DECLARE_EVENT_CLASS(node_lock_fail, TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), TP_ARGS(level, iter_seq, node, node_seq), diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 55562763..912020e6 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -25,44 +25,19 @@ #include #include +const char * const bch2_allocator_states[] = { +#define x(n) #n, + ALLOC_THREAD_STATES() +#undef x + NULL +}; + static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, BCH_ALLOC_FIELDS_V1() #undef x }; -/* Ratelimiting/PD controllers */ - -static void pd_controllers_update(struct work_struct *work) -{ - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, - pd_controllers_update); - struct bch_dev *ca; - s64 free = 0, fragmented = 0; - unsigned i; - - for_each_member_device(ca, c, i) { - struct bch_dev_usage stats = bch2_dev_usage_read(ca); - - free += bucket_to_sector(ca, - __dev_buckets_available(ca, stats)) << 9; - /* - * Bytes of internal fragmentation, which can be - * reclaimed by copy GC - */ - fragmented += max_t(s64, 0, (bucket_to_sector(ca, - stats.d[BCH_DATA_user].buckets + - stats.d[BCH_DATA_cached].buckets) - - (stats.d[BCH_DATA_user].sectors + - stats.d[BCH_DATA_cached].sectors)) << 9); - } - - bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); - schedule_delayed_work(&c->pd_controllers_update, - c->pd_controllers_update_seconds * HZ); -} - /* Persistent alloc info: */ static inline u64 alloc_field_v1_get(const struct bch_alloc *a, @@ -234,7 +209,7 @@ void bch2_alloc_pack(struct bch_fs *c, bch2_alloc_pack_v2(dst, src); } -static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); @@ -254,7 +229,7 @@ const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) return "invalid device"; /* allow for unknown fields */ - if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) + if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) return "incorrect value size"; return NULL; @@ -279,9 +254,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - pr_buf(out, "gen %u oldest_gen %u data_type %u", - u.gen, u.oldest_gen, u.data_type); -#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); + pr_buf(out, "gen %u oldest_gen %u data_type %s", + u.gen, u.oldest_gen, bch2_data_types[u.data_type]); +#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); BCH_ALLOC_FIELDS_V2() #undef x } @@ -322,7 +297,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, NULL, bch2_alloc_read_fn); up_read(&c->gc_lock); - if (ret) { bch_err(c, "error reading alloc info: %i", ret); return ret; @@ -467,52 +441,6 @@ out: * commands to the newly free buckets, then puts them on the various freelists. */ -/** - * wait_buckets_available - wait on reclaimable buckets - * - * If there aren't enough available buckets to fill up free_inc, wait until - * there are. - */ -static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned long gc_count = c->gc_count; - s64 available; - int ret = 0; - - ca->allocator_state = ALLOCATOR_BLOCKED; - closure_wake_up(&c->freelist_wait); - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - ret = 1; - break; - } - - if (gc_count != c->gc_count) - ca->inc_gen_really_needs_gc = 0; - - available = dev_buckets_reclaimable(ca); - available -= ca->inc_gen_really_needs_gc; - - available = max(available, 0LL); - - if (available) - break; - - up_read(&c->gc_lock); - schedule(); - try_to_freeze(); - down_read(&c->gc_lock); - } - - __set_current_state(TASK_RUNNING); - ca->allocator_state = ALLOCATOR_RUNNING; - closure_wake_up(&c->freelist_wait); - - return ret; -} - static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, struct bucket_mark m) { @@ -530,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, gc_gen = bucket_gc_gen(bucket(ca, b)); - if (gc_gen >= BUCKET_GC_GEN_MAX / 2) - ca->inc_gen_needs_gc++; - - if (gc_gen >= BUCKET_GC_GEN_MAX) - ca->inc_gen_really_needs_gc++; + ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; + ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; return gc_gen < BUCKET_GC_GEN_MAX; } @@ -611,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) struct bucket_mark m = READ_ONCE(g->mark); unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); + cond_resched(); + if (!bch2_can_invalidate_bucket(ca, b, m)) continue; @@ -627,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) .key = key, }; } - - cond_resched(); } if (e.nr) @@ -721,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) size_t i, nr = 0; ca->inc_gen_needs_gc = 0; + ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case BCH_CACHE_REPLACEMENT_lru: @@ -742,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) return nr; } -static inline long next_alloc_bucket(struct bch_dev *ca) -{ - struct alloc_heap_entry e, *top = ca->alloc_heap.data; - - while (ca->alloc_heap.used) { - if (top->nr) { - size_t b = top->bucket; - - top->bucket++; - top->nr--; - return b; - } - - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - } - - return -1; -} - /* * returns sequence number of most recent journal entry that updated this * bucket: @@ -783,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) } } -static int bch2_invalidate_one_bucket2(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *iter, - u64 *journal_seq, unsigned flags) +static int bucket_invalidate_btree(struct btree_trans *trans, + struct bch_dev *ca, u64 b) { struct bch_fs *c = trans->c; - struct bkey_alloc_buf a; + struct bkey_alloc_buf *a; struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; - bool invalidating_cached_data; + struct btree_iter *iter = + bch2_trans_get_iter(trans, BTREE_ID_alloc, + POS(ca->dev_idx, b), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + int ret; + + a = bch2_trans_kmalloc(trans, sizeof(*a)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + percpu_down_read(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); + u = alloc_mem_to_key(iter, g, m); + percpu_up_read(&c->mark_lock); + + u.gen++; + u.data_type = 0; + u.dirty_sectors = 0; + u.cached_sectors = 0; + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); + + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 *journal_seq, unsigned flags) +{ + struct bucket *g; + struct bucket_mark m; size_t b; int ret = 0; @@ -808,7 +754,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, BUG_ON(m.dirty_sectors); - bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + bch2_mark_alloc_bucket(c, ca, b, true); spin_lock(&c->freelist_lock); verify_not_on_freelist(c, ca, b); @@ -839,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, goto out; } - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); -retry: - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - percpu_down_read(&c->mark_lock); - g = bucket(ca, iter->pos.offset); - m = READ_ONCE(g->mark); - u = alloc_mem_to_key(iter, g, m); - - percpu_up_read(&c->mark_lock); - - invalidating_cached_data = u.cached_sectors != 0; - - u.gen++; - u.data_type = 0; - u.dirty_sectors = 0; - u.cached_sectors = 0; - u.read_time = atomic64_read(&c->io_clock[READ].now); - u.write_time = atomic64_read(&c->io_clock[WRITE].now); - - bch2_alloc_pack(c, &a, u); - bch2_trans_update(trans, iter, &a.k, - BTREE_TRIGGER_BUCKET_INVALIDATE); - - /* - * XXX: - * when using deferred btree updates, we have journal reclaim doing - * btree updates and thus requiring the allocator to make forward - * progress, and here the allocator is requiring space in the journal - - * so we need a journal pre-reservation: - */ - ret = bch2_trans_commit(trans, NULL, - invalidating_cached_data ? journal_seq : NULL, - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - flags); - if (ret == -EINTR) - goto retry; + ret = bch2_trans_do(c, NULL, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RESERVED| + flags, + bucket_invalidate_btree(&trans, ca, b)); out: if (!ret) { /* remove from alloc_heap: */ @@ -905,8 +815,7 @@ out: percpu_down_read(&c->mark_lock); spin_lock(&c->freelist_lock); - bch2_mark_alloc_bucket(c, ca, b, false, - gc_pos_alloc(c, NULL), 0); + bch2_mark_alloc_bucket(c, ca, b, false); BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); BUG_ON(b != b2); @@ -923,29 +832,23 @@ out: */ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) { - struct btree_trans trans; - struct btree_iter *iter; u64 journal_seq = 0; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, - POS(ca->dev_idx, 0), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - /* Only use nowait if we've already invalidated at least one bucket: */ while (!ret && !fifo_full(&ca->free_inc) && - ca->alloc_heap.used) - ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, - BTREE_INSERT_GC_LOCK_HELD| + ca->alloc_heap.used) { + ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, (!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0)); - - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); + /* + * We only want to batch up invalidates when they're going to + * require flushing the journal: + */ + if (!journal_seq) + break; + } /* If we used NOWAIT, don't return the error: */ if (!fifo_empty(&ca->free_inc)) @@ -965,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) return 0; } -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) +static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) +{ + if (ca->allocator_state != new_state) { + ca->allocator_state = new_state; + closure_wake_up(&ca->fs->freelist_wait); + } +} + +static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { unsigned i; int ret = 0; - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - - /* - * Don't strand buckets on the copygc freelist until - * after recovery is finished: - */ - if (!test_bit(BCH_FS_STARTED, &c->flags) && - i == RESERVE_MOVINGGC) - continue; - - if (fifo_push(&ca->free[i], bucket)) { - fifo_pop(&ca->free_inc, bucket); - - closure_wake_up(&c->freelist_wait); - ca->allocator_state = ALLOCATOR_RUNNING; - - spin_unlock(&c->freelist_lock); - goto out; - } - } - - if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { - ca->allocator_state = ALLOCATOR_BLOCKED_FULL; - closure_wake_up(&c->freelist_wait); - } - - spin_unlock(&c->freelist_lock); + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) { + /* + * Don't strand buckets on the copygc freelist until + * after recovery is finished: + */ + if (i == RESERVE_MOVINGGC && + !test_bit(BCH_FS_STARTED, &c->flags)) + continue; - if ((current->flags & PF_KTHREAD) && - kthread_should_stop()) { + if (fifo_push(&ca->free[i], b)) { + fifo_pop(&ca->free_inc, b); ret = 1; break; } - - schedule(); - try_to_freeze(); } -out: - __set_current_state(TASK_RUNNING); + spin_unlock(&c->freelist_lock); + + ca->allocator_state = ret + ? ALLOCATOR_running + : ALLOCATOR_blocked_full; + closure_wake_up(&c->freelist_wait); return ret; } -/* - * Pulls buckets off free_inc, discards them (if enabled), then adds them to - * freelists, waiting until there's room if necessary: - */ -static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) +static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - while (!fifo_empty(&ca->free_inc)) { - size_t bucket = fifo_peek(&ca->free_inc); - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); - - if (push_invalidated_bucket(c, ca, bucket)) - return 1; - } + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), + ca->mi.bucket_size, GFP_NOFS, 0); +} - return 0; +static bool allocator_thread_running(struct bch_dev *ca) +{ + unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) + ? ALLOCATOR_running + : ALLOCATOR_stopped; + alloc_thread_set_state(ca, state); + return state == ALLOCATOR_running; } -static inline bool allocator_thread_running(struct bch_dev *ca) +static int buckets_available(struct bch_dev *ca, unsigned long gc_count) { - return ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); + s64 available = dev_buckets_reclaimable(ca) - + (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); + bool ret = available > 0; + + alloc_thread_set_state(ca, ret + ? ALLOCATOR_running + : ALLOCATOR_blocked); + return ret; } /** @@ -1056,62 +948,29 @@ static int bch2_allocator_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; + unsigned long gc_count = c->gc_count; size_t nr; int ret; set_freezable(); while (1) { - if (!allocator_thread_running(ca)) { - ca->allocator_state = ALLOCATOR_STOPPED; - if (kthread_wait_freezable(allocator_thread_running(ca))) - break; - } - - ca->allocator_state = ALLOCATOR_RUNNING; - - cond_resched(); - if (kthread_should_stop()) - break; - - pr_debug("discarding %zu invalidated buckets", - fifo_used(&ca->free_inc)); - - ret = discard_invalidated_buckets(c, ca); + ret = kthread_wait_freezable(allocator_thread_running(ca)); if (ret) goto stop; - down_read(&c->gc_lock); - - ret = bch2_invalidate_buckets(c, ca); - if (ret) { - up_read(&c->gc_lock); - goto stop; - } - - if (!fifo_empty(&ca->free_inc)) { - up_read(&c->gc_lock); - continue; - } - - pr_debug("free_inc now empty"); - - while (1) { + while (!ca->alloc_heap.used) { cond_resched(); - /* - * Find some buckets that we can invalidate, either - * they're completely unused, or only contain clean data - * that's been written back to the backing device or - * another cache tier - */ - pr_debug("scanning for reclaimable buckets"); + ret = kthread_wait_freezable(buckets_available(ca, gc_count)); + if (ret) + goto stop; + gc_count = c->gc_count; nr = find_reclaimable_buckets(c, ca); - pr_debug("found %zu buckets", nr); - - trace_alloc_batch(ca, nr, ca->alloc_heap.size); + trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, + ca->inc_gen_really_needs_gc); if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ca->inc_gen_really_needs_gc) && @@ -1119,37 +978,24 @@ static int bch2_allocator_thread(void *arg) atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); } - - if (nr) - break; - - /* - * If we found any buckets, we have to invalidate them - * before we scan for more - but if we didn't find very - * many we may want to wait on more buckets being - * available so we don't spin: - */ - ret = wait_buckets_available(c, ca); - if (ret) { - up_read(&c->gc_lock); - goto stop; - } } - up_read(&c->gc_lock); + ret = bch2_invalidate_buckets(c, ca); + if (ret) + goto stop; - pr_debug("%zu buckets to invalidate", nr); + while (!fifo_empty(&ca->free_inc)) { + u64 b = fifo_peek(&ca->free_inc); - /* - * alloc_heap is now full of newly-invalidated buckets: next, - * write out the new bucket gens: - */ - } + discard_one_bucket(c, ca, b); + ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); + if (ret) + goto stop; + } + } stop: - pr_debug("alloc thread stopping (ret %i)", ret); - ca->allocator_state = ALLOCATOR_STOPPED; - closure_wake_up(&c->freelist_wait); + alloc_thread_set_state(ca, ALLOCATOR_stopped); return 0; } @@ -1158,7 +1004,7 @@ stop: void bch2_recalc_capacity(struct bch_fs *c) { struct bch_dev *ca; - u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; + u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; unsigned i, j; @@ -1201,8 +1047,6 @@ void bch2_recalc_capacity(struct bch_fs *c) dev_reserve *= ca->mi.bucket_size; - copygc_threshold += dev_reserve; - capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); @@ -1220,7 +1064,6 @@ void bch2_recalc_capacity(struct bch_fs *c) reserved_sectors = min(reserved_sectors, capacity); - c->copygc_threshold = copygc_threshold; c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; @@ -1331,7 +1174,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) { if (ca->alloc_thread) closure_wait_event(&c->freelist_wait, - ca->allocator_state != ALLOCATOR_RUNNING); + ca->allocator_state != ALLOCATOR_running); } /* stop allocator thread: */ @@ -1385,7 +1228,4 @@ int bch2_dev_allocator_start(struct bch_dev *ca) void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - - c->pd_controllers_update_seconds = 5; - INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 6fededcd..ad15a806 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -6,6 +6,8 @@ #include "alloc_types.h" #include "debug.h" +extern const char * const bch2_allocator_states[]; + struct bkey_alloc_unpacked { u64 bucket; u8 dev; @@ -98,10 +100,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) rcu_read_lock(); p = rcu_dereference(ca->alloc_thread); - if (p) { + if (p) wake_up_process(p); - ca->allocator_state = ALLOCATOR_RUNNING; - } rcu_read_unlock(); } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index a29d313d..412fed47 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -1,57 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Primary bucket allocation code - * * Copyright 2012 Google, Inc. * - * Allocation in bcache is done in terms of buckets: - * - * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in - * btree pointers - they must match for the pointer to be considered valid. - * - * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a - * bucket simply by incrementing its gen. - * - * The gens (along with the priorities; it's really the gens are important but - * the code is named as if it's the priorities) are written in an arbitrary list - * of buckets on disk, with a pointer to them in the journal header. - * - * When we invalidate a bucket, we have to write its new gen to disk and wait - * for that write to complete before we use it - otherwise after a crash we - * could have pointers that appeared to be good but pointed to data that had - * been overwritten. - * - * Since the gens and priorities are all stored contiguously on disk, we can - * batch this up: We fill up the free_inc list with freshly invalidated buckets, - * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. - * - * free_inc isn't the only freelist - if it was, we'd often have to sleep while - * priorities and gens were being written before we could allocate. c->free is a - * smaller freelist, and buckets on that list are always ready to be used. - * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * - * It's important to ensure that gens don't wrap around - with respect to - * either the oldest gen in the btree or the gen on disk. This is quite - * difficult to do in practice, but we explicitly guard against it anyways - if - * a bucket is in danger of wrapping around we simply skip invalidating it that - * time around, and we garbage collect or rewrite the priorities sooner than we - * would have otherwise. + * Foreground allocator code: allocate buckets from freelist, and allocate in + * sector granularity from writepoints. * * bch2_bucket_alloc() allocates a single bucket from a specific device. * * bch2_bucket_alloc_set() allocates one or more buckets from different devices * in a given filesystem. - * - * invalidate_buckets() drives all the processes described above. It's called - * from bch2_bucket_alloc() and a few other places that need to make sure free - * buckets are ready. - * - * invalidate_buckets_(lru|fifo)() find buckets that are available to be - * invalidated, and then invalidate them and stick them on the free_inc list - - * in either lru or fifo order. */ #include "bcachefs.h" @@ -98,8 +55,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), - false, gc_pos_alloc(c, ob), 0); + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false); ob->valid = false; ob->type = 0; @@ -109,7 +65,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) spin_lock(&c->freelist_lock); ob->freelist = c->open_buckets_freelist; c->open_buckets_freelist = ob - c->open_buckets; + c->open_buckets_nr_free++; + ca->nr_open_buckets--; spin_unlock(&c->freelist_lock); closure_wake_up(&c->open_buckets_wait); @@ -316,6 +274,7 @@ out: c->blocked_allocate = 0; } + ca->nr_open_buckets++; spin_unlock(&c->freelist_lock); bch2_wake_allocator(ca); @@ -680,11 +639,14 @@ static struct write_point *__writepoint_find(struct hlist_head *head, { struct write_point *wp; + rcu_read_lock(); hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) - return wp; - - return NULL; + goto out; + wp = NULL; +out: + rcu_read_unlock(); + return wp; } static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index be164d61..4a1cd8b7 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -10,6 +10,18 @@ struct ec_bucket_buf; +#define ALLOC_THREAD_STATES() \ + x(stopped) \ + x(running) \ + x(blocked) \ + x(blocked_full) + +enum allocator_states { +#define x(n) ALLOCATOR_##n, + ALLOC_THREAD_STATES() +#undef x +}; + enum alloc_reserve { RESERVE_BTREE_MOVINGGC = -2, RESERVE_BTREE = -1, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 549cded6..aade5624 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -379,7 +379,6 @@ enum gc_phase { GC_PHASE_BTREE_reflink, GC_PHASE_PENDING_DELETE, - GC_PHASE_ALLOC, }; struct gc_pos { @@ -447,6 +446,7 @@ struct bch_dev { */ alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; + unsigned nr_open_buckets; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial_nr; @@ -456,16 +456,7 @@ struct bch_dev { size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; - /* - * XXX: this should be an enum for allocator state, so as to include - * error state - */ - enum { - ALLOCATOR_STOPPED, - ALLOCATOR_RUNNING, - ALLOCATOR_BLOCKED, - ALLOCATOR_BLOCKED_FULL, - } allocator_state; + enum allocator_states allocator_state; alloc_heap alloc_heap; @@ -664,9 +655,6 @@ struct bch_fs { struct workqueue_struct *copygc_wq; /* ALLOCATION */ - struct delayed_work pd_controllers_update; - unsigned pd_controllers_update_seconds; - struct bch_devs_mask rw_devs[BCH_DATA_NR]; u64 capacity; /* sectors */ @@ -726,6 +714,9 @@ struct bch_fs { atomic_t kick_gc; unsigned long gc_count; + enum btree_id gc_gens_btree; + struct bpos gc_gens_pos; + /* * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] * has been marked by GC. @@ -772,9 +763,8 @@ struct bch_fs { /* COPYGC */ struct task_struct *copygc_thread; copygc_heap copygc_heap; - struct bch_pd_controller copygc_pd; struct write_point copygc_write_point; - u64 copygc_threshold; + s64 copygc_wait; /* STRIPES: */ GENRADIX(struct stripe) stripes[2]; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 6fe95b80..450b613d 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -98,12 +98,50 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) return bch2_bkey_ops[k.k->type].key_invalid(c, k); } +static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_extents] = + (1U << KEY_TYPE_discard)| + (1U << KEY_TYPE_error)| + (1U << KEY_TYPE_extent)| + (1U << KEY_TYPE_reservation)| + (1U << KEY_TYPE_reflink_p)| + (1U << KEY_TYPE_inline_data), + [BKEY_TYPE_inodes] = + (1U << KEY_TYPE_inode)| + (1U << KEY_TYPE_inode_generation), + [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_hash_whiteout)| + (1U << KEY_TYPE_dirent), + [BKEY_TYPE_xattrs] = + (1U << KEY_TYPE_hash_whiteout)| + (1U << KEY_TYPE_xattr), + [BKEY_TYPE_alloc] = + (1U << KEY_TYPE_alloc)| + (1U << KEY_TYPE_alloc_v2), + [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_quota), + [BKEY_TYPE_stripes] = + (1U << KEY_TYPE_stripe), + [BKEY_TYPE_reflink] = + (1U << KEY_TYPE_reflink_v)| + (1U << KEY_TYPE_indirect_inline_data), + [BKEY_TYPE_btree] = + (1U << KEY_TYPE_btree_ptr)| + (1U << KEY_TYPE_btree_ptr_v2), +}; + const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type) { + unsigned key_types_allowed = (1U << KEY_TYPE_deleted)| + bch2_key_types_allowed[type] ; + if (k.k->u64s < BKEY_U64s) return "u64s too small"; + if (!(key_types_allowed & (1U << k.k->type))) + return "invalid key type for this btree"; + if (type == BKEY_TYPE_btree && bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 751920a5..536947cc 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -250,39 +250,54 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, bkey_reassemble(new, *k); - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); - - (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || - (!ptr->cached && - gen_cmp(ptr->gen, g->mark.gen) < 0); - })); + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + + ptr->gen = g->mark.gen; + } + } else { + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0); + })); again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct stripe *m = genradix_ptr(&c->stripes[true], - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct stripe *m = genradix_ptr(&c->stripes[true], + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } } } } @@ -301,10 +316,10 @@ fsck_err: static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, unsigned level, bool is_root, - struct bkey_s_c k, + struct bkey_s_c *k, u8 *max_stale, bool initial) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bkey_ptrs_c ptrs; const struct bch_extent_ptr *ptr; unsigned flags = BTREE_TRIGGER_GC| @@ -313,28 +328,29 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, if (initial) { BUG_ON(bch2_journal_seq_verify && - k.k->version.lo > journal_cur_seq(&c->journal)); + k->k->version.lo > journal_cur_seq(&c->journal)); - if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, + if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, "key version number higher than recorded: %llu > %llu", - k.k->version.lo, + k->k->version.lo, atomic64_read(&c->key_version))) - atomic64_set(&c->key_version, k.k->version.lo); + atomic64_set(&c->key_version, k->k->version.lo); if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, + fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, "superblock not marked as containing replicas (type %u)", - k.k->type)) { - ret = bch2_mark_bkey_replicas(c, k); + k->k->type)) { + ret = bch2_mark_bkey_replicas(c, *k); if (ret) { bch_err(c, "error marking bkey replicas: %i", ret); goto err; } } - ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); } + ptrs = bch2_bkey_ptrs_c(*k); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_BUCKET(ca, ptr, true); @@ -345,7 +361,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); + bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags); fsck_err: err: if (ret) @@ -374,7 +390,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, - k, max_stale, initial); + &k, max_stale, initial); if (ret) break; @@ -396,12 +412,13 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, - bool initial) + bool initial, bool metadata_only) { struct btree_trans trans; struct btree_iter *iter; struct btree *b; - unsigned depth = bch2_expensive_debug_checks ? 0 + unsigned depth = metadata_only ? 1 + : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -445,10 +462,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; - if (!btree_node_fake(b)) + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, - bkey_i_to_s_c(&b->key), - &max_stale, initial); + &k, &max_stale, initial); + } gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -474,7 +493,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, - k, &max_stale, true); + &k, &max_stale, true); if (ret) { bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); break; @@ -544,11 +563,13 @@ fsck_err: } static int bch2_gc_btree_init(struct bch_fs *c, - enum btree_id btree_id) + enum btree_id btree_id, + bool metadata_only) { struct btree *b; - unsigned target_depth = bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 + unsigned target_depth = metadata_only ? 1 + : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; char buf[100]; @@ -575,10 +596,12 @@ static int bch2_gc_btree_init(struct bch_fs *c, if (b->c.level >= target_depth) ret = bch2_gc_btree_init_recurse(c, b, target_depth); - if (!ret) + if (!ret) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, - bkey_i_to_s_c(&b->key), - &max_stale, true); + &k, &max_stale, true); + } fsck_err: six_unlock_read(&b->c.lock); @@ -593,7 +616,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, bool initial) +static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) { enum btree_id ids[BTREE_ID_NR]; unsigned i; @@ -605,8 +628,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial) for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; int ret = initial - ? bch2_gc_btree_init(c, id) - : bch2_gc_btree(c, id, initial); + ? bch2_gc_btree_init(c, id, metadata_only) + : bch2_gc_btree(c, id, initial, metadata_only); if (ret) { bch_err(c, "%s: ret %i", __func__, ret); return ret; @@ -707,52 +730,6 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) } #endif -static void bch2_mark_allocator_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct open_bucket *ob; - size_t i, j, iter; - unsigned ci; - - percpu_down_read(&c->mark_lock); - - spin_lock(&c->freelist_lock); - gc_pos_set(c, gc_pos_alloc(c, NULL)); - - for_each_member_device(ca, c, ci) { - fifo_for_each_entry(i, &ca->free_inc, iter) - bch2_mark_alloc_bucket(c, ca, i, true, - gc_pos_alloc(c, NULL), - BTREE_TRIGGER_GC); - - - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - bch2_mark_alloc_bucket(c, ca, i, true, - gc_pos_alloc(c, NULL), - BTREE_TRIGGER_GC); - } - - spin_unlock(&c->freelist_lock); - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid) { - gc_pos_set(c, gc_pos_alloc(c, ob)); - ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, - gc_pos_alloc(c, ob), - BTREE_TRIGGER_GC); - } - spin_unlock(&ob->lock); - } - - percpu_up_read(&c->mark_lock); -} - static void bch2_gc_free(struct bch_fs *c) { struct bch_dev *ca; @@ -775,10 +752,10 @@ static void bch2_gc_free(struct bch_fs *c) } static int bch2_gc_done(struct bch_fs *c, - bool initial) + bool initial, bool metadata_only) { struct bch_dev *ca; - bool verify = (!initial || + bool verify = !metadata_only && (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i, dev; int ret = 0; @@ -805,7 +782,7 @@ static int bch2_gc_done(struct bch_fs *c, if (dst->b[b].mark._f != src->b[b].mark._f) { \ if (verify) \ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", i, b, \ + ": got %u, should be %u", dev, b, \ dst->b[b].mark.gen, \ bch2_data_types[dst->b[b].mark.data_type],\ dst->b[b].mark._f, src->b[b].mark._f); \ @@ -813,11 +790,11 @@ static int bch2_gc_done(struct bch_fs *c, set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ - copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) + copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - { + if (!metadata_only) { struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; @@ -857,7 +834,6 @@ static int bch2_gc_done(struct bch_fs *c, for (b = 0; b < src->nbuckets; b++) { copy_bucket_field(gen); copy_bucket_field(data_type); - copy_bucket_field(owned_by_allocator); copy_bucket_field(stripe); copy_bucket_field(dirty_sectors); copy_bucket_field(cached_sectors); @@ -890,20 +866,28 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(hidden, "hidden"); copy_fs_field(btree, "btree"); - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes,"nr_inodes"); - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], - "persistent_reserved[%i]", i); + if (!metadata_only) { + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); + } for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); char buf[80]; + if (metadata_only && + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) + continue; + bch2_replicas_entry_to_text(&PBUF(buf), e); copy_fs_field(replicas[i], "%s", buf); @@ -921,7 +905,8 @@ fsck_err: return ret; } -static int bch2_gc_start(struct bch_fs *c) +static int bch2_gc_start(struct bch_fs *c, + bool metadata_only) { struct bch_dev *ca; unsigned i; @@ -985,6 +970,11 @@ static int bch2_gc_start(struct bch_fs *c) d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; d->gen_valid = s->gen_valid; + + if (metadata_only && + (s->mark.data_type == BCH_DATA_user || + s->mark.data_type == BCH_DATA_cached)) + d->_mark = s->mark; } }; @@ -1011,7 +1001,7 @@ static int bch2_gc_start(struct bch_fs *c) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, bool initial) +int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) { struct bch_dev *ca; u64 start_time = local_clock(); @@ -1027,21 +1017,19 @@ int bch2_gc(struct bch_fs *c, bool initial) closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); again: - ret = bch2_gc_start(c); + ret = bch2_gc_start(c, metadata_only); if (ret) goto out; bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, initial); + ret = bch2_gc_btrees(c, initial, metadata_only); if (ret) goto out; #if 0 bch2_mark_pending_btree_node_frees(c); #endif - bch2_mark_allocator_buckets(c); - c->gc_count++; if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || @@ -1071,7 +1059,7 @@ out: bch2_journal_block(&c->journal); percpu_down_write(&c->mark_lock); - ret = bch2_gc_done(c, initial); + ret = bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); } else { @@ -1142,7 +1130,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) struct btree_iter *iter; struct bkey_s_c k; struct bkey_buf sk; - int ret = 0; + int ret = 0, commit_err = 0; bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); @@ -1154,18 +1142,20 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { - if (gc_btree_gens_key(c, k)) { + c->gc_gens_pos = iter->pos; + + if (gc_btree_gens_key(c, k) && !commit_err) { bch2_bkey_buf_reassemble(&sk, c, k); bch2_extent_normalize(c, bkey_i_to_s(sk.k)); bch2_trans_update(&trans, iter, sk.k, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); - if (ret == -EINTR) + commit_err = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_NOFAIL); + if (commit_err == -EINTR) { + commit_err = 0; continue; - if (ret) { - break; } } @@ -1205,6 +1195,8 @@ int bch2_gc_gens(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if ((1 << i) & BTREE_ID_HAS_PTRS) { + c->gc_gens_btree = i; + c->gc_gens_pos = POS_MIN; ret = bch2_gc_btree_gens(c, i); if (ret) { bch_err(c, "error recalculating oldest_gen: %i", ret); @@ -1221,352 +1213,15 @@ int bch2_gc_gens(struct bch_fs *c) up_read(&ca->bucket_lock); } + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; + c->gc_count++; err: up_read(&c->gc_lock); return ret; } -/* Btree coalescing */ - -static void recalc_packed_keys(struct btree *b) -{ - struct bset *i = btree_bset_first(b); - struct bkey_packed *k; - - memset(&b->nr, 0, sizeof(b->nr)); - - BUG_ON(b->nsets != 1); - - vstruct_for_each(i, k) - btree_keys_account_key_add(&b->nr, 0, k); -} - -static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, - struct btree *old_nodes[GC_MERGE_NODES]) -{ - struct btree *parent = btree_node_parent(iter, old_nodes[0]); - unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; - unsigned blocks = btree_blocks(c) * 2 / 3; - struct btree *new_nodes[GC_MERGE_NODES]; - struct btree_update *as; - struct keylist keylist; - struct bkey_format_state format_state; - struct bkey_format new_format; - - memset(new_nodes, 0, sizeof(new_nodes)); - bch2_keylist_init(&keylist, NULL); - - /* Count keys that are not deleted */ - for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) - u64s += old_nodes[i]->nr.live_u64s; - - nr_old_nodes = nr_new_nodes = i; - - /* Check if all keys in @old_nodes could fit in one fewer node */ - if (nr_old_nodes <= 1 || - __vstruct_blocks(struct btree_node, c->block_bits, - DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) - return; - - /* Find a format that all keys in @old_nodes can pack into */ - bch2_bkey_format_init(&format_state); - - /* - * XXX: this won't correctly take it account the new min/max keys: - */ - for (i = 0; i < nr_old_nodes; i++) - __bch2_btree_calc_format(&format_state, old_nodes[i]); - - new_format = bch2_bkey_format_done(&format_state); - - /* Check if repacking would make any nodes too big to fit */ - for (i = 0; i < nr_old_nodes; i++) - if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { - trace_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_FORMAT_FITS); - return; - } - - if (bch2_keylist_realloc(&keylist, NULL, 0, - BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) { - trace_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); - return; - } - - as = bch2_btree_update_start(iter, old_nodes[0]->c.level, - btree_update_reserve_required(c, parent) + nr_old_nodes, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); - if (IS_ERR(as)) { - trace_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_RESERVE_GET); - bch2_keylist_free(&keylist, NULL); - return; - } - - trace_btree_gc_coalesce(c, old_nodes[0]); - - for (i = 0; i < nr_old_nodes; i++) - bch2_btree_interior_update_will_free_node(as, old_nodes[i]); - - /* Repack everything with @new_format and sort down to one bset */ - for (i = 0; i < nr_old_nodes; i++) - new_nodes[i] = - __bch2_btree_node_alloc_replacement(as, old_nodes[i], - new_format); - - /* - * Conceptually we concatenate the nodes together and slice them - * up at different boundaries. - */ - for (i = nr_new_nodes - 1; i > 0; --i) { - struct btree *n1 = new_nodes[i]; - struct btree *n2 = new_nodes[i - 1]; - - struct bset *s1 = btree_bset_first(n1); - struct bset *s2 = btree_bset_first(n2); - struct bkey_packed *k, *last = NULL; - - /* Calculate how many keys from @n2 we could fit inside @n1 */ - u64s = 0; - - for (k = s2->start; - k < vstruct_last(s2) && - vstruct_blocks_plus(n1->data, c->block_bits, - u64s + k->u64s) <= blocks; - k = bkey_next(k)) { - last = k; - u64s += k->u64s; - } - - if (u64s == le16_to_cpu(s2->u64s)) { - /* n2 fits entirely in n1 */ - n1->key.k.p = n1->data->max_key = n2->data->max_key; - - memcpy_u64s(vstruct_last(s1), - s2->start, - le16_to_cpu(s2->u64s)); - le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); - - set_btree_bset_end(n1, n1->set); - - six_unlock_write(&n2->c.lock); - bch2_btree_node_free_never_inserted(c, n2); - six_unlock_intent(&n2->c.lock); - - memmove(new_nodes + i - 1, - new_nodes + i, - sizeof(new_nodes[0]) * (nr_new_nodes - i)); - new_nodes[--nr_new_nodes] = NULL; - } else if (u64s) { - /* move part of n2 into n1 */ - n1->key.k.p = n1->data->max_key = - bkey_unpack_pos(n1, last); - - n2->data->min_key = bpos_successor(n1->data->max_key); - - memcpy_u64s(vstruct_last(s1), - s2->start, u64s); - le16_add_cpu(&s1->u64s, u64s); - - memmove(s2->start, - vstruct_idx(s2, u64s), - (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); - s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); - - set_btree_bset_end(n1, n1->set); - set_btree_bset_end(n2, n2->set); - } - } - - for (i = 0; i < nr_new_nodes; i++) { - struct btree *n = new_nodes[i]; - - recalc_packed_keys(n); - btree_node_reset_sib_u64s(n); - - bch2_btree_build_aux_trees(n); - - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - bch2_btree_node_write(c, n, SIX_LOCK_intent); - } - - /* - * The keys for the old nodes get deleted. We don't want to insert keys - * that compare equal to the keys for the new nodes we'll also be - * inserting - we can't because keys on a keylist must be strictly - * greater than the previous keys, and we also don't need to since the - * key for the new node will serve the same purpose (overwriting the key - * for the old node). - */ - for (i = 0; i < nr_old_nodes; i++) { - struct bkey_i delete; - unsigned j; - - for (j = 0; j < nr_new_nodes; j++) - if (!bpos_cmp(old_nodes[i]->key.k.p, - new_nodes[j]->key.k.p)) - goto next; - - bkey_init(&delete.k); - delete.k.p = old_nodes[i]->key.k.p; - bch2_keylist_add_in_order(&keylist, &delete); -next: - i = i; - } - - /* - * Keys for the new nodes get inserted: bch2_btree_insert_keys() only - * does the lookup once and thus expects the keys to be in sorted order - * so we have to make sure the new keys are correctly ordered with - * respect to the deleted keys added in the previous loop - */ - for (i = 0; i < nr_new_nodes; i++) - bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); - - /* Insert the newly coalesced nodes */ - bch2_btree_insert_node(as, parent, iter, &keylist, 0); - - BUG_ON(!bch2_keylist_empty(&keylist)); - - BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); - - bch2_btree_iter_node_replace(iter, new_nodes[0]); - - for (i = 0; i < nr_new_nodes; i++) - bch2_btree_update_get_open_buckets(as, new_nodes[i]); - - /* Free the old nodes and update our sliding window */ - for (i = 0; i < nr_old_nodes; i++) { - bch2_btree_node_free_inmem(c, old_nodes[i], iter); - - /* - * the index update might have triggered a split, in which case - * the nodes we coalesced - the new nodes we just created - - * might not be sibling nodes anymore - don't add them to the - * sliding window (except the first): - */ - if (!i) { - old_nodes[i] = new_nodes[i]; - } else { - old_nodes[i] = NULL; - } - } - - for (i = 0; i < nr_new_nodes; i++) - six_unlock_intent(&new_nodes[i]->c.lock); - - bch2_btree_update_done(as); - bch2_keylist_free(&keylist, NULL); -} - -static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct btree *b; - bool kthread = (current->flags & PF_KTHREAD) != 0; - unsigned i; - int ret = 0; - - /* Sliding window of adjacent btree nodes */ - struct btree *merge[GC_MERGE_NODES]; - u32 lock_seq[GC_MERGE_NODES]; - - bch2_trans_init(&trans, c, 0, 0); - - /* - * XXX: We don't have a good way of positively matching on sibling nodes - * that have the same parent - this code works by handling the cases - * where they might not have the same parent, and is thus fragile. Ugh. - * - * Perhaps redo this to use multiple linked iterators? - */ - memset(merge, 0, sizeof(merge)); - - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, - BTREE_MAX_DEPTH, 0, - BTREE_ITER_PREFETCH, b) { - memmove(merge + 1, merge, - sizeof(merge) - sizeof(merge[0])); - memmove(lock_seq + 1, lock_seq, - sizeof(lock_seq) - sizeof(lock_seq[0])); - - merge[0] = b; - - for (i = 1; i < GC_MERGE_NODES; i++) { - if (!merge[i] || - !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) - break; - - if (merge[i]->c.level != merge[0]->c.level) { - six_unlock_intent(&merge[i]->c.lock); - break; - } - } - memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); - - bch2_coalesce_nodes(c, iter, merge); - - for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { - lock_seq[i] = merge[i]->c.lock.state.seq; - six_unlock_intent(&merge[i]->c.lock); - } - - lock_seq[0] = merge[0]->c.lock.state.seq; - - if (kthread && kthread_should_stop()) { - ret = -ESHUTDOWN; - break; - } - - bch2_trans_cond_resched(&trans); - - /* - * If the parent node wasn't relocked, it might have been split - * and the nodes in our sliding window might not have the same - * parent anymore - blow away the sliding window: - */ - if (btree_iter_node(iter, iter->level + 1) && - !btree_node_intent_locked(iter, iter->level + 1)) - memset(merge + 1, 0, - (GC_MERGE_NODES - 1) * sizeof(merge[0])); - } - bch2_trans_iter_put(&trans, iter); - - return bch2_trans_exit(&trans) ?: ret; -} - -/** - * bch_coalesce - coalesce adjacent nodes with low occupancy - */ -void bch2_coalesce(struct bch_fs *c) -{ - enum btree_id id; - - down_read(&c->gc_lock); - trace_gc_coalesce_start(c); - - for (id = 0; id < BTREE_ID_NR; id++) { - int ret = c->btree_roots[id].b - ? bch2_coalesce_btree(c, id) - : 0; - - if (ret) { - if (ret != -ESHUTDOWN) - bch_err(c, "btree coalescing failed: %d", ret); - return; - } - } - - trace_gc_coalesce_end(c); - up_read(&c->gc_lock); -} - static int bch2_gc_thread(void *arg) { struct bch_fs *c = arg; diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index b1362a9f..e9a87394 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -4,9 +4,7 @@ #include "btree_types.h" -void bch2_coalesce(struct bch_fs *); - -int bch2_gc(struct bch_fs *, bool); +int bch2_gc(struct bch_fs *, bool, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); @@ -92,14 +90,6 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id) return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); } -static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) -{ - return (struct gc_pos) { - .phase = GC_PHASE_ALLOC, - .pos = POS(ob ? ob - c->open_buckets : 0, 0), - }; -} - static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 4a2f5726..c8d8df96 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1057,14 +1057,17 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; + char buf[200]; int ret; + btree_pos_to_text(&PBUF(buf), c, b); trace_btree_read(c, b); ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick); if (bch2_fs_fatal_err_on(ret <= 0, c, - "btree node read error: no device to read from")) { + "btree node read error: no device to read from\n" + " at %s", buf)) { set_btree_node_read_error(b); return; } @@ -1337,13 +1340,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return ret; } -static void btree_write_submit(struct work_struct *work) -{ - struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); -} - void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) { struct btree_write_bio *wbio; @@ -1351,6 +1347,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; + struct bkey_buf k; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; @@ -1361,6 +1358,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) bool validate_before_checksum = false; void *data; + bch2_bkey_buf_init(&k); + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1537,7 +1536,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) wbio_init(&wbio->wbio.bio); wbio->data = data; wbio->bytes = bytes; - wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; @@ -1560,9 +1558,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) * just make all btree node writes FUA to keep things sane. */ - bkey_copy(&wbio->key, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr) ptr->offset += b->written; b->written += sectors_to_write; @@ -1570,8 +1568,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) atomic64_inc(&c->btree_writes_nr); atomic64_add(sectors_to_write, &c->btree_writes_sectors); - INIT_WORK(&wbio->work, btree_write_submit); - schedule_work(&wbio->work); + /* XXX: submitting IO with btree locks held: */ + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); + bch2_bkey_buf_exit(&k, c); return; err: set_btree_node_noevict(b); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index c8a8b05a..95c35161 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -42,7 +42,6 @@ struct btree_read_bio { struct btree_write_bio { struct work_struct work; - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); void *data; unsigned bytes; struct bch_write_bio wbio; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index adcb0ee4..c8f527bc 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -260,13 +260,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - __fls(linked->nodes_locked) + 1); - if (!btree_iter_get_locks(linked, true, false)) { - deadlock_iter = linked; - reason = 1; - } + deadlock_iter = linked; + reason = 1; } if (linked->btree_id != iter->btree_id) { @@ -295,14 +290,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, * we're about to lock, it must have the ancestors locked too: */ if (level > __fls(linked->nodes_locked)) { - linked->locks_want = - max(level + 1, max_t(unsigned, - linked->locks_want, - iter->locks_want)); - if (!btree_iter_get_locks(linked, true, false)) { - deadlock_iter = linked; - reason = 5; - } + deadlock_iter = linked; + reason = 5; } /* Must lock btree nodes in key order: */ @@ -311,27 +300,19 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, btree_iter_type(linked))) <= 0) { deadlock_iter = linked; reason = 7; - } - - /* - * Recheck if this is a node we already have locked - since one - * of the get_locks() calls might've successfully - * upgraded/relocked it: - */ - if (linked->l[level].b == b && - btree_node_locked_type(linked, level) >= type) { - six_lock_increment(&b->c.lock, type); - return true; + BUG_ON(trans->in_traverse_all); } } if (unlikely(deadlock_iter)) { trace_trans_restart_would_deadlock(iter->trans->ip, ip, - reason, + trans->in_traverse_all, reason, deadlock_iter->btree_id, btree_iter_type(deadlock_iter), + &deadlock_iter->real_pos, iter->btree_id, - btree_iter_type(iter)); + btree_iter_type(iter), + &pos); return false; } @@ -409,12 +390,27 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, return true; /* - * Ancestor nodes must be locked before child nodes, so set locks_want - * on iterators that might lock ancestors before us to avoid getting - * -EINTR later: + * XXX: this is ugly - we'd prefer to not be mucking with other + * iterators in the btree_trans here. + * + * On failure to upgrade the iterator, setting iter->locks_want and + * calling get_locks() is sufficient to make bch2_btree_iter_traverse() + * get the locks we want on transaction restart. + * + * But if this iterator was a clone, on transaction restart what we did + * to this iterator isn't going to be preserved. + * + * Possibly we could add an iterator field for the parent iterator when + * an iterator is a copy - for now, we'll just upgrade any other + * iterators with the same btree id. + * + * The code below used to be needed to ensure ancestor nodes get locked + * before interior nodes - now that's handled by + * bch2_btree_iter_traverse_all(). */ trans_for_each_iter(iter->trans, linked) if (linked != iter && + btree_iter_type(linked) == btree_iter_type(iter) && linked->btree_id == iter->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; @@ -1184,7 +1180,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) struct bch_fs *c = trans->c; struct btree_iter *iter; u8 sorted[BTREE_ITER_MAX]; - unsigned i, nr_sorted = 0; + int i, nr_sorted = 0; + bool relock_fail; if (trans->in_traverse_all) return -EINTR; @@ -1192,15 +1189,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) trans->in_traverse_all = true; retry_all: nr_sorted = 0; + relock_fail = false; - trans_for_each_iter(trans, iter) + trans_for_each_iter(trans, iter) { + if (!bch2_btree_iter_relock(iter, true)) + relock_fail = true; sorted[nr_sorted++] = iter->idx; + } + + if (!relock_fail) { + trans->in_traverse_all = false; + return 0; + } #define btree_iter_cmp_by_idx(_l, _r) \ btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); #undef btree_iter_cmp_by_idx + + for (i = nr_sorted - 2; i >= 0; --i) { + struct btree_iter *iter1 = trans->iters + sorted[i]; + struct btree_iter *iter2 = trans->iters + sorted[i + 1]; + + if (iter1->btree_id == iter2->btree_id && + iter1->locks_want < iter2->locks_want) + __bch2_btree_iter_upgrade(iter1, iter2->locks_want); + else if (!iter1->locks_want && iter2->locks_want) + __bch2_btree_iter_upgrade(iter1, 1); + } + bch2_trans_unlock(trans); cond_resched(); @@ -1250,6 +1268,8 @@ out: bch2_btree_cache_cannibalize_unlock(c); trans->in_traverse_all = false; + + trace_trans_traverse_all(trans->ip); return ret; } @@ -2009,10 +2029,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, if (iter->btree_id != btree_id) continue; - if (best && - bkey_cmp(bpos_diff(best->real_pos, pos), - bpos_diff(iter->real_pos, pos)) > 0) - continue; + if (best) { + int cmp = bkey_cmp(bpos_diff(best->real_pos, pos), + bpos_diff(iter->real_pos, pos)); + + if (cmp < 0 || + ((cmp == 0 && btree_iter_keep(trans, iter)))) + continue; + } best = iter; } @@ -2040,13 +2064,18 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, iter->snapshot = pos.snapshot; - locks_want = min(locks_want, BTREE_MAX_DEPTH); + /* + * If the iterator has locks_want greater than requested, we explicitly + * do not downgrade it here - on transaction restart because btree node + * split needs to upgrade locks, we might be putting/getting the + * iterator again. Downgrading iterators only happens via an explicit + * bch2_trans_downgrade(). + */ + locks_want = min(locks_want, BTREE_MAX_DEPTH); if (locks_want > iter->locks_want) { iter->locks_want = locks_want; btree_iter_get_locks(iter, true, false); - } else if (locks_want < iter->locks_want) { - __bch2_btree_iter_downgrade(iter, locks_want); } while (iter->level < depth) { @@ -2108,37 +2137,28 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, return iter; } -static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) +void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - if (size > trans->mem_bytes) { + size_t new_top = trans->mem_top + size; + void *p; + + if (new_top > trans->mem_bytes) { size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(size); + size_t new_bytes = roundup_pow_of_two(new_top); void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); if (!new_mem) - return -ENOMEM; + return ERR_PTR(-ENOMEM); trans->mem = new_mem; trans->mem_bytes = new_bytes; if (old_bytes) { - trace_trans_restart_mem_realloced(trans->ip, new_bytes); - return -EINTR; + trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); + return ERR_PTR(-EINTR); } } - return 0; -} - -void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -{ - void *p; - int ret; - - ret = bch2_trans_preload_mem(trans, trans->mem_top + size); - if (ret) - return ERR_PTR(ret); - p = trans->mem + trans->mem_top; trans->mem_top += size; return p; @@ -2188,7 +2208,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) if (!(flags & TRANS_RESET_NOUNLOCK)) bch2_trans_cond_resched(trans); - if (!(flags & TRANS_RESET_NOTRAVERSE)) + if (!(flags & TRANS_RESET_NOTRAVERSE) && + trans->iters_linked) bch2_btree_iter_traverse_all(trans); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 07d9b6d3..2f63adb9 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l, { return cmp_int(l->btree_id, r->btree_id) ?: -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: - bkey_cmp(l->pos, r->pos); + bkey_cmp(l->real_pos, r->real_pos); } /* diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index f5160d4f..afdcc98d 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -222,18 +222,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - struct bch_fs *c = trans->c; - - if (bch2_debug_check_bkeys) { - const char *invalid = bch2_bkey_invalid(c, - bkey_i_to_s_c(i->k), i->bkey_type); - if (invalid) { - char buf[200]; - - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - panic("invalid bkey %s on insert: %s\n", buf, invalid); - } - } BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); BUG_ON(i->level != i->iter->level); BUG_ON(i->btree_id != i->iter->btree_id); @@ -319,8 +307,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, } static inline void do_btree_insert_one(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) + struct btree_insert_entry *i) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; @@ -329,20 +316,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans, EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); - insert->k.needs_whiteout = false; + i->k->k.needs_whiteout = false; - did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) - ? btree_insert_key_leaf(trans, iter, insert) - : bch2_btree_insert_key_cached(trans, iter, insert); + did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED) + ? btree_insert_key_leaf(trans, i->iter, i->k) + : bch2_btree_insert_key_cached(trans, i->iter, i->k); if (!did_work) return; if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { bch2_journal_add_keys(j, &trans->journal_res, - iter->btree_id, insert); + i->btree_id, + i->level, + i->k); bch2_journal_set_has_inode(j, &trans->journal_res, - insert->k.p.inode); + i->k->k.p.inode); if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -480,7 +469,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, bch2_trans_mark_gc(trans); trans_for_each_update2(trans, i) - do_btree_insert_one(trans, i->iter, i->k); + do_btree_insert_one(trans, i); err: if (marking) { percpu_up_read(&c->mark_lock); @@ -592,9 +581,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } } - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_update2(trans, i) - btree_insert_entry_checks(trans, i); + trans_for_each_update2(trans, i) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid); + bch2_fatal_error(c); + } + btree_insert_entry_checks(trans, i); + } bch2_btree_trans_verify_locks(trans); trans_for_each_update2(trans, i) @@ -629,25 +627,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, static int journal_reclaim_wait_done(struct bch_fs *c) { - int ret; - - ret = bch2_journal_error(&c->journal); - if (ret) - return ret; - - ret = !bch2_btree_key_cache_must_wait(c); - if (ret) - return ret; - - journal_reclaim_kick(&c->journal); - - if (mutex_trylock(&c->journal.reclaim_lock)) { - ret = bch2_journal_reclaim(&c->journal); - mutex_unlock(&c->journal.reclaim_lock); - } + int ret = bch2_journal_error(&c->journal) ?: + !bch2_btree_key_cache_must_wait(c); if (!ret) - ret = !bch2_btree_key_cache_must_wait(c); + journal_reclaim_kick(&c->journal); return ret; } @@ -735,10 +719,12 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - wait_event(c->journal.reclaim_wait, - (ret = journal_reclaim_wait_done(c))); + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) + return ret; - if (!ret && bch2_trans_relock(trans)) + if (bch2_trans_relock(trans)) return 0; trace_trans_restart_journal_reclaim(trans->ip); @@ -1151,8 +1137,7 @@ int __bch2_btree_insert(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, k, 0); + ret = bch2_trans_update(trans, iter, k, 0); bch2_trans_iter_put(trans, iter); return ret; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index f0c3bbc7..6b99f127 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -3,64 +3,6 @@ * Code for manipulating bucket marks for garbage collection. * * Copyright 2014 Datera, Inc. - * - * Bucket states: - * - free bucket: mark == 0 - * The bucket contains no data and will not be read - * - * - allocator bucket: owned_by_allocator == 1 - * The bucket is on a free list, or it is an open bucket - * - * - cached bucket: owned_by_allocator == 0 && - * dirty_sectors == 0 && - * cached_sectors > 0 - * The bucket contains data but may be safely discarded as there are - * enough replicas of the data on other cache devices, or it has been - * written back to the backing device - * - * - dirty bucket: owned_by_allocator == 0 && - * dirty_sectors > 0 - * The bucket contains data that we must not discard (either only copy, - * or one of the 'main copies' for data requiring multiple replicas) - * - * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 - * This is a btree node, journal or gen/prio bucket - * - * Lifecycle: - * - * bucket invalidated => bucket on freelist => open bucket => - * [dirty bucket =>] cached bucket => bucket invalidated => ... - * - * Note that cache promotion can skip the dirty bucket step, as data - * is copied from a deeper tier to a shallower tier, onto a cached - * bucket. - * Note also that a cached bucket can spontaneously become dirty -- - * see below. - * - * Only a traversal of the key space can determine whether a bucket is - * truly dirty or cached. - * - * Transitions: - * - * - free => allocator: bucket was invalidated - * - cached => allocator: bucket was invalidated - * - * - allocator => dirty: open bucket was filled up - * - allocator => cached: open bucket was filled up - * - allocator => metadata: metadata was allocated - * - * - dirty => cached: dirty sectors were copied to a deeper tier - * - dirty => free: dirty sectors were overwritten or moved (copy gc) - * - cached => free: cached sectors were overwritten - * - * - metadata => free: metadata was freed - * - * Oddities: - * - cached => dirty: a device was removed so formerly replicated data - * is no longer sufficiently replicated - * - free => cached: cannot happen - * - free => dirty: cannot happen - * - free => metadata: cannot happen */ #include "bcachefs.h" @@ -229,7 +171,7 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) percpu_down_read(&c->mark_lock); ret = kmalloc(sizeof(struct bch_fs_usage_online) + - sizeof(u64) + c->replicas.nr, GFP_NOFS); + sizeof(u64) * c->replicas.nr, GFP_NOFS); if (unlikely(!ret)) { percpu_up_read(&c->mark_lock); return NULL; @@ -538,33 +480,17 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, ret; \ }) -static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator, - bool gc) +void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator) { - struct bucket *g = __bucket(ca, b, gc); + struct bucket *g = bucket(ca, b); struct bucket_mark old, new; old = bucket_cmpxchg(g, new, ({ new.owned_by_allocator = owned_by_allocator; })); - BUG_ON(!gc && - !owned_by_allocator && !old.owned_by_allocator); - - return 0; -} - -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator, - struct gc_pos pos, unsigned flags) -{ - preempt_disable(); - - do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, - ca, b, owned_by_allocator); - - preempt_enable(); + BUG_ON(owned_by_allocator == old.owned_by_allocator); } static int bch2_mark_alloc(struct bch_fs *c, @@ -1890,10 +1816,11 @@ int bch2_trans_mark_update(struct btree_trans *trans, return 0; if (!btree_node_type_is_extents(iter->btree_id)) { - /* iterators should be uptodate, shouldn't get errors here: */ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { old = bch2_btree_iter_peek_slot(iter); - BUG_ON(bkey_err(old)); + ret = bkey_err(old); + if (ret) + return ret; } else { struct bkey_cached *ck = (void *) iter->l[0].b; @@ -2004,22 +1931,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, goto out; } - if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n" - "while marking %s", - iter->pos.inode, iter->pos.offset, u.gen, - bch2_data_types[u.data_type ?: type], - u.dirty_sectors, sectors, ca->mi.bucket_size, - bch2_data_types[type]); - ret = -EIO; - goto out; - } - - if (u.data_type == type && - u.dirty_sectors == sectors) - goto out; - u.data_type = type; u.dirty_sectors = sectors; @@ -2031,53 +1942,44 @@ out: } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct disk_reservation *res, struct bch_dev *ca, size_t b, enum bch_data_type type, unsigned sectors) { - return __bch2_trans_do(trans, res, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, - ca->mi.bucket_size)); - + return __bch2_trans_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); } static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct disk_reservation *res, struct bch_dev *ca, u64 start, u64 end, enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors) { - int ret; - do { u64 b = sector_to_bucket(ca, start); unsigned sectors = min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - if (b != *bucket) { - if (*bucket_sectors) { - ret = bch2_trans_mark_metadata_bucket(trans, res, ca, - *bucket, type, *bucket_sectors); - if (ret) - return ret; - } + if (b != *bucket && *bucket_sectors) { + int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, + type, *bucket_sectors); + if (ret) + return ret; - *bucket = b; - *bucket_sectors = 0; + *bucket_sectors = 0; } + *bucket = b; *bucket_sectors += sectors; start += sectors; - } while (!ret && start < end); + } while (start < end); return 0; } static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, - struct disk_reservation *res, - struct bch_dev *ca) + struct bch_dev *ca) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; u64 bucket = 0; @@ -2088,14 +1990,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, u64 offset = le64_to_cpu(layout->sb_offset[i]); if (offset == BCH_SB_SECTOR) { - ret = bch2_trans_mark_metadata_sectors(trans, res, ca, + ret = bch2_trans_mark_metadata_sectors(trans, ca, 0, BCH_SB_SECTOR, BCH_DATA_sb, &bucket, &bucket_sectors); if (ret) return ret; } - ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, + ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, offset + (1 << layout->sb_max_size_bits), BCH_DATA_sb, &bucket, &bucket_sectors); if (ret) @@ -2103,14 +2005,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, } if (bucket_sectors) { - ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + ret = bch2_trans_mark_metadata_bucket(trans, ca, bucket, BCH_DATA_sb, bucket_sectors); if (ret) return ret; } for (i = 0; i < ca->journal.nr; i++) { - ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], BCH_DATA_journal, ca->mi.bucket_size); if (ret) @@ -2120,12 +2022,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, return 0; } -int bch2_trans_mark_dev_sb(struct bch_fs *c, - struct disk_reservation *res, - struct bch_dev *ca) +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { - return bch2_trans_do(c, res, NULL, 0, - __bch2_trans_mark_dev_sb(&trans, res, ca)); + return bch2_trans_do(c, NULL, NULL, 0, + __bch2_trans_mark_dev_sb(&trans, ca)); } /* Disk reservations: */ diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 297b04b2..7463e642 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -191,6 +191,7 @@ static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca, for (i = 0; i < RESERVE_NR; i++) available -= fifo_used(&ca->free[i]); available -= fifo_used(&ca->free_inc); + available -= ca->nr_open_buckets; spin_unlock(&c->freelist_lock); return max(available, 0LL); @@ -234,8 +235,7 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, - size_t, bool, struct gc_pos, unsigned); +void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); @@ -252,11 +252,9 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -int bch2_trans_mark_metadata_bucket(struct btree_trans *, - struct disk_reservation *, struct bch_dev *, - size_t, enum bch_data_type, unsigned); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, - struct bch_dev *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); /* disk reservations: */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 588b1a72..b2de2995 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -59,6 +59,11 @@ struct bch_dev_usage { struct { u64 buckets; u64 sectors; /* _compressed_ sectors: */ + /* + * XXX + * Why do we have this? Isn't it just buckets * bucket_size - + * sectors? + */ u64 fragmented; } d[BCH_DATA_NR]; }; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 1a94e7f7..dda3608d 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -2619,54 +2619,21 @@ err: return ret; } -static long bchfs_fallocate(struct bch_inode_info *inode, int mode, - loff_t offset, loff_t len) +static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + u64 start_sector, u64 end_sector) { - struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; struct btree_iter *iter; - struct bpos end_pos; - loff_t end = offset + len; - loff_t block_start = round_down(offset, block_bytes(c)); - loff_t block_end = round_up(end, block_bytes(c)); - unsigned sectors; + struct bpos end_pos = POS(inode->v.i_ino, end_sector); unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; - int ret; + int ret = 0; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - - if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { - ret = inode_newsize_ok(&inode->v, end); - if (ret) - goto err; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, end); - - if (!ret && - offset >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_page(inode, - end >> PAGE_SHIFT, - offset, end); - - if (unlikely(ret)) - goto err; - - truncate_pagecache_range(&inode->v, offset, end - 1); - } - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inode->v.i_ino, block_start >> 9), + POS(inode->v.i_ino, start_sector), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - end_pos = POS(inode->v.i_ino, block_end >> 9); while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { s64 i_sectors_delta = 0; @@ -2674,6 +2641,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; struct bkey_s_c k; + unsigned sectors; bch2_trans_begin(&trans); @@ -2734,7 +2702,48 @@ bkey_err: ret = 0; } bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; +} +static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->v.i_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + loff_t end = offset + len; + loff_t block_start = round_down(offset, block_bytes(c)); + loff_t block_end = round_up(end, block_bytes(c)); + int ret; + + inode_lock(&inode->v); + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { + ret = inode_newsize_ok(&inode->v, end); + if (ret) + goto err; + } + + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = __bch2_truncate_page(inode, + offset >> PAGE_SHIFT, + offset, end); + + if (!ret && + offset >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_page(inode, + end >> PAGE_SHIFT, + offset, end); + + if (unlikely(ret)) + goto err; + + truncate_pagecache_range(&inode->v, offset, end - 1); + } + + ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); if (ret) goto err; @@ -2748,28 +2757,13 @@ bkey_err: if (end >= inode->v.i_size && (!(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_ZERO_RANGE))) { - struct btree_iter *inode_iter; - struct bch_inode_unpacked inode_u; - - do { - bch2_trans_begin(&trans); - inode_iter = bch2_inode_peek(&trans, &inode_u, - inode->v.i_ino, 0); - ret = PTR_ERR_OR_ZERO(inode_iter); - } while (ret == -EINTR); - - bch2_trans_iter_put(&trans, inode_iter); - bch2_trans_unlock(&trans); - - if (ret) - goto err; /* * Sync existing appends before extending i_size, * as in bch2_extend(): */ ret = filemap_write_and_wait_range(mapping, - inode_u.bi_size, S64_MAX); + inode->ei_inode.bi_size, S64_MAX); if (ret) goto err; @@ -2783,7 +2777,6 @@ bkey_err: mutex_unlock(&inode->ei_update_lock); } err: - bch2_trans_exit(&trans); bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 6280b357..eb8ac164 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -81,51 +81,37 @@ static int write_inode(struct btree_trans *trans, return ret; } -static int __remove_dirent(struct btree_trans *trans, - struct bkey_s_c_dirent dirent) +static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; - struct qstr name; + struct btree_iter *iter; struct bch_inode_unpacked dir_inode; struct bch_hash_info dir_hash_info; - u64 dir_inum = dirent.k->p.inode; int ret; - char *buf; - - name.len = bch2_dirent_name_bytes(dirent); - buf = bch2_trans_kmalloc(trans, name.len + 1); - if (IS_ERR(buf)) - return PTR_ERR(buf); - memcpy(buf, dirent.v->d_name, name.len); - buf[name.len] = '\0'; - name.name = buf; - - ret = lookup_inode(trans, dir_inum, &dir_inode, NULL); - if (ret && ret != -EINTR) - bch_err(c, "remove_dirent: err %i looking up directory inode", ret); + ret = lookup_inode(trans, pos.inode, &dir_inode, NULL); if (ret) return ret; dir_hash_info = bch2_hash_info_init(c, &dir_inode); - ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, - &dir_hash_info, dir_inum, &name); - if (ret && ret != -EINTR) - bch_err(c, "remove_dirent: err %i deleting dirent", ret); - if (ret) - return ret; + iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); - return 0; + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, iter); + bch2_trans_iter_put(trans, iter); + return ret; } -static int remove_dirent(struct btree_trans *trans, - struct bkey_s_c_dirent dirent) +static int remove_dirent(struct btree_trans *trans, struct bpos pos) { - return __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __remove_dirent(trans, dirent)); + int ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __remove_dirent(trans, pos)); + if (ret) + bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret); + return ret; } static int __reattach_inode(struct btree_trans *trans, @@ -173,13 +159,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *lostfound, u64 inum) { - struct bch_fs *c = trans->c; - int ret; - - ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, __reattach_inode(trans, lostfound, inum)); if (ret) - bch_err(c, "error %i reattaching inode %llu", ret, inum); + bch_err(trans->c, "error %i reattaching inode %llu", ret, inum); return ret; } @@ -202,7 +185,7 @@ static int remove_backpointer(struct btree_trans *trans, goto out; } - ret = remove_dirent(trans, bkey_s_c_to_dirent(k)); + ret = remove_dirent(trans, k.k->p); out: bch2_trans_iter_put(trans, iter); return ret; @@ -752,7 +735,7 @@ retry: "dirent points to missing inode:\n%s", (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { - ret = remove_dirent(&trans, d); + ret = remove_dirent(&trans, d.k->p); if (ret) goto err; goto next; @@ -783,7 +766,7 @@ retry: backpointer_exists, c, "directory %llu with multiple links", target.bi_inum)) { - ret = remove_dirent(&trans, d); + ret = remove_dirent(&trans, d.k->p); if (ret) goto err; continue; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b901be5b..1b49a1c3 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -787,7 +787,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, * We may be called from the device add path, before the new device has * actually been added to the running filesystem: */ - if (c) + if (!new_fs) spin_lock(&c->journal.lock); memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); @@ -795,17 +795,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); - if (c) + if (!new_fs) spin_unlock(&c->journal.lock); while (ja->nr < nr) { struct open_bucket *ob = NULL; unsigned pos; - long bucket; + long b; if (new_fs) { - bucket = bch2_bucket_alloc_new_fs(ca); - if (bucket < 0) { + b = bch2_bucket_alloc_new_fs(ca); + if (b < 0) { ret = -ENOSPC; goto err; } @@ -819,10 +819,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; } - bucket = sector_to_bucket(ca, ob->ptr.offset); - } + b = sector_to_bucket(ca, ob->ptr.offset); - if (c) { percpu_down_read(&c->mark_lock); spin_lock(&c->journal.lock); } @@ -839,9 +837,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, __array_insert_item(journal_buckets->buckets, ja->nr, pos); ja->nr++; - ja->buckets[pos] = bucket; + ja->buckets[pos] = b; ja->bucket_seq[pos] = 0; - journal_buckets->buckets[pos] = cpu_to_le64(bucket); + journal_buckets->buckets[pos] = cpu_to_le64(b); if (pos <= ja->discard_idx) ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -852,28 +850,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - if (!c || new_fs) - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, + if (new_fs) { + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), 0); - - if (c) { + } else { spin_unlock(&c->journal.lock); percpu_up_read(&c->mark_lock); - } - if (c && !new_fs) ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_trans_mark_metadata_bucket(&trans, NULL, ca, - bucket, BCH_DATA_journal, + bch2_trans_mark_metadata_bucket(&trans, ca, + b, BCH_DATA_journal, ca->mi.bucket_size)); - if (!new_fs) bch2_open_bucket_put(c, ob); - if (ret) - goto err; + if (ret) + goto err; + } } err: bch2_sb_resize_journal(&ca->disk_sb, diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index cc497125..1d556790 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -241,10 +241,11 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res } static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, - enum btree_id id, const struct bkey_i *k) + enum btree_id id, unsigned level, + const struct bkey_i *k) { bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, - id, 0, k, k->k.u64s); + id, level, k, k->k.u64s); } static inline bool journal_entry_empty(struct jset *j) diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 7be6c65c..f117d361 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -599,7 +599,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) struct bch_fs *c = container_of(j, struct bch_fs, journal); bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; - size_t min_nr, nr_flushed; + size_t min_nr, min_key_cache, nr_flushed; unsigned flags; int ret = 0; @@ -649,9 +649,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL); + nr_flushed = journal_flush_pins(j, seq_to_flush, - min_nr, - min(bch2_nr_btree_keys_need_flush(c), 128UL)); + min_nr, min_key_cache); if (direct) j->nr_direct_reclaim += nr_flushed; @@ -661,7 +662,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (nr_flushed) wake_up(&j->reclaim_wait); - } while (min_nr && nr_flushed && !direct); + } while ((min_nr || min_key_cache) && !direct); memalloc_noreclaim_restore(flags); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 5b108490..aa8e8c25 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -68,7 +68,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_bkey_buf_init(&_insert); bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); iter = bch2_trans_get_iter(&trans, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 71db7ae6..80772cff 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -108,7 +108,7 @@ static bool have_copygc_reserve(struct bch_dev *ca) spin_lock(&ca->fs->freelist_lock); ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || - ca->allocator_state != ALLOCATOR_RUNNING; + ca->allocator_state != ALLOCATOR_running; spin_unlock(&ca->fs->freelist_lock); return ret; @@ -222,7 +222,7 @@ static int bch2_copygc(struct bch_fs *c) ret = bch2_move_data(c, 0, POS_MIN, BTREE_ID_NR, POS_MAX, - &c->copygc_pd.rate, + NULL, writepoint_ptr(&c->copygc_write_point), copygc_pred, NULL, &move_stats); @@ -282,8 +282,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { struct bch_dev *ca; unsigned dev_idx; - u64 fragmented_allowed = c->copygc_threshold; - u64 fragmented = 0; + u64 fragmented_allowed = 0, fragmented = 0; for_each_rw_member(ca, c, dev_idx) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); @@ -312,11 +311,14 @@ static int bch2_copygc_thread(void *arg) wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { + c->copygc_wait = last + wait; bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); continue; } + c->copygc_wait = 0; + if (bch2_copygc(c)) break; } @@ -326,9 +328,6 @@ static int bch2_copygc_thread(void *arg) void bch2_copygc_stop(struct bch_fs *c) { - c->copygc_pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&c->copygc_pd.rate); - if (c->copygc_thread) { kthread_stop(c->copygc_thread); put_task_struct(c->copygc_thread); @@ -365,6 +364,4 @@ int bch2_copygc_start(struct bch_fs *c) void bch2_fs_copygc_init(struct bch_fs *c) { - bch2_pd_controller_init(&c->copygc_pd); - c->copygc_pd.d_term = 0; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index b5cc0e64..2dc3dee4 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1005,13 +1005,6 @@ int bch2_fs_recovery(struct bch_fs *c) } - if (!c->sb.clean && - !(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { - bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required"); - c->opts.fsck = true; - c->opts.fix_errors = FSCK_OPT_YES; - } - if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { bch_info(c, "alloc_v2 feature bit not set, fsck required"); c->opts.fsck = true; @@ -1145,9 +1138,11 @@ use_clean: !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bool metadata_only = c->opts.norecovery; + bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, true); + ret = bch2_gc(c, true, metadata_only); if (ret) goto err; bch_verbose(c, "mark and sweep done"); @@ -1245,8 +1240,8 @@ use_clean: } if (c->opts.fsck && - !test_bit(BCH_FS_ERROR, &c->flags)) { - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + !test_bit(BCH_FS_ERROR, &c->flags) && + BCH_SB_HAS_ERRORS(c->disk_sb.sb)) { SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); write_sb = true; } @@ -1338,10 +1333,12 @@ int bch2_fs_initialize(struct bch_fs *c) * Write out the superblock and journal buckets, now that we can do * btree updates */ - err = "error writing alloc info"; - ret = bch2_alloc_write(c, 0); - if (ret) - goto err; + err = "error marking superblock and journal"; + for_each_member_device(ca, c, i) { + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) + goto err; + } bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 1e297171..4128a1b3 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -313,8 +313,8 @@ static int replicas_table_update(struct bch_fs *c, out: free_percpu(new_gc); kfree(new_scratch); - free_percpu(new_usage[1]); - free_percpu(new_usage[0]); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + free_percpu(new_usage[i]); kfree(new_base); return ret; err: diff --git a/libbcachefs/super.c b/libbcachefs/super.c index b2a2614b..61fd1144 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -286,7 +286,6 @@ void bch2_fs_read_only(struct bch_fs *c) percpu_ref_kill(&c->writes); cancel_work_sync(&c->ec_stripe_delete_work); - cancel_delayed_work(&c->pd_controllers_update); /* * If we're not doing an emergency shutdown, we want to wait on @@ -371,8 +370,6 @@ static int bch2_fs_read_write_late(struct bch_fs *c) return ret; } - schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); - schedule_work(&c->ec_stripe_delete_work); return 0; @@ -566,7 +563,6 @@ void __bch2_fs_stop(struct bch_fs *c) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->btree_write_error_work); - cancel_delayed_work_sync(&c->pd_controllers_update); cancel_work_sync(&c->read_only_work); for (i = 0; i < c->sb.nr_devices; i++) @@ -908,9 +904,16 @@ int bch2_fs_start(struct bch_fs *c) /* * Allocator threads don't start filling copygc reserve until after we * set BCH_FS_STARTED - wake them now: + * + * XXX ugly hack: + * Need to set ca->allocator_state here instead of relying on the + * allocator threads to do it to avoid racing with the copygc threads + * checking it and thinking they have no alloc reserve: */ - for_each_online_member(ca, c, i) + for_each_online_member(ca, c, i) { + ca->allocator_state = ALLOCATOR_running; bch2_wake_allocator(ca); + } if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); @@ -1679,7 +1682,7 @@ have_slot: bch2_dev_usage_journal_reserve(c); err = "error marking superblock"; - ret = bch2_trans_mark_dev_sb(c, NULL, ca); + ret = bch2_trans_mark_dev_sb(c, ca); if (ret) goto err_late; @@ -1739,7 +1742,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ca = bch_dev_locked(c, dev_idx); - if (bch2_trans_mark_dev_sb(c, NULL, ca)) { + if (bch2_trans_mark_dev_sb(c, ca)) { err = "bch2_trans_mark_dev_sb() error"; goto err; } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 2d008979..21ef7719 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -132,10 +132,10 @@ do { \ } while (0) write_attribute(trigger_journal_flush); -write_attribute(trigger_btree_coalesce); write_attribute(trigger_gc); write_attribute(prune_cache); rw_attribute(btree_gc_periodic); +rw_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); @@ -190,7 +190,7 @@ rw_attribute(cache_replacement_policy); rw_attribute(label); rw_attribute(copy_gc_enabled); -sysfs_pd_controller_attribute(copy_gc); +read_attribute(copy_gc_wait); rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); @@ -199,8 +199,6 @@ rw_attribute(promote_whole_extents); read_attribute(new_stripes); -rw_attribute(pd_controllers_update_seconds); - read_attribute(io_timers_read); read_attribute(io_timers_write); @@ -314,6 +312,13 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c return 0; } +void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) +{ + pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); + bch2_bpos_to_text(out, c->gc_gens_pos); + pr_buf(out, "\n"); +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -339,14 +344,18 @@ SHOW(bch2_fs) sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + if (attr == &sysfs_gc_gens_pos) { + bch2_gc_gens_pos_to_text(&out, c); + return out.pos - buf; + } - sysfs_print(pd_controllers_update_seconds, - c->pd_controllers_update_seconds); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ - sysfs_pd_controller_show(copy_gc, &c->copygc_pd); + sysfs_hprint(copy_gc_wait, + max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); if (attr == &sysfs_rebalance_work) { bch2_rebalance_work_to_text(&out, c); @@ -454,10 +463,7 @@ STORE(bch2_fs) return ret; } - sysfs_strtoul(pd_controllers_update_seconds, - c->pd_controllers_update_seconds); sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - sysfs_pd_controller_store(copy_gc, &c->copygc_pd); sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); @@ -471,9 +477,6 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_journal_flush) bch2_journal_meta(&c->journal); - if (attr == &sysfs_trigger_btree_coalesce) - bch2_coalesce(c); - if (attr == &sysfs_trigger_gc) { /* * Full gc is currently incompatible with btree key cache: @@ -570,16 +573,16 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_extent_migrate_raced, &sysfs_trigger_journal_flush, - &sysfs_trigger_btree_coalesce, &sysfs_trigger_gc, + &sysfs_gc_gens_pos, &sysfs_prune_cache, &sysfs_copy_gc_enabled, + &sysfs_copy_gc_wait, &sysfs_rebalance_enabled, &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), - sysfs_pd_controller_files(copy_gc), &sysfs_new_stripes, @@ -817,23 +820,28 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "free[RESERVE_MOVINGGC]\t%zu/%zu\n" "free[RESERVE_NONE]\t%zu/%zu\n" "freelist_wait\t\t%s\n" - "open buckets\t\t%u/%u (reserved %u)\n" + "open buckets allocated\t%u\n" + "open buckets this dev\t%u\n" + "open buckets total\t%u\n" "open_buckets_wait\t%s\n" "open_buckets_btree\t%u\n" "open_buckets_user\t%u\n" - "btree reserve cache\t%u\n", + "btree reserve cache\t%u\n" + "thread state:\t\t%s\n", stats.buckets_ec, __dev_buckets_available(ca, stats), fifo_used(&ca->free_inc), ca->free_inc.size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, - BTREE_NODE_OPEN_BUCKET_RESERVE, + OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, + ca->nr_open_buckets, + OPEN_BUCKETS_COUNT, c->open_buckets_wait.list.first ? "waiting" : "empty", nr[BCH_DATA_btree], nr[BCH_DATA_user], - c->btree_reserve_cache_nr); + c->btree_reserve_cache_nr, + bch2_allocator_states[ca->allocator_state]); } static const char * const bch2_rw[] = { diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 7507b6bc..254e3b31 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -497,6 +497,42 @@ static int rand_insert(struct bch_fs *c, u64 nr) return ret; } +static int rand_insert_multi(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct bkey_i_cookie k[8]; + int ret = 0; + unsigned j; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i += ARRAY_SIZE(k)) { + for (j = 0; j < ARRAY_SIZE(k); j++) { + bkey_cookie_init(&k[j].k_i); + k[j].k.p.offset = test_rand(); + k[j].k.p.snapshot = U32_MAX; + } + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); + if (ret) { + bch_err(c, "error in rand_insert_multi: %i", ret); + break; + } + } + + bch2_trans_exit(&trans); + return ret; +} + static int rand_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; @@ -765,6 +801,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, if (!strcmp(testname, #_test)) j.fn = _test perf_test(rand_insert); + perf_test(rand_insert_multi); perf_test(rand_lookup); perf_test(rand_mixed); perf_test(rand_delete); -- cgit v1.2.3