diff options
Diffstat (limited to 'fs')
28 files changed, 443 insertions, 375 deletions
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 23a9fbb36f49..1a8fa5482653 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -206,8 +206,7 @@ static inline bool may_alloc_bucket(struct bch_fs *c, static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct alloc_request *req, - u64 bucket, u8 gen, - struct closure *cl) + u64 bucket, u8 gen) { struct bch_dev *ca = req->ca; @@ -222,12 +221,18 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, spin_lock(&c->freelist_lock); if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); + + int ret; + if (req->cl && !(req->flags & BCH_WRITE_alloc_nowait)) { + closure_wait(&c->open_buckets_wait, req->cl); + ret = bch_err_throw(c, open_bucket_alloc_blocked); + } else { + ret = bch_err_throw(c, open_buckets_empty); + } + spin_unlock(&c->freelist_lock); - return ERR_PTR(bch_err_throw(c, open_buckets_empty)); + return ERR_PTR(ret); } /* Recheck under lock: */ @@ -259,8 +264,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct alloc_request *req, - struct btree_iter *freespace_iter, - struct closure *cl) + struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); @@ -275,7 +279,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, if (ret) return NULL; - return __try_alloc_bucket(c, req, b, gen, cl); + return __try_alloc_bucket(c, req, b, gen); } /* @@ -283,8 +287,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) + struct alloc_request *req) { struct bch_fs *c = trans->c; struct bch_dev *ca = req->ca; @@ -348,7 +351,7 @@ again: req->counters.buckets_seen++; ob = may_alloc_bucket(c, req, k.k->p) - ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) + ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen) : NULL; next: bch2_set_btree_iter_dontneed(trans, &citer); @@ -374,8 +377,7 @@ next: } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) + struct alloc_request *req) { struct bch_dev *ca = req->ca; struct btree_iter iter; @@ -417,7 +419,7 @@ again: goto next; } - ob = try_alloc_bucket(trans, req, &iter, cl); + ob = try_alloc_bucket(trans, req, &iter); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; @@ -450,7 +452,6 @@ fail: static noinline void trace_bucket_alloc2(struct bch_fs *c, struct alloc_request *req, - struct closure *cl, struct open_bucket *ob) { struct printbuf buf = PRINTBUF; @@ -460,7 +461,8 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); - prt_printf(&buf, "blocking\t%u\n", cl != NULL); + prt_printf(&buf, "blocking\t%u\n", !req->will_retry_target_devices && + !req->will_retry_all_devices); prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); prt_printf(&buf, "copygc_wait\t%llu/%lli\n", @@ -488,28 +490,23 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @req: state for the entire allocation - * @cl: if not NULL, closure to be used to wait if buckets not available - * @nowait: if true, do not wait for buckets to become available * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl, - bool nowait) + struct alloc_request *req) { struct bch_fs *c = trans->c; struct bch_dev *ca = req->ca; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); - u64 avail; - bool waiting = nowait; + bool waiting = false; req->btree_bitmap = req->data_type == BCH_DATA_btree; memset(&req->counters, 0, sizeof(req->counters)); again: bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark); + u64 avail = dev_buckets_free(ca, req->usage, req->watermark); if (req->usage.buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); @@ -525,8 +522,11 @@ again: c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) goto alloc; - if (cl && !waiting) { - closure_wait(&c->freelist_wait, cl); + if (req->cl && + !req->will_retry_target_devices && + !req->will_retry_all_devices && + !(req->flags & BCH_WRITE_alloc_nowait)) { + closure_wait(&c->freelist_wait, req->cl); waiting = true; goto again; } @@ -541,8 +541,8 @@ again: closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, req, cl) - : bch2_bucket_alloc_early(trans, req, cl); + ? bch2_bucket_alloc_freelist(trans, req) + : bch2_bucket_alloc_early(trans, req); if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); @@ -571,7 +571,7 @@ err: if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, req, cl, ob); + trace_bucket_alloc2(c, req, ob); return ob; } @@ -583,13 +583,14 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, { struct open_bucket *ob; struct alloc_request req = { + .cl = cl, .watermark = watermark, .data_type = data_type, .ca = ca, }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req))); return ob; } @@ -703,18 +704,26 @@ static int add_new_bucket(struct bch_fs *c, return 0; } -inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct alloc_request *req, - struct dev_stripe_state *stripe, - struct closure *cl) +int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct alloc_request *req, + struct dev_stripe_state *stripe) { struct bch_fs *c = trans->c; + struct closure *cl = NULL; int ret = 0; BUG_ON(req->nr_effective >= req->nr_replicas); + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ +retry_blocking: bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); + if (req->devs_sorted.nr == 1) + req->will_retry_target_devices = false; + darray_for_each(req->devs_sorted, i) { req->ca = bch2_dev_tryget_noerror(c, *i); if (!req->ca) @@ -725,8 +734,7 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, - req->flags & BCH_WRITE_alloc_nowait); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); bch2_dev_put(req->ca); @@ -745,6 +753,14 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, if (ret == 1) return 0; + + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + req->will_retry_target_devices) { + req->will_retry_target_devices = false; + goto retry_blocking; + } + if (ret) return ret; return bch_err_throw(c, insufficient_devices); @@ -759,20 +775,13 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) + struct alloc_request *req) { struct bch_fs *c = trans->c; int ret = 0; - if (req->nr_replicas < 2) - return 0; - - if (ec_open_bucket(c, &req->ptrs)) - return 0; - struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, req, 0, cl); + bch2_ec_stripe_head_get(trans, req, 0); if (IS_ERR(h)) return PTR_ERR(h); if (!h) @@ -887,79 +896,6 @@ unlock: return ret; } -static int __open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *_cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct closure *cl = NULL; - unsigned i; - int ret; - - req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); - - /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*req->devs_have, i) - __clear_bit(*i, req->devs_may_alloc.d); - - open_bucket_for_each(c, &req->ptrs, ob, i) - __clear_bit(ob->dev, req->devs_may_alloc.d); - - ret = bucket_alloc_set_writepoint(c, req); - if (ret) - return ret; - - ret = bucket_alloc_set_partial(c, req); - if (ret) - return ret; - - if (req->ec) { - ret = bucket_alloc_from_stripe(trans, req, _cl); - } else { -retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ - ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && - !cl && _cl) { - cl = _cl; - goto retry_blocking; - } - } - - return ret; -} - -static int open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - int ret; - - if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { - ret = __open_bucket_add_buckets(trans, req, cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_operation_blocked) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; - if (req->nr_effective >= req->nr_replicas) - return 0; - } - - bool ec = false; - swap(ec, req->ec); - ret = __open_bucket_add_buckets(trans, req, cl); - swap(ec, req->ec); - - return ret < 0 ? ret : 0; -} - /** * should_drop_bucket - check if this is open_bucket should go away * @ob: open_bucket to predicate on @@ -1255,72 +1191,94 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; + if (nr_replicas < 2) + erasure_code = false; + + req->cl = cl; req->nr_replicas = nr_replicas; req->target = target; - req->ec = erasure_code; req->watermark = watermark; req->flags = flags; req->devs_have = devs_have; BUG_ON(!nr_replicas || !nr_replicas_required); retry: - req->ptrs.nr = 0; - req->nr_effective = 0; - req->have_cache = false; - write_points_nr = c->write_points_nr; + req->ec = erasure_code; + req->will_retry_target_devices = true; + req->will_retry_all_devices = true; + req->ptrs.nr = 0; + req->nr_effective = 0; + req->have_cache = false; + write_points_nr = c->write_points_nr; *wp_ret = req->wp = writepoint_find(trans, write_point.v); req->data_type = req->wp->data_type; + /* metadata may not allocate on cache devices: */ + if (req->data_type != BCH_DATA_user) + req->have_cache = true; + ret = bch2_trans_relock(trans); if (ret) goto err; - /* metadata may not allocate on cache devices: */ - if (req->data_type != BCH_DATA_user) - req->have_cache = true; + while (1) { + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); - if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, req, NULL); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto alloc_done; - - /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, req, cl); - if (!ret2 || - bch2_err_matches(ret2, BCH_ERR_transaction_restart) || - bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { - ret = ret2; - goto alloc_done; - } - } + /* Don't allocate from devices we already have pointers to: */ + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - /* - * Only try to allocate cache (durability = 0 devices) from the - * specified target: - */ - req->have_cache = true; - req->target = 0; + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = open_bucket_add_buckets(trans, req, cl); - } else { - ret = open_bucket_add_buckets(trans, req, cl); - } -alloc_done: - BUG_ON(!ret && req->nr_effective < req->nr_replicas); + ret = bucket_alloc_set_writepoint(c, req) ?: + bucket_alloc_set_partial(c, req) ?: + (req->ec + ? bucket_alloc_from_stripe(trans, req) + : bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe)); - if (erasure_code && !ec_open_bucket(c, &req->ptrs)) - pr_debug("failed to get ec bucket: ret %u", ret); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_operation_blocked)) + goto err; + + if (ret == -BCH_ERR_freelist_empty || + ret == -BCH_ERR_insufficient_devices) { + if (req->will_retry_all_devices) { + BUG_ON(!req->will_retry_all_devices); + req->will_retry_all_devices = false; + /* + * Only try to allocate cache (durability = 0 devices) from the + * specified target: + */ + if (req->target && + (!(flags & BCH_WRITE_only_specified_devs) || + (ret == -BCH_ERR_insufficient_devices))) { + req->have_cache = true; + req->target = 0; + } + continue; + } - if (ret == -BCH_ERR_insufficient_devices && - req->nr_effective >= nr_replicas_required) - ret = 0; + if (ret == -BCH_ERR_insufficient_devices && + req->nr_effective >= nr_replicas_required) + ret = 0; + else + goto err; + } - if (ret) - goto err; + if (req->nr_effective < req->nr_replicas && req->ec) { + req->ec = false; + req->will_retry_target_devices = true; + req->will_retry_all_devices = true; + continue; + } + + BUG_ON(req->nr_effective < nr_replicas_required); + BUG_ON(ret < 0); + break; + } if (req->nr_effective > req->nr_replicas) deallocate_extra_replicas(c, req); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 1b3fc8460096..90eb8604a0a2 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -26,9 +26,12 @@ struct dev_alloc_list { }; struct alloc_request { + struct closure *cl; unsigned nr_replicas; unsigned target; - bool ec; + bool ec:1; + bool will_retry_target_devices:1; + bool will_retry_all_devices:1; enum bch_watermark watermark; enum bch_write_flags flags; enum bch_data_type data_type; @@ -224,7 +227,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 enum bch_write_flags; int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, - struct dev_stripe_state *, struct closure *); + struct dev_stripe_state *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index a3631a903ecf..49505653fe12 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -86,7 +86,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } -static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +void __btree_node_data_free(struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); @@ -113,16 +113,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) munmap(b->aux_data, btree_aux_data_bytes(b)); #endif b->aux_data = NULL; - - btree_node_to_freedlist(bc, b); } static void btree_node_data_free(struct btree_cache *bc, struct btree *b) { BUG_ON(list_empty(&b->list)); list_del_init(&b->list); + + __btree_node_data_free(b); + --bc->nr_freeable; - __btree_node_data_free(bc, b); + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -186,10 +187,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = __btree_node_mem_alloc(c, GFP_KERNEL); + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; @@ -199,8 +197,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) } bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - - __bch2_btree_node_to_freelist(bc, b); return b; } @@ -526,7 +522,8 @@ restart: --touched;; } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(bc, b); + __btree_node_data_free(b); + btree_node_to_freedlist(bc, b); freed++; bc->nr_freed++; @@ -667,9 +664,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->nr_reserve; i++) - if (!__bch2_btree_node_mem_alloc(c)) + for (i = 0; i < bc->nr_reserve; i++) { + struct btree *b = __bch2_btree_node_mem_alloc(c); + if (!b) goto err; + __bch2_btree_node_to_freelist(bc, b); + } list_splice_init(&bc->live[0].list, &bc->freeable); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 3264801cbcbe..649e9dfd178a 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); +void __btree_node_data_free(struct btree *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 19fd951495ac..b30799e494eb 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -26,6 +26,12 @@ #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_btree_read_corrupt_ratio; +module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); +#endif + static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) { bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); @@ -568,9 +574,9 @@ static int __btree_err(int ret, bch2_mark_btree_validate_failure(failed, ca->dev_idx); struct extent_ptr_decoded pick; - have_retry = !bch2_bkey_pick_read_device(c, + have_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - failed, &pick, -1); + failed, &pick, -1) == 1; } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) @@ -615,7 +621,6 @@ static int __btree_err(int ret, goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); break; } @@ -644,7 +649,6 @@ static int __btree_err(int ret, goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); break; } print: @@ -1337,15 +1341,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - } + } if (!ptr_written) { set_btree_node_need_rewrite(b); @@ -1381,7 +1412,7 @@ static void btree_node_read_work(struct work_struct *work) ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), &failed, &rb->pick, -1); - if (ret) { + if (ret <= 0) { set_btree_node_read_error(b); break; } @@ -1412,6 +1443,11 @@ start: continue; } + memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); + bio->bi_iter.bi_size = btree_buf_bytes(b); + + bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); + ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); if (ret == -BCH_ERR_btree_node_read_err_want_retry || ret == -BCH_ERR_btree_node_read_err_must_retry) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 341d31b3a1f1..ea839560a136 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -717,18 +717,6 @@ static void __journal_keys_sort(struct journal_keys *keys) keys->nr = dst - keys->data; } -static bool should_rewind_entry(struct bch_fs *c, struct jset_entry *entry) -{ - if (entry->level) - return false; - if (btree_id_is_alloc(entry->btree_id)) - return false; - if (c->opts.journal_rewind_no_extents && - entry->btree_id == BTREE_ID_extents) - return false; - return true; -} - int bch2_journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; @@ -747,8 +735,9 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); vstruct_for_each(&i->j, entry) { - bool rewind = le64_to_cpu(i->j.seq) >= rewind_seq && - should_rewind_entry(c, entry); + bool rewind = !entry->level && + !btree_id_is_alloc(entry->btree_id) && + le64_to_cpu(i->j.seq) >= rewind_seq; if (entry->type != (rewind ? BCH_JSET_ENTRY_overwrite diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 23d8c62ea4b6..365808b4b7c0 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k) } } -static bool found_btree_node_is_readable(struct btree_trans *trans, - struct found_btree_node *f) -{ - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, f); - - struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); - bool ret = !IS_ERR_OR_NULL(b); - if (!ret) - return ret; - - f->sectors_written = b->written; - f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) - f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); - - six_unlock_read(&b->c.lock); - - /* - * We might update this node's range; if that happens, we need the node - * to be re-read so the read path can trim keys that are no longer in - * this node - */ - if (b != btree_node_root(trans->c, b)) - bch2_btree_node_evict(trans, &tmp.k); - return ret; -} - static int found_btree_node_cmp_cookie(const void *_l, const void *_r) { const struct found_btree_node *l = _l; @@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = { }; static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct bio *bio, struct btree_node *bn, u64 offset) + struct btree *b, struct bio *bio, u64 offset) { struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct btree_node *bn = b->data; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, bn, PAGE_SIZE); + bch2_bio_map(bio, b->data, c->opts.block_size); u64 submit_time = local_clock(); submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); if (bio->bi_status) { @@ -201,6 +168,14 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) return; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, b->data, c->opts.btree_node_size); + + submit_time = local_clock(); + submit_bio_wait(bio); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -217,7 +192,20 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, }; rcu_read_unlock(); - if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + found_btree_node_to_key(&b->key, &n); + + CLASS(printbuf, buf)(); + if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { + /* read_done will swap out b->data for another buffer */ + bn = b->data; + /* + * Grab journal_seq here because we want the max journal_seq of + * any bset; read_done sorts down to a single set and picks the + * max journal_seq + */ + n.journal_seq = le64_to_cpu(bn->keys.journal_seq), + n.sectors_written = b->written; + mutex_lock(&f->lock); if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { bch_err(c, "try_read_btree_node() can't handle endian conversion"); @@ -237,12 +225,18 @@ static int read_btree_nodes_worker(void *p) struct find_btree_nodes_worker *w = p; struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); struct bch_dev *ca = w->ca; - void *buf = (void *) __get_free_page(GFP_KERNEL); - struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); unsigned long last_print = jiffies; - if (!buf || !bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + struct btree *b = __bch2_btree_node_mem_alloc(c); + if (!b) { + bch_err(c, "read_btree_nodes_worker: error allocating buf"); + w->f->ret = -ENOMEM; + goto err; + } + + struct bio *bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); + if (!bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio"); w->f->ret = -ENOMEM; goto err; } @@ -266,11 +260,13 @@ static int read_btree_nodes_worker(void *p) !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) continue; - try_read_btree_node(w->f, ca, bio, buf, sector); + try_read_btree_node(w->f, ca, b, bio, sector); } err: + if (b) + __btree_node_data_free(b); + kfree(b); bio_put(bio); - free_page((unsigned long) buf); enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 3968f3be7f3b..e848e210a9bf 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -783,9 +783,6 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - CLASS(printbuf, buf)(); - buf.atomic++; - guard(rcu)(); unsigned nr_replicas = 0, i; @@ -797,11 +794,7 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); - u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark); - - prt_printf(&buf, "%s=%llu ", ca->name, nr_free); - - if (!nr_free) + if (!dev_buckets_free(ca, usage, m->op.watermark)) continue; nr_replicas += ca->mi.durability; @@ -809,10 +802,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) break; } - if (!nr_replicas) { - trace_data_update_done_no_rw_devs(c, buf.buf); + if (!nr_replicas) return bch_err_throw(c, data_update_done_no_rw_devs); - } if (nr_replicas < m->op.nr_replicas) return bch_err_throw(c, insufficient_devices); return 0; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 901f643ead83..07c2a0f73cc2 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) c->verify_data = __bch2_btree_node_mem_alloc(c); if (!c->verify_data) goto out; - - list_del_init(&c->verify_data->list); } BUG_ON(b->nsets != 1); @@ -586,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, i->ubuf = buf; i->size = size; i->ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); restart: seqmutex_lock(&c->btree_trans_lock); list_sort(&c->btree_trans_list, list_ptr_order_cmp); @@ -599,6 +599,11 @@ restart: if (!closure_get_not_zero(&trans->ref)) continue; + if (!trans->srcu_held) { + closure_put(&trans->ref); + continue; + } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); @@ -620,6 +625,8 @@ restart: } seqmutex_unlock(&c->btree_trans_lock); unlocked: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + if (i->buf.allocation_failure) ret = -ENOMEM; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 687c3ba98095..71956ee86a9c 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1720,8 +1720,7 @@ err: static int new_stripe_alloc_buckets(struct btree_trans *trans, struct alloc_request *req, - struct ec_stripe_head *h, struct ec_stripe_new *s, - struct closure *cl) + struct ec_stripe_head *h, struct ec_stripe_new *s) { struct bch_fs *c = trans->c; struct open_bucket *ob; @@ -1771,7 +1770,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, req->nr_effective = nr_have_parity; req->data_type = BCH_DATA_parity; - ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe); open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, @@ -1794,7 +1793,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, req->nr_effective = nr_have_data; req->data_type = BCH_DATA_user; - ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe); open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, @@ -1926,7 +1925,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri } bch2_trans_iter_exit(trans, &lru_iter); if (!ret) - ret = bch_err_throw(c, stripe_alloc_blocked); + return bch_err_throw(c, stripe_alloc_blocked); if (ret == 1) ret = 0; if (ret) @@ -1998,8 +1997,7 @@ err: struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, struct alloc_request *req, - unsigned algo, - struct closure *cl) + unsigned algo) { struct bch_fs *c = trans->c; unsigned redundancy = req->nr_replicas - 1; @@ -2041,12 +2039,18 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (s->have_existing_stripe) goto alloc_existing; + /* First, try to allocate a full stripe: */ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; - swap(req->watermark, saved_watermark); - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: + unsigned saved_flags = req->flags | BCH_WRITE_alloc_nowait; + swap(req->watermark, saved_watermark); + swap(req->flags, saved_flags); + + ret = new_stripe_alloc_buckets(trans, req, h, s) ?: __bch2_ec_stripe_head_reserve(trans, h, s); - swap(req->watermark, saved_watermark); + + swap(req->watermark, saved_watermark); + swap(req->flags, saved_flags); if (!ret) goto allocate_buf; @@ -2062,19 +2066,25 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, ret = __bch2_ec_stripe_head_reuse(trans, h, s); if (!ret) break; - if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) + if (waiting || + (req->flags & BCH_WRITE_alloc_nowait) || + ret != -BCH_ERR_stripe_alloc_blocked) goto err; if (req->watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: + /* Don't self-deadlock copygc */ + swap(req->flags, saved_flags); + ret = new_stripe_alloc_buckets(trans, req, h, s) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->flags, saved_flags); + if (ret) goto err; goto allocate_buf; } /* XXX freelist_wait? */ - closure_wait(&c->freelist_wait, cl); + closure_wait(&c->freelist_wait, req->cl); waiting = true; } @@ -2085,7 +2095,7 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, req, h, s, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s); if (ret) goto err; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 548048adf0d5..756f14bd7bb7 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -258,7 +258,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); struct alloc_request; struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - struct alloc_request *, unsigned, struct closure *); + struct alloc_request *, unsigned); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index d27b94a6610a..a66c96bd9556 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -236,6 +236,9 @@ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ x(BCH_ERR_operation_blocked, journal_res_blocked) \ + x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_operation_blocked, open_bucket_alloc_blocked) \ + x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ x(BCH_ERR_journal_res_blocked, journal_blocked) \ x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ x(BCH_ERR_journal_res_blocked, journal_max_open) \ @@ -244,8 +247,6 @@ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ x(BCH_ERR_journal_res_blocked, journal_stuck) \ x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ x(BCH_ERR_invalid_sb, invalid_sb_version) \ @@ -289,7 +290,6 @@ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ - x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ x(EIO, trigger_alloc) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index a9a9fe193966..71649b4164b8 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -103,7 +103,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) return bch_err_throw(c, btree_need_topology_repair); } else { return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - bch_err_throw(c, btree_node_read_validate_error); + bch_err_throw(c, btree_need_topology_repair); } } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 036e4ad95987..83cbd77dcb9c 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -50,19 +50,17 @@ void bch2_io_failures_to_text(struct printbuf *out, struct bch_io_failures *failed) { static const char * const error_types[] = { - "io", "checksum", "ec reconstruct", NULL + "btree validate", "io", "checksum", "ec reconstruct", NULL }; for (struct bch_dev_io_failures *f = failed->devs; f < failed->devs + failed->nr; f++) { unsigned errflags = - ((!!f->failed_io) << 0) | - ((!!f->failed_csum_nr) << 1) | - ((!!f->failed_ec) << 2); - - if (!errflags) - continue; + ((!!f->failed_btree_validate) << 0) | + ((!!f->failed_io) << 1) | + ((!!f->failed_csum_nr) << 2) | + ((!!f->failed_ec) << 3); bch2_printbuf_make_room(out, 1024); out->atomic++; @@ -77,7 +75,9 @@ void bch2_io_failures_to_text(struct printbuf *out, prt_char(out, ' '); - if (is_power_of_2(errflags)) { + if (!errflags) { + prt_str(out, "no error - confused"); + } else if (is_power_of_2(errflags)) { prt_bitflags(out, error_types, errflags); prt_str(out, " error"); } else { diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index dad48d44f47b..4e82dfa6c03f 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -257,7 +257,7 @@ err: struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); + prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f9bc99eb2d02..3b0783f117ae 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1692,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, s.mask = map_defined(bch_flags_to_xflags); s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return bch_err_throw(c, unsupported_fsx_flag); + if (fa->fsx_xflags) { + ret = bch_err_throw(c, unsupported_fsx_flag); + goto err; + } - if (fa->fsx_projid >= U32_MAX) - return bch_err_throw(c, projid_too_big); + if (fa->fsx_projid >= U32_MAX) { + ret = bch_err_throw(c, projid_too_big); + goto err; + } /* * inode fields accessible via the xattr interface are stored with a +1 @@ -1718,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, fa->flags &= ~FS_CASEFOLD_FL; s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return bch_err_throw(c, unsupported_fa_flag); + if (fa->flags) { + ret = bch_err_throw(c, unsupported_fa_flag); + goto err; + } } mutex_lock(&inode->ei_update_lock); @@ -1730,7 +1736,7 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - +err: return bch2_err_class(ret); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 00afe0a3593f..471e93a3f00c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,6 +12,7 @@ #include "fs.h" #include "fsck.h" #include "inode.h" +#include "io_misc.h" #include "keylist.h" #include "namei.h" #include "recovery_passes.h" @@ -1637,7 +1638,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal i->count = count2; } - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty) && + i->inode.bi_sectors != i->count, trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", w->last_pos.inode, i->inode.bi_snapshot, @@ -1963,33 +1965,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, "extent type past end of inode %llu:%u, i_size %llu\n%s", i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); - ret = PTR_ERR_OR_ZERO(whiteout); - if (ret) - goto err; - - bkey_init(&whiteout->k); - whiteout->k.p = SPOS(k.k->p.inode, - last_block, - i->inode.bi_snapshot); - bch2_key_resize(&whiteout->k, - min(KEY_SIZE_MAX & (~0 << c->block_bits), - U64_MAX - whiteout->k.p.offset)); - - - /* - * Need a normal (not BTREE_ITER_all_snapshots) - * iterator, if we're deleting in a different - * snapshot and need to emit a whiteout - */ - struct btree_iter iter2; - bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, - bkey_start_pos(&whiteout->k), - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, whiteout, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter2); + ret = bch2_fpunch_snapshot(trans, + SPOS(i->inode.bi_inum, + last_block, + i->inode.bi_snapshot), + POS(i->inode.bi_inum, U64_MAX)); if (ret) goto err; diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index bf72b1d2e2cb..07023667a475 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -135,6 +135,33 @@ err_noprint: return ret; } +/* For fsck */ +int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) +{ + u32 restart_count = trans->restart_count; + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_i delete; + + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + start, end, 0, k, + &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bkey_init(&delete.k); + delete.k.p = iter.pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + bch2_extent_trim_atomic(trans, &iter, &delete) ?: + bch2_trans_update(trans, &iter, &delete, 0); + })); + + bch2_disk_reservation_put(c, &disk_res); + return ret ?: trans_was_restarted(trans, restart_count); +} + /* * Returns -BCH_ERR_transacton_restart if we had to drop locks: */ diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h index 9cb44a7c43c1..b93e4d4b3c0c 100644 --- a/fs/bcachefs/io_misc.h +++ b/fs/bcachefs/io_misc.h @@ -5,6 +5,8 @@ int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, u64, struct bch_io_opts, s64 *, struct write_point_specifier); + +int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 9c5ddbf861b3..cfc8ef35b14d 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -147,7 +147,7 @@ static inline void bch2_read_extent(struct btree_trans *trans, int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, data_btree, k, offset_into_extent, NULL, flags, -1); /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ - WARN(ret, "unhandled error from __bch2_read_extent()"); + WARN(ret, "unhandled error from __bch2_read_extent(): %s", bch2_err_str(ret)); } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ce5340611de6..f22b05e02c1e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1376,6 +1376,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) return bch_err_throw(c, erofs_filesystem_full); } + unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { @@ -1384,19 +1385,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) } /* 1/128th of the device by default: */ - unsigned nr = ca->mi.nbuckets >> 7; + nr = ca->mi.nbuckets >> 7; /* - * clamp journal size to 8GB, or 32GB with large_journal option: + * clamp journal size to 8192 buckets or 8GB (in sectors), whichever + * is smaller: */ - unsigned max_sectors = 1 << 24; - - if (c->opts.large_journal) - max_sectors *= 4; - nr = clamp_t(unsigned, nr, BCH_JOURNAL_BUCKETS_MIN, - max_sectors / ca->mi.bucket_size); + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index dd3f3434c1b0..2d6ce4348a22 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1245,6 +1245,8 @@ noinline_for_stack static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) { struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); bool have_good = false; @@ -1272,6 +1274,28 @@ static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_r printbuf_exit(&buf); } +struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end) +{ + BUG_ON(start > end); + + if (start == end) + return (struct u64_range) {}; + + start = bch2_journal_seq_next_nonblacklisted(c, start); + if (start >= end) + return (struct u64_range) {}; + + struct u64_range missing = { + .start = start, + .end = min(end, bch2_journal_seq_next_blacklisted(c, start)), + }; + + if (missing.start == missing.end) + return (struct u64_range) {}; + + return missing; +} + noinline_for_stack static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) { @@ -1280,6 +1304,7 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e struct genradix_iter radix_iter; struct journal_replay *i, **_i, *prev = NULL; + /* Sequence number we expect to find next, to check for missing entries */ u64 seq = start_seq; genradix_for_each(&c->journal_entries, radix_iter, _i) { @@ -1290,43 +1315,31 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e BUG_ON(seq > le64_to_cpu(i->j.seq)); - while (seq < le64_to_cpu(i->j.seq)) { - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - u64 missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - u64 missing_end = seq - 1; + struct u64_range missing; + while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) { printbuf_reset(&buf); prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", - missing_start, missing_end, + missing.start, missing.end - 1, start_seq, end_seq); - prt_printf(&buf, "\nprev at "); if (prev) { + prt_printf(&buf, "\n%llu at ", le64_to_cpu(prev->j.seq)); bch2_journal_ptrs_to_text(&buf, c, prev); prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf, "(none)"); + } - prt_printf(&buf, "\nnext at "); + prt_printf(&buf, "\n%llu at ", le64_to_cpu(i->j.seq)); bch2_journal_ptrs_to_text(&buf, c, i); prt_printf(&buf, ", continue?"); fsck_err(c, journal_entries_missing, "%s", buf.buf); + + seq = missing.end; } prev = i; - seq++; + seq = le64_to_cpu(i->j.seq) + 1; } fsck_err: printbuf_exit(&buf); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 6fa82c4050fe..f53c5c81d137 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -71,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, struct journal_replay *); +struct u64_range { + u64 start; + u64 end; +}; + +struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); + int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index af4fe416d9ec..6361809b5e2e 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -103,6 +103,52 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } +static int journal_seq_blacklist_table_end_cmp(const void *_l, const void *_r) +{ + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; + + return cmp_int(l->end, r->end); +} + +u64 bch2_journal_seq_next_blacklisted(struct bch_fs *c, u64 seq) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t) + return U64_MAX; + + struct journal_seq_blacklist_table_entry search = { .end = seq }; + int idx = eytzinger0_find_gt(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_end_cmp, + &search); + if (idx < 0) + return U64_MAX; + + return max(seq, t->entries[idx].start); +} + +u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *c, u64 seq) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t) + return seq; + + while (true) { + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0 || t->entries[idx].end <= seq) + return seq; + + seq = t->entries[idx].end; + } +} + bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, bool dirty) { diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index f06942ccfcdd..389b789b26f4 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -11,6 +11,9 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) : 0; } +u64 bch2_journal_seq_next_blacklisted(struct bch_fs *, u64); +u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *, u64); + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 4a7a60588c10..63f8e254495c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -343,12 +343,6 @@ enum fsck_err_opts { OPT_UINT(0, U32_MAX), \ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ NULL, "Delay in milliseconds before automatic journal reclaim")\ - x(large_journal, bool, \ - OPT_FS|OPT_MOUNT|OPT_FORMAT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allocate a bigger than normal journal: recovery from unclean "\ - "shutdown will be slower, but more info will be available for debugging")\ x(move_bytes_in_flight, u32, \ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1024, U32_MAX), \ @@ -395,11 +389,6 @@ enum fsck_err_opts { OPT_UINT(0, U64_MAX), \ BCH2_NO_SB_OPT, 0, \ NULL, "Rewind journal") \ - x(journal_rewind_no_extents, bool, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Don't rewind extents when rewinding journal") \ x(recovery_passes, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 974f8bf9a574..0def4ecb7f88 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -273,24 +273,35 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level) && - !k->allocated)) { + if (unlikely(!btree_path_node(path, k->level))) { struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + prt_str(&buf, "btree="); + bch2_btree_id_to_text(&buf, k->btree_id); + prt_printf(&buf, " level=%u ", k->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); + if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { - bch_err(c, "have key in journal replay for btree depth that does not exist, confused"); + bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", + buf.buf); ret = -EINVAL; } -#if 0 + + if (!k->allocated) { + bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", + buf.buf); + k->overwritten = true; + goto out; + } + bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; -#endif - k->overwritten = true; goto out; } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 9324ef32903d..b5dae1145afa 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1330,11 +1330,6 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, data_update_done_no_rw_devs, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - DEFINE_EVENT(fs_str, io_move_pred, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) |