summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/alloc_foreground.c282
-rw-r--r--fs/bcachefs/alloc_foreground.h7
-rw-r--r--fs/bcachefs/btree_cache.c26
-rw-r--r--fs/bcachefs/btree_cache.h1
-rw-r--r--fs/bcachefs/btree_io.c62
-rw-r--r--fs/bcachefs/btree_journal_iter.c17
-rw-r--r--fs/bcachefs/btree_node_scan.c82
-rw-r--r--fs/bcachefs/data_update.c13
-rw-r--r--fs/bcachefs/debug.c11
-rw-r--r--fs/bcachefs/ec.c38
-rw-r--r--fs/bcachefs/ec.h2
-rw-r--r--fs/bcachefs/errcode.h6
-rw-r--r--fs/bcachefs/error.c2
-rw-r--r--fs/bcachefs/extents.c16
-rw-r--r--fs/bcachefs/fs-io-buffered.c2
-rw-r--r--fs/bcachefs/fs.c20
-rw-r--r--fs/bcachefs/fsck.c36
-rw-r--r--fs/bcachefs/io_misc.c27
-rw-r--r--fs/bcachefs/io_misc.h2
-rw-r--r--fs/bcachefs/io_read.h2
-rw-r--r--fs/bcachefs/journal.c14
-rw-r--r--fs/bcachefs/journal_io.c55
-rw-r--r--fs/bcachefs/journal_io.h7
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c46
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h3
-rw-r--r--fs/bcachefs/opts.h11
-rw-r--r--fs/bcachefs/recovery.c23
-rw-r--r--fs/bcachefs/trace.h5
28 files changed, 443 insertions, 375 deletions
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 23a9fbb36f49..1a8fa5482653 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -206,8 +206,7 @@ static inline bool may_alloc_bucket(struct bch_fs *c,
static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
struct alloc_request *req,
- u64 bucket, u8 gen,
- struct closure *cl)
+ u64 bucket, u8 gen)
{
struct bch_dev *ca = req->ca;
@@ -222,12 +221,18 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
spin_lock(&c->freelist_lock);
if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
- if (cl)
- closure_wait(&c->open_buckets_wait, cl);
-
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
+
+ int ret;
+ if (req->cl && !(req->flags & BCH_WRITE_alloc_nowait)) {
+ closure_wait(&c->open_buckets_wait, req->cl);
+ ret = bch_err_throw(c, open_bucket_alloc_blocked);
+ } else {
+ ret = bch_err_throw(c, open_buckets_empty);
+ }
+
spin_unlock(&c->freelist_lock);
- return ERR_PTR(bch_err_throw(c, open_buckets_empty));
+ return ERR_PTR(ret);
}
/* Recheck under lock: */
@@ -259,8 +264,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
struct alloc_request *req,
- struct btree_iter *freespace_iter,
- struct closure *cl)
+ struct btree_iter *freespace_iter)
{
struct bch_fs *c = trans->c;
u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
@@ -275,7 +279,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
if (ret)
return NULL;
- return __try_alloc_bucket(c, req, b, gen, cl);
+ return __try_alloc_bucket(c, req, b, gen);
}
/*
@@ -283,8 +287,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
+ struct alloc_request *req)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = req->ca;
@@ -348,7 +351,7 @@ again:
req->counters.buckets_seen++;
ob = may_alloc_bucket(c, req, k.k->p)
- ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
+ ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen)
: NULL;
next:
bch2_set_btree_iter_dontneed(trans, &citer);
@@ -374,8 +377,7 @@ next:
}
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
+ struct alloc_request *req)
{
struct bch_dev *ca = req->ca;
struct btree_iter iter;
@@ -417,7 +419,7 @@ again:
goto next;
}
- ob = try_alloc_bucket(trans, req, &iter, cl);
+ ob = try_alloc_bucket(trans, req, &iter);
if (ob) {
if (!IS_ERR(ob))
*dev_alloc_cursor = iter.pos.offset;
@@ -450,7 +452,6 @@ fail:
static noinline void trace_bucket_alloc2(struct bch_fs *c,
struct alloc_request *req,
- struct closure *cl,
struct open_bucket *ob)
{
struct printbuf buf = PRINTBUF;
@@ -460,7 +461,8 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c,
prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx);
prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]);
prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]);
- prt_printf(&buf, "blocking\t%u\n", cl != NULL);
+ prt_printf(&buf, "blocking\t%u\n", !req->will_retry_target_devices &&
+ !req->will_retry_all_devices);
prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]);
prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark));
prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
@@ -488,28 +490,23 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c,
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @trans: transaction object
* @req: state for the entire allocation
- * @cl: if not NULL, closure to be used to wait if buckets not available
- * @nowait: if true, do not wait for buckets to become available
*
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
*/
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl,
- bool nowait)
+ struct alloc_request *req)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = req->ca;
struct open_bucket *ob = NULL;
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
- u64 avail;
- bool waiting = nowait;
+ bool waiting = false;
req->btree_bitmap = req->data_type == BCH_DATA_btree;
memset(&req->counters, 0, sizeof(req->counters));
again:
bch2_dev_usage_read_fast(ca, &req->usage);
- avail = dev_buckets_free(ca, req->usage, req->watermark);
+ u64 avail = dev_buckets_free(ca, req->usage, req->watermark);
if (req->usage.buckets[BCH_DATA_need_discard] > avail)
bch2_dev_do_discards(ca);
@@ -525,8 +522,11 @@ again:
c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
goto alloc;
- if (cl && !waiting) {
- closure_wait(&c->freelist_wait, cl);
+ if (req->cl &&
+ !req->will_retry_target_devices &&
+ !req->will_retry_all_devices &&
+ !(req->flags & BCH_WRITE_alloc_nowait)) {
+ closure_wait(&c->freelist_wait, req->cl);
waiting = true;
goto again;
}
@@ -541,8 +541,8 @@ again:
closure_wake_up(&c->freelist_wait);
alloc:
ob = likely(freespace)
- ? bch2_bucket_alloc_freelist(trans, req, cl)
- : bch2_bucket_alloc_early(trans, req, cl);
+ ? bch2_bucket_alloc_freelist(trans, req)
+ : bch2_bucket_alloc_early(trans, req);
if (req->counters.need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
@@ -571,7 +571,7 @@ err:
if (!IS_ERR(ob)
? trace_bucket_alloc_enabled()
: trace_bucket_alloc_fail_enabled())
- trace_bucket_alloc2(c, req, cl, ob);
+ trace_bucket_alloc2(c, req, ob);
return ob;
}
@@ -583,13 +583,14 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
{
struct open_bucket *ob;
struct alloc_request req = {
+ .cl = cl,
.watermark = watermark,
.data_type = data_type,
.ca = ca,
};
bch2_trans_do(c,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req)));
return ob;
}
@@ -703,18 +704,26 @@ static int add_new_bucket(struct bch_fs *c,
return 0;
}
-inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
- struct alloc_request *req,
- struct dev_stripe_state *stripe,
- struct closure *cl)
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+ struct alloc_request *req,
+ struct dev_stripe_state *stripe)
{
struct bch_fs *c = trans->c;
+ struct closure *cl = NULL;
int ret = 0;
BUG_ON(req->nr_effective >= req->nr_replicas);
+ /*
+ * Try nonblocking first, so that if one device is full we'll try from
+ * other devices:
+ */
+retry_blocking:
bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
+ if (req->devs_sorted.nr == 1)
+ req->will_retry_target_devices = false;
+
darray_for_each(req->devs_sorted, i) {
req->ca = bch2_dev_tryget_noerror(c, *i);
if (!req->ca)
@@ -725,8 +734,7 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
- req->flags & BCH_WRITE_alloc_nowait);
+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
bch2_dev_put(req->ca);
@@ -745,6 +753,14 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
if (ret == 1)
return 0;
+
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ req->will_retry_target_devices) {
+ req->will_retry_target_devices = false;
+ goto retry_blocking;
+ }
+
if (ret)
return ret;
return bch_err_throw(c, insufficient_devices);
@@ -759,20 +775,13 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
*/
static int bucket_alloc_from_stripe(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
+ struct alloc_request *req)
{
struct bch_fs *c = trans->c;
int ret = 0;
- if (req->nr_replicas < 2)
- return 0;
-
- if (ec_open_bucket(c, &req->ptrs))
- return 0;
-
struct ec_stripe_head *h =
- bch2_ec_stripe_head_get(trans, req, 0, cl);
+ bch2_ec_stripe_head_get(trans, req, 0);
if (IS_ERR(h))
return PTR_ERR(h);
if (!h)
@@ -887,79 +896,6 @@ unlock:
return ret;
}
-static int __open_bucket_add_buckets(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *_cl)
-{
- struct bch_fs *c = trans->c;
- struct open_bucket *ob;
- struct closure *cl = NULL;
- unsigned i;
- int ret;
-
- req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
-
- /* Don't allocate from devices we already have pointers to: */
- darray_for_each(*req->devs_have, i)
- __clear_bit(*i, req->devs_may_alloc.d);
-
- open_bucket_for_each(c, &req->ptrs, ob, i)
- __clear_bit(ob->dev, req->devs_may_alloc.d);
-
- ret = bucket_alloc_set_writepoint(c, req);
- if (ret)
- return ret;
-
- ret = bucket_alloc_set_partial(c, req);
- if (ret)
- return ret;
-
- if (req->ec) {
- ret = bucket_alloc_from_stripe(trans, req, _cl);
- } else {
-retry_blocking:
- /*
- * Try nonblocking first, so that if one device is full we'll try from
- * other devices:
- */
- ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
- !cl && _cl) {
- cl = _cl;
- goto retry_blocking;
- }
- }
-
- return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
-{
- int ret;
-
- if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
- ret = __open_bucket_add_buckets(trans, req, cl);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
- bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- return ret;
- if (req->nr_effective >= req->nr_replicas)
- return 0;
- }
-
- bool ec = false;
- swap(ec, req->ec);
- ret = __open_bucket_add_buckets(trans, req, cl);
- swap(ec, req->ec);
-
- return ret < 0 ? ret : 0;
-}
-
/**
* should_drop_bucket - check if this is open_bucket should go away
* @ob: open_bucket to predicate on
@@ -1255,72 +1191,94 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
+ if (nr_replicas < 2)
+ erasure_code = false;
+
+ req->cl = cl;
req->nr_replicas = nr_replicas;
req->target = target;
- req->ec = erasure_code;
req->watermark = watermark;
req->flags = flags;
req->devs_have = devs_have;
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
- req->ptrs.nr = 0;
- req->nr_effective = 0;
- req->have_cache = false;
- write_points_nr = c->write_points_nr;
+ req->ec = erasure_code;
+ req->will_retry_target_devices = true;
+ req->will_retry_all_devices = true;
+ req->ptrs.nr = 0;
+ req->nr_effective = 0;
+ req->have_cache = false;
+ write_points_nr = c->write_points_nr;
*wp_ret = req->wp = writepoint_find(trans, write_point.v);
req->data_type = req->wp->data_type;
+ /* metadata may not allocate on cache devices: */
+ if (req->data_type != BCH_DATA_user)
+ req->have_cache = true;
+
ret = bch2_trans_relock(trans);
if (ret)
goto err;
- /* metadata may not allocate on cache devices: */
- if (req->data_type != BCH_DATA_user)
- req->have_cache = true;
+ while (1) {
+ req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
- if (target && !(flags & BCH_WRITE_only_specified_devs)) {
- ret = open_bucket_add_buckets(trans, req, NULL);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto alloc_done;
-
- /* Don't retry from all devices if we're out of open buckets: */
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret2 = open_bucket_add_buckets(trans, req, cl);
- if (!ret2 ||
- bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
- ret = ret2;
- goto alloc_done;
- }
- }
+ /* Don't allocate from devices we already have pointers to: */
+ darray_for_each(*req->devs_have, i)
+ __clear_bit(*i, req->devs_may_alloc.d);
- /*
- * Only try to allocate cache (durability = 0 devices) from the
- * specified target:
- */
- req->have_cache = true;
- req->target = 0;
+ open_bucket_for_each(c, &req->ptrs, ob, i)
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
- ret = open_bucket_add_buckets(trans, req, cl);
- } else {
- ret = open_bucket_add_buckets(trans, req, cl);
- }
-alloc_done:
- BUG_ON(!ret && req->nr_effective < req->nr_replicas);
+ ret = bucket_alloc_set_writepoint(c, req) ?:
+ bucket_alloc_set_partial(c, req) ?:
+ (req->ec
+ ? bucket_alloc_from_stripe(trans, req)
+ : bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe));
- if (erasure_code && !ec_open_bucket(c, &req->ptrs))
- pr_debug("failed to get ec bucket: ret %u", ret);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ goto err;
+
+ if (ret == -BCH_ERR_freelist_empty ||
+ ret == -BCH_ERR_insufficient_devices) {
+ if (req->will_retry_all_devices) {
+ BUG_ON(!req->will_retry_all_devices);
+ req->will_retry_all_devices = false;
+ /*
+ * Only try to allocate cache (durability = 0 devices) from the
+ * specified target:
+ */
+ if (req->target &&
+ (!(flags & BCH_WRITE_only_specified_devs) ||
+ (ret == -BCH_ERR_insufficient_devices))) {
+ req->have_cache = true;
+ req->target = 0;
+ }
+ continue;
+ }
- if (ret == -BCH_ERR_insufficient_devices &&
- req->nr_effective >= nr_replicas_required)
- ret = 0;
+ if (ret == -BCH_ERR_insufficient_devices &&
+ req->nr_effective >= nr_replicas_required)
+ ret = 0;
+ else
+ goto err;
+ }
- if (ret)
- goto err;
+ if (req->nr_effective < req->nr_replicas && req->ec) {
+ req->ec = false;
+ req->will_retry_target_devices = true;
+ req->will_retry_all_devices = true;
+ continue;
+ }
+
+ BUG_ON(req->nr_effective < nr_replicas_required);
+ BUG_ON(ret < 0);
+ break;
+ }
if (req->nr_effective > req->nr_replicas)
deallocate_extra_replicas(c, req);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 1b3fc8460096..90eb8604a0a2 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -26,9 +26,12 @@ struct dev_alloc_list {
};
struct alloc_request {
+ struct closure *cl;
unsigned nr_replicas;
unsigned target;
- bool ec;
+ bool ec:1;
+ bool will_retry_target_devices:1;
+ bool will_retry_all_devices:1;
enum bch_watermark watermark;
enum bch_write_flags flags;
enum bch_data_type data_type;
@@ -224,7 +227,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
enum bch_write_flags;
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
- struct dev_stripe_state *, struct closure *);
+ struct dev_stripe_state *);
int bch2_alloc_sectors_start_trans(struct btree_trans *,
unsigned, unsigned,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index a3631a903ecf..49505653fe12 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -86,7 +86,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+void __btree_node_data_free(struct btree *b)
{
BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
@@ -113,16 +113,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
munmap(b->aux_data, btree_aux_data_bytes(b));
#endif
b->aux_data = NULL;
-
- btree_node_to_freedlist(bc, b);
}
static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
{
BUG_ON(list_empty(&b->list));
list_del_init(&b->list);
+
+ __btree_node_data_free(b);
+
--bc->nr_freeable;
- __btree_node_data_free(bc, b);
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -186,10 +187,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
return NULL;
@@ -199,8 +197,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
}
bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-
- __bch2_btree_node_to_freelist(bc, b);
return b;
}
@@ -526,7 +522,8 @@ restart:
--touched;;
} else if (!btree_node_reclaim(c, b)) {
__bch2_btree_node_hash_remove(bc, b);
- __btree_node_data_free(bc, b);
+ __btree_node_data_free(b);
+ btree_node_to_freedlist(bc, b);
freed++;
bc->nr_freed++;
@@ -667,9 +664,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->nr_reserve; i++)
- if (!__bch2_btree_node_mem_alloc(c))
+ for (i = 0; i < bc->nr_reserve; i++) {
+ struct btree *b = __bch2_btree_node_mem_alloc(c);
+ if (!b)
goto err;
+ __bch2_btree_node_to_freelist(bc, b);
+ }
list_splice_init(&bc->live[0].list, &bc->freeable);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 3264801cbcbe..649e9dfd178a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
+void __btree_node_data_free(struct btree *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..b30799e494eb 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -26,6 +26,12 @@
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_btree_read_corrupt_ratio;
+module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
+#endif
+
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
{
bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
@@ -568,9 +574,9 @@ static int __btree_err(int ret,
bch2_mark_btree_validate_failure(failed, ca->dev_idx);
struct extent_ptr_decoded pick;
- have_retry = !bch2_bkey_pick_read_device(c,
+ have_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
- failed, &pick, -1);
+ failed, &pick, -1) == 1;
}
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
@@ -615,7 +621,6 @@ static int __btree_err(int ret,
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
- ret = __bch2_topology_error(c, &out);
break;
}
@@ -644,7 +649,6 @@ static int __btree_err(int ret,
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
- ret = __bch2_topology_error(c, &out);
break;
}
print:
@@ -1337,15 +1341,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
}
- }
+ }
if (!ptr_written) {
set_btree_node_need_rewrite(b);
@@ -1381,7 +1412,7 @@ static void btree_node_read_work(struct work_struct *work)
ret = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1);
- if (ret) {
+ if (ret <= 0) {
set_btree_node_read_error(b);
break;
}
@@ -1412,6 +1443,11 @@ start:
continue;
}
+ memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
+
+ bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
+
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
ret == -BCH_ERR_btree_node_read_err_must_retry)
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 341d31b3a1f1..ea839560a136 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -717,18 +717,6 @@ static void __journal_keys_sort(struct journal_keys *keys)
keys->nr = dst - keys->data;
}
-static bool should_rewind_entry(struct bch_fs *c, struct jset_entry *entry)
-{
- if (entry->level)
- return false;
- if (btree_id_is_alloc(entry->btree_id))
- return false;
- if (c->opts.journal_rewind_no_extents &&
- entry->btree_id == BTREE_ID_extents)
- return false;
- return true;
-}
-
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
@@ -747,8 +735,9 @@ int bch2_journal_keys_sort(struct bch_fs *c)
cond_resched();
vstruct_for_each(&i->j, entry) {
- bool rewind = le64_to_cpu(i->j.seq) >= rewind_seq &&
- should_rewind_entry(c, entry);
+ bool rewind = !entry->level &&
+ !btree_id_is_alloc(entry->btree_id) &&
+ le64_to_cpu(i->j.seq) >= rewind_seq;
if (entry->type != (rewind
? BCH_JSET_ENTRY_overwrite
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 23d8c62ea4b6..365808b4b7c0 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k)
}
}
-static bool found_btree_node_is_readable(struct btree_trans *trans,
- struct found_btree_node *f)
-{
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
- found_btree_node_to_key(&tmp.k, f);
-
- struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
- bool ret = !IS_ERR_OR_NULL(b);
- if (!ret)
- return ret;
-
- f->sectors_written = b->written;
- f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
-
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
- for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
- f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
-
- six_unlock_read(&b->c.lock);
-
- /*
- * We might update this node's range; if that happens, we need the node
- * to be re-read so the read path can trim keys that are no longer in
- * this node
- */
- if (b != btree_node_root(trans->c, b))
- bch2_btree_node_evict(trans, &tmp.k);
- return ret;
-}
-
static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
{
const struct found_btree_node *l = _l;
@@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = {
};
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
- struct bio *bio, struct btree_node *bn, u64 offset)
+ struct btree *b, struct bio *bio, u64 offset)
{
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct btree_node *bn = b->data;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, bn, PAGE_SIZE);
+ bch2_bio_map(bio, b->data, c->opts.block_size);
u64 submit_time = local_clock();
submit_bio_wait(bio);
-
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
if (bio->bi_status) {
@@ -201,6 +168,14 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
return;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, b->data, c->opts.btree_node_size);
+
+ submit_time = local_clock();
+ submit_bio_wait(bio);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
@@ -217,7 +192,20 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
};
rcu_read_unlock();
- if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ found_btree_node_to_key(&b->key, &n);
+
+ CLASS(printbuf, buf)();
+ if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
+ /* read_done will swap out b->data for another buffer */
+ bn = b->data;
+ /*
+ * Grab journal_seq here because we want the max journal_seq of
+ * any bset; read_done sorts down to a single set and picks the
+ * max journal_seq
+ */
+ n.journal_seq = le64_to_cpu(bn->keys.journal_seq),
+ n.sectors_written = b->written;
+
mutex_lock(&f->lock);
if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
bch_err(c, "try_read_btree_node() can't handle endian conversion");
@@ -237,12 +225,18 @@ static int read_btree_nodes_worker(void *p)
struct find_btree_nodes_worker *w = p;
struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
struct bch_dev *ca = w->ca;
- void *buf = (void *) __get_free_page(GFP_KERNEL);
- struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
unsigned long last_print = jiffies;
- if (!buf || !bio) {
- bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ struct btree *b = __bch2_btree_node_mem_alloc(c);
+ if (!b) {
+ bch_err(c, "read_btree_nodes_worker: error allocating buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
+
+ struct bio *bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
+ if (!bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio");
w->f->ret = -ENOMEM;
goto err;
}
@@ -266,11 +260,13 @@ static int read_btree_nodes_worker(void *p)
!bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
continue;
- try_read_btree_node(w->f, ca, bio, buf, sector);
+ try_read_btree_node(w->f, ca, b, bio, sector);
}
err:
+ if (b)
+ __btree_node_data_free(b);
+ kfree(b);
bio_put(bio);
- free_page((unsigned long) buf);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
closure_put(w->cl);
kfree(w);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 3968f3be7f3b..e848e210a9bf 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -783,9 +783,6 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
- CLASS(printbuf, buf)();
- buf.atomic++;
-
guard(rcu)();
unsigned nr_replicas = 0, i;
@@ -797,11 +794,7 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
- u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark);
-
- prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
-
- if (!nr_free)
+ if (!dev_buckets_free(ca, usage, m->op.watermark))
continue;
nr_replicas += ca->mi.durability;
@@ -809,10 +802,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
break;
}
- if (!nr_replicas) {
- trace_data_update_done_no_rw_devs(c, buf.buf);
+ if (!nr_replicas)
return bch_err_throw(c, data_update_done_no_rw_devs);
- }
if (nr_replicas < m->op.nr_replicas)
return bch_err_throw(c, insufficient_devices);
return 0;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 901f643ead83..07c2a0f73cc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
c->verify_data = __bch2_btree_node_mem_alloc(c);
if (!c->verify_data)
goto out;
-
- list_del_init(&c->verify_data->list);
}
BUG_ON(b->nsets != 1);
@@ -586,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
i->ubuf = buf;
i->size = size;
i->ret = 0;
+
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
restart:
seqmutex_lock(&c->btree_trans_lock);
list_sort(&c->btree_trans_list, list_ptr_order_cmp);
@@ -599,6 +599,11 @@ restart:
if (!closure_get_not_zero(&trans->ref))
continue;
+ if (!trans->srcu_held) {
+ closure_put(&trans->ref);
+ continue;
+ }
+
u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
@@ -620,6 +625,8 @@ restart:
}
seqmutex_unlock(&c->btree_trans_lock);
unlocked:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 687c3ba98095..71956ee86a9c 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1720,8 +1720,7 @@ err:
static int new_stripe_alloc_buckets(struct btree_trans *trans,
struct alloc_request *req,
- struct ec_stripe_head *h, struct ec_stripe_new *s,
- struct closure *cl)
+ struct ec_stripe_head *h, struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
struct open_bucket *ob;
@@ -1771,7 +1770,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
req->nr_effective = nr_have_parity;
req->data_type = BCH_DATA_parity;
- ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe);
open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
@@ -1794,7 +1793,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
req->nr_effective = nr_have_data;
req->data_type = BCH_DATA_user;
- ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe);
open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
@@ -1926,7 +1925,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
}
bch2_trans_iter_exit(trans, &lru_iter);
if (!ret)
- ret = bch_err_throw(c, stripe_alloc_blocked);
+ return bch_err_throw(c, stripe_alloc_blocked);
if (ret == 1)
ret = 0;
if (ret)
@@ -1998,8 +1997,7 @@ err:
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
struct alloc_request *req,
- unsigned algo,
- struct closure *cl)
+ unsigned algo)
{
struct bch_fs *c = trans->c;
unsigned redundancy = req->nr_replicas - 1;
@@ -2041,12 +2039,18 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
if (s->have_existing_stripe)
goto alloc_existing;
+
/* First, try to allocate a full stripe: */
enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
- swap(req->watermark, saved_watermark);
- ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
+ unsigned saved_flags = req->flags | BCH_WRITE_alloc_nowait;
+ swap(req->watermark, saved_watermark);
+ swap(req->flags, saved_flags);
+
+ ret = new_stripe_alloc_buckets(trans, req, h, s) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
- swap(req->watermark, saved_watermark);
+
+ swap(req->watermark, saved_watermark);
+ swap(req->flags, saved_flags);
if (!ret)
goto allocate_buf;
@@ -2062,19 +2066,25 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
ret = __bch2_ec_stripe_head_reuse(trans, h, s);
if (!ret)
break;
- if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+ if (waiting ||
+ (req->flags & BCH_WRITE_alloc_nowait) ||
+ ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
if (req->watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
+ /* Don't self-deadlock copygc */
+ swap(req->flags, saved_flags);
+ ret = new_stripe_alloc_buckets(trans, req, h, s) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
+ swap(req->flags, saved_flags);
+
if (ret)
goto err;
goto allocate_buf;
}
/* XXX freelist_wait? */
- closure_wait(&c->freelist_wait, cl);
+ closure_wait(&c->freelist_wait, req->cl);
waiting = true;
}
@@ -2085,7 +2095,7 @@ alloc_existing:
* Retry allocating buckets, with the watermark for this
* particular write:
*/
- ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
+ ret = new_stripe_alloc_buckets(trans, req, h, s);
if (ret)
goto err;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 548048adf0d5..756f14bd7bb7 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -258,7 +258,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
struct alloc_request;
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
- struct alloc_request *, unsigned, struct closure *);
+ struct alloc_request *, unsigned);
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index d27b94a6610a..a66c96bd9556 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -236,6 +236,9 @@
x(0, operation_blocked) \
x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
x(BCH_ERR_operation_blocked, journal_res_blocked) \
+ x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
+ x(BCH_ERR_operation_blocked, open_bucket_alloc_blocked) \
+ x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
x(BCH_ERR_journal_res_blocked, journal_blocked) \
x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
x(BCH_ERR_journal_res_blocked, journal_max_open) \
@@ -244,8 +247,6 @@
x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
x(BCH_ERR_journal_res_blocked, journal_stuck) \
x(BCH_ERR_journal_res_blocked, journal_retry_open) \
- x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
- x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
x(BCH_ERR_invalid, invalid_sb) \
x(BCH_ERR_invalid_sb, invalid_sb_magic) \
x(BCH_ERR_invalid_sb, invalid_sb_version) \
@@ -289,7 +290,6 @@
x(EIO, sb_not_downgraded) \
x(EIO, btree_node_write_all_failed) \
x(EIO, btree_node_read_error) \
- x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(EIO, bucket_ref_update) \
x(EIO, trigger_alloc) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index a9a9fe193966..71649b4164b8 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -103,7 +103,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
return bch_err_throw(c, btree_need_topology_repair);
} else {
return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
- bch_err_throw(c, btree_node_read_validate_error);
+ bch_err_throw(c, btree_need_topology_repair);
}
}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 036e4ad95987..83cbd77dcb9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -50,19 +50,17 @@ void bch2_io_failures_to_text(struct printbuf *out,
struct bch_io_failures *failed)
{
static const char * const error_types[] = {
- "io", "checksum", "ec reconstruct", NULL
+ "btree validate", "io", "checksum", "ec reconstruct", NULL
};
for (struct bch_dev_io_failures *f = failed->devs;
f < failed->devs + failed->nr;
f++) {
unsigned errflags =
- ((!!f->failed_io) << 0) |
- ((!!f->failed_csum_nr) << 1) |
- ((!!f->failed_ec) << 2);
-
- if (!errflags)
- continue;
+ ((!!f->failed_btree_validate) << 0) |
+ ((!!f->failed_io) << 1) |
+ ((!!f->failed_csum_nr) << 2) |
+ ((!!f->failed_ec) << 3);
bch2_printbuf_make_room(out, 1024);
out->atomic++;
@@ -77,7 +75,9 @@ void bch2_io_failures_to_text(struct printbuf *out,
prt_char(out, ' ');
- if (is_power_of_2(errflags)) {
+ if (!errflags) {
+ prt_str(out, "no error - confused");
+ } else if (is_power_of_2(errflags)) {
prt_bitflags(out, error_types, errflags);
prt_str(out, " error");
} else {
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index dad48d44f47b..4e82dfa6c03f 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -257,7 +257,7 @@ err:
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
- prt_printf(&buf, "read error %i from btree lookup", ret);
+ prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f9bc99eb2d02..3b0783f117ae 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1692,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
s.mask = map_defined(bch_flags_to_xflags);
s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
- if (fa->fsx_xflags)
- return bch_err_throw(c, unsupported_fsx_flag);
+ if (fa->fsx_xflags) {
+ ret = bch_err_throw(c, unsupported_fsx_flag);
+ goto err;
+ }
- if (fa->fsx_projid >= U32_MAX)
- return bch_err_throw(c, projid_too_big);
+ if (fa->fsx_projid >= U32_MAX) {
+ ret = bch_err_throw(c, projid_too_big);
+ goto err;
+ }
/*
* inode fields accessible via the xattr interface are stored with a +1
@@ -1718,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
fa->flags &= ~FS_CASEFOLD_FL;
s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
- if (fa->flags)
- return bch_err_throw(c, unsupported_fa_flag);
+ if (fa->flags) {
+ ret = bch_err_throw(c, unsupported_fa_flag);
+ goto err;
+ }
}
mutex_lock(&inode->ei_update_lock);
@@ -1730,7 +1736,7 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
-
+err:
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 00afe0a3593f..471e93a3f00c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,6 +12,7 @@
#include "fs.h"
#include "fsck.h"
#include "inode.h"
+#include "io_misc.h"
#include "keylist.h"
#include "namei.h"
#include "recovery_passes.h"
@@ -1637,7 +1638,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
i->count = count2;
}
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty) &&
+ i->inode.bi_sectors != i->count,
trans, inode_i_sectors_wrong,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->last_pos.inode, i->inode.bi_snapshot,
@@ -1963,33 +1965,11 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
"extent type past end of inode %llu:%u, i_size %llu\n%s",
i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout));
- ret = PTR_ERR_OR_ZERO(whiteout);
- if (ret)
- goto err;
-
- bkey_init(&whiteout->k);
- whiteout->k.p = SPOS(k.k->p.inode,
- last_block,
- i->inode.bi_snapshot);
- bch2_key_resize(&whiteout->k,
- min(KEY_SIZE_MAX & (~0 << c->block_bits),
- U64_MAX - whiteout->k.p.offset));
-
-
- /*
- * Need a normal (not BTREE_ITER_all_snapshots)
- * iterator, if we're deleting in a different
- * snapshot and need to emit a whiteout
- */
- struct btree_iter iter2;
- bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents,
- bkey_start_pos(&whiteout->k),
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(trans, &iter2) ?:
- bch2_trans_update(trans, &iter2, whiteout,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter2);
+ ret = bch2_fpunch_snapshot(trans,
+ SPOS(i->inode.bi_inum,
+ last_block,
+ i->inode.bi_snapshot),
+ POS(i->inode.bi_inum, U64_MAX));
if (ret)
goto err;
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index bf72b1d2e2cb..07023667a475 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -135,6 +135,33 @@ err_noprint:
return ret;
}
+/* For fsck */
+int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
+{
+ u32 restart_count = trans->restart_count;
+ struct bch_fs *c = trans->c;
+ struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bkey_i delete;
+
+ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+ start, end, 0, k,
+ &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+ bch2_cut_back(end, &delete);
+
+ bch2_extent_trim_atomic(trans, &iter, &delete) ?:
+ bch2_trans_update(trans, &iter, &delete, 0);
+ }));
+
+ bch2_disk_reservation_put(c, &disk_res);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
/*
* Returns -BCH_ERR_transacton_restart if we had to drop locks:
*/
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 9cb44a7c43c1..b93e4d4b3c0c 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -5,6 +5,8 @@
int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
u64, struct bch_io_opts, s64 *,
struct write_point_specifier);
+
+int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index 9c5ddbf861b3..cfc8ef35b14d 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -147,7 +147,7 @@ static inline void bch2_read_extent(struct btree_trans *trans,
int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
data_btree, k, offset_into_extent, NULL, flags, -1);
/* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
- WARN(ret, "unhandled error from __bch2_read_extent()");
+ WARN(ret, "unhandled error from __bch2_read_extent(): %s", bch2_err_str(ret));
}
int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ce5340611de6..f22b05e02c1e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1376,6 +1376,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
return bch_err_throw(c, erofs_filesystem_full);
}
+ unsigned nr;
int ret;
if (dynamic_fault("bcachefs:add:journal_alloc")) {
@@ -1384,19 +1385,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
}
/* 1/128th of the device by default: */
- unsigned nr = ca->mi.nbuckets >> 7;
+ nr = ca->mi.nbuckets >> 7;
/*
- * clamp journal size to 8GB, or 32GB with large_journal option:
+ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
+ * is smaller:
*/
- unsigned max_sectors = 1 << 24;
-
- if (c->opts.large_journal)
- max_sectors *= 4;
-
nr = clamp_t(unsigned, nr,
BCH_JOURNAL_BUCKETS_MIN,
- max_sectors / ca->mi.bucket_size);
+ min(1 << 13,
+ (1 << 24) / ca->mi.bucket_size));
ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
err:
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index dd3f3434c1b0..2d6ce4348a22 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1245,6 +1245,8 @@ noinline_for_stack
static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
{
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
bool have_good = false;
@@ -1272,6 +1274,28 @@ static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_r
printbuf_exit(&buf);
}
+struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end)
+{
+ BUG_ON(start > end);
+
+ if (start == end)
+ return (struct u64_range) {};
+
+ start = bch2_journal_seq_next_nonblacklisted(c, start);
+ if (start >= end)
+ return (struct u64_range) {};
+
+ struct u64_range missing = {
+ .start = start,
+ .end = min(end, bch2_journal_seq_next_blacklisted(c, start)),
+ };
+
+ if (missing.start == missing.end)
+ return (struct u64_range) {};
+
+ return missing;
+}
+
noinline_for_stack
static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
{
@@ -1280,6 +1304,7 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e
struct genradix_iter radix_iter;
struct journal_replay *i, **_i, *prev = NULL;
+ /* Sequence number we expect to find next, to check for missing entries */
u64 seq = start_seq;
genradix_for_each(&c->journal_entries, radix_iter, _i) {
@@ -1290,43 +1315,31 @@ static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 e
BUG_ON(seq > le64_to_cpu(i->j.seq));
- while (seq < le64_to_cpu(i->j.seq)) {
- while (seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (seq == le64_to_cpu(i->j.seq))
- break;
-
- u64 missing_start = seq;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- !bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- u64 missing_end = seq - 1;
+ struct u64_range missing;
+ while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) {
printbuf_reset(&buf);
prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- missing_start, missing_end,
+ missing.start, missing.end - 1,
start_seq, end_seq);
- prt_printf(&buf, "\nprev at ");
if (prev) {
+ prt_printf(&buf, "\n%llu at ", le64_to_cpu(prev->j.seq));
bch2_journal_ptrs_to_text(&buf, c, prev);
prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
- } else
- prt_printf(&buf, "(none)");
+ }
- prt_printf(&buf, "\nnext at ");
+ prt_printf(&buf, "\n%llu at ", le64_to_cpu(i->j.seq));
bch2_journal_ptrs_to_text(&buf, c, i);
prt_printf(&buf, ", continue?");
fsck_err(c, journal_entries_missing, "%s", buf.buf);
+
+ seq = missing.end;
}
prev = i;
- seq++;
+ seq = le64_to_cpu(i->j.seq) + 1;
}
fsck_err:
printbuf_exit(&buf);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6fa82c4050fe..f53c5c81d137 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -71,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct journal_replay *);
+struct u64_range {
+ u64 start;
+ u64 end;
+};
+
+struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64);
+
int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index af4fe416d9ec..6361809b5e2e 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -103,6 +103,52 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
return cmp_int(l->start, r->start);
}
+static int journal_seq_blacklist_table_end_cmp(const void *_l, const void *_r)
+{
+ const struct journal_seq_blacklist_table_entry *l = _l;
+ const struct journal_seq_blacklist_table_entry *r = _r;
+
+ return cmp_int(l->end, r->end);
+}
+
+u64 bch2_journal_seq_next_blacklisted(struct bch_fs *c, u64 seq)
+{
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+
+ if (!t)
+ return U64_MAX;
+
+ struct journal_seq_blacklist_table_entry search = { .end = seq };
+ int idx = eytzinger0_find_gt(t->entries, t->nr,
+ sizeof(t->entries[0]),
+ journal_seq_blacklist_table_end_cmp,
+ &search);
+ if (idx < 0)
+ return U64_MAX;
+
+ return max(seq, t->entries[idx].start);
+}
+
+u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *c, u64 seq)
+{
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+
+ if (!t)
+ return seq;
+
+ while (true) {
+ struct journal_seq_blacklist_table_entry search = { .start = seq };
+ int idx = eytzinger0_find_le(t->entries, t->nr,
+ sizeof(t->entries[0]),
+ journal_seq_blacklist_table_cmp,
+ &search);
+ if (idx < 0 || t->entries[idx].end <= seq)
+ return seq;
+
+ seq = t->entries[idx].end;
+ }
+}
+
bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
bool dirty)
{
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index f06942ccfcdd..389b789b26f4 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -11,6 +11,9 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
: 0;
}
+u64 bch2_journal_seq_next_blacklisted(struct bch_fs *, u64);
+u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *, u64);
+
bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 4a7a60588c10..63f8e254495c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -343,12 +343,6 @@ enum fsck_err_opts {
OPT_UINT(0, U32_MAX), \
BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
NULL, "Delay in milliseconds before automatic journal reclaim")\
- x(large_journal, bool, \
- OPT_FS|OPT_MOUNT|OPT_FORMAT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allocate a bigger than normal journal: recovery from unclean "\
- "shutdown will be slower, but more info will be available for debugging")\
x(move_bytes_in_flight, u32, \
OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1024, U32_MAX), \
@@ -395,11 +389,6 @@ enum fsck_err_opts {
OPT_UINT(0, U64_MAX), \
BCH2_NO_SB_OPT, 0, \
NULL, "Rewind journal") \
- x(journal_rewind_no_extents, bool, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Don't rewind extents when rewinding journal") \
x(recovery_passes, u64, \
OPT_FS|OPT_MOUNT, \
OPT_BITFIELD(bch2_recovery_passes), \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 974f8bf9a574..0def4ecb7f88 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -273,24 +273,35 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
goto out;
struct btree_path *path = btree_iter_path(trans, &iter);
- if (unlikely(!btree_path_node(path, k->level) &&
- !k->allocated)) {
+ if (unlikely(!btree_path_node(path, k->level))) {
struct bch_fs *c = trans->c;
+ CLASS(printbuf, buf)();
+ prt_str(&buf, "btree=");
+ bch2_btree_id_to_text(&buf, k->btree_id);
+ prt_printf(&buf, " level=%u ", k->level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+
if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
- bch_err(c, "have key in journal replay for btree depth that does not exist, confused");
+ bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
+ buf.buf);
ret = -EINVAL;
}
-#if 0
+
+ if (!k->allocated) {
+ bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
+ buf.buf);
+ k->overwritten = true;
+ goto out;
+ }
+
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
-BCH_ERR_transaction_restart_nested;
-#endif
- k->overwritten = true;
goto out;
}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9324ef32903d..b5dae1145afa 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1330,11 +1330,6 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, data_update_done_no_rw_devs,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
DEFINE_EVENT(fs_str, io_move_pred,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)