summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-03-17 20:51:27 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:50 -0400
commita8c752bb1d93a24a0de753e209d4f4d58d65c878 (patch)
treec0acc070a99c2c869e535b34be3fdff39ee9bd06 /fs/bcachefs
parent920e69bc3db88d3825c69190cafd43f0a1918d3b (diff)
bcachefs: New on disk format: Backpointers
This patch adds backpointers: we now have a reverse index from device and offset on that device (specifically, offset within a bucket) back to btree nodes and (non cached) data extents. The first 40 backpointers within a bucket are stored in the alloc key; after that backpointers spill over to the next backpointers btree. This is to help avoid performance regressions from additional btree updates on large streaming workloads. This patch adds all the code for creating, checking and repairing backpointers. The next patch in the series is going to use backpointers for copygc - finally getting rid of the need to scan all extents to do copygc. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/alloc_background.c104
-rw-r--r--fs/bcachefs/alloc_background.h16
-rw-r--r--fs/bcachefs/alloc_foreground.c23
-rw-r--r--fs/bcachefs/backpointers.c799
-rw-r--r--fs/bcachefs/backpointers.h131
-rw-r--r--fs/bcachefs/bcachefs.h7
-rw-r--r--fs/bcachefs/bcachefs_format.h62
-rw-r--r--fs/bcachefs/bkey_methods.c4
-rw-r--r--fs/bcachefs/buckets.c48
-rw-r--r--fs/bcachefs/buckets.h19
-rw-r--r--fs/bcachefs/errcode.h1
-rw-r--r--fs/bcachefs/recovery.c35
-rw-r--r--fs/bcachefs/super.c2
14 files changed, 1186 insertions, 66 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index c0e715760c8b..456d540441ce 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
alloc_background.o \
alloc_foreground.o \
+ backpointers.o \
bkey.o \
bkey_methods.o \
bkey_sort.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f75d05beaf31..58ec650a512c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
@@ -266,12 +267,34 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
- if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
- prt_printf(err, "bad val size (%zu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
+ if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
+ prt_printf(err, "bad val size (%lu != %u)",
+ bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
return -BCH_ERR_invalid_bkey;
}
+ if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+ prt_printf(err, "invalid backpointers_start");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ /*
+ * XXX this is wrong, we'll be checking updates that happened from
+ * before BCH_FS_CHECK_BACKPOINTERS_DONE
+ */
+ if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ unsigned i, bp_len = 0;
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
+ bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+
+ if (bp_len > a.v->dirty_sectors) {
+ prt_printf(err, "too many backpointers");
+ return -BCH_ERR_invalid_bkey;
+ }
+ }
+
if (rw == WRITE) {
if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
prt_printf(err, "invalid data type (got %u should be %u)",
@@ -328,9 +351,19 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+static inline u64 swab40(u64 x)
+{
+ return (((x & 0x00000000ffULL) << 32)|
+ ((x & 0x000000ff00ULL) << 16)|
+ ((x & 0x0000ff0000ULL) >> 0)|
+ ((x & 0x00ff000000ULL) >> 16)|
+ ((x & 0xff00000000ULL) >> 32));
+}
+
void bch2_alloc_v4_swab(struct bkey_s k)
{
struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+ struct bch_backpointer *bp, *bps;
a->journal_seq = swab64(a->journal_seq);
a->flags = swab32(a->flags);
@@ -340,12 +373,20 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->io_time[1] = swab64(a->io_time[1]);
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+ bps = alloc_v4_backpointers(a);
+ for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+ bp->bucket_offset = swab40(bp->bucket_offset);
+ bp->bucket_len = swab32(bp->bucket_len);
+ bch2_bpos_swab(&bp->pos);
+ }
}
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+ unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -374,14 +415,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_newline(out);
prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
prt_newline(out);
- prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a));
+ prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_newline(out);
- printbuf_indent_sub(out, 2);
-}
+ if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+ struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-static inline void *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
- return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+ printbuf_indent_add(out, 2);
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+ prt_newline(out);
+ bch2_backpointer_to_text(out, &bps[i]);
+ }
+
+ printbuf_indent_sub(out, 2);
+ }
+
+ printbuf_indent_sub(out, 2);
}
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -422,12 +474,18 @@ static noinline struct bkey_i_alloc_v4 *
__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
{
struct bkey_i_alloc_v4 *ret;
-
if (k.k->type == KEY_TYPE_alloc_v4) {
- unsigned bytes = min(sizeof(struct bkey_i_alloc_v4), bkey_bytes(k.k));
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ unsigned bytes = sizeof(struct bkey_i_alloc_v4) +
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) *
+ sizeof(struct bch_backpointer);
void *src, *dst;
- ret = bch2_trans_kmalloc(trans, bytes);
+ /*
+ * Reserve space for one more backpointer here:
+ * Not sketchy at doing it this way, nope...
+ */
+ ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
if (IS_ERR(ret))
return ret;
@@ -437,16 +495,20 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
dst = alloc_v4_backpointers(&ret->v);
+ memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
+ sizeof(struct bch_backpointer));
if (src < dst)
memset(src, 0, dst - src);
set_alloc_v4_u64s(ret);
} else {
- ret = bch2_trans_kmalloc(trans, sizeof(*ret));
- if (!IS_ERR(ret)) {
- bkey_alloc_v4_init(&ret->k_i);
- ret->k.p = k.k->p;
- bch2_alloc_to_v4(k, &ret->v);
- }
+ ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) +
+ sizeof(struct bch_backpointer));
+ if (IS_ERR(ret))
+ return ret;
+
+ bkey_alloc_v4_init(&ret->k_i);
+ ret->k.p = k.k->p;
+ bch2_alloc_to_v4(k, &ret->v);
}
return ret;
}
@@ -455,8 +517,12 @@ static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_
{
if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+ /*
+ * Reserve space for one more backpointer here:
+ * Not sketchy at doing it this way, nope...
+ */
struct bkey_i_alloc_v4 *ret =
- bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+ bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
if (!IS_ERR(ret))
bkey_reassemble(&ret->k_i, k);
return ret;
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c562aff3ac33..b843316d3846 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -73,7 +73,9 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
{
unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
- BCH_ALLOC_V4_U64s_V0);
+ BCH_ALLOC_V4_U64s_V0) +
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+ (sizeof(struct bch_backpointer) / sizeof(u64));
BUG_ON(ret > U8_MAX - BKEY_U64s);
return ret;
@@ -175,6 +177,18 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
void bch2_do_invalidates(struct bch_fs *);
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+ return (void *) ((u64 *) &a->v +
+ (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
int bch2_fs_freespace_init(struct bch_fs *);
void bch2_recalc_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index ba14cfe06515..5988aa288c98 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,6 +14,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_gc.h"
@@ -346,6 +347,28 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
}
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ struct bch_backpointer bp;
+ u64 bp_offset = 0;
+
+ ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+ &bp_offset, &bp);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ if (bp_offset != U64_MAX) {
+ /*
+ * Bucket may have data in it - we don't call
+ * bc2h_trans_inconnsistent() because fsck hasn't
+ * finished yet
+ */
+ ob = NULL;
+ goto err;
+ }
+ }
+
ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
if (!ob)
iter.path->preserve = false;
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644
index 000000000000..6efc286cd6ba
--- /dev/null
+++ b/fs/bcachefs/backpointers.c
@@ -0,0 +1,799 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "error.h"
+
+static bool extent_matches_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct bpos bucket,
+ struct bch_backpointer bp)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket2;
+ struct bch_backpointer bp2;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+ &bucket2, &bp2);
+ if (bpos_eq(bucket, bucket2) &&
+ !memcmp(&bp, &bp2, sizeof(bp)))
+ return true;
+ }
+
+ return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ int rw, struct printbuf *err)
+{
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+ struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+ if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
+ prt_str(err, "incorrect value size");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+ prt_str(err, "backpointer at wrong pos");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+ prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+ bch2_btree_ids[bp->btree_id],
+ bp->level,
+ (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ bp->bucket_len);
+ bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+ struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+ bp.v->bucket_offset = swab32(bp.v->bucket_offset);
+ bp.v->bucket_len = swab32(bp.v->bucket_len);
+ bch2_bpos_swab(&bp.v->pos);
+}
+
+#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1)
+
+static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
+{
+ return cmp_int(l.bucket_offset, r.bucket_offset);
+}
+
+static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ if (bp_offset < BACKPOINTER_OFFSET_MAX) {
+ struct bch_backpointer *bps;
+ struct bkey_i_alloc_v4 *a;
+ unsigned i, nr;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ bucket,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_alloc_v4) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+ bps = alloc_v4_backpointers(&a->v);
+ nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+ for (i = 0; i < nr; i++) {
+ if (bps[i].bucket_offset == bp_offset)
+ goto found;
+ if (bps[i].bucket_offset > bp_offset)
+ break;
+ }
+
+ ret = -ENOENT;
+ goto err;
+found:
+ if (memcmp(&bps[i], &bp, sizeof(bp))) {
+ ret = -ENOENT;
+ goto err;
+ }
+ array_remove_item(bps, nr, i);
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+ set_alloc_v4_u64s(a);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ } else {
+ bp_offset -= BACKPOINTER_OFFSET_MAX;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, bucket, bp_offset),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+bool bch2_bucket_backpointer_del(struct btree_trans *trans,
+ struct bkey_i_alloc_v4 *a,
+ struct bch_backpointer bp)
+{
+ struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+ for (i = 0; i < nr; i++) {
+ int cmp = backpointer_cmp(bps[i], bp) ?:
+ memcmp(&bps[i], &bp, sizeof(bp));
+ if (!cmp) {
+ array_remove_item(bps, nr, i);
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+ set_alloc_v4_u64s(a);
+ return true;
+ }
+ if (cmp >= 0)
+ break;
+ }
+
+ return false;
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+ struct bch_backpointer bp,
+ struct bkey_s_c bp_k,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ if (insert) {
+ prt_printf(&buf, "existing backpointer found when inserting ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "found ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ bch_err(c, "%s", buf.buf);
+ } else if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ prt_printf(&buf, "backpointer not found when deleting");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "searching for ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "got ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ bch_err(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
+
+ if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ bch2_inconsistent_error(c);
+ return -EIO;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+ struct bkey_i_alloc_v4 *a,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_backpointer *bp_k;
+ struct btree_iter bp_iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (insert
+ ? k.k->type
+ : (k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+ ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+ struct bpos bucket, int gen,
+ u64 *bp_offset,
+ struct bch_backpointer *dst)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bp_pos, bp_end_pos;
+ struct btree_iter alloc_iter, bp_iter = { NULL };
+ struct bkey_s_c k;
+ struct bkey_s_c_alloc_v4 a;
+ size_t i;
+ int ret;
+
+ if (*bp_offset == U64_MAX)
+ return 0;
+
+ bp_pos = bucket_pos_to_bp(c, bucket,
+ max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+ bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ if (k.k->type != KEY_TYPE_alloc_v4)
+ goto done;
+
+ a = bkey_s_c_to_alloc_v4(k);
+ if (gen >= 0 && a.v->gen != gen)
+ goto done;
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
+ if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
+ continue;
+
+ *dst = alloc_v4_backpointers_c(a.v)[i];
+ *bp_offset = dst->bucket_offset;
+ goto out;
+ }
+
+ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+ bp_pos, 0, k, ret) {
+ if (bpos_ge(k.k->p, bp_end_pos))
+ break;
+
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ *dst = *bkey_s_c_to_backpointer(k).v;
+ *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
+ goto out;
+ }
+done:
+ *bp_offset = U64_MAX;
+out:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp,
+ struct bkey_s_c k,
+ const char *thing_it_points_to)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ if (likely(!bch2_backpointers_no_use_write_buffer))
+ return;
+
+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
+ thing_it_points_to);
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, "\n ");
+
+ if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
+ struct bpos bp_pos =
+ bucket_pos_to_bp(c, bucket,
+ bp_offset - BACKPOINTER_OFFSET_MAX);
+ prt_printf(&buf, "backpointer pos: ");
+ bch2_bpos_to_text(&buf, bp_pos);
+ prt_printf(&buf, "\n ");
+ }
+
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+ bch_err_ratelimited(c, "%s", buf.buf);
+ else
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+ printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ min(bp.level, c->btree_roots[bp.btree_id].level),
+ 0);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ if (bp.level == c->btree_roots[bp.btree_id].level + 1)
+ k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (unlikely(bch2_backpointers_no_use_write_buffer)) {
+ if (bp.level) {
+ struct btree *b;
+
+ /*
+ * If a backpointer for a btree node wasn't found, it may be
+ * because it was overwritten by a new btree node that hasn't
+ * been written out yet - backpointer_get_node() checks for
+ * this:
+ */
+ b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp);
+ if (!IS_ERR_OR_NULL(b))
+ return bkey_i_to_s_c(&b->key);
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (IS_ERR(b))
+ return bkey_s_c_err(PTR_ERR(b));
+ return bkey_s_c_null;
+ }
+
+ backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
+ }
+
+ return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket,
+ u64 bp_offset,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+
+ BUG_ON(!bp.level);
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ bp.level - 1,
+ 0);
+ b = bch2_btree_iter_peek_node(iter);
+ if (IS_ERR(b))
+ goto err;
+
+ if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
+ return b;
+
+ if (b && btree_node_will_make_reachable(b)) {
+ b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ } else {
+ backpointer_not_found(trans, bucket, bp_offset, bp,
+ bkey_i_to_s_c(&b->key), "btree node");
+ b = NULL;
+ }
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bch_dev *ca;
+ struct bkey_s_c alloc_k;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+ "backpointer for mising device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ bp_pos_to_bucket(c, k.k->p), 0);
+
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto out;
+
+ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+ alloc_iter.pos.inode, alloc_iter.pos.offset,
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_backpointers, POS_MIN, 0, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ bch2_check_btree_backpointer(&trans, &iter, k)));
+}
+
+static int check_bp_exists(struct btree_trans *trans,
+ struct bpos bucket_pos,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter, bp_iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c alloc_k, bp_k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto err;
+
+ if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
+ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
+ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
+
+ for (i = 0; i < nr; i++) {
+ int cmp = backpointer_cmp(bps[i], bp) ?:
+ memcmp(&bps[i], &bp, sizeof(bp));
+ if (!cmp)
+ goto out;
+ if (cmp >= 0)
+ break;
+ }
+ } else {
+ goto missing;
+ }
+
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
+ 0);
+ bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(bp_k);
+ if (ret)
+ goto err;
+
+ if (bp_k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
+ goto missing;
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+missing:
+ prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+ bch2_btree_ids[bp.btree_id], bp.level);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_printf(&buf, "\nbp pos ");
+ bch2_bpos_to_text(&buf, bp_iter.pos);
+
+ if (c->sb.version < bcachefs_metadata_version_backpointers ||
+ c->opts.reconstruct_alloc ||
+ fsck_err(c, "%s", buf.buf)) {
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
+
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ bch2_bucket_backpointer_mod(trans, a, bp, orig_k, true);
+ }
+
+ goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek_all_levels(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ enum btree_id btree_id)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct btree *b;
+ struct bkey_s_c k;
+ struct bkey_ptrs_c ptrs;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ c->btree_roots[btree_id].level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ BUG_ON(b != btree_node_root(c, b));
+
+ k = bkey_i_to_s_c(&b->key);
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k);
+ if (ret)
+ goto err;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ enum btree_id btree_id;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+ bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0,
+ depth,
+ BTREE_ITER_ALL_LEVELS|
+ BTREE_ITER_PREFETCH);
+
+ do {
+ ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_extent_to_backpointers(&trans, &iter));
+ if (ret)
+ break;
+ } while (!bch2_btree_iter_advance(&iter));
+
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ break;
+
+ ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_btree_root_to_backpointers(&trans, btree_id));
+ if (ret)
+ break;
+ }
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+ struct bpos bucket,
+ u64 *bp_offset)
+{
+ struct btree_iter iter;
+ struct bch_backpointer bp;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ ret = bch2_get_next_backpointer(trans, bucket, -1,
+ bp_offset, &bp);
+ if (ret || *bp_offset == U64_MAX)
+ return ret;
+
+ k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+ ret = bkey_err(k);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ return 0;
+ if (ret)
+ return ret;
+
+ if (fsck_err_on(!k.k, trans->c,
+ "%s backpointer points to missing extent\n%s",
+ *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
+ (bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
+ ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
+ if (ret == -ENOENT)
+ bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ u64 bp_offset = 0;
+
+ while (!(ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
+ bp_offset < U64_MAX)
+ bp_offset++;
+
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644
index 000000000000..e1506492f022
--- /dev/null
+++ b/fs/bcachefs/backpointers.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+ int, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
+ .key_invalid = bch2_backpointer_invalid, \
+ .val_to_text = bch2_backpointer_k_to_text, \
+ .swab = bch2_backpointer_swab, \
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+ struct bpos bp_pos)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+ struct bpos ret;
+
+ ret = POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+ BUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+ return ret;
+}
+
+bool bch2_bucket_backpointer_del(struct btree_trans *,
+ struct bkey_i_alloc_v4 *,
+ struct bch_backpointer);
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+ struct bkey_i_alloc_v4 *,
+ struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+ struct bkey_i_alloc_v4 *a,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_backpointer *bp_k;
+ int ret;
+
+ if (!insert &&
+ unlikely(BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v)) &&
+ bch2_bucket_backpointer_del(trans, a, bp))
+ return 0;
+
+ if (unlikely(bch2_backpointers_no_use_write_buffer))
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, a, bp, orig_k, insert);
+
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+ s64 sectors = level ? btree_sectors(c) : k.k->size;
+ u32 bucket_offset;
+
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bp = (struct bch_backpointer) {
+ .btree_id = btree_id,
+ .level = level,
+ .data_type = data_type,
+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+ p.crc.offset,
+ .bucket_len = ptr_disk_sectors(sectors, p),
+ .pos = k.k->p,
+ };
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+ u64 *, struct bch_backpointer *);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+ struct bpos, u64, struct bch_backpointer);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+ struct bpos, u64, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 91f635faccb0..6d048e5d8843 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -315,7 +315,10 @@ do { \
"done in memory") \
BCH_DEBUG_PARAM(verify_all_btree_replicas, \
"When reading btree nodes, read all replicas and " \
- "compare them")
+ "compare them") \
+ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
+ "Don't use the write buffer for backpointers, enabling "\
+ "extra runtime checks")
/* Parameters that should only be compiled in debug mode: */
#define BCH_DEBUG_PARAMS_DEBUG() \
@@ -435,6 +438,7 @@ enum gc_phase {
GC_PHASE_BTREE_lru,
GC_PHASE_BTREE_freespace,
GC_PHASE_BTREE_need_discard,
+ GC_PHASE_BTREE_backpointers,
GC_PHASE_PENDING_DELETE,
};
@@ -552,6 +556,7 @@ enum {
BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */
BCH_FS_CHECK_ALLOC_DONE,
BCH_FS_CHECK_LRUS_DONE,
+ BCH_FS_CHECK_BACKPOINTERS_DONE,
BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8e070402e73f..66c885186160 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -369,7 +369,8 @@ static inline void bkey_init(struct bkey *k)
x(alloc_v3, 24) \
x(set, 25) \
x(lru, 26) \
- x(alloc_v4, 27)
+ x(alloc_v4, 27) \
+ x(backpointer, 28)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -890,6 +891,12 @@ struct bch_alloc {
x(stripe, 32) \
x(stripe_redundancy, 8)
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
@@ -918,6 +925,9 @@ struct bch_alloc_v3 {
__u8 data[];
} __packed __aligned(8);
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
struct bch_alloc_v4 {
struct bch_val v;
__u64 journal_seq;
@@ -931,25 +941,27 @@ struct bch_alloc_v4 {
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
- struct bpos backpointers[0];
} __packed __aligned(8);
#define BCH_ALLOC_V4_U64s_V0 6
#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64))
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
+
+struct bch_backpointer {
+ struct bch_val v;
+ __u8 btree_id;
+ __u8 level;
+ __u8 data_type;
+ __u64 bucket_offset:40;
+ __u32 bucket_len;
+ struct bpos pos;
+} __packed __aligned(8);
/* Quotas: */
@@ -1486,7 +1498,8 @@ struct bch_sb_field_journal_seq_blacklist {
x(inode_v2, 18) \
x(freespace, 19) \
x(alloc_v4, 20) \
- x(new_data_types, 21)
+ x(new_data_types, 21) \
+ x(backpointers, 22)
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -2007,19 +2020,20 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
/* Btree: */
#define BCH_BTREE_IDS() \
- x(extents, 0) \
- x(inodes, 1) \
- x(dirents, 2) \
- x(xattrs, 3) \
- x(alloc, 4) \
- x(quotas, 5) \
- x(stripes, 6) \
- x(reflink, 7) \
- x(subvolumes, 8) \
- x(snapshots, 9) \
- x(lru, 10) \
- x(freespace, 11) \
- x(need_discard, 12)
+ x(extents, 0) \
+ x(inodes, 1) \
+ x(dirents, 2) \
+ x(xattrs, 3) \
+ x(alloc, 4) \
+ x(quotas, 5) \
+ x(stripes, 6) \
+ x(reflink, 7) \
+ x(subvolumes, 8) \
+ x(snapshots, 9) \
+ x(lru, 10) \
+ x(freespace, 11) \
+ x(need_discard, 12) \
+ x(backpointers, 13)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 29809da5e9cf..45c8b2c61c5b 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc_background.h"
@@ -191,6 +192,9 @@ static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_need_discard] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_set),
+ [BKEY_TYPE_backpointers] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_backpointer),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 86f48f5762dd..b657f8545a3b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -7,6 +7,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
+#include "backpointers.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
@@ -662,16 +663,6 @@ err:
return ret;
}
-static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
- EBUG_ON(sectors < 0);
-
- return crc_is_compressed(p.crc)
- ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
- p.crc.uncompressed_size)
- : sectors;
-}
-
static int check_bucket_ref(struct bch_fs *c,
struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
@@ -1399,22 +1390,42 @@ need_mark:
/* trans_mark: */
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ unsigned flags)
{
+ bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
struct btree_iter iter;
struct bkey_i_alloc_v4 *a;
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+ s64 sectors;
int ret;
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
+ sectors = bp.bucket_len;
+ if (!insert)
+ sectors = -sectors;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
if (IS_ERR(a))
return PTR_ERR(a);
- ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+ ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
+ &a->v.dirty_sectors, &a->v.cached_sectors);
+ if (ret)
+ goto err;
+
+ if (!p.ptr.cached) {
+ ret = bch2_bucket_backpointer_mod(trans, a, bp, k, insert);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1497,8 +1508,7 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_OVERWRITE)
disk_sectors = -disk_sectors;
- ret = bch2_trans_mark_pointer(trans, k, p,
- disk_sectors, data_type);
+ ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
if (ret < 0)
return ret;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e8e3a3b09714..3398c9c3a81b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -75,6 +75,15 @@ static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
}
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+ const struct bch_extent_ptr *ptr,
+ u32 *bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
+
static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
@@ -90,6 +99,16 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
}
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+ EBUG_ON(sectors < 0);
+
+ return crc_is_compressed(p.crc)
+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+ p.crc.uncompressed_size)
+ : sectors;
+}
+
static inline int gen_cmp(u8 a, u8 b)
{
return (s8) (a - b);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 7a6448f48fca..804bc15dce31 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -60,6 +60,7 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
+ x(0, backpointer_to_overwritten_btree_node) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
x(0, fsck) \
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 61890755d335..55356c117737 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "alloc_background.h"
#include "btree_gc.h"
@@ -925,6 +926,7 @@ static bool btree_id_is_alloc(enum btree_id id)
{
switch (id) {
case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
case BTREE_ID_need_discard:
case BTREE_ID_freespace:
return true;
@@ -1091,8 +1093,8 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!c->opts.nochanges) {
- if (c->sb.version < bcachefs_metadata_version_new_data_types) {
- bch_info(c, "version prior to new_data_types, upgrade and fsck required");
+ if (c->sb.version < bcachefs_metadata_version_backpointers) {
+ bch_info(c, "version prior to backpointers, upgrade and fsck required");
c->opts.version_upgrade = true;
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
@@ -1301,6 +1303,28 @@ use_clean:
bch_verbose(c, "done checking lrus");
set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ bch_info(c, "checking backpointers to alloc keys");
+ err = "error checking backpointers to alloc keys";
+ ret = bch2_check_btree_backpointers(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking backpointers to alloc keys");
+
+ bch_info(c, "checking backpointers to extents");
+ err = "error checking backpointers to extents";
+ ret = bch2_check_backpointers_to_extents(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking backpointers to extents");
+
+ bch_info(c, "checking extents to backpointers");
+ err = "error checking extents to backpointers";
+ ret = bch2_check_extents_to_backpointers(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking extents to backpointers");
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+
bch_info(c, "checking alloc to lru refs");
err = "error checking alloc to lru refs";
ret = bch2_check_alloc_to_lru_refs(c);
@@ -1312,6 +1336,7 @@ use_clean:
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
@@ -1471,6 +1496,9 @@ int bch2_fs_initialize(struct bch_fs *c)
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+ if (c->sb.version < bcachefs_metadata_version_backpointers)
+ c->opts.version_upgrade = true;
+
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1479,6 +1507,9 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ade8d074e887..c5efaa7d38a8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1431,6 +1431,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
BTREE_TRIGGER_NORUN, NULL);
if (ret)