summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-04-02 15:40:30 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2022-04-02 15:40:30 -0400
commit3ac04b499779a0ee8873a7014211b40c95eeec49 (patch)
tree2d19b8f3b9cd01931c8aa3af877c576d3fb5105c
parentb034dfb24fece43a7677b9a29781495aeb62767f (diff)
Merge with 40a2993bf6 bcachefs: Discard path fixes/improvements
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/alloc_background.c1326
-rw-r--r--fs/bcachefs/alloc_background.h154
-rw-r--r--fs/bcachefs/alloc_foreground.c439
-rw-r--r--fs/bcachefs/alloc_foreground.h16
-rw-r--r--fs/bcachefs/alloc_types.h34
-rw-r--r--fs/bcachefs/bcachefs.h73
-rw-r--r--fs/bcachefs/bcachefs_format.h167
-rw-r--r--fs/bcachefs/bkey.c9
-rw-r--r--fs/bcachefs/bkey_methods.c47
-rw-r--r--fs/bcachefs/bkey_methods.h29
-rw-r--r--fs/bcachefs/bset.c196
-rw-r--r--fs/bcachefs/bset.h1
-rw-r--r--fs/bcachefs/btree_cache.c230
-rw-r--r--fs/bcachefs/btree_cache.h4
-rw-r--r--fs/bcachefs/btree_gc.c1014
-rw-r--r--fs/bcachefs/btree_io.c234
-rw-r--r--fs/bcachefs/btree_io.h58
-rw-r--r--fs/bcachefs/btree_iter.c906
-rw-r--r--fs/bcachefs/btree_iter.h73
-rw-r--r--fs/bcachefs/btree_key_cache.c90
-rw-r--r--fs/bcachefs/btree_key_cache.h11
-rw-r--r--fs/bcachefs/btree_locking.h50
-rw-r--r--fs/bcachefs/btree_types.h133
-rw-r--r--fs/bcachefs/btree_update.h32
-rw-r--r--fs/bcachefs/btree_update_interior.c351
-rw-r--r--fs/bcachefs/btree_update_interior.h10
-rw-r--r--fs/bcachefs/btree_update_leaf.c743
-rw-r--r--fs/bcachefs/buckets.c1026
-rw-r--r--fs/bcachefs/buckets.h187
-rw-r--r--fs/bcachefs/buckets_types.h45
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c167
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h15
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal_types.h23
-rw-r--r--fs/bcachefs/chardev.c5
-rw-r--r--fs/bcachefs/checksum.c72
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/compress.c22
-rw-r--r--fs/bcachefs/darray.h76
-rw-r--r--fs/bcachefs/debug.c218
-rw-r--r--fs/bcachefs/dirent.c23
-rw-r--r--fs/bcachefs/disk_groups.c142
-rw-r--r--fs/bcachefs/disk_groups.h5
-rw-r--r--fs/bcachefs/ec.c90
-rw-r--r--fs/bcachefs/ec.h2
-rw-r--r--fs/bcachefs/error.c4
-rw-r--r--fs/bcachefs/error.h28
-rw-r--r--fs/bcachefs/extent_update.c13
-rw-r--r--fs/bcachefs/extents.c22
-rw-r--r--fs/bcachefs/extents.h8
-rw-r--r--fs/bcachefs/eytzinger.h48
-rw-r--r--fs/bcachefs/fs-io.c39
-rw-r--r--fs/bcachefs/fs.c27
-rw-r--r--fs/bcachefs/fs.h4
-rw-r--r--fs/bcachefs/fsck.c341
-rw-r--r--fs/bcachefs/inode.c72
-rw-r--r--fs/bcachefs/inode.h6
-rw-r--r--fs/bcachefs/io.c103
-rw-r--r--fs/bcachefs/io.h4
-rw-r--r--fs/bcachefs/journal.c723
-rw-r--r--fs/bcachefs/journal.h65
-rw-r--r--fs/bcachefs/journal_io.c374
-rw-r--r--fs/bcachefs/journal_io.h16
-rw-r--r--fs/bcachefs/journal_reclaim.c101
-rw-r--r--fs/bcachefs/journal_sb.c222
-rw-r--r--fs/bcachefs/journal_sb.h24
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c159
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h2
-rw-r--r--fs/bcachefs/journal_types.h56
-rw-r--r--fs/bcachefs/lru.c203
-rw-r--r--fs/bcachefs/lru.h17
-rw-r--r--fs/bcachefs/move.c71
-rw-r--r--fs/bcachefs/movinggc.c218
-rw-r--r--fs/bcachefs/opts.c110
-rw-r--r--fs/bcachefs/opts.h81
-rw-r--r--fs/bcachefs/quota.c59
-rw-r--r--fs/bcachefs/rebalance.c42
-rw-r--r--fs/bcachefs/recovery.c554
-rw-r--r--fs/bcachefs/recovery.h16
-rw-r--r--fs/bcachefs/reflink.c36
-rw-r--r--fs/bcachefs/reflink.h12
-rw-r--r--fs/bcachefs/replicas.c235
-rw-r--r--fs/bcachefs/replicas.h1
-rw-r--r--fs/bcachefs/str_hash.h21
-rw-r--r--fs/bcachefs/subvolume.c56
-rw-r--r--fs/bcachefs/subvolume.h42
-rw-r--r--fs/bcachefs/subvolume_types.h8
-rw-r--r--fs/bcachefs/super-io.c859
-rw-r--r--fs/bcachefs/super-io.h21
-rw-r--r--fs/bcachefs/super.c319
-rw-r--r--fs/bcachefs/super.h7
-rw-r--r--fs/bcachefs/super_types.h1
-rw-r--r--fs/bcachefs/sysfs.c398
-rw-r--r--fs/bcachefs/tests.c123
-rw-r--r--fs/bcachefs/util.c160
-rw-r--r--fs/bcachefs/util.h172
-rw-r--r--fs/bcachefs/vstructs.h2
-rw-r--r--fs/bcachefs/xattr.c46
98 files changed, 9042 insertions, 5866 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 71cda24e6d08..7ddae26116a0 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -16,6 +16,7 @@ bcachefs-y := \
btree_update_interior.o \
btree_update_leaf.o \
buckets.o \
+ buckets_waiting_for_journal.o \
chardev.o \
checksum.o \
clock.o \
@@ -37,8 +38,10 @@ bcachefs-y := \
journal.o \
journal_io.o \
journal_reclaim.o \
+ journal_sb.o \
journal_seq_blacklist.o \
keylist.o \
+ lru.o \
migrate.o \
move.o \
movinggc.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2a36af5e0220..e8a34eccac25 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -9,10 +9,12 @@
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "ec.h"
#include "error.h"
+#include "lru.h"
#include "recovery.h"
#include "varint.h"
@@ -25,12 +27,7 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-const char * const bch2_allocator_states[] = {
-#define x(n) #n,
- ALLOC_THREAD_STATES()
-#undef x
- NULL
-};
+/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
@@ -38,16 +35,28 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x
};
-struct bkey_alloc_buf {
- struct bkey_i k;
- struct bch_alloc_v3 v;
+const char * const bch2_bucket_states[] = {
+ "free",
+ "need gc gens",
+ "need discard",
+ "cached",
+ "dirty",
+ NULL
+};
-#define x(_name, _bits) + _bits / 8
- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
+struct bkey_alloc_unpacked {
+ u64 journal_seq;
+ u64 bucket;
+ u8 dev;
+ u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
+ bool need_discard:1;
+ bool need_inc_gen:1;
+#define x(_name, _bits) u##_bits _name;
+ BCH_ALLOC_FIELDS_V2()
#undef x
-} __attribute__((packed, aligned(8)));
-
-/* Persistent alloc info: */
+};
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
const void **p, unsigned field)
@@ -169,6 +178,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
out->gen = a.v->gen;
out->oldest_gen = a.v->oldest_gen;
out->data_type = a.v->data_type;
+ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
out->journal_seq = le64_to_cpu(a.v->journal_seq);
#define x(_name, _bits) \
@@ -190,47 +201,7 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
return 0;
}
-static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
-{
- struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- u8 *out = a->v.data;
- u8 *end = (void *) &dst[1];
- u8 *last_nonzero_field = out;
- unsigned bytes;
-
- a->k.p = POS(src.dev, src.bucket);
- a->v.gen = src.gen;
- a->v.oldest_gen = src.oldest_gen;
- a->v.data_type = src.data_type;
- a->v.journal_seq = cpu_to_le64(src.journal_seq);
-
-#define x(_name, _bits) \
- nr_fields++; \
- \
- if (src._name) { \
- out += bch2_varint_encode_fast(out, src._name); \
- \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- } else { \
- *out++ = 0; \
- }
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- BUG_ON(out > end);
-
- out = last_nonzero_field;
- a->v.nr_fields = last_nonzero_fieldnr;
-
- bytes = (u8 *) out - (u8 *) &a->v;
- set_bkey_val_bytes(&a->k, bytes);
- memset_u64s_tail(&a->v, 0, bytes);
-}
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
struct bkey_alloc_unpacked ret = {
.dev = k.k->p.inode,
@@ -253,24 +224,71 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
return ret;
}
-static void bch2_alloc_pack(struct bch_fs *c,
- struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
+void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
{
- bch2_alloc_pack_v3(dst, src);
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ *out = *bkey_s_c_to_alloc_v4(k).v;
+ } else {
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ *out = (struct bch_alloc_v4) {
+ .journal_seq = u.journal_seq,
+ .flags = u.need_discard,
+ .gen = u.gen,
+ .oldest_gen = u.oldest_gen,
+ .data_type = u.data_type,
+ .stripe_redundancy = u.stripe_redundancy,
+ .dirty_sectors = u.dirty_sectors,
+ .cached_sectors = u.cached_sectors,
+ .io_time[READ] = u.read_time,
+ .io_time[WRITE] = u.write_time,
+ .stripe = u.stripe,
+ };
+ }
}
-int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
{
- struct bkey_alloc_buf *a;
+ struct bkey_i_alloc_v4 *ret;
- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- if (IS_ERR(a))
- return PTR_ERR(a);
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if (!IS_ERR(ret))
+ bkey_reassemble(&ret->k_i, k);
+ } else {
+ ret = bch2_trans_kmalloc(trans, sizeof(*ret));
+ if (!IS_ERR(ret)) {
+ bkey_alloc_v4_init(&ret->k_i);
+ ret->k.p = k.k->p;
+ bch2_alloc_to_v4(k, &ret->v);
+ }
+ }
+ return ret;
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
+{
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ int ret;
+
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ERR_PTR(ret);
+ }
- bch2_alloc_pack(trans->c, a, *u);
- return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+ a = bch2_alloc_to_v4_mut(trans, k);
+ if (IS_ERR(a))
+ bch2_trans_iter_exit(trans, iter);
+ return a;
}
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -316,629 +334,835 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_alloc_unpacked u;
+ struct bch_dev *ca;
if (k.k->p.inode >= c->sb.nr_devices ||
!c->devs[k.k->p.inode])
return "invalid device";
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ if (k.k->p.offset < ca->mi.first_bucket ||
+ k.k->p.offset >= ca->mi.nbuckets)
+ return "invalid bucket";
+
if (bch2_alloc_unpack_v3(&u, k))
return "unpack error";
return NULL;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
- pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
- u.gen, u.oldest_gen, bch2_data_types[u.data_type],
- u.journal_seq);
-#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
- BCH_ALLOC_FIELDS_V2()
-#undef x
-}
-
-static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bch_fs *c = trans->c;
struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked u;
- if (!bkey_is_alloc(k.k))
- return 0;
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = bucket(ca, k.k->p.offset);
- u = bch2_alloc_unpack(k);
-
- *bucket_gen(ca, k.k->p.offset) = u.gen;
- g->_mark.gen = u.gen;
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->_mark.stripe = u.stripe != 0;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
- g->gen_valid = 1;
- return 0;
+ if (k.k->p.offset < ca->mi.first_bucket ||
+ k.k->p.offset >= ca->mi.nbuckets)
+ return "invalid bucket";
+
+ return NULL;
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+
+ a->journal_seq = swab64(a->journal_seq);
+ a->flags = swab32(a->flags);
+ a->dirty_sectors = swab32(a->dirty_sectors);
+ a->cached_sectors = swab32(a->cached_sectors);
+ a->io_time[0] = swab64(a->io_time[0]);
+ a->io_time[1] = swab64(a->io_time[1]);
+ a->stripe = swab32(a->stripe);
+ a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 a;
+
+ bch2_alloc_to_v4(k, &a);
+
+ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
+ a.gen, a.oldest_gen, bch2_data_types[a.data_type],
+ a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
+ pr_buf(out, " dirty_sectors %u", a.dirty_sectors);
+ pr_buf(out, " cached_sectors %u", a.cached_sectors);
+ pr_buf(out, " stripe %u", a.stripe);
+ pr_buf(out, " stripe_redundancy %u", a.stripe_redundancy);
+ pr_buf(out, " read_time %llu", a.io_time[READ]);
+ pr_buf(out, " write_time %llu", a.io_time[WRITE]);
}
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ struct bch_dev *ca;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- down_read(&c->gc_lock);
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
- up_read(&c->gc_lock);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ bch2_alloc_to_v4(k, &a);
+
+ *bucket_gen(ca, k.k->p.offset) = a.gen;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
bch2_trans_exit(&trans);
- if (ret) {
+
+ if (ret)
bch_err(c, "error reading alloc info: %i", ret);
- return ret;
- }
- return 0;
+ return ret;
}
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
+/* Free space/discard btree: */
+
+static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ struct bch_alloc_v4 a,
+ bool set)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked old_u, new_u;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+ struct btree_iter iter;
+ struct bkey_s_c old;
+ struct bkey_i *k;
+ enum bucket_state state = bucket_state(a);
+ enum btree_id btree;
+ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ struct printbuf buf = PRINTBUF;
int ret;
-retry:
- bch2_trans_begin(trans);
- ret = bch2_btree_key_cache_flush(trans,
- BTREE_ID_alloc, iter->pos);
- if (ret)
- goto err;
+ if (state != BUCKET_free &&
+ state != BUCKET_need_discard)
+ return 0;
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
+ k = bch2_trans_kmalloc(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
- old_u = bch2_alloc_unpack(k);
- new_u = alloc_mem_to_key(c, iter);
+ bkey_init(&k->k);
+ k->k.type = new_type;
- if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+ switch (state) {
+ case BUCKET_free:
+ btree = BTREE_ID_freespace;
+ k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
+ bch2_key_resize(&k->k, 1);
+ break;
+ case BUCKET_need_discard:
+ btree = BTREE_ID_need_discard;
+ k->k.p = alloc_k.k->p;
+ break;
+ default:
return 0;
+ }
- ret = bch2_alloc_write(trans, iter, &new_u,
- BTREE_TRIGGER_NORUN) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
+ bch2_trans_iter_init(trans, &iter, btree,
+ bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ old = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(old);
+ if (ret)
+ goto err;
+
+ if (ca->mi.freespace_initialized &&
+ bch2_fs_inconsistent_on(old.k->type != old_type, c,
+ "incorrect key when %s %s btree (got %s should be %s)\n"
+ " for %s",
+ set ? "setting" : "clearing",
+ bch2_btree_ids[btree],
+ bch2_bkey_types[old.k->type],
+ bch2_bkey_types[old_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, k, 0);
err:
- if (ret == -EINTR)
- goto retry;
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bch_dev *ca;
- unsigned i;
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 old_a, *new_a;
+ u64 old_lru, new_lru;
int ret = 0;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ /*
+ * Deletion only happens in the device removal path, with
+ * BTREE_TRIGGER_NORUN:
+ */
+ BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
- for_each_member_device(ca, c, i) {
- bch2_btree_iter_set_pos(&iter,
- POS(ca->dev_idx, ca->mi.first_bucket));
+ bch2_alloc_to_v4(old, &old_a);
+ new_a = &bkey_i_to_alloc_v4(new)->v;
- while (iter.pos.offset < ca->mi.nbuckets) {
- ret = bch2_alloc_write_key(&trans, &iter, flags);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
- bch2_btree_iter_advance(&iter);
- }
+ if (new_a->dirty_sectors > old_a.dirty_sectors ||
+ new_a->cached_sectors > old_a.cached_sectors) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
-err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
- return ret;
-}
-/* Bucket IO clocks: */
-
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_alloc_unpacked u;
- u64 *time, now;
- int ret = 0;
+ if (old_a.data_type && !new_a->data_type &&
+ old_a.gen == new_a->gen &&
+ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter);
- if (ret)
- goto out;
+ if (bucket_state(old_a) != bucket_state(*new_a) ||
+ (bucket_state(*new_a) == BUCKET_free &&
+ alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
+ if (ret)
+ return ret;
+ }
- u = alloc_mem_to_key(c, &iter);
+ old_lru = alloc_lru_idx(old_a);
+ new_lru = alloc_lru_idx(*new_a);
- time = rw == READ ? &u.read_time : &u.write_time;
- now = atomic64_read(&c->io_clock[rw].now);
- if (*time == now)
- goto out;
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
+ old_lru, &new_lru);
+ if (ret)
+ return ret;
- *time = now;
+ if (new_lru && new_a->io_time[READ] != new_lru)
+ new_a->io_time[READ] = new_lru;
+ }
- ret = bch2_alloc_write(trans, &iter, &u, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-/* Background allocator thread: */
+static int bch2_check_alloc_key(struct btree_trans *trans,
+ struct btree_iter *alloc_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter discard_iter, freespace_iter, lru_iter;
+ struct bch_alloc_v4 a;
+ unsigned discard_key_type, freespace_key_type;
+ struct bkey_s_c alloc_k, k;
+ struct printbuf buf = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ int ret;
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
+ alloc_k = bch2_btree_iter_peek(alloc_iter);
+ if (!alloc_k.k)
+ return 0;
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
- struct bucket_mark m)
-{
- u8 gc_gen;
+ ret = bkey_err(alloc_k);
+ if (ret)
+ return ret;
- if (!is_available_bucket(m))
- return false;
+ bch2_alloc_to_v4(alloc_k, &a);
+ discard_key_type = bucket_state(a) == BUCKET_need_discard
+ ? KEY_TYPE_set : 0;
+ freespace_key_type = bucket_state(a) == BUCKET_free
+ ? KEY_TYPE_set : 0;
- if (m.owned_by_allocator)
- return false;
+ bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
+ alloc_k.k->p, 0);
+ bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
+ alloc_freespace_pos(alloc_k.k->p, a), 0);
+ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+ POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
- if (ca->buckets_nouse &&
- test_bit(b, ca->buckets_nouse))
- return false;
+ k = bch2_btree_iter_peek_slot(&discard_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- if (ca->new_fs_bucket_idx) {
- /*
- * Device or filesystem is still being initialized, and we
- * haven't fully marked superblocks & journal:
- */
- if (is_superblock_bucket(ca, b))
- return false;
+ if (fsck_err_on(k.k->type != discard_key_type, c,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = discard_key_type;
+ update->k.p = discard_iter.pos;
- if (b < ca->new_fs_bucket_idx)
- return false;
+ ret = bch2_trans_update(trans, &discard_iter, update, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ if (ret)
+ goto err;
}
- gc_gen = bucket_gc_gen(bucket(ca, b));
+ k = bch2_btree_iter_peek_slot(&freespace_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
+ if (fsck_err_on(k.k->type != freespace_key_type, c,
+ "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
- return gc_gen < BUCKET_GC_GEN_MAX;
-}
+ bkey_init(&update->k);
+ update->k.type = freespace_key_type;
+ update->k.p = freespace_iter.pos;
+ bch2_key_resize(&update->k, 1);
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
+ ret = bch2_trans_update(trans, &freespace_iter, update, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ if (ret)
+ goto err;
+ }
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
- u64 now, u64 last_seq_ondisk)
-{
- unsigned used = bucket_sectors_used(m);
+ if (bucket_state(a) == BUCKET_cached) {
+ k = bch2_btree_iter_peek_slot(&lru_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(!a.io_time[READ], c,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+ fsck_err_on(k.k->type != KEY_TYPE_lru ||
+ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
+ "incorrect/missing lru entry\n"
+ " %s\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+ u64 read_time = a.io_time[READ];
+
+ if (!a.io_time[READ])
+ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+
+ ret = bch2_lru_change(trans,
+ alloc_k.k->p.inode,
+ alloc_k.k->p.offset,
+ 0, &a.io_time[READ]);
+ if (ret)
+ goto err;
- if (used) {
- /*
- * Prefer to keep buckets that have been read more recently, and
- * buckets that have more data in them:
- */
- u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
+ if (a.io_time[READ] != read_time) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
+
+ a_mut->v.io_time[READ] = a.io_time[READ];
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+ }
- return -last_read_scaled;
- } else {
- /*
- * Prefer to use buckets with smaller gc_gen so that we don't
- * have to walk the btree and recalculate oldest_gen - but shift
- * off the low bits so that buckets will still have equal sort
- * keys when there's only a small difference, so that we can
- * keep sequential buckets together:
- */
- return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
- (bucket_gc_gen(g) >> 4);
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (ret)
+ goto err;
+ }
}
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ bch2_trans_iter_exit(trans, &freespace_iter);
+ bch2_trans_iter_exit(trans, &discard_iter);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf);
+ return ret;
}
-static inline int bucket_alloc_cmp(alloc_heap *h,
- struct alloc_heap_entry l,
- struct alloc_heap_entry r)
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- return cmp_int(l.key, r.key) ?:
- cmp_int(r.nr, l.nr) ?:
- cmp_int(l.bucket, r.bucket);
-}
+ struct bch_dev *ca;
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
- const struct alloc_heap_entry *l = _l, *r = _r;
+ if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+ return false;
- return cmp_int(l->bucket, r->bucket);
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ return pos.offset >= ca->mi.first_bucket &&
+ pos.offset < ca->mi.nbuckets;
}
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_check_freespace_key(struct btree_trans *trans,
+ struct btree_iter *freespace_iter,
+ bool initial)
{
- struct bucket_array *buckets;
- struct alloc_heap_entry e = { 0 };
- u64 now, last_seq_ondisk;
- size_t b, i, nr = 0;
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter;
+ struct bkey_s_c k, freespace_k;
+ struct bch_alloc_v4 a;
+ u64 genbits;
+ struct bpos pos;
+ struct bkey_i *update;
+ struct printbuf buf = PRINTBUF;
+ int ret;
- down_read(&ca->bucket_lock);
+ freespace_k = bch2_btree_iter_peek(freespace_iter);
+ if (!freespace_k.k)
+ return 1;
- buckets = bucket_array(ca);
- ca->alloc_heap.used = 0;
- now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.flushed_seq_ondisk;
+ ret = bkey_err(freespace_k);
+ if (ret)
+ return ret;
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket *g = &buckets->b[b];
- struct bucket_mark m = READ_ONCE(g->mark);
- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
+ pos = freespace_iter->pos;
+ pos.offset &= ~(~0ULL << 56);
+ genbits = freespace_iter->pos.offset & (~0ULL << 56);
- cond_resched();
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
- if (!bch2_can_invalidate_bucket(ca, b, m))
- continue;
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+ "%llu:%llu set in freespace btree but device or bucket does not exist",
+ pos.inode, pos.offset))
+ goto delete;
- if (e.nr && e.bucket + e.nr == b && e.key == key) {
- e.nr++;
- } else {
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .nr = 1,
- .key = key,
- };
- }
- }
+ k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
+ bch2_alloc_to_v4(k, &a);
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
+ if (fsck_err_on(bucket_state(a) != BUCKET_free ||
+ genbits != alloc_freespace_genbits(a), c,
+ "%s\n incorrectly set in freespace index (free %u, genbits %llu should be %llu)",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ bucket_state(a) == BUCKET_free,
+ genbits >> 56, alloc_freespace_genbits(a) >> 56))
+ goto delete;
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+delete:
+ update = bch2_trans_kmalloc(trans, sizeof(*update));
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
- nr -= ca->alloc_heap.data[0].nr;
- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
- }
+ bkey_init(&update->k);
+ update->k.p = freespace_iter->pos;
+ bch2_key_resize(&update->k, 1);
- up_read(&ca->bucket_lock);
+ ret = bch2_trans_update(trans, freespace_iter, update, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+ goto out;
}
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+int bch2_check_alloc_info(struct bch_fs *c, bool initial)
{
- size_t i, nr = 0;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0, last_dev = -1;
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
+ bch2_trans_init(&trans, c, 0, 0);
- find_reclaimable_buckets_lru(c, ca);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->p.inode != last_dev) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ if (!ca->mi.freespace_initialized) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+
+ last_dev = k.k->p.inode;
+ }
+
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_check_alloc_key(&trans, &iter));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
+ if (ret)
+ goto err;
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_check_freespace_key(&trans, &iter, initial));
+ if (ret)
+ break;
- return nr;
+ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
-static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- struct bkey_alloc_unpacked *u)
+static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
+ struct bch_dev *ca, bool *discard_done)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ struct printbuf buf = PRINTBUF;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
- ret = bch2_btree_iter_traverse(&iter);
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
if (ret)
- goto err;
+ goto out;
- *u = alloc_mem_to_key(c, &iter);
+ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
+ }
- u->gen++;
- u->data_type = 0;
- u->dirty_sectors = 0;
- u->cached_sectors = 0;
- u->read_time = atomic64_read(&c->io_clock[READ].now);
- u->write_time = atomic64_read(&c->io_clock[WRITE].now);
+ BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
- ret = bch2_alloc_write(trans, &iter, u,
- BTREE_TRIGGER_BUCKET_INVALIDATE);
-err:
+ if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
+ "%s\n incorrectly set in need_discard btree",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
+ /*
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
+ */
+ bch2_trans_unlock(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL, 0);
+ *discard_done = true;
+
+ ret = bch2_trans_relock(trans) ? 0 : -EINTR;
+ if (ret)
+ goto out;
+ }
+
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+write:
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+out:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, unsigned flags)
+static void bch2_do_discards_work(struct work_struct *work)
{
- struct bkey_alloc_unpacked u;
- size_t b;
- int ret = 0;
-
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
- return 1;
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct bch_dev *ca = NULL;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ int ret;
- BUG_ON(!ca->alloc_heap.used ||
- !ca->alloc_heap.data[0].nr);
- b = ca->alloc_heap.data[0].bucket;
+ bch2_trans_init(&trans, c, 0, 0);
- /* first, put on free_inc and mark as owned by allocator: */
- percpu_down_read(&c->mark_lock);
+ for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
+ POS_MIN, 0, k, ret) {
+ bool discard_done = false;
- bch2_mark_alloc_bucket(c, ca, b, true);
+ if (ca && k.k->p.inode != ca->dev_idx) {
+ percpu_ref_put(&ca->io_ref);
+ ca = NULL;
+ }
- spin_lock(&c->freelist_lock);
- verify_not_on_freelist(c, ca, b);
- BUG_ON(!fifo_push(&ca->free_inc, b));
- spin_unlock(&c->freelist_lock);
+ if (!ca) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ ca = NULL;
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+ }
- percpu_up_read(&c->mark_lock);
+ seen++;
- ret = bch2_trans_do(c, NULL, journal_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- flags,
- bucket_invalidate_btree(&trans, ca, b, &u));
+ if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
+ open++;
+ continue;
+ }
- if (!ret) {
- /* remove from alloc_heap: */
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ k.k->p.inode, k.k->p.offset)) {
+ need_journal_commit++;
+ continue;
+ }
- top->bucket++;
- top->nr--;
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_NOFAIL,
+ bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
+ if (ret)
+ break;
- if (!top->nr)
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+ discarded++;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- /*
- * Make sure we flush the last journal entry that updated this
- * bucket (i.e. deleting the last reference) before writing to
- * this bucket again:
- */
- *journal_seq = max(*journal_seq, u.journal_seq);
- } else {
- size_t b2;
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
- /* remove from free_inc: */
- percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
+ bch2_trans_exit(&trans);
- bch2_mark_alloc_bucket(c, ca, b, false);
+ if (need_journal_commit * 2 > seen)
+ bch2_journal_flush_async(&c->journal, NULL);
- BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
- BUG_ON(b != b2);
+ percpu_ref_put(&c->writes);
- spin_unlock(&c->freelist_lock);
- percpu_up_read(&c->mark_lock);
- }
+ trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
+}
- return ret < 0 ? ret : 0;
+void bch2_do_discards(struct bch_fs *c)
+{
+ if (percpu_ref_tryget(&c->writes) &&
+ !queue_work(system_long_wq, &c->discard_work))
+ percpu_ref_put(&c->writes);
}
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
{
- u64 journal_seq = 0;
- int ret = 0;
+ struct bch_fs *c = trans->c;
+ struct btree_iter lru_iter, alloc_iter = { NULL };
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ u64 bucket, idx;
+ int ret;
- /* Only use nowait if we've already invalidated at least one bucket: */
- while (!ret &&
- !fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used) {
- if (kthread_should_stop()) {
- ret = 1;
- break;
- }
+ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+ POS(ca->dev_idx, 0), 0);
+ k = bch2_btree_iter_peek(&lru_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
- (!fifo_empty(&ca->free_inc)
- ? BTREE_INSERT_NOWAIT : 0));
- /*
- * We only want to batch up invalidates when they're going to
- * require flushing the journal:
- */
- if (!journal_seq)
- break;
- }
+ if (!k.k || k.k->p.inode != ca->dev_idx)
+ goto out;
- /* If we used NOWAIT, don't return the error: */
- if (!fifo_empty(&ca->free_inc))
- ret = 0;
- if (ret < 0)
- bch_err(ca, "error invalidating buckets: %i", ret);
+ if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
+ "non lru key in lru btree"))
+ goto out;
+
+ idx = k.k->p.offset;
+ bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+
+ a = bch2_trans_start_alloc_update(trans, &alloc_iter,
+ POS(ca->dev_idx, bucket));
+ ret = PTR_ERR_OR_ZERO(a);
if (ret)
- return ret;
+ goto out;
- if (journal_seq)
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret) {
- bch_err(ca, "journal error: %i", ret);
- return ret;
- }
+ if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
+ "invalidating bucket with wrong lru idx (got %llu should be %llu",
+ idx, alloc_lru_idx(a->v)))
+ goto out;
- return 0;
-}
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ a->v.gen++;
+ a->v.data_type = 0;
+ a->v.dirty_sectors = 0;
+ a->v.cached_sectors = 0;
+ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
- if (ca->allocator_state != new_state) {
- ca->allocator_state = new_state;
- closure_wake_up(&ca->fs->freelist_wait);
- }
+ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
+ BTREE_TRIGGER_BUCKET_INVALIDATE);
+out:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ bch2_trans_iter_exit(trans, &lru_iter);
+ return ret;
}
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+static void bch2_do_invalidates_work(struct work_struct *work)
{
+ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca;
+ struct btree_trans trans;
unsigned i;
int ret = 0;
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (i == RESERVE_MOVINGGC &&
- !test_bit(BCH_FS_STARTED, &c->flags))
- continue;
+ bch2_trans_init(&trans, c, 0, 0);
- if (fifo_push(&ca->free[i], b)) {
- fifo_pop(&ca->free_inc, b);
- ret = 1;
- break;
- }
- }
- spin_unlock(&c->freelist_lock);
+ for_each_member_device(ca, c, i)
+ while (!ret && should_invalidate_buckets(ca))
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_NOFAIL,
+ invalidate_one_bucket(&trans, ca));
- ca->allocator_state = ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
- return ret;
+ bch2_trans_exit(&trans);
+ percpu_ref_put(&c->writes);
}
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+void bch2_do_invalidates(struct bch_fs *c)
{
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
- ca->mi.bucket_size, GFP_NOFS, 0);
+ if (percpu_ref_tryget(&c->writes))
+ queue_work(system_long_wq, &c->invalidate_work);
}
-static bool allocator_thread_running(struct bch_dev *ca)
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
- ? ALLOCATOR_running
- : ALLOCATOR_stopped;
- alloc_thread_set_state(ca, state);
- return state == ALLOCATOR_running;
-}
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ struct bch_member *m;
+ int ret;
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
- s64 available = dev_buckets_reclaimable(ca) -
- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
- bool ret = available > 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (iter.pos.offset >= ca->mi.nbuckets)
+ break;
+
+ bch2_alloc_to_v4(k, &a);
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW,
+ bch2_bucket_do_index(&trans, k, a, true));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+
+ if (ret) {
+ bch_err(ca, "error initializing free space: %i", ret);
+ return ret;
+ }
+
+ mutex_lock(&c->sb_lock);
+ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+ mutex_unlock(&c->sb_lock);
- alloc_thread_set_state(ca, ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked);
return ret;
}
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
+int bch2_fs_freespace_init(struct bch_fs *c)
{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- unsigned long gc_count = c->gc_count;
- size_t nr;
- int ret;
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+ bool doing_init = false;
- set_freezable();
+ /*
+ * We can crash during the device add path, so we need to check this on
+ * every mount:
+ */
- while (1) {
- ret = kthread_wait_freezable(allocator_thread_running(ca));
- if (ret)
- goto stop;
+ for_each_member_device(ca, c, i) {
+ if (ca->mi.freespace_initialized)
+ continue;
- while (!ca->alloc_heap.used) {
- cond_resched();
+ if (!doing_init) {
+ bch_info(c, "initializing freespace");
+ doing_init = true;
+ }
- ret = kthread_wait_freezable(buckets_available(ca, gc_count));
- if (ret)
- goto stop;
+ ret = bch2_dev_freespace_init(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
+ }
+ }
- gc_count = c->gc_count;
- nr = find_reclaimable_buckets(c, ca);
+ if (doing_init) {
+ mutex_lock(&c->sb_lock);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
+ bch_verbose(c, "done initializing freespace");
+ }
- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
- ca->inc_gen_really_needs_gc) &&
- c->gc_thread) {
- atomic_inc(&c->kick_gc);
- wake_up_process(c->gc_thread);
- }
- }
+ return ret;
+}
- ret = bch2_invalidate_buckets(c, ca);
- if (ret)
- goto stop;
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ u64 now;
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
- while (!fifo_empty(&ca->free_inc)) {
- u64 b = fifo_peek(&ca->free_inc);
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (a->v.io_time[rw] == now)
+ goto out;
- discard_one_bucket(c, ca, b);
+ a->v.io_time[rw] = now;
- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
- if (ret)
- goto stop;
- }
- }
-stop:
- alloc_thread_set_state(ca, ALLOCATOR_stopped);
- return 0;
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
/* Startup/shutdown (ro/rw): */
@@ -949,7 +1173,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i, j;
+ unsigned i;
lockdep_assert_held(&c->state_lock);
@@ -980,8 +1204,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_NONE; j++)
- dev_reserve += ca->free[j].size;
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
@@ -1037,8 +1262,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
- BUG_ON(ca->alloc_thread);
-
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1112,62 +1335,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
- if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait,
- ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- p = rcu_dereference_protected(ca->alloc_thread, 1);
- ca->alloc_thread = NULL;
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid bch2_wake_allocator() racing:
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- synchronize_rcu();
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- p = kthread_create(bch2_allocator_thread, ca,
- "bch-alloc/%s", ca->name);
- if (IS_ERR(p)) {
- bch_err(ca->fs, "error creating allocator thread: %li",
- PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- rcu_assign_pointer(ca->alloc_thread, p);
- wake_up_process(p);
- return 0;
-}
-
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
+ INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 86b64177b3d0..da1b650e8017 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,90 +8,98 @@
#include "debug.h"
#include "super.h"
-extern const char * const bch2_allocator_states[];
-
-struct bkey_alloc_unpacked {
- u64 journal_seq;
- u64 bucket;
- u8 dev;
- u8 gen;
- u8 oldest_gen;
- u8 data_type;
-#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS_V2()
-#undef x
-};
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
-/* returns true if not equal */
-static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
- struct bkey_alloc_unpacked r)
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
{
- return l.gen != r.gen ||
- l.oldest_gen != r.oldest_gen ||
- l.data_type != r.data_type
-#define x(_name, ...) || l._name != r._name
- BCH_ALLOC_FIELDS_V2()
-#undef x
- ;
+ return a.gen - a.oldest_gen;
}
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
- struct bkey_alloc_unpacked *, unsigned);
+enum bucket_state {
+ BUCKET_free,
+ BUCKET_need_gc_gens,
+ BUCKET_need_discard,
+ BUCKET_cached,
+ BUCKET_dirty,
+};
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+extern const char * const bch2_bucket_states[];
+
+static inline enum bucket_state bucket_state(struct bch_alloc_v4 a)
+{
+ if (a.dirty_sectors || a.stripe)
+ return BUCKET_dirty;
+ if (a.cached_sectors)
+ return BUCKET_cached;
+ BUG_ON(a.data_type);
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BUCKET_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BUCKET_need_gc_gens;
+ return BUCKET_free;
+}
+
+static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+{
+ return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0;
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+ return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
{
- struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked ret;
-
- percpu_down_read(&c->mark_lock);
- ca = bch_dev_bkey_exists(c, iter->pos.inode);
- g = bucket(ca, iter->pos.offset);
- ret = (struct bkey_alloc_unpacked) {
- .dev = iter->pos.inode,
- .bucket = iter->pos.offset,
- .gen = g->mark.gen,
- .oldest_gen = g->oldest_gen,
- .data_type = g->mark.data_type,
- .dirty_sectors = g->mark.dirty_sectors,
- .cached_sectors = g->mark.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- .stripe = g->stripe,
- .stripe_redundancy = g->stripe_redundancy,
- };
- percpu_up_read(&c->mark_lock);
-
- return ret;
+ pos.offset |= alloc_freespace_genbits(a);
+ return pos;
}
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
+
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k);
+void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
}
#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
}
#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+}
+
+#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v4_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .swab = bch2_alloc_v4_swab, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
}
static inline bool bkey_is_alloc(const struct bkey *k)
@@ -103,43 +111,29 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
-static inline void bch2_wake_allocator(struct bch_dev *ca)
+int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
+ struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *, bool);
+void bch2_do_discards(struct bch_fs *);
+
+static inline bool should_invalidate_buckets(struct bch_dev *ca)
{
- struct task_struct *p;
+ struct bch_dev_usage u = bch2_dev_usage_read(ca);
- rcu_read_lock();
- p = rcu_dereference(ca->alloc_thread);
- if (p)
- wake_up_process(p);
- rcu_read_unlock();
+ return u.d[BCH_DATA_cached].buckets &&
+ u.buckets_unavailable + u.d[BCH_DATA_cached].buckets <
+ ca->mi.nbuckets >> 7;
}
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
- size_t bucket)
-{
- if (bch2_expensive_debug_checks) {
- size_t iter;
- long i;
- unsigned j;
-
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- BUG_ON(i == bucket);
- fifo_for_each_entry(i, &ca->free_inc, iter)
- BUG_ON(i == bucket);
- }
-}
+void bch2_do_invalidates(struct bch_fs *);
+
+int bch2_fs_freespace_init(struct bch_fs *);
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
-int bch2_alloc_write_all(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0a634125dc90..4dbab45be5ed 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,19 +14,31 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "error.h"
#include "io.h"
+#include "journal.h"
#include <linux/math64.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <trace/events/bcachefs.h>
+const char * const bch2_alloc_reserves[] = {
+#define x(t) #t,
+ BCH_ALLOC_RESERVES()
+#undef x
+ NULL
+};
+
/*
* Open buckets represent a bucket that's currently being allocated from. They
* serve two purposes:
@@ -78,7 +90,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
ob->data_type = 0;
@@ -151,22 +162,6 @@ static void open_bucket_free_unused(struct bch_fs *c,
}
}
-static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct open_bucket *ob;
- unsigned i;
-
- rcu_read_lock();
- open_bucket_for_each(c, obs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-
- BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen);
- }
- rcu_read_unlock();
-#endif
-}
-
/* _only_ for allocating the journal on a new device: */
long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
{
@@ -184,49 +179,45 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
{
switch (reserve) {
- case RESERVE_BTREE:
- case RESERVE_BTREE_MOVINGGC:
+ case RESERVE_btree:
+ case RESERVE_btree_movinggc:
return 0;
- case RESERVE_MOVINGGC:
+ case RESERVE_movinggc:
return OPEN_BUCKETS_COUNT / 4;
default:
return OPEN_BUCKETS_COUNT / 2;
}
}
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum alloc_reserve reserve,
- bool may_alloc_partial,
- struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket,
+ enum alloc_reserve reserve,
+ struct bch_alloc_v4 *a,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
{
struct open_bucket *ob;
- long b = 0;
- spin_lock(&c->freelist_lock);
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ (*skipped_nouse)++;
+ return NULL;
+ }
- if (may_alloc_partial) {
- int i;
-
- for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
- ob = c->open_buckets + ca->open_buckets_partial[i];
-
- if (reserve <= ob->alloc_reserve) {
- array_remove_item(ca->open_buckets_partial,
- ca->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
- ob->alloc_reserve = reserve;
- spin_unlock(&c->freelist_lock);
- return ob;
- }
- }
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ (*skipped_open)++;
+ return NULL;
+ }
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ (*skipped_need_journal_commit)++;
+ return NULL;
}
+ spin_lock(&c->freelist_lock);
+
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
@@ -235,36 +226,18 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
c->blocked_allocate_open_bucket = local_clock();
spin_unlock(&c->freelist_lock);
- trace_open_bucket_alloc_fail(ca, reserve);
+
+ trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
return ERR_PTR(-OPEN_BUCKETS_EMPTY);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
- goto out;
-
- switch (reserve) {
- case RESERVE_BTREE_MOVINGGC:
- case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
- goto out;
- break;
- default:
- break;
+ /* Recheck under lock: */
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ spin_unlock(&c->freelist_lock);
+ (*skipped_open)++;
+ return NULL;
}
- if (cl)
- closure_wait(&c->freelist_wait, cl);
-
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
-
- spin_unlock(&c->freelist_lock);
-
- trace_bucket_alloc_fail(ca, reserve);
- return ERR_PTR(-FREELIST_EMPTY);
-out:
- verify_not_on_freelist(c, ca, b);
-
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
@@ -273,8 +246,8 @@ out:
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->dev = ca->dev_idx;
- ob->gen = *bucket_gen(ca, b);
- ob->bucket = b;
+ ob->gen = a->gen;
+ ob->bucket = bucket;
spin_unlock(&ob->lock);
ca->nr_open_buckets++;
@@ -296,9 +269,283 @@ out:
spin_unlock(&c->freelist_lock);
- bch2_wake_allocator(ca);
+ trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ enum alloc_reserve reserve, u64 free_entry,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob;
+ struct bch_alloc_v4 a;
+ u64 b = free_entry & ~(~0ULL << 56);
+ unsigned genbits = free_entry >> 56;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c,
+ "non free bucket in freespace btree (state %s)\n"
+ " %s\n"
+ " at %llu (genbits %u)",
+ bch2_bucket_states[bucket_state(a)],
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ free_entry, genbits)) {
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c,
+ "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+ " %s",
+ genbits, alloc_freespace_genbits(a) >> 56,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c,
+ "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)",
+ b, ca->mi.first_bucket, ca->mi.nbuckets)) {
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ ob = __try_alloc_bucket(c, ca, b, reserve, &a,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve)
+{
+ struct open_bucket *ob;
+ int i;
+
+ spin_lock(&c->freelist_lock);
+
+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+ ob = c->open_buckets + ca->open_buckets_partial[i];
+
+ if (reserve <= ob->alloc_reserve) {
+ array_remove_item(ca->open_buckets_partial,
+ ca->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+ ob->alloc_reserve = reserve;
+ spin_unlock(&c->freelist_lock);
+ return ob;
+ }
+ }
+
+ spin_unlock(&c->freelist_lock);
+ return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_trans_early(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *cur_bucket,
+ u64 *buckets_seen,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ int ret;
+
+ *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
+ *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
+
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
+ BTREE_ITER_SLOTS, k, ret) {
+ struct bch_alloc_v4 a;
+
+ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
+
+ if (ca->new_fs_bucket_idx &&
+ is_superblock_bucket(ca, k.k->p.offset))
+ continue;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (bucket_state(a) != BUCKET_free)
+ continue;
+
+ (*buckets_seen)++;
+
+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ *cur_bucket = iter.pos.offset;
+
+ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ u64 *cur_bucket,
+ u64 *buckets_seen,
+ u64 *skipped_open,
+ u64 *skipped_need_journal_commit,
+ u64 *skipped_nouse,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ int ret;
+
+ if (unlikely(!ca->mi.freespace_initialized))
+ return bch2_bucket_alloc_trans_early(trans, ca, reserve,
+ cur_bucket,
+ buckets_seen,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+
+ BUG_ON(ca->new_fs_bucket_idx);
+
+ for_each_btree_key(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
+ if (k.k->p.inode != ca->dev_idx)
+ break;
+
+ for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
+ *cur_bucket != k.k->p.offset && !ob;
+ (*cur_bucket)++) {
+ if (btree_trans_too_many_iters(trans)) {
+ ob = ERR_PTR(-EINTR);
+ break;
+ }
+
+ (*buckets_seen)++;
+
+ ob = try_alloc_bucket(trans, ca, reserve,
+ *cur_bucket,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl);
+ }
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ob ?: ERR_PTR(ret);
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl)
+{
+ struct open_bucket *ob = NULL;
+ u64 avail = dev_buckets_available(ca, reserve);
+ u64 cur_bucket = 0;
+ u64 buckets_seen = 0;
+ u64 skipped_open = 0;
+ u64 skipped_need_journal_commit = 0;
+ u64 skipped_nouse = 0;
+ int ret;
+
+ if (may_alloc_partial) {
+ ob = try_alloc_partial_bucket(c, ca, reserve);
+ if (ob)
+ return ob;
+ }
+again:
+ if (!avail) {
+ if (cl) {
+ closure_wait(&c->freelist_wait, cl);
+ /* recheck after putting ourself on waitlist */
+ avail = dev_buckets_available(ca, reserve);
+ if (avail) {
+ closure_wake_up(&c->freelist_wait);
+ goto again;
+ }
+ }
+
+ if (!c->blocked_allocate)
+ c->blocked_allocate = local_clock();
+
+ ob = ERR_PTR(-FREELIST_EMPTY);
+ goto err;
+ }
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+ &cur_bucket,
+ &buckets_seen,
+ &skipped_open,
+ &skipped_need_journal_commit,
+ &skipped_nouse,
+ cl)));
+
+ if (skipped_need_journal_commit * 2 > avail)
+ bch2_journal_flush_async(&c->journal, NULL);
+err:
+ if (!ob)
+ ob = ERR_PTR(ret ?: -FREELIST_EMPTY);
+
+ if (IS_ERR(ob)) {
+ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
+ buckets_seen,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl == NULL, PTR_ERR(ob));
+ atomic_long_inc(&c->bucket_alloc_fail);
+ }
- trace_bucket_alloc(ca, reserve);
return ob;
}
@@ -329,7 +576,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
struct dev_stripe_state *stripe)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca);
+ u64 free_space = dev_buckets_available(ca, RESERVE_none);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
@@ -380,6 +627,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
{
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ unsigned dev;
struct bch_dev *ca;
int ret = -INSUFFICIENT_DEVICES;
unsigned i;
@@ -389,30 +637,43 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
for (i = 0; i < devs_sorted.nr; i++) {
struct open_bucket *ob;
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ dev = devs_sorted.devs[i];
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
if (!ca)
continue;
- if (!ca->mi.durability && *have_cache)
+ if (!ca->mi.durability && *have_cache) {
+ percpu_ref_put(&ca->ref);
continue;
+ }
ob = bch2_bucket_alloc(c, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
+ if (!IS_ERR(ob))
+ bch2_dev_stripe_increment(ca, stripe);
+ percpu_ref_put(&ca->ref);
+
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
if (cl)
- return ret;
+ break;
continue;
}
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob);
- bch2_dev_stripe_increment(ca, stripe);
-
- if (*nr_effective >= nr_replicas)
- return 0;
+ if (*nr_effective >= nr_replicas) {
+ ret = 0;
+ break;
+ }
}
return ret;
@@ -580,9 +841,6 @@ static int open_bucket_add_buckets(struct bch_fs *c,
if (*nr_effective >= nr_replicas)
return 0;
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
@@ -596,9 +854,6 @@ retry_blocking:
goto retry_blocking;
}
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
-
return ret;
}
@@ -857,8 +1112,6 @@ alloc_done:
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
- verify_not_stale(c, &wp->ptrs);
-
return wp;
err:
open_bucket_for_each(c, &wp->ptrs, ob, i)
@@ -881,7 +1134,7 @@ err:
case -INSUFFICIENT_DEVICES:
return ERR_PTR(-EROFS);
default:
- BUG();
+ return ERR_PTR(ret);
}
}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index d466bda9afc8..8bc78877f0fc 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -12,6 +12,8 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
+extern const char * const bch2_alloc_reserves[];
+
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@@ -115,6 +117,20 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke
return false;
}
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ bool ret;
+
+ if (bch2_bucket_is_open(c, dev, bucket))
+ return true;
+
+ spin_lock(&c->freelist_lock);
+ ret = bch2_bucket_is_open(c, dev, bucket);
+ spin_unlock(&c->freelist_lock);
+
+ return ret;
+}
+
int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, enum alloc_reserve,
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 409232e3d998..21b56451bc18 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,28 +10,18 @@
struct ec_bucket_buf;
-#define ALLOC_THREAD_STATES() \
- x(stopped) \
- x(running) \
- x(blocked) \
- x(blocked_full)
-
-enum allocator_states {
-#define x(n) ALLOCATOR_##n,
- ALLOC_THREAD_STATES()
-#undef x
-};
+#define BCH_ALLOC_RESERVES() \
+ x(btree_movinggc) \
+ x(btree) \
+ x(movinggc) \
+ x(none)
enum alloc_reserve {
- RESERVE_BTREE_MOVINGGC = -2,
- RESERVE_BTREE = -1,
- RESERVE_MOVINGGC = 0,
- RESERVE_NONE = 1,
- RESERVE_NR = 2,
+#define x(name) RESERVE_##name,
+ BCH_ALLOC_RESERVES()
+#undef x
};
-typedef FIFO(long) alloc_fifo;
-
#define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32
@@ -94,12 +84,4 @@ struct write_point_specifier {
unsigned long v;
};
-struct alloc_heap_entry {
- size_t bucket;
- size_t nr;
- unsigned long key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3ada85ac09c6..a13845a23387 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -177,7 +177,11 @@
*/
#undef pr_fmt
+#ifdef __KERNEL__
#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
@@ -219,8 +223,8 @@
#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
#else
-#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name)
-#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt(_c, fmt) fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
#endif
#define bch_info(c, fmt, ...) \
@@ -277,9 +281,6 @@ do { \
"significantly affect performance") \
BCH_DEBUG_PARAM(debug_check_iterators, \
"Enables extra verification for btree iterators") \
- BCH_DEBUG_PARAM(debug_check_bkeys, \
- "Run bkey_debugcheck (primarily checking GC/allocation "\
- "information) when iterating over keys") \
BCH_DEBUG_PARAM(debug_check_btree_accounting, \
"Verify btree accounting for keys within a node") \
BCH_DEBUG_PARAM(journal_seq_verify, \
@@ -351,6 +352,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
#include "ec_types.h"
#include "journal_types.h"
@@ -389,6 +391,10 @@ enum gc_phase {
GC_PHASE_BTREE_reflink,
GC_PHASE_BTREE_subvolumes,
GC_PHASE_BTREE_snapshots,
+ GC_PHASE_BTREE_lru,
+ GC_PHASE_BTREE_freespace,
+ GC_PHASE_BTREE_need_discard,
+ GC_PHASE_BTREE_backpointers,
GC_PHASE_PENDING_DELETE,
};
@@ -432,6 +438,7 @@ struct bch_dev {
struct bch_sb_handle disk_sb;
struct bch_sb *sb_read_scratch;
int sb_write_error;
+ dev_t dev;
struct bch_devs_mask self;
@@ -444,8 +451,9 @@ struct bch_dev {
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
- struct bucket_array __rcu *buckets[2];
- struct bucket_gens *bucket_gens;
+ struct bucket_array __rcu *buckets_gc;
+ struct bucket_gens __rcu *bucket_gens;
+ u8 *oldest_gen;
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
@@ -455,32 +463,16 @@ struct bch_dev {
/* Allocator: */
u64 new_fs_bucket_idx;
- struct task_struct __rcu *alloc_thread;
- /*
- * free: Buckets that are ready to be used
- *
- * free_inc: Incoming buckets - these are buckets that currently have
- * cached data in them, and we can't reuse them until after we write
- * their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
- */
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
unsigned nr_open_buckets;
+ unsigned nr_btree_reserve;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr;
- size_t fifo_last_bucket;
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
-
- enum allocator_states allocator_state;
-
- alloc_heap alloc_heap;
+ size_t buckets_waiting_on_journal;
atomic64_t rebalance_work;
@@ -502,17 +494,13 @@ struct bch_dev {
enum {
/* startup: */
- BCH_FS_INITIALIZED,
- BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOC_CLEAN,
- BCH_FS_ALLOCATOR_RUNNING,
- BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE,
- BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
+ BCH_FS_MAY_GO_RW,
BCH_FS_RW,
BCH_FS_WAS_RW,
@@ -530,16 +518,11 @@ enum {
/* misc: */
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_DELETED_NODES,
- BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
- BCH_FS_HOLD_BTREE_WRITES,
};
struct btree_debug {
unsigned id;
- struct dentry *btree;
- struct dentry *btree_format;
- struct dentry *failed;
};
struct bch_fs_pcpu {
@@ -560,6 +543,7 @@ struct journal_keys {
enum btree_id btree_id:8;
unsigned level:8;
bool allocated;
+ bool overwritten;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
@@ -666,7 +650,7 @@ struct bch_fs {
struct mutex snapshot_table_lock;
struct work_struct snapshot_delete_work;
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
- struct snapshot_id_list snapshots_unlinked;
+ snapshot_id_list snapshots_unlinked;
struct mutex snapshots_unlinked_lock;
/* BTREE CACHE */
@@ -709,6 +693,7 @@ struct bch_fs {
bool btree_trans_barrier_initialized;
struct btree_key_cache btree_key_cache;
+ unsigned btree_key_cache_btrees;
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_io_complete_wq;
@@ -750,6 +735,7 @@ struct bch_fs {
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table;
+ struct work_struct journal_seq_blacklist_gc_work;
/* ALLOCATOR */
spinlock_t freelist_lock;
@@ -771,6 +757,10 @@ struct bch_fs {
struct mutex write_points_hash_lock;
unsigned write_points_nr;
+ struct buckets_waiting_for_journal buckets_waiting_for_journal;
+ struct work_struct discard_work;
+ struct work_struct invalidate_work;
+
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
@@ -796,6 +786,7 @@ struct bch_fs {
* it's not while a gc is in progress.
*/
struct rw_semaphore gc_lock;
+ struct mutex gc_gens_lock;
/* IO PATH */
struct semaphore io_in_flight;
@@ -858,7 +849,6 @@ struct bch_fs {
u64 reflink_hint;
reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr;
- size_t reflink_gc_idx;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@@ -879,7 +869,8 @@ struct bch_fs {
struct bch_memquota_type quotas[QTYP_NR];
/* DEBUG JUNK */
- struct dentry *debug;
+ struct dentry *fs_debug_dir;
+ struct dentry *btree_debug_dir;
struct btree_debug btree_debug[BTREE_ID_NR];
struct btree *verify_data;
struct btree_node *verify_ondisk;
@@ -907,6 +898,7 @@ struct bch_fs {
atomic_long_t read_realloc_races;
atomic_long_t extent_migrate_done;
atomic_long_t extent_migrate_raced;
+ atomic_long_t bucket_alloc_fail;
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
@@ -943,6 +935,11 @@ static inline size_t btree_sectors(const struct bch_fs *c)
return c->opts.btree_node_size >> 9;
}
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+ return c->btree_key_cache_btrees & (1U << btree);
+}
+
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a053fca7886d..8312018e1ed5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -76,6 +76,22 @@
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
+#include "vstructs.h"
+
+#define BITMASK(name, type, field, offset, end) \
+static const unsigned name##_OFFSET = offset; \
+static const unsigned name##_BITS = (end - offset); \
+ \
+static inline __u64 name(const type *k) \
+{ \
+ return (k->field >> offset) & ~(~0ULL << (end - offset)); \
+} \
+ \
+static inline void SET_##name(type *k, __u64 v) \
+{ \
+ k->field &= ~(~(~0ULL << (end - offset)) << offset); \
+ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
+}
#define LE_BITMASK(_bits, name, type, field, offset, end) \
static const unsigned name##_OFFSET = offset; \
@@ -346,7 +362,10 @@ static inline void bkey_init(struct bkey *k)
x(subvolume, 21) \
x(snapshot, 22) \
x(inode_v2, 23) \
- x(alloc_v3, 24)
+ x(alloc_v3, 24) \
+ x(set, 25) \
+ x(lru, 26) \
+ x(alloc_v4, 27)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -376,6 +395,10 @@ struct bch_hash_whiteout {
struct bch_val v;
};
+struct bch_set {
+ struct bch_val v;
+};
+
/* Extents */
/*
@@ -876,8 +899,8 @@ struct bch_alloc_v2 {
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
x(stripe, 32) \
x(stripe_redundancy, 8)
@@ -892,11 +915,34 @@ struct bch_alloc_v3 {
__u8 data[];
} __attribute__((packed, aligned(8)));
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ struct bpos backpointers[0];
+} __attribute__((packed, aligned(8)));
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
#undef x
- BCH_ALLOC_FIELD_NR
};
/* Quotas: */
@@ -1014,6 +1060,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+/* LRU btree: */
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __attribute__((packed, aligned(8)));
+
+#define LRU_ID_STRIPES (1U << 16)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1022,16 +1077,17 @@ struct bch_sb_field {
__le32 type;
};
-#define BCH_SB_FIELDS() \
- x(journal, 0) \
- x(members, 1) \
- x(crypt, 2) \
- x(replicas_v0, 3) \
- x(quota, 4) \
- x(disk_groups, 5) \
- x(clean, 6) \
- x(replicas, 7) \
- x(journal_seq_blacklist, 8)
+#define BCH_SB_FIELDS() \
+ x(journal, 0) \
+ x(members, 1) \
+ x(crypt, 2) \
+ x(replicas_v0, 3) \
+ x(quota, 4) \
+ x(disk_groups, 5) \
+ x(clean, 6) \
+ x(replicas, 7) \
+ x(journal_seq_blacklist, 8) \
+ x(journal_v2, 9)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -1040,6 +1096,14 @@ enum bch_sb_field_type {
BCH_SB_FIELD_NR
};
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS \
+ ((1U << BCH_SB_FIELD_journal)| \
+ (1U << BCH_SB_FIELD_journal_v2))
+
/* BCH_SB_FIELD_journal: */
struct bch_sb_field_journal {
@@ -1047,6 +1111,15 @@ struct bch_sb_field_journal {
__le64 buckets[0];
};
+struct bch_sb_field_journal_v2 {
+ struct bch_sb_field field;
+
+ struct bch_sb_field_journal_v2_entry {
+ __le64 start;
+ __le64 nr;
+ } d[0];
+};
+
/* BCH_SB_FIELD_members: */
#define BCH_MIN_NR_NBUCKETS (1 << 6)
@@ -1068,6 +1141,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags[0], 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
@@ -1274,19 +1349,25 @@ struct bch_sb_field_journal_seq_blacklist {
#define BCH_JSET_VERSION_OLD 2
#define BCH_BSET_VERSION_OLD 3
+#define BCH_METADATA_VERSIONS() \
+ x(bkey_renumber, 10) \
+ x(inode_btree_change, 11) \
+ x(snapshot, 12) \
+ x(inode_backpointers, 13) \
+ x(btree_ptr_sectors_written, 14) \
+ x(snapshot_2, 15) \
+ x(reflink_p_fix, 16) \
+ x(subvol_dirent, 17) \
+ x(inode_v2, 18) \
+ x(freespace, 19) \
+ x(alloc_v4, 20)
+
enum bcachefs_metadata_version {
- bcachefs_metadata_version_min = 9,
- bcachefs_metadata_version_new_versioning = 10,
- bcachefs_metadata_version_bkey_renumber = 10,
- bcachefs_metadata_version_inode_btree_change = 11,
- bcachefs_metadata_version_snapshot = 12,
- bcachefs_metadata_version_inode_backpointers = 13,
- bcachefs_metadata_version_btree_ptr_sectors_written = 14,
- bcachefs_metadata_version_snapshot_2 = 15,
- bcachefs_metadata_version_reflink_p_fix = 16,
- bcachefs_metadata_version_subvol_dirent = 17,
- bcachefs_metadata_version_inode_v2 = 18,
- bcachefs_metadata_version_max = 19,
+ bcachefs_metadata_version_min = 9,
+#define x(t, n) bcachefs_metadata_version_##t = n,
+ BCH_METADATA_VERSIONS()
+#undef x
+ bcachefs_metadata_version_max
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -1426,6 +1507,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
/*
* Features:
@@ -1660,7 +1742,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(usage, 5) \
x(data_usage, 6) \
x(clock, 7) \
- x(dev_usage, 8)
+ x(dev_usage, 8) \
+ x(log, 9)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -1690,11 +1773,16 @@ struct jset_entry_blacklist_v2 {
__le64 end;
};
+#define BCH_FS_USAGE_TYPES() \
+ x(reserved, 0) \
+ x(inodes, 1) \
+ x(key_version, 2)
+
enum {
- FS_USAGE_RESERVED = 0,
- FS_USAGE_INODES = 1,
- FS_USAGE_KEY_VERSION = 2,
- FS_USAGE_NR = 3
+#define x(f, nr) BCH_FS_USAGE_##f = nr,
+ BCH_FS_USAGE_TYPES()
+#undef x
+ BCH_FS_USAGE_NR
};
struct jset_entry_usage {
@@ -1732,6 +1820,17 @@ struct jset_entry_dev_usage {
struct jset_entry_dev_usage_type d[];
} __attribute__((packed));
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+ struct jset_entry entry;
+ u8 d[];
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1785,7 +1884,11 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
x(stripes, 6) \
x(reflink, 7) \
x(subvolumes, 8) \
- x(snapshots, 9)
+ x(snapshots, 9) \
+ x(lru, 10) \
+ x(freespace, 11) \
+ x(need_discard, 12) \
+ x(backpointers, 13)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 946dd27f09fc..4b01ab3029a2 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -57,11 +57,12 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
tmp = __bch2_bkey_unpack_key(format, packed);
if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
- char buf1[160], buf2[160];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
char buf3[160], buf4[160];
- bch2_bkey_to_text(&PBUF(buf1), unpacked);
- bch2_bkey_to_text(&PBUF(buf2), &tmp);
+ bch2_bkey_to_text(&buf1, unpacked);
+ bch2_bkey_to_text(&buf2, &tmp);
bch2_to_binary(buf3, (void *) unpacked, 80);
bch2_to_binary(buf4, high_word(format, packed), 80);
@@ -72,7 +73,7 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
format->bits_per_field[2],
format->bits_per_field[3],
format->bits_per_field[4],
- buf1, buf2, buf3, buf4);
+ buf1.buf, buf2.buf, buf3, buf4);
}
}
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5c900cf8a8a2..0eac86e5e776 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -9,6 +9,7 @@
#include "error.h"
#include "extents.h"
#include "inode.h"
+#include "lru.h"
#include "quota.h"
#include "reflink.h"
#include "subvolume.h"
@@ -85,6 +86,24 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
.val_to_text = key_type_inline_data_to_text, \
}
+static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ if (bkey_val_bytes(k.k))
+ return "nonempty value";
+ return NULL;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
+}
+
+#define bch2_bkey_ops_set (struct bkey_ops) { \
+ .key_invalid = key_type_set_invalid, \
+ .key_merge = key_type_set_merge, \
+}
+
const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
@@ -130,7 +149,8 @@ static unsigned bch2_key_types_allowed[] = {
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
(1U << KEY_TYPE_alloc_v2)|
- (1U << KEY_TYPE_alloc_v3),
+ (1U << KEY_TYPE_alloc_v3)|
+ (1U << KEY_TYPE_alloc_v4),
[BKEY_TYPE_quotas] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
@@ -147,6 +167,15 @@ static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_snapshots] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_snapshot),
+ [BKEY_TYPE_lru] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_lru),
+ [BKEY_TYPE_freespace] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
+ [BKEY_TYPE_need_discard] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
@@ -212,22 +241,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
return NULL;
}
-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
- const char *invalid;
-
- BUG_ON(!k.k->u64s);
-
- invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, k);
- if (invalid) {
- char buf[160];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
- }
-}
-
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
if (!bpos_cmp(pos, POS_MIN))
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3012035db1a3..2289a09d98fc 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -6,6 +6,7 @@
struct bch_fs;
struct btree;
+struct btree_trans;
struct bkey;
enum btree_node_type;
@@ -20,6 +21,10 @@ struct bkey_ops {
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+ int (*trans_trigger)(struct btree_trans *, struct bkey_s_c,
+ struct bkey_i *, unsigned);
+ int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
+ struct bkey_s_c, unsigned);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -34,8 +39,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
enum btree_node_type);
const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
void bch2_val_to_text(struct printbuf *, struct bch_fs *,
@@ -59,6 +62,28 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+static inline int bch2_mark_key(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+
+ return ops->atomic_trigger
+ ? ops->atomic_trigger(trans, old, new, flags)
+ : 0;
+}
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
+ struct bkey_i *new, unsigned flags)
+{
+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+
+ return ops->trans_trigger
+ ? ops->trans_trigger(trans, old, new, flags)
+ : 0;
+}
+
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 59e4c1d1a2a5..c7a41d0dc781 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -70,7 +70,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
struct bkey_packed *_k, *_n;
struct bkey uk, n;
struct bkey_s_c k;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
if (!i->u64s)
return;
@@ -81,12 +81,14 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
_n = bkey_next(_k);
k = bkey_disassemble(b, _k, &uk);
+
+ printbuf_reset(&buf);
if (c)
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ bch2_bkey_val_to_text(&buf, c, k);
else
- bch2_bkey_to_text(&PBUF(buf), k.k);
+ bch2_bkey_to_text(&buf, k.k);
printk(KERN_ERR "block %u key %5zu: %s\n", set,
- _k->_data - i->_data, buf);
+ _k->_data - i->_data, buf.buf);
if (_n == vstruct_last(i))
continue;
@@ -102,6 +104,8 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
!bpos_cmp(n.p, k.k->p))
printk(KERN_ERR "Duplicate keys\n");
}
+
+ printbuf_exit(&buf);
}
void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
@@ -118,6 +122,7 @@ void bch2_dump_btree_node_iter(struct btree *b,
struct btree_node_iter *iter)
{
struct btree_node_iter_set *set;
+ struct printbuf buf = PRINTBUF;
printk(KERN_ERR "btree node iter with %u/%u sets:\n",
__btree_node_iter_used(iter), b->nsets);
@@ -126,12 +131,14 @@ void bch2_dump_btree_node_iter(struct btree *b,
struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
struct bset_tree *t = bch2_bkey_to_bset(b, k);
struct bkey uk = bkey_unpack_key(b, k);
- char buf[100];
- bch2_bkey_to_text(&PBUF(buf), &uk);
+ printbuf_reset(&buf);
+ bch2_bkey_to_text(&buf, &uk);
printk(KERN_ERR "set %zu key %u: %s\n",
- t - b->set, set->k, buf);
+ t - b->set, set->k, buf.buf);
}
+
+ printbuf_exit(&buf);
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -167,13 +174,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
struct btree_node_iter_set *set;
struct bkey ku = bkey_unpack_key(b, k);
struct bkey nu = bkey_unpack_key(b, n);
- char buf1[80], buf2[80];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &ku);
- bch2_bkey_to_text(&PBUF(buf2), &nu);
+ bch2_bkey_to_text(&buf1, &ku);
+ bch2_bkey_to_text(&buf2, &nu);
printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
printk(KERN_ERR "iter was:");
btree_node_iter_for_each(_iter, set) {
@@ -238,6 +246,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
#if 0
BUG_ON(prev &&
bkey_iter_cmp(b, prev, insert) > 0);
@@ -246,17 +256,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
bkey_iter_cmp(b, prev, insert) > 0) {
struct bkey k1 = bkey_unpack_key(b, prev);
struct bkey k2 = bkey_unpack_key(b, insert);
- char buf1[100];
- char buf2[100];
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &k1);
- bch2_bkey_to_text(&PBUF(buf2), &k2);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
panic("prev > insert:\n"
"prev key %s\n"
"insert key %s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
#endif
#if 0
@@ -267,17 +275,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
bkey_iter_cmp(b, insert, next) > 0) {
struct bkey k1 = bkey_unpack_key(b, insert);
struct bkey k2 = bkey_unpack_key(b, next);
- char buf1[100];
- char buf2[100];
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &k1);
- bch2_bkey_to_text(&PBUF(buf2), &k2);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
panic("insert > next:\n"
"insert key %s\n"
"next key %s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
#endif
}
@@ -473,7 +479,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
unsigned j)
{
return cacheline_to_bkey(b, t,
- __eytzinger1_to_inorder(j, t->size, t->extra),
+ __eytzinger1_to_inorder(j, t->size - 1, t->extra),
bkey_float(b, t, j)->key_offset);
}
@@ -607,10 +613,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
}
__always_inline
-static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
+static inline void make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
@@ -679,34 +685,6 @@ static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
f->mantissa = mantissa;
}
-static void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
-{
- struct bkey_i *k;
-
- if (is_power_of_2(j) &&
- !min_key->u64s) {
- if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
- k = (void *) min_key;
- bkey_init(&k->k);
- k->k.p = b->data->min_key;
- }
- }
-
- if (is_power_of_2(j + 1) &&
- !max_key->u64s) {
- if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
- k = (void *) max_key;
- bkey_init(&k->k);
- k->k.p = b->data->max_key;
- }
- }
-
- __make_bfloat(b, t, j, min_key, max_key);
-}
-
/* bytes remaining - only valid for last bset: */
static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
@@ -763,7 +741,7 @@ retry:
t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
/* First we figure out where the first key in each cacheline is */
- eytzinger1_for_each(j, t->size) {
+ eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
prev = k, k = bkey_next(k);
@@ -795,10 +773,10 @@ retry:
}
/* Then we build the tree */
- eytzinger1_for_each(j, t->size)
- __make_bfloat(b, t, j,
- bkey_to_packed(&min_key),
- bkey_to_packed(&max_key));
+ eytzinger1_for_each(j, t->size - 1)
+ make_bfloat(b, t, j,
+ bkey_to_packed(&min_key),
+ bkey_to_packed(&max_key));
}
static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
@@ -897,7 +875,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
do {
p = j ? tree_to_bkey(b, t,
__inorder_to_eytzinger1(j--,
- t->size, t->extra))
+ t->size - 1, t->extra))
: btree_bkey_first(b, t);
} while (p >= k);
break;
@@ -943,91 +921,6 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
-static void rw_aux_tree_fix_invalidated_key(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- unsigned offset = __btree_node_key_to_offset(b, k);
- unsigned j = rw_aux_tree_bsearch(b, t, offset);
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset == offset)
- rw_aux_tree_set(b, t, j, k);
-
- bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-static void ro_aux_tree_fix_invalidated_key(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed min_key, max_key;
- unsigned inorder, j;
-
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
- if (bkey_next(k) == btree_bkey_last(b, t)) {
- for (j = 1; j < t->size; j = j * 2 + 1)
- make_bfloat(b, t, j, &min_key, &max_key);
- }
-
- inorder = bkey_to_cacheline(b, t, k);
-
- if (inorder &&
- inorder < t->size) {
- j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
-
- if (k == tree_to_bkey(b, t, j)) {
- /* Fix the node this key corresponds to */
- make_bfloat(b, t, j, &min_key, &max_key);
-
- /* Children for which this key is the right boundary */
- for (j = eytzinger1_left_child(j);
- j < t->size;
- j = eytzinger1_right_child(j))
- make_bfloat(b, t, j, &min_key, &max_key);
- }
- }
-
- if (inorder + 1 < t->size) {
- j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
-
- if (k == tree_to_prev_bkey(b, t, j)) {
- make_bfloat(b, t, j, &min_key, &max_key);
-
- /* Children for which this key is the left boundary */
- for (j = eytzinger1_right_child(j);
- j < t->size;
- j = eytzinger1_left_child(j))
- make_bfloat(b, t, j, &min_key, &max_key);
- }
- }
-}
-
-/**
- * bch2_bset_fix_invalidated_key() - given an existing key @k that has been
- * modified, fix any auxiliary search tree by remaking all the nodes in the
- * auxiliary search tree that @k corresponds to
- */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- break;
- case BSET_RO_AUX_TREE:
- ro_aux_tree_fix_invalidated_key(b, t, k);
- break;
- case BSET_RW_AUX_TREE:
- rw_aux_tree_fix_invalidated_key(b, t, k);
- break;
- }
-}
-
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
@@ -1262,7 +1155,7 @@ slowpath:
n = n * 2 + (cmp < 0);
} while (n < t->size);
- inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
/*
* n would have been the node we recursed to - the low bit tells us if
@@ -1273,7 +1166,7 @@ slowpath:
if (unlikely(!inorder))
return btree_bkey_first(b, t);
- f = &base->f[eytzinger1_prev(n >> 1, t->size)];
+ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
}
return cacheline_to_bkey(b, t, inorder, f->key_offset);
@@ -1547,10 +1440,6 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
EBUG_ON(iter->data->k > iter->data->end);
- while (!__btree_node_iter_set_end(iter, 0) &&
- !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
- iter->data->k++;
-
if (unlikely(__btree_node_iter_set_end(iter, 0))) {
bch2_btree_node_iter_set_drop(iter, iter->data);
return;
@@ -1684,9 +1573,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
struct bkey uk;
unsigned j, inorder;
- if (out->pos != out->end)
- *out->pos = '\0';
-
if (!bset_has_ro_aux_tree(t))
return;
@@ -1694,7 +1580,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
if (!inorder || inorder >= t->size)
return;
- j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
if (k != tree_to_bkey(b, t, j))
return;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index e42f866cf2ec..0d46534c3dcd 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -361,7 +361,6 @@ void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
struct bkey_packed *, struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 3411d5a02203..0dcdc30c6888 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -15,6 +15,13 @@
struct lock_class_key bch2_btree_node_lock_key;
+const char * const bch2_btree_node_flags[] = {
+#define x(f) #f,
+ BTREE_FLAGS()
+#undef x
+ NULL
+};
+
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
unsigned i, reserve = 16;
@@ -35,6 +42,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
return max_t(int, 0, bc->used - bc->reserve);
}
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+ if (b->c.lock.readers)
+ list_move(&b->list, &bc->freed_pcpu);
+ else
+ list_move(&b->list, &bc->freed_nonpcpu);
+}
+
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
@@ -51,7 +66,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
b->aux_data = NULL;
bc->used--;
- list_move(&b->list, &bc->freed);
+
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -83,6 +99,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ if (b->aux_data == MAP_FAILED)
+ b->aux_data = NULL;
#endif
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
@@ -154,11 +172,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
b->c.level = level;
b->c.btree_id = id;
- if (level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free_rcu(&b->c.lock);
-
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
@@ -215,15 +228,13 @@ wait_on_io:
goto wait_on_io;
}
- if (btree_node_noevict(b))
- goto out_unlock;
-
- if (!btree_node_may_write(b))
+ if (btree_node_noevict(b) ||
+ btree_node_write_blocked(b) ||
+ btree_node_will_make_reachable(b))
goto out_unlock;
if (btree_node_dirty(b)) {
- if (!flush ||
- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+ if (!flush)
goto out_unlock;
/*
* Using the underscore version because we don't want to compact
@@ -232,9 +243,9 @@ wait_on_io:
* the post write cleanup:
*/
if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent);
+ bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
else
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, 0);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -274,6 +285,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned long freed = 0;
unsigned i, flags;
+ unsigned long ret = SHRINK_STOP;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -282,7 +294,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (sc->gfp_mask & __GFP_FS)
mutex_lock(&bc->lock);
else if (!mutex_trylock(&bc->lock))
- return -1;
+ goto out_norestore;
flags = memalloc_nofs_save();
@@ -299,13 +311,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) {
+ /*
+ * Leave a few nodes on the freeable list, so that a btree split
+ * won't have to hit the system allocator:
+ */
+ if (++i <= 3)
+ continue;
+
touched++;
if (touched >= nr)
break;
- if (++i > 3 &&
- !btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -314,17 +332,13 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
}
restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
- touched++;
-
- if (touched >= nr) {
- /* Save position */
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
- break;
+ /* tweak this */
+ if (btree_node_accessed(b)) {
+ clear_btree_node_accessed(b);
+ goto touched;
}
- if (!btree_node_accessed(b) &&
- !btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b)) {
/* can't call bch2_btree_node_hash_remove under lock */
freed++;
if (&t->list != &bc->live)
@@ -345,14 +359,30 @@ restart:
else if (!mutex_trylock(&bc->lock))
goto out;
goto restart;
- } else
- clear_btree_node_accessed(b);
+ } else {
+ continue;
+ }
+touched:
+ touched++;
+
+ if (touched >= nr) {
+ /* Save position */
+ if (&t->list != &bc->live)
+ list_move_tail(&bc->live, &t->list);
+ break;
+ }
}
mutex_unlock(&bc->lock);
out:
+ ret = (unsigned long) freed * btree_pages(c);
memalloc_nofs_restore(flags);
- return (unsigned long) freed * btree_pages(c);
+out_norestore:
+ trace_btree_cache_scan(sc->nr_to_scan,
+ sc->nr_to_scan / btree_pages(c),
+ btree_cache_can_free(bc),
+ ret);
+ return ret;
}
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
@@ -400,15 +430,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(c, b);
+ clear_btree_node_dirty_acct(c, b);
btree_node_data_free(c, b);
}
BUG_ON(atomic_read(&c->btree_cache.dirty));
- while (!list_empty(&bc->freed)) {
- b = list_first_entry(&bc->freed, struct btree, list);
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ while (!list_empty(&bc->freed_nonpcpu)) {
+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_del(&b->list);
six_lock_pcpu_free(&b->c.lock);
kfree(b);
@@ -462,7 +494,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed);
+ INIT_LIST_HEAD(&bc->freed_pcpu);
+ INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/*
@@ -537,10 +570,13 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
}
}
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
+ struct list_head *freed = pcpu_read_locks
+ ? &bc->freed_pcpu
+ : &bc->freed_nonpcpu;
+ struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
@@ -548,44 +584,49 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
mutex_lock(&bc->lock);
/*
- * btree_free() doesn't free memory; it sticks the node on the end of
- * the list. Check if there's any freed nodes there:
- */
- list_for_each_entry(b, &bc->freeable, list)
- if (!btree_node_reclaim(c, b))
- goto got_node;
-
- /*
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
- list_for_each_entry(b, &bc->freed, list)
- if (!btree_node_reclaim(c, b))
+ list_for_each_entry(b, freed, list)
+ if (!btree_node_reclaim(c, b)) {
+ list_del_init(&b->list);
goto got_node;
+ }
+
+ b = __btree_node_mem_alloc(c);
+ if (!b)
+ goto err_locked;
+
+ if (pcpu_read_locks)
+ six_lock_pcpu_alloc(&b->c.lock);
- b = NULL;
+ BUG_ON(!six_trylock_intent(&b->c.lock));
+ BUG_ON(!six_trylock_write(&b->c.lock));
got_node:
- if (b)
- list_del_init(&b->list);
- mutex_unlock(&bc->lock);
- if (!b) {
- b = __btree_node_mem_alloc(c);
- if (!b)
- goto err;
+ /*
+ * btree_free() doesn't free memory; it sticks the node on the end of
+ * the list. Check if there's any freed nodes there:
+ */
+ list_for_each_entry(b2, &bc->freeable, list)
+ if (!btree_node_reclaim(c, b2)) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ goto got_mem;
+ }
- BUG_ON(!six_trylock_intent(&b->c.lock));
- BUG_ON(!six_trylock_write(&b->c.lock));
- }
+ mutex_unlock(&bc->lock);
- if (!b->data) {
- if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
- goto err;
+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+ goto err;
- mutex_lock(&bc->lock);
- bc->used++;
- mutex_unlock(&bc->lock);
- }
+ mutex_lock(&bc->lock);
+ bc->used++;
+got_mem:
+ mutex_unlock(&bc->lock);
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
@@ -607,20 +648,24 @@ out:
return b;
err:
mutex_lock(&bc->lock);
-
- if (b) {
- list_add(&b->list, &bc->freed);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- }
-
+err_locked:
/* Try to cannibalize another cached btree node: */
if (bc->alloc_lock == current) {
- b = btree_node_cannibalize(c);
- list_del_init(&b->list);
- mutex_unlock(&bc->lock);
+ b2 = btree_node_cannibalize(c);
+ bch2_btree_node_hash_remove(bc, b2);
+
+ if (b) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ } else {
+ b = b2;
+ list_del_init(&b->list);
+ }
- bch2_btree_node_hash_remove(bc, b);
+ mutex_unlock(&bc->lock);
trace_btree_node_cannibalize(c);
goto out;
@@ -651,11 +696,22 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
* been freed:
*/
if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+ trace_trans_restart_relock_parent_for_fill(trans->fn,
+ _THIS_IP_, btree_id, &path->pos);
+ btree_trans_restart(trans);
+ return ERR_PTR(-EINTR);
+ }
+
+ b = bch2_btree_node_mem_alloc(c, level != 0);
+
+ if (trans && b == ERR_PTR(-ENOMEM)) {
+ trans->memory_allocation_failure = true;
+ trace_trans_restart_memory_allocation_failure(trans->fn,
+ _THIS_IP_, btree_id, &path->pos);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
- b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
return b;
@@ -698,6 +754,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+ trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
+ btree_id, &path->pos);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
@@ -715,14 +773,16 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
{
- char buf1[200], buf2[100], buf3[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct printbuf buf3 = PRINTBUF;
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
return;
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
- bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key));
+ bch2_bpos_to_text(&buf2, b->data->min_key);
+ bch2_bpos_to_text(&buf3, b->data->max_key);
bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
"btree %s level %u\n"
@@ -730,10 +790,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
"header: btree %s level %llu\n"
"min %s max %s\n",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
+ buf1.buf,
bch2_btree_ids[BTREE_NODE_ID(b->data)],
BTREE_NODE_LEVEL(b->data),
- buf2, buf3);
+ buf2.buf, buf3.buf);
+
+ printbuf_exit(&buf3);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
}
static inline void btree_check_header(struct bch_fs *c, struct btree *b)
@@ -843,7 +907,7 @@ lock_node:
if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
- trace_trans_restart_btree_node_reused(trans->ip,
+ trace_trans_restart_btree_node_reused(trans->fn,
trace_ip,
path->btree_id,
&path->pos);
@@ -1025,7 +1089,7 @@ wait_on_io:
six_lock_write(&b->c.lock, NULL, NULL);
if (btree_node_dirty(b)) {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, 0);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index f7e10986f317..25906127c023 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -7,6 +7,8 @@
extern struct lock_class_key bch2_btree_node_lock_key;
+extern const char * const bch2_btree_node_flags[];
+
struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *);
@@ -20,7 +22,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d1883701afc3..e19991796c82 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -9,6 +9,7 @@
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
@@ -69,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c,
struct bpos expected_start = bkey_deleted(&prev->k->k)
? node_start
: bpos_successor(prev->k->k.p);
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
- if (bkey_deleted(&prev->k->k)) {
- struct printbuf out = PBUF(buf1);
- pr_buf(&out, "start of node: ");
- bch2_bpos_to_text(&out, node_start);
- } else {
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
- }
-
if (bpos_cmp(expected_start, bp->v.min_key)) {
bch2_topology_error(c);
+ if (bkey_deleted(&prev->k->k)) {
+ pr_buf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, node_start);
+ } else {
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+ }
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
@@ -94,11 +95,11 @@ static int bch2_gc_check_topology(struct bch_fs *c,
" prev %s\n"
" cur %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+ buf1.buf, buf2.buf) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
- return FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+ goto err;
} else {
set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
}
@@ -108,6 +109,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
bch2_topology_error(c);
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+ bch2_bpos_to_text(&buf2, node_end);
+
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
@@ -116,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c,
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+ buf1.buf, buf2.buf) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
- return FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+ goto err;
} else {
set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
}
}
bch2_bkey_buf_copy(prev, c, cur.k);
+err:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -155,6 +165,34 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
+static void bch2_btree_node_update_key_early(struct bch_fs *c,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new)
+{
+ struct btree *b;
+ struct bkey_buf tmp;
+ int ret;
+
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_reassemble(&tmp, c, old);
+
+ b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+ if (!IS_ERR_OR_NULL(b)) {
+ mutex_lock(&c->btree_cache.lock);
+
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&b->c.lock);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
{
struct bkey_i_btree_ptr_v2 *new;
@@ -169,11 +207,11 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
new->v.min_key = new_min;
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- kfree(new);
-
- if (ret)
+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ if (ret) {
+ kfree(new);
return ret;
+ }
bch2_btree_node_drop_keys_outside_node(b);
@@ -198,11 +236,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
new->k.p = new_max;
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- kfree(new);
-
- if (ret)
+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ if (ret) {
+ kfree(new);
return ret;
+ }
bch2_btree_node_drop_keys_outside_node(b);
@@ -222,18 +260,17 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (!prev) {
- struct printbuf out = PBUF(buf1);
- pr_buf(&out, "start of node: ");
- bch2_bpos_to_text(&out, b->data->min_key);
+ pr_buf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, b->data->min_key);
} else {
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
if (prev &&
bpos_cmp(expected_start, cur->data->min_key) > 0 &&
@@ -246,8 +283,10 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- return DROP_PREV_NODE;
+ buf1.buf, buf2.buf)) {
+ ret = DROP_PREV_NODE;
+ goto out;
+ }
if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
@@ -255,7 +294,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
+ buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
} else {
@@ -267,39 +306,49 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- return DROP_THIS_NODE;
+ buf1.buf, buf2.buf)) {
+ ret = DROP_THIS_NODE;
+ goto out;
+ }
if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
+ buf1.buf, buf2.buf))
ret = set_node_min(c, cur, expected_start);
}
+out:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
struct btree *child)
{
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+ bch2_bpos_to_text(&buf2, b->key.k.p);
+
if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+ buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
- return ret;
+ goto err;
}
+err:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -310,7 +359,7 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
bool have_child, dropped_children = false;
- char buf[200];
+ struct printbuf buf;
int ret = 0;
if (!b->c.level)
@@ -334,12 +383,15 @@ again:
false);
ret = PTR_ERR_OR_ZERO(cur);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
if (mustfix_fsck_err_on(ret == -EIO, c,
"Unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
+ buf.buf)) {
bch2_btree_node_evict(c, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
@@ -439,12 +491,14 @@ again:
have_child = true;
}
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
if (mustfix_fsck_err_on(!have_child, c,
"empty interior btree node at btree %s level %u\n"
" %s",
bch2_btree_ids[b->c.btree_id],
- b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+ b->c.level, buf.buf))
ret = DROP_THIS_NODE;
err:
fsck_err:
@@ -460,6 +514,7 @@ fsck_err:
if (!ret && dropped_children)
goto again;
+ printbuf_exit(&buf);
return ret;
}
@@ -495,7 +550,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p = { 0 };
bool do_update = false;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
/*
@@ -505,7 +560,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
if (fsck_err_on(!g->gen_valid, c,
@@ -514,83 +568,72 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
} else {
do_update = true;
}
}
- if (fsck_err_on(data_type == BCH_DATA_btree &&
- g->mark.gen != p.ptr.gen, c,
- "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
- }
-
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
} else {
do_update = true;
}
}
- if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (fsck_err_on(!p.ptr.cached &&
- gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+ gen_cmp(p.ptr.gen, g->gen) < 0, c,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
- if (p.ptr.gen != g->mark.gen)
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
continue;
- if (fsck_err_on(g->mark.data_type &&
- g->mark.data_type != data_type, c,
+ if (fsck_err_on(g->data_type &&
+ g->data_type != data_type, c,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->mark.data_type],
+ bch2_data_types[g->data_type],
bch2_data_types[data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->data_type = data_type;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
} else {
do_update = true;
}
@@ -603,14 +646,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
}
}
@@ -623,13 +668,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
if (is_root) {
bch_err(c, "cannot update btree roots yet");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
bch_err(c, "%s: error allocating new key", __func__);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err;
}
bkey_reassemble(new, *k);
@@ -645,7 +692,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- ptr->gen = g->mark.gen;
+ ptr->gen = g->gen;
}
} else {
bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
@@ -654,12 +701,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
(ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
(!ptr->cached &&
- gen_cmp(ptr->gen, g->mark.gen) < 0) ||
- gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->mark.data_type &&
- g->mark.data_type != data_type);
+ gen_cmp(ptr->gen, g->gen) < 0) ||
+ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type);
}));
again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
@@ -690,13 +737,28 @@ found:
}
}
- ret = bch2_journal_key_insert(c, btree_id, level, new);
- kfree(new);
+ ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+ if (ret) {
+ kfree(new);
+ goto err;
+ }
+
+ if (level)
+ bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
- if (!ret)
- *k = bkey_i_to_s_c(new);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, *k);
+ bch_info(c, "updated %s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+
+ *k = bkey_i_to_s_c(new);
}
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -705,11 +767,9 @@ fsck_err:
static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k,
- u8 *max_stale, bool initial)
+ bool initial)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
unsigned flags =
@@ -721,7 +781,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k->k->version.lo > journal_cur_seq(&c->journal));
+ k->k->version.lo > atomic64_read(&c->journal.seq));
ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
if (ret)
@@ -734,18 +794,8 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
atomic64_set(&c->key_version, k->k->version.lo);
}
- ptrs = bch2_bkey_ptrs_c(*k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- if (gen_after(g->oldest_gen, ptr->gen))
- g->oldest_gen = ptr->gen;
-
- *max_stale = max(*max_stale, ptr_stale(ca, ptr));
- }
-
- ret = bch2_mark_key(trans, old, *k, flags);
+ ret = __bch2_trans_do(trans, NULL, NULL, 0,
+ bch2_mark_key(trans, old, *k, flags));
fsck_err:
err:
if (ret)
@@ -753,8 +803,7 @@ err:
return ret;
}
-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale,
- bool initial)
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
struct bch_fs *c = trans->c;
struct btree_node_iter iter;
@@ -763,8 +812,6 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma
struct bkey_buf prev, cur;
int ret = 0;
- *max_stale = 0;
-
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
@@ -775,7 +822,7 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, max_stale, initial);
+ &k, initial);
if (ret)
break;
@@ -806,7 +853,6 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
: bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
- u8 max_stale = 0;
int ret = 0;
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -817,21 +863,9 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(trans, b, &max_stale, initial);
+ ret = btree_gc_mark_node(trans, b, initial);
if (ret)
break;
-
- if (!initial) {
- if (max_stale > 64)
- bch2_btree_node_rewrite(trans, &iter, b,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_GC_LOCK_HELD);
- else if (!bch2_btree_gc_rewrite_disabled &&
- (bch2_btree_gc_always_rewrite || max_stale > 16))
- bch2_btree_node_rewrite(trans, &iter,
- b, BTREE_INSERT_NOWAIT|
- BTREE_INSERT_GC_LOCK_HELD);
- }
}
bch2_trans_iter_exit(trans, &iter);
@@ -843,8 +877,8 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
if (!btree_node_fake(b)) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
- &k, &max_stale, initial);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+ true, &k, initial);
}
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
@@ -859,8 +893,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- u8 max_stale = 0;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -872,8 +905,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, &max_stale, true);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+ false, &k, true);
if (ret) {
bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
goto fsck_err;
@@ -921,7 +954,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
bch_info(c, "Halting mark and sweep to start topology repair pass");
@@ -951,6 +985,7 @@ fsck_err:
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
+ printbuf_exit(&buf);
return ret;
}
@@ -964,8 +999,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
: bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
- u8 max_stale = 0;
- char buf[100];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
b = c->btree_roots[btree_id].b;
@@ -974,17 +1008,19 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
- "btree root with incorrect min_key: %s",
- (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
+ "btree root with incorrect min_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = FSCK_ERR_EXIT;
goto fsck_err;
}
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
- "btree root with incorrect max_key: %s",
- (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
+ "btree root with incorrect max_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = FSCK_ERR_EXIT;
goto fsck_err;
@@ -997,13 +1033,14 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
- &k, &max_stale, true);
+ &k, true);
}
fsck_err:
six_unlock_read(&b->c.lock);
if (ret < 0)
bch_err(c, "%s: ret %i", __func__, ret);
+ printbuf_exit(&buf);
return ret;
}
@@ -1022,6 +1059,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
bch2_trans_init(&trans, c, 0, 0);
+ if (initial)
+ trans.is_initial_gc = true;
+
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
@@ -1123,10 +1163,10 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(ca, c, i) {
- kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- ca->buckets[1] = NULL;
+ ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
@@ -1140,18 +1180,20 @@ static int bch2_gc_done(struct bch_fs *c,
bool initial, bool metadata_only)
{
struct bch_dev *ca = NULL;
+ struct printbuf buf = PRINTBUF;
bool verify = !metadata_only && (!initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i, dev;
int ret = 0;
+ percpu_down_write(&c->mark_lock);
+
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
if (verify) \
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
@@ -1161,18 +1203,6 @@ static int bch2_gc_done(struct bch_fs *c,
iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
-#define copy_bucket_field(_f) \
- if (dst->b[b]._f != src->b[b]._f) { \
- if (verify) \
- fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", dev, b, \
- dst->b[b].mark.gen, \
- bch2_data_types[dst->b[b].mark.data_type],\
- dst->b[b]._f, src->b[b]._f); \
- dst->b[b]._f = src->b[b]._f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
@@ -1183,36 +1213,18 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
- struct bucket_array *dst = __bucket_array(ca, 0);
- struct bucket_array *src = __bucket_array(ca, 1);
- size_t b;
-
- for (b = 0; b < src->nbuckets; b++) {
- copy_bucket_field(_mark.gen);
- copy_bucket_field(_mark.data_type);
- copy_bucket_field(_mark.stripe);
- copy_bucket_field(_mark.dirty_sectors);
- copy_bucket_field(_mark.cached_sectors);
- copy_bucket_field(stripe_redundancy);
- copy_bucket_field(stripe);
-
- dst->b[b].oldest_gen = src->b[b].oldest_gen;
- }
-
- {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage_gc,
- dev_usage_u64s());
-
- copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_unavailable, "buckets_unavailable");
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
- }
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
+
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
};
@@ -1239,22 +1251,21 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- char buf[80];
if (metadata_only &&
(e->data_type == BCH_DATA_user ||
e->data_type == BCH_DATA_cached))
continue;
- bch2_replicas_entry_to_text(&PBUF(buf), e);
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, e);
- copy_fs_field(replicas[i], "%s", buf);
+ copy_fs_field(replicas[i], "%s", buf.buf);
}
}
#undef copy_fs_field
#undef copy_dev_field
-#undef copy_bucket_field
#undef copy_stripe_field
#undef copy_field
fsck_err:
@@ -1262,6 +1273,9 @@ fsck_err:
percpu_ref_put(&ca->ref);
if (ret)
bch_err(c, "%s: ret %i", __func__, ret);
+
+ percpu_up_write(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
@@ -1281,18 +1295,9 @@ static int bch2_gc_start(struct bch_fs *c,
}
for_each_member_device(ca, c, i) {
- BUG_ON(ca->buckets[1]);
+ BUG_ON(ca->buckets_gc);
BUG_ON(ca->usage_gc);
- ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets[1]) {
- percpu_ref_put(&ca->ref);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -ENOMEM;
- }
-
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
@@ -1301,103 +1306,215 @@ static int bch2_gc_start(struct bch_fs *c,
}
}
- percpu_down_write(&c->mark_lock);
+ return 0;
+}
- /*
- * indicate to stripe code that we need to allocate for the gc stripes
- * radix tree, too
- */
- gc_pos_set(c, gc_phase(GC_PHASE_START));
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+ struct bch_alloc_v4 r)
+{
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type ||
+ l.dirty_sectors != r.dirty_sectors ||
+ l.cached_sectors != r.cached_sectors ||
+ l.stripe_redundancy != r.stripe_redundancy ||
+ l.stripe != r.stripe;
+}
- for_each_member_device(ca, c, i) {
- struct bucket_array *dst = __bucket_array(ca, 1);
- struct bucket_array *src = __bucket_array(ca, 0);
- size_t b;
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ struct bucket gc;
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ struct bch_alloc_v4 old, new;
+ int ret;
- dst->first_bucket = src->first_bucket;
- dst->nbuckets = src->nbuckets;
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
- for (b = 0; b < src->nbuckets; b++) {
- struct bucket *d = &dst->b[b];
- struct bucket *s = &src->b[b];
+ bch2_alloc_to_v4(k, &old);
+ new = old;
- d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
- d->gen_valid = s->gen_valid;
+ percpu_down_read(&c->mark_lock);
+ gc = *gc_bucket(ca, iter->pos.offset);
+ percpu_up_read(&c->mark_lock);
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached))
- d->_mark = s->mark;
- }
- };
+ if (metadata_only &&
+ gc.data_type != BCH_DATA_sb &&
+ gc.data_type != BCH_DATA_journal &&
+ gc.data_type != BCH_DATA_btree)
+ return 0;
- percpu_up_write(&c->mark_lock);
+ if (gen_after(old.gen, gc.gen))
+ return 0;
- return 0;
+#define copy_bucket_field(_f) \
+ if (fsck_err_on(new._f != gc._f, c, \
+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \
+ ": got %u, should be %u", \
+ iter->pos.inode, iter->pos.offset, \
+ gc.gen, \
+ bch2_data_types[gc.data_type], \
+ new._f, gc._f)) \
+ new._f = gc._f; \
+
+ copy_bucket_field(gen);
+ copy_bucket_field(data_type);
+ copy_bucket_field(dirty_sectors);
+ copy_bucket_field(cached_sectors);
+ copy_bucket_field(stripe_redundancy);
+ copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+ if (!bch2_alloc_v4_cmp(old, new))
+ return 0;
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ a->v = new;
+
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+fsck_err:
+ return ret;
}
-static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
- char buf[200];
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
int ret = 0;
- if (!refcount)
- return 0;
+ bch2_trans_init(&trans, c, 0, 0);
- r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
- if (!r)
- return -ENOMEM;
+ for_each_member_device(ca, c, i) {
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
+
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(&trans, &iter,
+ metadata_only));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
+ if (ret) {
+ bch_err(c, "error writing alloc info: %i", ret);
+ percpu_ref_put(&ca->ref);
+ break;
+ }
}
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- r->refcount)) {
- struct bkey_i *new;
+ bch2_trans_exit(&trans);
+ return ret;
+}
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto fsck_err;
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+{
+ struct bch_dev *ca;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bucket *g;
+ struct bch_alloc_v4 a;
+ unsigned i;
+ int ret;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!buckets) {
+ percpu_ref_put(&ca->ref);
+ bch_err(c, "error allocating ca->buckets[gc]");
+ return -ENOMEM;
}
- bkey_reassemble(new, k);
+ buckets->first_bucket = ca->mi.first_bucket;
+ buckets->nbuckets = ca->mi.nbuckets;
+ rcu_assign_pointer(ca->buckets_gc, buckets);
+ };
- if (!r->refcount) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- } else {
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
- }
+ bch2_trans_init(&trans, c, 0, 0);
- ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
- kfree(new);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = gc_bucket(ca, k.k->p.offset);
+
+ bch2_alloc_to_v4(k, &a);
+
+ g->gen_valid = 1;
+ g->gen = a.gen;
+
+ if (metadata_only &&
+ (a.data_type == BCH_DATA_user ||
+ a.data_type == BCH_DATA_cached ||
+ a.data_type == BCH_DATA_parity)) {
+ g->data_type = a.data_type;
+ g->dirty_sectors = a.dirty_sectors;
+ g->cached_sectors = a.cached_sectors;
+ g->stripe = a.stripe;
+ g->stripe_redundancy = a.stripe_redundancy;
+ }
}
-fsck_err:
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "error reading alloc info at gc start: %i", ret);
+
return ret;
}
-static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
- bool metadata_only)
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = gc_bucket_array(ca);
+ struct bucket *g;
+
+ for_each_bucket(g, buckets) {
+ if (metadata_only &&
+ (g->data_type == BCH_DATA_user ||
+ g->data_type == BCH_DATA_cached ||
+ g->data_type == BCH_DATA_parity))
+ continue;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ }
+ };
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct reflink_gc *r;
size_t idx = 0;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (metadata_only)
@@ -1405,14 +1522,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0);
- if (initial) {
- c->reflink_gc_idx = 0;
-
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
- bch2_gc_reflink_done_initial_fn);
- goto out;
- }
-
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k);
@@ -1420,7 +1529,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
if (!refcount)
continue;
- r = genradix_ptr(&c->reflink_gc_table, idx);
+ r = genradix_ptr(&c->reflink_gc_table, idx++);
if (!r ||
r->offset != k.k->p.offset ||
r->size != k.k->size) {
@@ -1433,7 +1542,8 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
struct bkey_i *new;
@@ -1451,7 +1561,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
*bkey_refcount(new) = cpu_to_le64(r->refcount);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
+ __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
kfree(new);
if (ret)
@@ -1460,149 +1570,128 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
}
fsck_err:
bch2_trans_iter_exit(&trans, &iter);
-out:
c->reflink_gc_nr = 0;
bch2_trans_exit(&trans);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
+static int bch2_gc_reflink_start(struct bch_fs *c,
+ bool metadata_only)
{
- struct bch_fs *c = trans->c;
- struct gc_stripe *m;
- const struct bch_stripe *s;
- char buf[200];
- unsigned i;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct reflink_gc *r;
int ret = 0;
- if (k.k->type != KEY_TYPE_stripe)
+ if (metadata_only)
return 0;
- s = bkey_s_c_to_stripe(k).v;
+ bch2_trans_init(&trans, c, 0, 0);
+ c->reflink_gc_nr = 0;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ const __le64 *refcount = bkey_refcount_c(k);
- for (i = 0; i < s->nr_blocks; i++)
- if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
- goto inconsistent;
- return 0;
-inconsistent:
- if (fsck_err_on(true, c,
- "stripe has wrong block sector count %u:\n"
- " %s\n"
- " should be %u", i,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- m ? m->block_sectors[i] : 0)) {
- struct bkey_i_stripe *new;
+ if (!refcount)
+ continue;
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+ GFP_KERNEL);
+ if (!r) {
ret = -ENOMEM;
- goto fsck_err;
+ break;
}
- bkey_reassemble(&new->k_i, k);
-
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
- ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
- kfree(new);
- }
-fsck_err:
- return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
- bool metadata_only)
-{
- struct btree_trans trans;
- int ret = 0;
-
- if (metadata_only)
- return 0;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- if (initial) {
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
- bch2_gc_stripes_done_initial_fn);
- } else {
- BUG();
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
}
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
{
-
- struct bch_fs *c = trans->c;
+ struct genradix_iter iter;
struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- return 0;
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- return 0;
+ genradix_for_each(&c->reflink_gc_table, iter, r)
+ r->refcount = 0;
}
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
- bool metadata_only)
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct reflink_gc *r;
+ struct gc_stripe *m;
+ const struct bch_stripe *s;
+ struct printbuf buf = PRINTBUF;
+ unsigned i;
int ret = 0;
if (metadata_only)
return 0;
bch2_trans_init(&trans, c, 0, 0);
- c->reflink_gc_nr = 0;
- if (initial) {
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
- bch2_gc_reflink_start_initial_fn);
- goto out;
- }
-
- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
+ if (k.k->type != KEY_TYPE_stripe)
continue;
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r) {
- ret = -ENOMEM;
- break;
- }
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
+ for (i = 0; i < s->nr_blocks; i++)
+ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+ goto inconsistent;
+ continue;
+inconsistent:
+ if (fsck_err_on(true, c,
+ "stripe has wrong block sector count %u:\n"
+ " %s\n"
+ " should be %u", i,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ m ? m->block_sectors[i] : 0)) {
+ struct bkey_i_stripe *new;
+
+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ bkey_reassemble(&new->k_i, k);
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+ kfree(new);
+ }
}
+fsck_err:
bch2_trans_iter_exit(&trans, &iter);
-out:
+
bch2_trans_exit(&trans);
+
+ printbuf_exit(&buf);
return ret;
}
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
+{
+ genradix_free(&c->gc_stripes);
+}
+
/**
* bch2_gc - walk _all_ references to buckets, and recompute them:
*
@@ -1623,9 +1712,8 @@ out:
*/
int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct bch_dev *ca;
u64 start_time = local_clock();
- unsigned i, iter = 0;
+ unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
@@ -1636,11 +1724,14 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
/* flush interior btree updates: */
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
-again:
+
ret = bch2_gc_start(c, metadata_only) ?:
- bch2_gc_reflink_start(c, initial, metadata_only);
+ bch2_gc_alloc_start(c, metadata_only) ?:
+ bch2_gc_reflink_start(c, metadata_only);
if (ret)
goto out;
+again:
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
bch2_mark_superblocks(c);
@@ -1678,40 +1769,40 @@ again:
if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
+ if (iter++ > 2) {
+ bch_info(c, "Unable to fix bucket gens, looping");
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* XXX: make sure gens we fixed got saved
*/
- if (iter++ <= 2) {
- bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
- percpu_down_write(&c->mark_lock);
- bch2_gc_free(c);
- percpu_up_write(&c->mark_lock);
- /* flush fsck errors, reset counters */
- bch2_flush_fsck_errs(c);
+ bch_info(c, "Second GC pass needed, restarting:");
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
- goto again;
- }
+ bch2_gc_stripes_reset(c, metadata_only);
+ bch2_gc_alloc_reset(c, metadata_only);
+ bch2_gc_reflink_reset(c, metadata_only);
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
+ /* flush fsck errors, reset counters */
+ bch2_flush_fsck_errs(c);
+ goto again;
}
out:
if (!ret) {
bch2_journal_block(&c->journal);
- percpu_down_write(&c->mark_lock);
- ret = bch2_gc_reflink_done(c, initial, metadata_only) ?:
- bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ ret = bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only) ?:
+ bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
- } else {
- percpu_down_write(&c->mark_lock);
}
+ percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
@@ -1724,13 +1815,6 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
/*
- * Wake up allocator in case it was waiting for buckets
- * because of not being able to inc gens
- */
- for_each_member_device(ca, c, i)
- bch2_wake_allocator(ca);
-
- /*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
@@ -1746,9 +1830,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
percpu_down_read(&c->mark_lock);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr);
- if (gen_after(g->mark.gen, ptr->gen) > 16) {
+ if (ptr_stale(ca, ptr) > 16) {
percpu_up_read(&c->mark_lock);
return true;
}
@@ -1756,10 +1839,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr);
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(g->gc_gen, ptr->gen))
- g->gc_gen = ptr->gen;
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
}
percpu_up_read(&c->mark_lock);
@@ -1770,23 +1853,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
* For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
* node pointers currently never have cached pointers that can become stale:
*/
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
{
- struct btree_trans trans;
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf sk;
int ret = 0, commit_err = 0;
bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
BTREE_ITER_PREFETCH|
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
- while ((bch2_trans_begin(&trans),
+ while ((bch2_trans_begin(trans),
k = bch2_btree_iter_peek(&iter)).k) {
ret = bkey_err(k);
@@ -1802,10 +1884,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
commit_err =
- bch2_trans_update(&trans, &iter, sk.k, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_NOFAIL);
+ bch2_trans_update(trans, &iter, sk.k, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOWAIT|
+ BTREE_INSERT_NOFAIL);
if (commit_err == -EINTR) {
commit_err = 0;
continue;
@@ -1814,20 +1896,48 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_btree_iter_advance(&iter);
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ struct bkey_i_alloc_v4 *a_mut;
+ int ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
+ return 0;
+
+ a_mut = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ return ret;
+
+ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+
+ return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
+}
+
int bch2_gc_gens(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
- u64 start_time = local_clock();
+ u64 b, start_time = local_clock();
unsigned i;
int ret;
@@ -1836,36 +1946,53 @@ int bch2_gc_gens(struct bch_fs *c)
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
+ if (!mutex_trylock(&c->gc_gens_lock))
+ return 0;
+
down_read(&c->gc_lock);
+ bch2_trans_init(&trans, c, 0, 0);
for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
+ struct bucket_gens *gens;
+
+ BUG_ON(ca->oldest_gen);
+
+ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+ if (!ca->oldest_gen) {
+ percpu_ref_put(&ca->ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ gens = bucket_gens(ca);
- for_each_bucket(g, buckets)
- g->gc_gen = g->mark.gen;
- up_read(&ca->bucket_lock);
+ for (b = gens->first_bucket;
+ b < gens->nbuckets; b++)
+ ca->oldest_gen[b] = gens->b[b];
}
for (i = 0; i < BTREE_ID_NR; i++)
if ((1 << i) & BTREE_ID_HAS_PTRS) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = bch2_gc_btree_gens(c, i);
+ ret = bch2_gc_btree_gens(&trans, i);
if (ret) {
bch_err(c, "error recalculating oldest_gen: %i", ret);
goto err;
}
}
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->oldest_gen = g->gc_gen;
- up_read(&ca->bucket_lock);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_alloc_write_oldest_gen(&trans, &iter));
+ if (ret) {
+ bch_err(c, "error writing oldest_gen: %i", ret);
+ break;
+ }
}
+ bch2_trans_iter_exit(&trans, &iter);
c->gc_gens_btree = 0;
c->gc_gens_pos = POS_MIN;
@@ -1874,7 +2001,14 @@ int bch2_gc_gens(struct bch_fs *c)
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
err:
+ for_each_member_device(ca, c, i) {
+ kvfree(ca->oldest_gen);
+ ca->oldest_gen = NULL;
+ }
+
+ bch2_trans_exit(&trans);
up_read(&c->gc_lock);
+ mutex_unlock(&c->gc_gens_lock);
return ret;
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1455dc787190..4b880ea59cad 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -477,7 +477,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
};
if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
- bch2_btree_node_write(c, b, SIX_LOCK_write);
+ bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
reinit_iter = true;
}
}
@@ -540,13 +540,7 @@ enum btree_validate_ret {
#define btree_err(type, c, ca, b, i, msg, ...) \
({ \
__label__ out; \
- char _buf[300]; \
- char *_buf2 = _buf; \
- struct printbuf out = PBUF(_buf); \
- \
- _buf2 = kmalloc(4096, GFP_ATOMIC); \
- if (_buf2) \
- out = _PBUF(_buf2, 4986); \
+ struct printbuf out = PRINTBUF; \
\
btree_err_msg(&out, c, ca, b, i, b->written, write); \
pr_buf(&out, ": " msg, ##__VA_ARGS__); \
@@ -554,14 +548,13 @@ enum btree_validate_ret {
if (type == BTREE_ERR_FIXABLE && \
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", _buf2); \
+ mustfix_fsck_err(c, "%s", out.buf); \
goto out; \
} \
\
switch (write) { \
case READ: \
- if (_buf2) \
- bch_err(c, "%s", _buf2); \
+ bch_err(c, "%s", out.buf); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
@@ -582,7 +575,7 @@ enum btree_validate_ret {
} \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write: %s", _buf2); \
+ bch_err(c, "corrupt metadata before write: %s", out.buf);\
\
if (bch2_fs_inconsistent(c)) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
@@ -591,8 +584,7 @@ enum btree_validate_ret {
break; \
} \
out: \
- if (_buf2 != _buf) \
- kfree(_buf2); \
+ printbuf_exit(&out); \
true; \
})
@@ -653,8 +645,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
{
unsigned version = le16_to_cpu(i->version);
const char *err;
- char buf1[100];
- char buf2[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret = 0;
btree_err_on((version != BCH_BSET_VERSION_OLD &&
@@ -691,7 +683,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
- return 0;
+ ret = 0;
+ goto out;
}
btree_err_on(offset && !i->u64s,
@@ -742,14 +735,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect min_key: got %s should be %s",
- (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+ (printbuf_reset(&buf2),
+ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
}
btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect max key %s",
- (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
if (write)
compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -764,7 +760,10 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
+out:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -774,6 +773,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
int ret = 0;
@@ -812,11 +813,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
(!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
if (invalid) {
- char buf[160];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
+ printbuf_reset(&buf1);
+ bch2_bkey_val_to_text(&buf1, c, u.s_c);
btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "invalid bkey: %s\n%s", invalid, buf);
+ "invalid bkey: %s\n%s", invalid, buf1.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
@@ -830,18 +830,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
&b->format, k);
if (prev && bkey_iter_cmp(b, prev, k) > 0) {
- char buf1[80];
- char buf2[80];
struct bkey up = bkey_unpack_key(b, prev);
- bch2_bkey_to_text(&PBUF(buf1), &up);
- bch2_bkey_to_text(&PBUF(buf2), u.k);
+ printbuf_reset(&buf1);
+ bch2_bkey_to_text(&buf1, &up);
+ printbuf_reset(&buf2);
+ bch2_bkey_to_text(&buf2, u.k);
bch2_dump_bset(c, b, i, 0);
if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
"keys out of order: %s > %s",
- buf1, buf2)) {
+ buf1.buf, buf2.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
@@ -853,6 +853,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k = bkey_next(k);
}
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -885,11 +887,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad magic");
+ "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(b->data->magic));
btree_err_on(!b->data->keys.seq,
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad btree header");
+ "bad btree header: seq 0");
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
@@ -922,9 +925,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i", ret))
+ goto fsck_err;
- btree_err_on(btree_node_is_extents(b) &&
+ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
BTREE_ERR_FATAL, c, NULL, b, NULL,
"btree node does not have NEW_EXTENT_OVERWRITE set");
@@ -949,7 +955,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i\n", ret))
+ goto fsck_err;
sectors = vstruct_sectors(bne, c->block_bits);
}
@@ -972,19 +981,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
- b->written += sectors;
-
blacklisted = bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(i->journal_seq),
true);
btree_err_on(blacklisted && first,
BTREE_ERR_FIXABLE, c, ca, b, i,
- "first btree node bset has blacklisted journal seq");
+ "first btree node bset has blacklisted journal seq (%llu)",
+ le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
BTREE_ERR_FIXABLE, c, ca, b, i,
- "found blacklisted bset in btree node with sectors_written");
+ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+ le64_to_cpu(i->journal_seq),
+ b->written, b->written + sectors, ptr_written);
+
+ b->written += sectors;
+
if (blacklisted && !first)
continue;
@@ -1057,11 +1070,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (invalid ||
(bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
- char buf[160];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "invalid bkey %s: %s", buf, invalid);
+ "invalid bkey %s: %s", buf.buf, invalid);
+ printbuf_exit(&buf);
btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1118,8 +1132,7 @@ static void btree_node_read_work(struct work_struct *work)
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
- char buf[200];
- struct printbuf out;
+ struct printbuf buf = PRINTBUF;
bool saw_error = false;
bool can_retry;
@@ -1140,10 +1153,10 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_status = BLK_STS_REMOVED;
}
start:
- out = PBUF(buf);
- btree_pos_to_text(&out, c, b);
+ printbuf_reset(&buf);
+ btree_pos_to_text(&buf, c, b);
bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf);
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
@@ -1169,6 +1182,7 @@ start:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
+ printbuf_exit(&buf);
if (saw_error && !btree_node_read_error(b))
bch2_btree_node_rewrite_async(c, b);
@@ -1249,6 +1263,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
container_of(cl, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
+ struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
bool have_retry = false;
int ret = 0, best = -1, write = READ;
@@ -1292,8 +1307,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
fsck_err:
if (dump_bset_maps) {
for (i = 0; i < ra->nr; i++) {
- char buf[200];
- struct printbuf out = PBUF(buf);
struct btree_node *bn = ra->buf[i];
struct btree_node_entry *bne = NULL;
unsigned offset = 0, sectors;
@@ -1302,6 +1315,8 @@ fsck_err:
if (ra->err[i])
continue;
+ printbuf_reset(&buf);
+
while (offset < btree_sectors(c)) {
if (!offset) {
sectors = vstruct_sectors(bn, c->block_bits);
@@ -1312,10 +1327,10 @@ fsck_err:
sectors = vstruct_sectors(bne, c->block_bits);
}
- pr_buf(&out, " %u-%u", offset, offset + sectors);
+ pr_buf(&buf, " %u-%u", offset, offset + sectors);
if (bne && bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
- pr_buf(&out, "*");
+ pr_buf(&buf, "*");
offset += sectors;
}
@@ -1323,19 +1338,19 @@ fsck_err:
bne = ra->buf[i] + (offset << 9);
if (bne->keys.seq == bn->keys.seq) {
if (!gap)
- pr_buf(&out, " GAP");
+ pr_buf(&buf, " GAP");
gap = true;
sectors = vstruct_sectors(bne, c->block_bits);
- pr_buf(&out, " %u-%u", offset, offset + sectors);
+ pr_buf(&buf, " %u-%u", offset, offset + sectors);
if (bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
- pr_buf(&out, "*");
+ pr_buf(&buf, "*");
}
offset++;
}
- bch_err(c, "replica %u:%s", i, buf);
+ bch_err(c, "replica %u:%s", i, buf.buf);
}
}
@@ -1356,6 +1371,7 @@ fsck_err:
closure_debug_destroy(&ra->cl);
kfree(ra);
+ printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1455,23 +1471,23 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret;
- btree_pos_to_text(&PBUF(buf), c, b);
+ btree_pos_to_text(&buf, c, b);
trace_btree_read(c, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
- return;
+ goto out;
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick);
if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from\n"
- " at %s", buf)) {
+ " at %s", buf.buf)) {
set_btree_node_read_error(b);
- return;
+ goto out;
}
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -1512,6 +1528,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
else
queue_work(c->io_complete_wq, &rb->work);
}
+out:
+ printbuf_exit(&buf);
}
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
@@ -1528,7 +1546,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, level != 0);
bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
@@ -1578,7 +1596,7 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
bch2_journal_pin_drop(&c->journal, &w->journal);
}
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new, v;
@@ -1589,26 +1607,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
do {
old = new = v;
- if (old & (1U << BTREE_NODE_need_write))
- goto do_write;
-
- new &= ~(1U << BTREE_NODE_write_in_flight);
- new &= ~(1U << BTREE_NODE_write_in_flight_inner);
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
- return;
-
-do_write:
- six_lock_read(&b->c.lock, NULL, NULL);
- v = READ_ONCE(b->flags);
- do {
- old = new = v;
-
if ((old & (1U << BTREE_NODE_dirty)) &&
(old & (1U << BTREE_NODE_need_write)) &&
!(old & (1U << BTREE_NODE_never_write)) &&
- btree_node_may_write(b)) {
+ !(old & (1U << BTREE_NODE_write_blocked)) &&
+ !(old & (1U << BTREE_NODE_will_make_reachable))) {
new &= ~(1U << BTREE_NODE_dirty);
new &= ~(1U << BTREE_NODE_need_write);
new |= (1U << BTREE_NODE_write_in_flight);
@@ -1622,8 +1625,15 @@ do_write:
} while ((v = cmpxchg(&b->flags, old, new)) != old);
if (new & (1U << BTREE_NODE_write_in_flight))
- __bch2_btree_node_write(c, b, true);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+ else
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+ six_lock_read(&b->c.lock, NULL, NULL);
+ __btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
}
@@ -1738,7 +1748,7 @@ static void btree_write_submit(struct work_struct *work)
bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
}
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
{
struct btree_write_bio *wbio;
struct bset_tree *t;
@@ -1753,13 +1763,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
unsigned long old, new;
bool validate_before_checksum = false;
void *data;
+ int ret;
- if (already_started)
+ if (flags & BTREE_WRITE_ALREADY_STARTED)
goto do_write;
- if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
- return;
-
/*
* We may only have a read lock on the btree node - the dirty bit is our
* "lock" against racing with other threads that may be trying to start
@@ -1773,13 +1781,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
if (!(old & (1 << BTREE_NODE_dirty)))
return;
- if (!btree_node_may_write(b))
+ if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+ !(old & (1 << BTREE_NODE_need_write)))
return;
- if (old & (1 << BTREE_NODE_never_write))
+ if (old &
+ ((1 << BTREE_NODE_never_write)|
+ (1 << BTREE_NODE_write_blocked)))
return;
- BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
+ if (b->written &&
+ (old & (1 << BTREE_NODE_will_make_reachable)))
+ return;
+
+ if (old & (1 << BTREE_NODE_write_in_flight))
+ return;
new &= ~(1 << BTREE_NODE_dirty);
new &= ~(1 << BTREE_NODE_need_write);
@@ -1875,7 +1891,7 @@ do_write:
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
- i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+ i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
? cpu_to_le16(BCH_BSET_VERSION_OLD)
: cpu_to_le16(c->sb.version);
SET_BSET_OFFSET(i, b->written);
@@ -1893,7 +1909,10 @@ do_write:
validate_bset_for_write(c, b, i, sectors_to_write))
goto err;
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error encrypting btree node: %i\n", ret))
+ goto err;
nonce = btree_nonce(i, b->written << 9);
@@ -1976,7 +1995,7 @@ err:
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
- btree_node_write_done(c, b);
+ __btree_node_write_done(c, b);
}
/*
@@ -2039,12 +2058,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
* Use this one if the node is intent locked:
*/
void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- enum six_lock_type lock_type_held)
+ enum six_lock_type lock_type_held,
+ unsigned flags)
{
if (lock_type_held == SIX_LOCK_intent ||
(lock_type_held == SIX_LOCK_read &&
six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, flags);
/* don't cycle lock unnecessarily: */
if (btree_node_just_written(b) &&
@@ -2056,7 +2076,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->c.lock);
} else {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, flags);
if (lock_type_held == SIX_LOCK_write &&
btree_node_just_written(b))
bch2_btree_post_write_cleanup(c, b);
@@ -2076,7 +2096,6 @@ restart:
rcu_read_unlock();
wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
goto restart;
-
}
rcu_read_unlock();
}
@@ -2090,30 +2109,3 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
{
__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
-
-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos) {
- unsigned long flags = READ_ONCE(b->flags);
-
- if (!(flags & (1 << BTREE_NODE_dirty)))
- continue;
-
- pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
- b,
- (flags & (1 << BTREE_NODE_dirty)) != 0,
- (flags & (1 << BTREE_NODE_need_write)) != 0,
- b->c.level,
- b->written,
- !list_empty_careful(&b->write_blocked),
- b->will_make_reachable != 0,
- b->will_make_reachable & 1);
- }
- rcu_read_unlock();
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 0f20224e2a77..d818d87661e8 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -15,18 +15,13 @@ struct btree;
struct btree_iter;
struct btree_node_read_all;
-static inline bool btree_node_dirty(struct btree *b)
-{
- return test_bit(BTREE_NODE_dirty, &b->flags);
-}
-
-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
atomic_inc(&c->btree_cache.dirty);
}
-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
atomic_dec(&c->btree_cache.dirty);
@@ -67,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *);
void bch2_btree_node_wait_on_read(struct btree *);
void bch2_btree_node_wait_on_write(struct btree *);
-static inline bool btree_node_may_write(struct btree *b)
-{
- return list_empty_careful(&b->write_blocked) &&
- (!b->written || !b->will_make_reachable);
-}
-
enum compact_mode {
COMPACT_LAZY,
COMPACT_ALL,
@@ -111,22 +100,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
}};
}
-static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
{
struct nonce nonce = btree_nonce(i, offset);
+ int ret;
if (!offset) {
struct btree_node *bn = container_of(i, struct btree_node, keys);
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
- bytes);
+ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+ &bn->flags, bytes);
+ if (ret)
+ return ret;
nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
+ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+ vstruct_end(i) - (void *) i->_data);
}
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
@@ -145,41 +137,23 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+#define BTREE_WRITE_ONLY_IF_NEED (1U << 0)
+#define BTREE_WRITE_ALREADY_STARTED (1U << 1)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
- enum six_lock_type);
+ enum six_lock_type, unsigned);
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{
- if (b->written &&
- btree_node_need_write(b) &&
- btree_node_may_write(b) &&
- !btree_node_write_in_flight(b))
- bch2_btree_node_write(c, b, lock_held);
+ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
}
-#define bch2_btree_node_write_cond(_c, _b, cond) \
-do { \
- unsigned long old, new, v = READ_ONCE((_b)->flags); \
- \
- do { \
- old = new = v; \
- \
- if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \
- break; \
- \
- new |= (1 << BTREE_NODE_need_write); \
- } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
- \
- btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
-} while (0)
-
void bch2_btree_flush_all_reads(struct bch_fs *);
void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
unsigned version, unsigned big_endian,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 65ab2cd64dde..25d254ee9eac 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "recovery.h"
#include "replicas.h"
#include "subvolume.h"
@@ -19,7 +20,7 @@
#include <trace/events/bcachefs.h>
static void btree_trans_verify_sorted(struct btree_trans *);
-static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
@@ -57,6 +58,9 @@ static inline int __btree_path_cmp(const struct btree_path *l,
struct bpos r_pos,
unsigned r_level)
{
+ /*
+ * Must match lock ordering as defined by __bch2_btree_node_lock:
+ */
return cmp_int(l->btree_id, r_btree_id) ?:
cmp_int((int) l->cached, (int) r_cached) ?:
bpos_cmp(l->pos, r_pos) ?:
@@ -161,7 +165,7 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
else
this_cpu_sub(*b->c.lock.readers, readers);
- btree_node_lock_type(trans->c, b, SIX_LOCK_write);
+ six_lock_write(&b->c.lock, NULL, NULL);
if (!b->c.lock.readers)
atomic64_add(__SIX_VAL(read_lock, readers),
@@ -177,19 +181,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
int want = __btree_lock_want(path, level);
if (!is_btree_node(path, level))
- return false;
+ goto fail;
if (race_fault())
- return false;
+ goto fail;
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, b, level, want))) {
- mark_btree_node_locked(path, level, want);
+ mark_btree_node_locked(trans, path, level, want);
return true;
- } else {
- return false;
}
+fail:
+ trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+ path->btree_id,
+ &path->pos,
+ (unsigned long) b,
+ path->l[level].lock_seq,
+ is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+ return false;
}
bool bch2_btree_node_upgrade(struct btree_trans *trans,
@@ -230,13 +240,13 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans,
return false;
success:
- mark_btree_node_intent_locked(path, level);
+ mark_btree_node_intent_locked(trans, path, level);
return true;
}
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
- bool upgrade, unsigned long trace_ip)
+ bool upgrade)
{
unsigned l = path->level;
int fail_idx = -1;
@@ -293,10 +303,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
- struct btree_path *linked, *deadlock_path = NULL;
- u64 start_time = local_clock();
- unsigned reason = 9;
- bool ret;
+ struct btree_path *linked;
+ unsigned reason;
/* Check if it's safe to block: */
trans_for_each_path(trans, linked) {
@@ -317,28 +325,28 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- deadlock_path = linked;
reason = 1;
+ goto deadlock;
}
if (linked->btree_id != path->btree_id) {
- if (linked->btree_id > path->btree_id) {
- deadlock_path = linked;
- reason = 3;
- }
- continue;
+ if (linked->btree_id < path->btree_id)
+ continue;
+
+ reason = 3;
+ goto deadlock;
}
/*
- * Within the same btree, cached paths come before non
- * cached paths:
+ * Within the same btree, non-cached paths come before cached
+ * paths:
*/
if (linked->cached != path->cached) {
- if (path->cached) {
- deadlock_path = linked;
- reason = 4;
- }
- continue;
+ if (!linked->cached)
+ continue;
+
+ reason = 4;
+ goto deadlock;
}
/*
@@ -347,50 +355,33 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
* we're about to lock, it must have the ancestors locked too:
*/
if (level > __fls(linked->nodes_locked)) {
- deadlock_path = linked;
reason = 5;
+ goto deadlock;
}
/* Must lock btree nodes in key order: */
if (btree_node_locked(linked, level) &&
bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
linked->cached)) <= 0) {
- deadlock_path = linked;
- reason = 7;
BUG_ON(trans->in_traverse_all);
+ reason = 7;
+ goto deadlock;
}
}
- if (unlikely(deadlock_path)) {
- trace_trans_restart_would_deadlock(trans->ip, ip,
- trans->in_traverse_all, reason,
- deadlock_path->btree_id,
- deadlock_path->cached,
- &deadlock_path->pos,
- path->btree_id,
- path->cached,
- &pos);
- btree_trans_restart(trans);
- return false;
- }
-
- if (six_trylock_type(&b->c.lock, type))
- return true;
-
- trans->locking_path_idx = path->idx;
- trans->locking_pos = pos;
- trans->locking_btree_id = path->btree_id;
- trans->locking_level = level;
- trans->locking = b;
-
- ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-
- trans->locking = NULL;
-
- if (ret)
- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
- start_time);
- return ret;
+ return btree_node_lock_type(trans, path, b, pos, level,
+ type, should_sleep_fn, p);
+deadlock:
+ trace_trans_restart_would_deadlock(trans->fn, ip,
+ trans->in_traverse_all, reason,
+ linked->btree_id,
+ linked->cached,
+ &linked->pos,
+ path->btree_id,
+ path->cached,
+ &pos);
+ btree_trans_restart(trans);
+ return false;
}
/* Btree iterator locking: */
@@ -439,6 +430,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(path);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
+ path->btree_id, &path->pos);
btree_trans_restart(trans);
return false;
}
@@ -451,10 +444,13 @@ __flatten
static bool bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- bool ret = btree_path_get_locks(trans, path, false, trace_ip);
+ bool ret = btree_path_get_locks(trans, path, false);
- if (!ret)
+ if (!ret) {
+ trace_trans_restart_relock_path(trans->fn, trace_ip,
+ path->btree_id, &path->pos);
btree_trans_restart(trans);
+ }
return ret;
}
@@ -468,7 +464,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
path->locks_want = new_locks_want;
- if (btree_path_get_locks(trans, path, true, _THIS_IP_))
+ if (btree_path_get_locks(trans, path, true))
return true;
/*
@@ -490,14 +486,15 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
- trans_for_each_path(trans, linked)
- if (linked != path &&
- linked->cached == path->cached &&
- linked->btree_id == path->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true, _THIS_IP_);
- }
+ if (!path->cached && !trans->in_traverse_all)
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
+ linked->locks_want < new_locks_want) {
+ linked->locks_want = new_locks_want;
+ btree_path_get_locks(trans, linked, true);
+ }
return false;
}
@@ -547,7 +544,7 @@ bool bch2_trans_relock(struct btree_trans *trans)
trans_for_each_path(trans, path)
if (path->should_be_locked &&
!bch2_btree_path_relock(trans, path, _RET_IP_)) {
- trace_trans_restart_relock(trans->ip, _RET_IP_,
+ trace_trans_restart_relock(trans->fn, _RET_IP_,
path->btree_id, &path->pos);
BUG_ON(!trans->restarted);
return false;
@@ -562,7 +559,12 @@ void bch2_trans_unlock(struct btree_trans *trans)
trans_for_each_path(trans, path)
__bch2_btree_path_unlock(path);
- BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+ /*
+ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
+ * btree nodes, it implements its own walking:
+ */
+ BUG_ON(!trans->is_initial_gc &&
+ lock_class_is_held(&bch2_btree_node_lock_key));
}
/* Btree iterator: */
@@ -593,7 +595,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_node_iter tmp;
bool locked;
struct bkey_packed *p, *k;
- char buf1[100], buf2[100], buf3[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct printbuf buf3 = PRINTBUF;
const char *msg;
if (!bch2_debug_check_iterators)
@@ -641,26 +645,27 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
btree_node_unlock(path, level);
return;
err:
- strcpy(buf2, "(none)");
- strcpy(buf3, "(none)");
-
- bch2_bpos_to_text(&PBUF(buf1), path->pos);
+ bch2_bpos_to_text(&buf1, path->pos);
if (p) {
struct bkey uk = bkey_unpack_key(l->b, p);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
+ bch2_bkey_to_text(&buf2, &uk);
+ } else {
+ pr_buf(&buf2, "(none)");
}
if (k) {
struct bkey uk = bkey_unpack_key(l->b, k);
- bch2_bkey_to_text(&PBUF(buf3), &uk);
+ bch2_bkey_to_text(&buf3, &uk);
+ } else {
+ pr_buf(&buf3, "(none)");
}
panic("path should be %s key at level %u:\n"
"path pos %s\n"
"prev key %s\n"
"cur key %s\n",
- msg, level, buf1, buf2, buf3);
+ msg, level, buf1.buf, buf2.buf, buf3.buf);
}
static void bch2_btree_path_verify(struct btree_trans *trans,
@@ -700,9 +705,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- iter->pos.snapshot != iter->snapshot);
-
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
@@ -710,6 +712,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id));
+ if (iter->update_path)
+ bch2_btree_path_verify(trans, iter->update_path);
bch2_btree_path_verify(trans, iter->path);
}
@@ -759,16 +763,16 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
if (!bkey_cmp(prev.k->p, k.k->p) &&
bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
prev.k->p.snapshot) > 0) {
- char buf1[100], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- bch2_bkey_to_text(&PBUF(buf1), k.k);
- bch2_bkey_to_text(&PBUF(buf2), prev.k);
+ bch2_bkey_to_text(&buf1, k.k);
+ bch2_bkey_to_text(&buf2, prev.k);
panic("iter snap %u\n"
"k %s\n"
"prev %s\n",
iter->snapshot,
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
out:
bch2_trans_iter_exit(trans, &copy);
@@ -780,7 +784,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
{
struct btree_path *path;
unsigned idx;
- char buf[100];
+ struct printbuf buf = PRINTBUF;
trans_for_each_path_inorder(trans, path, idx) {
int cmp = cmp_int(path->btree_id, id) ?:
@@ -806,9 +810,10 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
}
bch2_dump_trans_paths_updates(trans);
+ bch2_bpos_to_text(&buf, pos);
+
panic("not locked: %s %s%s\n",
- bch2_btree_ids[id],
- (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+ bch2_btree_ids[id], buf.buf,
key_cache ? " cached" : "");
}
@@ -994,8 +999,6 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
struct bkey *u,
struct bkey_packed *k)
{
- struct bkey_s_c ret;
-
if (unlikely(!k)) {
/*
* signal to bch2_btree_iter_peek_slot() that we're currently at
@@ -1005,19 +1008,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
return bkey_s_c_null;
}
- ret = bkey_disassemble(l->b, k, u);
-
- /*
- * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
- * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
- * being overwritten but doesn't change k->size. But this is ok, because
- * those keys are never written out, we just have to avoid a spurious
- * assertion here:
- */
- if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
- bch2_bkey_debugcheck(c, l->b, ret);
-
- return ret;
+ return bkey_disassemble(l->b, k, u);
}
static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
@@ -1077,6 +1068,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
static void btree_path_verify_new_node(struct btree_trans *trans,
struct btree_path *path, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_path_level *l;
unsigned plevel;
bool parent_locked;
@@ -1085,6 +1077,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return;
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ return;
+
plevel = b->c.level + 1;
if (!btree_path_node(path, plevel))
return;
@@ -1099,23 +1094,23 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
if (!k ||
bkey_deleted(k) ||
bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
- char buf1[100];
- char buf2[100];
- char buf3[100];
- char buf4[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct printbuf buf3 = PRINTBUF;
+ struct printbuf buf4 = PRINTBUF;
struct bkey uk = bkey_unpack_key(b, k);
- bch2_dump_btree_node(trans->c, l->b);
- bch2_bpos_to_text(&PBUF(buf1), path->pos);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
- bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
+ bch2_dump_btree_node(c, l->b);
+ bch2_bpos_to_text(&buf1, path->pos);
+ bch2_bkey_to_text(&buf2, &uk);
+ bch2_bpos_to_text(&buf3, b->data->min_key);
+ bch2_bpos_to_text(&buf3, b->data->max_key);
panic("parent iter doesn't point to new node:\n"
"iter pos %s %s\n"
"iter key %s\n"
"new node %s-%s\n",
- bch2_btree_ids[path->btree_id], buf1,
- buf2, buf3, buf4);
+ bch2_btree_ids[path->btree_id],
+ buf1.buf, buf2.buf, buf3.buf, buf4.buf);
}
if (!parent_locked)
@@ -1173,7 +1168,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(path, b->c.level);
six_lock_increment(&b->c.lock, t);
- mark_btree_node_locked(path, b->c.level, t);
+ mark_btree_node_locked(trans, path, b->c.level, t);
}
btree_path_level_init(trans, path, b);
@@ -1250,7 +1245,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
- mark_btree_node_locked(path, path->level, lock_type);
+ mark_btree_node_locked(trans, path, path->level, lock_type);
btree_path_level_init(trans, path, b);
return 0;
}
@@ -1296,6 +1291,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
return ret;
}
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+ struct btree_and_journal_iter *jiter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
+ break;
+
+ bch2_btree_and_journal_iter_advance(jiter);
+ k = bch2_btree_and_journal_iter_peek(jiter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ path->level - 1);
+ }
+
+ if (!was_locked)
+ btree_node_unlock(path, path->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
struct btree_path *path,
unsigned plevel, struct btree *b)
@@ -1318,6 +1348,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
btree_node_unlock(path, plevel);
}
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ struct bkey_buf *out)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree_and_journal_iter jiter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+ k = bch2_btree_and_journal_iter_peek(&jiter);
+
+ bch2_bkey_buf_reassemble(out, c, k);
+
+ if (flags & BTREE_ITER_PREFETCH)
+ ret = btree_path_prefetch_j(trans, path, &jiter);
+
+ bch2_btree_and_journal_iter_exit(&jiter);
+ return ret;
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -1328,30 +1382,41 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
+ bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
struct bkey_buf tmp;
int ret;
EBUG_ON(!btree_node_locked(path, path->level));
bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (unlikely(!replay_done)) {
+ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ if (ret)
+ goto err;
+ } else {
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (flags & BTREE_ITER_PREFETCH) {
+ ret = btree_path_prefetch(trans, path);
+ if (ret)
+ goto err;
+ }
+ }
b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
goto err;
- mark_btree_node_locked(path, level, lock_type);
+ mark_btree_node_locked(trans, path, level, lock_type);
btree_path_level_init(trans, path, b);
- if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+ if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(trans, path, level + 1, b);
- if (flags & BTREE_ITER_PREFETCH)
- ret = btree_path_prefetch(trans, path);
-
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(path, level + 1);
path->level = level;
@@ -1365,12 +1430,12 @@ err:
static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
unsigned, unsigned long);
-static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
- unsigned long trace_ip)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
- int i;
+ unsigned long trace_ip = _RET_IP_;
+ int i, ret = 0;
if (trans->in_traverse_all)
return -EINTR;
@@ -1378,6 +1443,7 @@ static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
trans->in_traverse_all = true;
retry_all:
trans->restarted = false;
+ trans->traverse_all_idx = U8_MAX;
trans_for_each_path(trans, path)
path->should_be_locked = false;
@@ -1398,7 +1464,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
- if (unlikely(ret == -ENOMEM)) {
+ if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
closure_init_stack(&cl);
@@ -1409,27 +1475,25 @@ retry_all:
} while (ret);
}
- if (unlikely(ret == -EIO))
- goto out;
-
- BUG_ON(ret && ret != -EINTR);
-
/* Now, redo traversals in correct order: */
- i = 0;
- while (i < trans->nr_sorted) {
- path = trans->paths + trans->sorted[i];
+ trans->traverse_all_idx = 0;
+ while (trans->traverse_all_idx < trans->nr_sorted) {
+ path = trans->paths + trans->sorted[trans->traverse_all_idx];
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-
- ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
- if (ret)
- goto retry_all;
-
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-
- if (path->nodes_locked ||
- !btree_path_node(path, path->level))
- i++;
+ /*
+ * Traversing a path can cause another path to be added at about
+ * the same position:
+ */
+ if (path->uptodate) {
+ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+ if (ret == -EINTR || ret == -ENOMEM)
+ goto retry_all;
+ if (ret)
+ goto err;
+ BUG_ON(path->uptodate);
+ } else {
+ trans->traverse_all_idx++;
+ }
}
/*
@@ -1439,20 +1503,15 @@ retry_all:
*/
trans_for_each_path(trans, path)
BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-out:
+err:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
- trace_trans_traverse_all(trans->ip, trace_ip);
+ trace_trans_traverse_all(trans->fn, trace_ip);
return ret;
}
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
- return __btree_path_traverse_all(trans, 0, _RET_IP_);
-}
-
static inline bool btree_path_good_node(struct btree_trans *trans,
struct btree_path *path,
unsigned l, int check_pos)
@@ -1576,8 +1635,6 @@ out:
return ret;
}
-static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
-
int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
struct btree_path *path, unsigned flags)
{
@@ -1601,7 +1658,7 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
six_lock_increment(&dst->l[i].b->c.lock,
__btree_lock_want(dst, i));
- btree_path_check_sort(trans, dst, 0);
+ bch2_btree_path_check_sort(trans, dst, 0);
}
static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -1629,11 +1686,12 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
btree_trans_verify_sorted(trans);
}
+ path->should_be_locked = false;
return path;
}
-static struct btree_path * __must_check
-btree_path_set_pos(struct btree_trans *trans,
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip)
{
@@ -1648,10 +1706,9 @@ btree_path_set_pos(struct btree_trans *trans,
path = bch2_btree_path_make_mut(trans, path, intent, ip);
- path->pos = new_pos;
- path->should_be_locked = false;
+ path->pos = new_pos;
- btree_path_check_sort(trans, path, cmp);
+ bch2_btree_path_check_sort(trans, path, cmp);
if (unlikely(path->cached)) {
btree_node_unlock(path, 0);
@@ -1663,6 +1720,7 @@ btree_path_set_pos(struct btree_trans *trans,
l = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, l)) {
+ BUG_ON(!btree_node_locked(path, l));
/*
* We might have to skip over many keys, or just a few: try
* advancing the node iterator, and if we have to skip over too
@@ -1755,23 +1813,64 @@ free:
__bch2_path_free(trans, path);
}
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ pr_buf(buf, "transaction updates for %s journal seq %llu",
+ trans->fn, trans->journal_res.seq);
+ pr_newline(buf);
+ pr_indent_push(buf, 2);
+
+ trans_for_each_update(trans, i) {
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+
+ pr_buf(buf, "update: btree %s %pS",
+ bch2_btree_ids[i->btree_id],
+ (void *) i->ip_allocated);
+ pr_newline(buf);
+
+ pr_buf(buf, " old ");
+ bch2_bkey_val_to_text(buf, trans->c, old);
+ pr_newline(buf);
+
+ pr_buf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+ pr_newline(buf);
+ }
+
+ pr_indent_pop(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_updates_to_text(&buf, trans);
+ bch_err(trans->c, "%s", buf.buf);
+ printbuf_exit(&buf);
+}
+
noinline __cold
void bch2_dump_trans_paths_updates(struct btree_trans *trans)
{
struct btree_path *path;
- struct btree_insert_entry *i;
+ struct printbuf buf = PRINTBUF;
unsigned idx;
- char buf1[300], buf2[300];
- btree_trans_verify_sorted(trans);
+ trans_for_each_path_inorder(trans, path, idx) {
+ printbuf_reset(&buf);
- trans_for_each_path_inorder(trans, path, idx)
- printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
+ bch2_bpos_to_text(&buf, path->pos);
+
+ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
path->idx, path->ref, path->intent_ref,
path->should_be_locked ? " S" : "",
path->preserve ? " P" : "",
bch2_btree_ids[path->btree_id],
- (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
+ path->level,
+ buf.buf,
path->nodes_locked,
#ifdef CONFIG_BCACHEFS_DEBUG
(void *) path->ip_allocated
@@ -1779,17 +1878,11 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans)
NULL
#endif
);
+ }
- trans_for_each_update(trans, i) {
- struct bkey u;
- struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+ printbuf_exit(&buf);
- printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s",
- bch2_btree_ids[i->btree_id],
- (void *) i->ip_allocated,
- (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
- (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
- }
+ bch2_dump_trans_updates(trans);
}
static struct btree_path *btree_path_alloc(struct btree_trans *trans,
@@ -1830,6 +1923,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
int i;
BUG_ON(trans->restarted);
+ btree_trans_verify_sorted(trans);
+ bch2_trans_verify_locks(trans);
trans_for_each_path_inorder(trans, path, i) {
if (__btree_path_cmp(path,
@@ -1847,7 +1942,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path_pos->btree_id == btree_id &&
path_pos->level == level) {
__btree_path_get(path_pos, intent);
- path = btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
} else {
path = btree_path_alloc(trans, path_pos);
path_pos = NULL;
@@ -1887,7 +1982,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want) {
path->locks_want = locks_want;
- btree_path_get_locks(trans, path, true, _THIS_IP_);
+ btree_path_get_locks(trans, path, true);
}
return path;
@@ -1898,13 +1993,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
struct bkey_s_c k;
- BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-
if (!path->cached) {
struct btree_path_level *l = path_l(path);
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ struct bkey_packed *_k;
+
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
@@ -1914,13 +2009,17 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
} else {
struct bkey_cached *ck = (void *) path->l[0].b;
- EBUG_ON(path->btree_id != ck->key.btree_id ||
- bkey_cmp(path->pos, ck->key.pos));
+ EBUG_ON(ck &&
+ (path->btree_id != ck->key.btree_id ||
+ bkey_cmp(path->pos, ck->key.pos)));
- /* BTREE_ITER_CACHED_NOFILL? */
- if (unlikely(!ck->valid))
- goto hole;
+ /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
+ if (unlikely(!ck || !ck->valid))
+ return bkey_s_c_null;
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+ *u = ck->k->k;
k = bkey_i_to_s_c(ck->k);
}
@@ -1944,7 +2043,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
- iter->path = btree_path_set_pos(iter->trans, iter->path,
+ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -1981,7 +2080,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos = b->key.k.p;
- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
iter->path->should_be_locked = true;
@@ -2017,6 +2116,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
btree_node_unlock(path, path->level);
path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
path->level++;
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
return NULL;
}
@@ -2024,6 +2124,9 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
__bch2_btree_path_unlock(path);
path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
+ path->btree_id, &path->pos);
btree_trans_restart(trans);
ret = -EINTR;
goto err;
@@ -2041,7 +2144,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
* the next child node
*/
path = iter->path =
- btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2064,7 +2167,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos = b->key.k.p;
- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
iter->path->should_be_locked = true;
@@ -2107,24 +2210,88 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
return ret;
}
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if ((cmp_int(btree_id, i->btree_id) ?:
+ bpos_cmp(pos, i->k->k.p)) <= 0) {
+ if (btree_id == i->btree_id)
+ return i->k;
+ break;
+ }
+
+ return NULL;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *next_journal =
+ bch2_journal_keys_peek(trans->c, iter->btree_id, 0,
+ iter->path->pos);
+
+ if (next_journal &&
+ bpos_cmp(next_journal->k.p,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ iter->k = next_journal->k;
+ k = bkey_i_to_s_c(next_journal);
+ }
+
+ return k;
+}
+
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
*/
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ int ret;
+
+ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+ return bkey_s_c_null;
+
+ if (!iter->key_cache_path)
+ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+ iter->flags & BTREE_ITER_INTENT, 0,
+ iter->flags|BTREE_ITER_CACHED,
+ _THIS_IP_);
+
+ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ iter->key_cache_path->should_be_locked = true;
+
+ return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bpos search_key = btree_iter_search_key(iter);
struct bkey_i *next_update;
- struct bkey_s_c k;
- int ret, cmp;
+ struct bkey_s_c k, k2;
+ int ret;
EBUG_ON(iter->path->cached || iter->path->level);
bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_entry_exit(iter);
while (1) {
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2136,19 +2303,30 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
goto out;
}
- next_update = iter->flags & BTREE_ITER_WITH_UPDATES
- ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
- : NULL;
+ iter->path->should_be_locked = true;
+
k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
- /* * In the btree, deleted keys sort before non deleted: */
- if (k.k && bkey_deleted(k.k) &&
- (!next_update ||
- bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
- search_key = k.k->p;
- continue;
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ ret = bkey_err(k2);
+ if (ret) {
+ k = k2;
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ goto out;
+ }
+
+ k = k2;
+ iter->k = *k.k;
}
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ k = btree_trans_peek_journal(trans, iter, k);
+
+ next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+ ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+ : NULL;
if (next_update &&
bpos_cmp(next_update->k.p,
k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
@@ -2156,25 +2334,21 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
k = bkey_i_to_s_c(next_update);
}
- if (likely(k.k)) {
+ if (k.k && bkey_deleted(k.k)) {
/*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
+ * If we've got a whiteout, and it's after the search
+ * key, advance the search key to the whiteout instead
+ * of just after the whiteout - it might be a btree
+ * whiteout, with a real key at the same position, since
+ * in the btree deleted keys sort before non deleted.
*/
- if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
- !bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
-
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
- }
+ search_key = bpos_cmp(search_key, k.k->p)
+ ? k.k->p
+ : bpos_successor(k.k->p);
+ continue;
+ }
+ if (likely(k.k)) {
break;
} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
@@ -2186,35 +2360,137 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
goto out;
}
}
+out:
+ bch2_btree_iter_verify(iter);
- /*
- * iter->pos should be mononotically increasing, and always be equal to
- * the key we just returned - except extents can straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
- iter->pos = k.k->p;
- else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- iter->pos = bkey_start_pos(k.k);
+ return k;
+}
- if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
- iter->pos.snapshot = iter->snapshot;
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = btree_iter_search_key(iter);
+ struct bkey_s_c k;
+ struct bpos iter_pos;
+ int ret;
- cmp = bpos_cmp(k.k->p, iter->path->pos);
- if (cmp) {
- iter->path = bch2_btree_path_make_mut(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
- iter->path->pos = k.k->p;
- btree_path_check_sort(trans, iter->path, cmp);
+ if (iter->update_path) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ while (1) {
+ k = __bch2_btree_iter_peek(iter, search_key);
+ if (!k.k || bkey_err(k))
+ goto out;
+
+ /*
+ * iter->pos should be mononotically increasing, and always be
+ * equal to the key we just returned - except extents can
+ * straddle iter->pos:
+ */
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter_pos = k.k->p;
+ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ iter_pos = bkey_start_pos(k.k);
+ else
+ iter_pos = iter->pos;
+
+ if (bkey_cmp(iter_pos, end) > 0) {
+ bch2_btree_iter_set_pos(iter, end);
+ k = bkey_s_c_null;
+ goto out;
+ }
+
+ if (iter->update_path &&
+ bkey_cmp(iter->update_path->pos, k.k->p)) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ (iter->flags & BTREE_ITER_INTENT) &&
+ !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
+
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ pos.snapshot = iter->snapshot;
+
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = iter->path;
+
+ iter->update_path = bch2_btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+ }
+
+ /*
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
+ */
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
+
+ break;
}
+
+ iter->pos = iter_pos;
+
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ BUG_ON(!iter->path->nodes_locked);
out:
+ if (iter->update_path) {
+ if (iter->update_path->uptodate &&
+ !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) {
+ k = bkey_s_c_err(-EINTR);
+ } else {
+ BUG_ON(!(iter->update_path->nodes_locked & 1));
+ iter->update_path->should_be_locked = true;
+ }
+ }
iter->path->should_be_locked = true;
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ iter->pos.snapshot = iter->snapshot;
+
ret = bch2_btree_iter_verify_ret(iter, k);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ if (unlikely(ret)) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ }
+
+ bch2_btree_iter_verify_entry_exit(iter);
return k;
}
@@ -2247,6 +2523,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
EBUG_ON(iter->path->cached || iter->path->level);
EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+ if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ return bkey_s_c_err(-EIO);
+
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@@ -2254,7 +2534,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
search_key.snapshot = U32_MAX;
while (1) {
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2275,7 +2555,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
k = btree_path_level_prev(trans->c, iter->path,
&iter->path->l[0], &iter->k);
- btree_path_check_sort(trans, iter->path, 0);
+ bch2_btree_path_check_sort(trans, iter->path, 0);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
@@ -2385,7 +2665,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
search_key = btree_iter_search_key(iter);
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2397,25 +2677,44 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
- next_update = iter->flags & BTREE_ITER_WITH_UPDATES
- ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
- : NULL;
+ if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ (next_update = btree_trans_peek_updates(trans,
+ iter->btree_id, search_key)) &&
+ !bpos_cmp(next_update->k.p, iter->pos)) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ goto out;
+ }
- if (next_update &&
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ (next_update = bch2_journal_keys_peek(trans->c, iter->btree_id,
+ 0, iter->pos)) &&
!bpos_cmp(next_update->k.p, iter->pos)) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
- } else {
- k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ goto out;
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ if (!bkey_err(k))
+ iter->k = *k.k;
+ goto out;
}
+
+ k = bch2_btree_path_peek_slot(iter->path, &iter->k);
} else {
struct bpos next;
if (iter->flags & BTREE_ITER_INTENT) {
struct btree_iter iter2;
+ struct bpos end = iter->pos;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ end.offset = U64_MAX;
bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek(&iter2);
+ k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
iter->k = iter2.k;
@@ -2437,18 +2736,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (bkey_cmp(iter->pos, next) < 0) {
bkey_init(&iter->k);
iter->k.p = iter->pos;
- bch2_key_resize(&iter->k,
- min_t(u64, KEY_SIZE_MAX,
- (next.inode == iter->pos.inode
- ? next.offset
- : KEY_OFFSET_MAX) -
- iter->pos.offset));
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ bch2_key_resize(&iter->k,
+ min_t(u64, KEY_SIZE_MAX,
+ (next.inode == iter->pos.inode
+ ? next.offset
+ : KEY_OFFSET_MAX) -
+ iter->pos.offset));
+ EBUG_ON(!iter->k.size);
+ }
k = (struct bkey_s_c) { &iter->k, NULL };
- EBUG_ON(!k.k->size);
}
}
-
+out:
iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter);
@@ -2503,7 +2805,10 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
unsigned i;
trans_for_each_path_inorder(trans, path, i) {
- BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+ if (prev && btree_path_cmp(prev, path) > 0) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans paths out of order!\n");
+ }
prev = path;
}
#endif
@@ -2520,8 +2825,8 @@ static inline void btree_path_swap(struct btree_trans *trans,
btree_path_verify_sorted_ref(trans, r);
}
-static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
- int cmp)
+inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+ int cmp)
{
struct btree_path *n;
@@ -2577,6 +2882,11 @@ static inline void btree_path_list_add(struct btree_trans *trans,
path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+ if (trans->in_traverse_all &&
+ trans->traverse_all_idx != U8_MAX &&
+ trans->traverse_all_idx >= path->sorted_idx)
+ trans->traverse_all_idx++;
+
array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
for (i = path->sorted_idx; i < trans->nr_sorted; i++)
@@ -2590,7 +2900,15 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
if (iter->path)
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
+ if (iter->update_path)
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ if (iter->key_cache_path)
+ bch2_path_put(trans, iter->key_cache_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->path = NULL;
+ iter->update_path = NULL;
+ iter->key_cache_path = NULL;
}
static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2615,8 +2933,19 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
btree_type_has_snapshots(btree_id))
flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
+ flags |= BTREE_ITER_WITH_JOURNAL;
+
+ if (!btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_CACHED;
+ flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ } else if (!(flags & BTREE_ITER_CACHED))
+ flags |= BTREE_ITER_WITH_KEY_CACHE;
+
iter->trans = trans;
iter->path = NULL;
+ iter->update_path = NULL;
+ iter->key_cache_path = NULL;
iter->btree_id = btree_id;
iter->min_depth = depth;
iter->flags = flags;
@@ -2665,6 +2994,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
*dst = *src;
if (src->path)
__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ if (src->update_path)
+ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = NULL;
}
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
@@ -2693,7 +3025,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
- trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+ trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
@@ -2727,8 +3059,7 @@ void bch2_trans_begin(struct btree_trans *trans)
trans->mem_top = 0;
trans->hooks = NULL;
- trans->extra_journal_entries = NULL;
- trans->extra_journal_entry_u64s = 0;
+ trans->extra_journal_entries.nr = 0;
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
@@ -2741,13 +3072,21 @@ void bch2_trans_begin(struct btree_trans *trans)
path->should_be_locked = false;
/*
+ * If the transaction wasn't restarted, we're presuming to be
+ * doing something new: dont keep iterators excpt the ones that
+ * are in use - except for the subvolumes btree:
+ */
+ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+ path->preserve = false;
+
+ /*
* XXX: we probably shouldn't be doing this if the transaction
* was restarted, but currently we still overflow transaction
* iterators if we do that
*/
if (!path->ref && !path->preserve)
__bch2_path_free(trans, path);
- else if (!path->ref)
+ else
path->preserve = false;
}
@@ -2777,14 +3116,17 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
trans->updates = p; p += updates_bytes;
}
-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
- unsigned expected_nr_iters,
- size_t expected_mem_bytes)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
+ unsigned expected_nr_iters,
+ size_t expected_mem_bytes,
+ const char *fn)
__acquires(&c->btree_trans_barrier)
{
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+
memset(trans, 0, sizeof(*trans));
trans->c = c;
- trans->ip = _RET_IP_;
+ trans->fn = fn;
bch2_trans_alloc_paths(trans, c);
@@ -2817,7 +3159,7 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
goto leaked;
return;
leaked:
- bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip);
+ bch_err(c, "btree paths leaked from %s!", trans->fn);
trans_for_each_path(trans, path)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
@@ -2850,6 +3192,8 @@ void bch2_trans_exit(struct btree_trans *trans)
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ kfree(trans->extra_journal_entries.data);
+
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
@@ -2903,6 +3247,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
struct btree_trans *trans;
struct btree_path *path;
struct btree *b;
+ static char lock_types[] = { 'r', 'i', 'w' };
unsigned l;
mutex_lock(&c->btree_trans_lock);
@@ -2910,7 +3255,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
if (!trans_has_locks(trans))
continue;
- pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
+ pr_buf(out, "%i %s\n", trans->pid, trans->fn);
trans_for_each_path(trans, path) {
if (!path->nodes_locked)
@@ -2939,10 +3284,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
b = READ_ONCE(trans->locking);
if (b) {
path = &trans->paths[trans->locking_path_idx];
- pr_buf(out, " locking path %u %c l=%u %s:",
+ pr_buf(out, " locking path %u %c l=%u %c %s:",
trans->locking_path_idx,
path->cached ? 'c' : 'b',
trans->locking_level,
+ lock_types[trans->locking_lock_type],
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4c903b9dd716..f6700295e1a7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path,
return btree_path_node(path, b->c.level + 1);
}
-static inline int btree_iter_err(const struct btree_iter *iter)
-{
- return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
-}
-
/* Iterate over paths within a transaction: */
static inline struct btree_path *
@@ -75,6 +70,8 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
return &trans->paths[idx];
}
+void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
+
#define trans_for_each_path(_trans, _path) \
for (_path = __trans_next_path((_trans), 0); \
(_path); \
@@ -132,6 +129,9 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
struct btree_path * __must_check
bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
bool, unsigned long);
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+ struct bpos, bool, unsigned long);
int __must_check bch2_btree_path_traverse(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
@@ -209,9 +209,14 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+ return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
@@ -222,11 +227,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
bool bch2_btree_iter_advance(struct btree_iter *);
bool bch2_btree_iter_rewind(struct btree_iter *);
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
- new_pos.snapshot = iter->snapshot;
-
iter->k.type = KEY_TYPE_deleted;
iter->k.p.inode = iter->pos.inode = new_pos.inode;
iter->k.p.offset = iter->pos.offset = new_pos.offset;
@@ -234,6 +236,19 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
iter->k.size = 0;
}
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ if (unlikely(iter->update_path))
+ bch2_path_put(iter->trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ new_pos.snapshot = iter->snapshot;
+
+ __bch2_btree_iter_set_pos(iter, new_pos);
+}
+
static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
{
BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
@@ -295,14 +310,27 @@ static inline int bkey_err(struct bkey_s_c k)
return PTR_ERR_OR_ZERO(k.k);
}
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
- unsigned flags)
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+ unsigned flags)
{
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_peek_slot(iter)
: bch2_btree_iter_peek(iter);
}
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
+{
+ if (!(flags & BTREE_ITER_SLOTS))
+ return bch2_btree_iter_peek_upto(iter, end);
+
+ if (bkey_cmp(iter->pos, end) > 0)
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_slot(iter);
+}
+
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
@@ -316,7 +344,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
struct bkey_s_c k;
while (btree_trans_too_many_iters(trans) ||
- (k = __bch2_btree_iter_peek(iter, flags),
+ (k = bch2_btree_iter_peek_type(iter, flags),
bkey_err(k) == -EINTR))
bch2_trans_begin(trans);
@@ -335,7 +363,15 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
@@ -347,16 +383,21 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for (; \
- (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
/* new multiple iterator interface: */
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
-void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
+ unsigned, size_t, const char *);
void bch2_trans_exit(struct btree_trans *);
+#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__)
+
void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_btree_iter_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 230a920ae32a..f5a942b6bbf7 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -146,28 +146,32 @@ bkey_cached_reuse(struct btree_key_cache *c)
}
static struct bkey_cached *
-btree_key_cache_create(struct btree_key_cache *c,
+btree_key_cache_create(struct bch_fs *c,
enum btree_id btree_id,
struct bpos pos)
{
+ struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck;
bool was_new = true;
- ck = bkey_cached_alloc(c);
+ ck = bkey_cached_alloc(bc);
if (unlikely(!ck)) {
- ck = bkey_cached_reuse(c);
- if (unlikely(!ck))
+ ck = bkey_cached_reuse(bc);
+ if (unlikely(!ck)) {
+ bch_err(c, "error allocating memory for key cache item, btree %s",
+ bch2_btree_ids[btree_id]);
return ERR_PTR(-ENOMEM);
+ }
was_new = false;
+ } else {
+ if (btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
+ else
+ six_lock_pcpu_free(&ck->c.lock);
}
- if (btree_id == BTREE_ID_subvolumes)
- six_lock_pcpu_alloc(&ck->c.lock);
- else
- six_lock_pcpu_free(&ck->c.lock);
-
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
@@ -175,7 +179,7 @@ btree_key_cache_create(struct btree_key_cache *c,
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
- if (unlikely(rhashtable_lookup_insert_fast(&c->table,
+ if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
&ck->hash,
bch2_btree_key_cache_params))) {
/* We raced with another fill: */
@@ -185,15 +189,15 @@ btree_key_cache_create(struct btree_key_cache *c,
six_unlock_intent(&ck->c.lock);
kfree(ck);
} else {
- mutex_lock(&c->lock);
- bkey_cached_free(c, ck);
- mutex_unlock(&c->lock);
+ mutex_lock(&bc->lock);
+ bkey_cached_free(bc, ck);
+ mutex_unlock(&bc->lock);
}
return NULL;
}
- atomic_long_inc(&c->nr_keys);
+ atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
@@ -204,21 +208,24 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct btree_path *ck_path,
struct bkey_cached *ck)
{
- struct btree_iter iter;
+ struct btree_path *path;
struct bkey_s_c k;
unsigned new_u64s = 0;
struct bkey_i *new_k = NULL;
+ struct bkey u;
int ret;
- bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
- ck->key.pos, BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
+ path = bch2_path_get(trans, ck->key.btree_id,
+ ck->key.pos, 0, 0, 0, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, path, 0);
if (ret)
goto err;
+ k = bch2_btree_path_peek_slot(path, &u);
+
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+ trace_trans_restart_relock_key_cache_fill(trans->fn,
+ _THIS_IP_, ck_path->btree_id, &ck_path->pos);
ret = btree_trans_restart(trans);
goto err;
}
@@ -233,6 +240,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
new_u64s = roundup_pow_of_two(new_u64s);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_ids[ck->key.btree_id], new_u64s);
ret = -ENOMEM;
goto err;
}
@@ -254,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(&iter);
+ path->preserve = false;
err:
- bch2_trans_iter_exit(trans, &iter);
+ bch2_path_put(trans, path, 0);
return ret;
}
@@ -293,15 +302,14 @@ retry:
return 0;
}
- ck = btree_key_cache_create(&c->btree_key_cache,
- path->btree_id, path->pos);
+ ck = btree_key_cache_create(c, path->btree_id, path->pos);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
- mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
path->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -312,7 +320,6 @@ retry:
if (!trans->restarted)
goto retry;
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
@@ -323,7 +330,7 @@ retry:
goto retry;
}
- mark_btree_node_locked(path, 0, lock_want);
+ mark_btree_node_locked(trans, path, 0, lock_want);
}
path->l[0].lock_seq = ck->c.lock.state.seq;
@@ -332,7 +339,7 @@ fill:
if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+ trace_transaction_restart_ip(trans->fn, _THIS_IP_);
ret = btree_trans_restart(trans);
goto err;
}
@@ -378,21 +385,27 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
+ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
goto out;
ck = (void *) c_iter.path->l[0].b;
- if (!ck ||
- (journal_seq && ck->journal.seq != journal_seq))
+ if (!ck)
goto out;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- if (!evict)
- goto out;
- goto evict;
+ if (evict)
+ goto evict;
+ goto out;
}
+ BUG_ON(!ck->valid);
+
+ if (journal_seq && ck->journal.seq != journal_seq)
+ goto out;
+
/*
* Since journal reclaim depends on us making progress here, and the
* allocator/copygc depend on journal reclaim making progress, we need
@@ -400,6 +413,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
* */
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
@@ -407,7 +421,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
(ck->journal.seq == journal_last_seq(j)
- ? BTREE_INSERT_JOURNAL_RESERVED
+ ? JOURNAL_WATERMARK_reserved
: 0)|
commit_flags);
if (ret) {
@@ -541,14 +555,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
return true;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
- enum btree_id id, struct bpos pos)
-{
- BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
-}
-#endif
-
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 0768ef3ca776..fd29c14c5626 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
- return nr_dirty > max_dirty &&
- test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+ return nr_dirty > max_dirty;
}
int bch2_btree_key_cache_journal_flush(struct journal *,
@@ -33,14 +32,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *,
struct btree_path *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *,
- enum btree_id, struct bpos);
-#else
-static inline void
-bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
- enum btree_id id, struct bpos pos) {}
-#endif
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index d599008c5fc1..67c970d727ac 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -58,7 +58,8 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
path->nodes_intent_locked &= ~(1 << level);
}
-static inline void mark_btree_node_locked(struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+ struct btree_path *path,
unsigned level,
enum six_lock_type type)
{
@@ -66,14 +67,17 @@ static inline void mark_btree_node_locked(struct btree_path *path,
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
+ BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx);
+
path->nodes_locked |= 1 << level;
path->nodes_intent_locked |= type << level;
}
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
+static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
+ struct btree_path *path,
unsigned level)
{
- mark_btree_node_locked(path, level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
}
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -128,23 +132,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
}
}
-/*
- * wrapper around six locks that just traces lock contended time
- */
-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
- enum six_lock_type type)
+static inline bool btree_node_lock_type(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct bpos pos, unsigned level,
+ enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- u64 start_time = local_clock();
+ struct bch_fs *c = trans->c;
+ u64 start_time;
+ bool ret;
- six_lock_type(&b->c.lock, type, NULL, NULL);
- bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
-}
+ if (six_trylock_type(&b->c.lock, type))
+ return true;
-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
- enum six_lock_type type)
-{
- if (!six_trylock_type(&b->c.lock, type))
- __btree_node_lock_type(c, b, type);
+ start_time = local_clock();
+
+ trans->locking_path_idx = path->idx;
+ trans->locking_pos = pos;
+ trans->locking_btree_id = path->btree_id;
+ trans->locking_level = level;
+ trans->locking_lock_type = type;
+ trans->locking = b;
+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+ trans->locking = NULL;
+
+ if (ret)
+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+
+ return ret;
}
/*
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index c84bba7bcda5..3438e089dba0 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -8,6 +8,7 @@
#include "bkey_methods.h"
#include "buckets_types.h"
+#include "darray.h"
#include "journal_types.h"
struct open_bucket;
@@ -152,7 +153,8 @@ struct btree_cache {
struct mutex lock;
struct list_head live;
struct list_head freeable;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
@@ -202,15 +204,16 @@ struct btree_node_iter {
*/
#define BTREE_ITER_IS_EXTENTS (1 << 4)
#define BTREE_ITER_NOT_EXTENTS (1 << 5)
-#define BTREE_ITER_ERROR (1 << 6)
-#define BTREE_ITER_CACHED (1 << 7)
-#define BTREE_ITER_CACHED_NOFILL (1 << 8)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 9)
+#define BTREE_ITER_CACHED (1 << 6)
+#define BTREE_ITER_CACHED_NOFILL (1 << 7)
+#define BTREE_ITER_CACHED_NOCREATE (1 << 8)
+#define BTREE_ITER_WITH_KEY_CACHE (1 << 9)
#define BTREE_ITER_WITH_UPDATES (1 << 10)
-#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
-#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13)
-#define BTREE_ITER_NOPRESERVE (1 << 14)
+#define BTREE_ITER_WITH_JOURNAL (1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
+#define BTREE_ITER_NOPRESERVE (1 << 15)
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -275,6 +278,8 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
struct btree_iter {
struct btree_trans *trans;
struct btree_path *path;
+ struct btree_path *update_path;
+ struct btree_path *key_cache_path;
enum btree_id btree_id:4;
unsigned min_depth:4;
@@ -322,7 +327,7 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
- u8 u64s;
+ u16 u64s;
bool valid;
u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
@@ -340,12 +345,20 @@ struct btree_insert_entry {
unsigned flags;
u8 bkey_type;
enum btree_id btree_id:8;
- u8 level;
+ u8 level:4;
bool cached:1;
bool insert_trigger_run:1;
bool overwrite_trigger_run:1;
+ /*
+ * @old_k may be a key from the journal; @old_btree_u64s always refers
+ * to the size of the key being overwritten in the btree:
+ */
+ u8 old_btree_u64s;
struct bkey_i *k;
struct btree_path *path;
+ /* key being overwritten: */
+ struct bkey old_k;
+ const struct bch_val *old_v;
unsigned long ip_allocated;
};
@@ -367,21 +380,26 @@ struct btree_trans_commit_hook {
struct btree_trans {
struct bch_fs *c;
+ const char *fn;
struct list_head list;
struct btree *locking;
unsigned locking_path_idx;
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
+ u8 locking_lock_type;
pid_t pid;
- unsigned long ip;
int srcu_idx;
u8 nr_sorted;
u8 nr_updates;
+ u8 traverse_all_idx;
bool used_mempool:1;
bool in_traverse_all:1;
bool restarted:1;
+ bool memory_allocation_failure:1;
+ bool journal_transaction_names:1;
+ bool is_initial_gc:1;
/*
* For when bch2_trans_update notices we'll be splitting a compressed
* extent:
@@ -400,8 +418,7 @@ struct btree_trans {
/* update path: */
struct btree_trans_commit_hook *hooks;
- struct jset_entry *extra_journal_entries;
- unsigned extra_journal_entry_u64s;
+ DARRAY(u64) extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
@@ -414,7 +431,31 @@ struct btree_trans {
struct replicas_delta_list *fs_usage_deltas;
};
-#define BTREE_FLAG(flag) \
+#define BTREE_FLAGS() \
+ x(read_in_flight) \
+ x(read_error) \
+ x(dirty) \
+ x(need_write) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(noevict) \
+ x(write_idx) \
+ x(accessed) \
+ x(write_in_flight) \
+ x(write_in_flight_inner) \
+ x(just_written) \
+ x(dying) \
+ x(fake) \
+ x(need_rewrite) \
+ x(never_write)
+
+enum btree_flags {
+#define x(flag) BTREE_NODE_##flag,
+ BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
@@ -424,36 +465,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
static inline void clear_btree_node_ ## flag(struct btree *b) \
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-enum btree_flags {
- BTREE_NODE_read_in_flight,
- BTREE_NODE_read_error,
- BTREE_NODE_dirty,
- BTREE_NODE_need_write,
- BTREE_NODE_noevict,
- BTREE_NODE_write_idx,
- BTREE_NODE_accessed,
- BTREE_NODE_write_in_flight,
- BTREE_NODE_write_in_flight_inner,
- BTREE_NODE_just_written,
- BTREE_NODE_dying,
- BTREE_NODE_fake,
- BTREE_NODE_need_rewrite,
- BTREE_NODE_never_write,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(write_in_flight_inner);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(need_rewrite);
-BTREE_FLAG(never_write);
+BTREE_FLAGS()
+#undef x
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -583,24 +596,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
return __btree_node_type(b->c.level, b->c.btree_id);
}
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
- switch (type) {
- case BKEY_TYPE_extents:
- case BKEY_TYPE_reflink:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
- return btree_node_type_is_extents(btree_node_type(b));
-}
-
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_extents)| \
+ (1U << BKEY_TYPE_alloc)| \
(1U << BKEY_TYPE_inodes)| \
(1U << BKEY_TYPE_stripes)| \
(1U << BKEY_TYPE_reflink)| \
@@ -616,6 +614,16 @@ static inline bool btree_node_is_extents(struct btree *b)
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+#define BTREE_ID_IS_EXTENTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_reflink)| \
+ (1U << BTREE_ID_freespace))
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+ return (1U << type) & BTREE_ID_IS_EXTENTS;
+}
+
#define BTREE_ID_HAS_SNAPSHOTS \
((1U << BTREE_ID_extents)| \
(1U << BTREE_ID_inodes)| \
@@ -633,6 +641,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ __BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
@@ -645,6 +654,7 @@ enum btree_update_flags {
};
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
@@ -659,6 +669,7 @@ enum btree_update_flags {
((1U << KEY_TYPE_alloc)| \
(1U << KEY_TYPE_alloc_v2)| \
(1U << KEY_TYPE_alloc_v3)| \
+ (1U << KEY_TYPE_alloc_v4)| \
(1U << KEY_TYPE_stripe)| \
(1U << KEY_TYPE_inode)| \
(1U << KEY_TYPE_inode_v2)| \
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 89f07e58f61b..ad13b0739a68 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -16,12 +16,12 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
- __BTREE_INSERT_NOFAIL,
+ /* First two bits for journal watermark: */
+ __BTREE_INSERT_NOFAIL = 2,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
@@ -41,9 +41,6 @@ enum btree_insert_flags {
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
-/* Indicates that we have pre-reserved space in the journal: */
-#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
@@ -63,7 +60,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
- struct bpos, struct bpos, u64 *);
+ struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
struct btree *, unsigned);
@@ -73,12 +70,18 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
int bch2_btree_node_update_key_get_iter(struct btree_trans *,
struct btree *, struct bkey_i *, bool);
+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_update_flags);
+
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
+
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
+int bch2_trans_log_msg(struct btree_trans *, const char *);
+
/**
* bch2_trans_commit - insert keys at given iterator positions
*
@@ -135,21 +138,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i)
- if ((cmp_int(btree_id, i->btree_id) ?:
- bpos_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->btree_id)
- return i->k;
- break;
- }
-
- return NULL;
-}
-
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6872e56b5c41..42ae3b0c5839 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@@ -40,11 +41,11 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
struct bkey_s_c k;
struct bkey_s_c_btree_ptr_v2 bp;
struct bkey unpacked;
- char buf1[100], buf2[100];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
BUG_ON(!b->c.level);
- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
return;
bch2_btree_node_iter_init_from_start(&iter, b);
@@ -57,9 +58,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
if (bpos_cmp(next_node, bp.v->min_key)) {
bch2_dump_btree_node(c, b);
- panic("expected next min_key %s got %s\n",
- (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+ bch2_bpos_to_text(&buf1, next_node);
+ bch2_bpos_to_text(&buf2, bp.v->min_key);
+ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
}
bch2_btree_node_iter_advance(&iter, b);
@@ -67,9 +68,9 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
if (bch2_btree_node_iter_end(&iter)) {
if (bpos_cmp(k.k->p, b->key.k.p)) {
bch2_dump_btree_node(c, b);
- panic("expected end %s got %s\n",
- (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+ bch2_bpos_to_text(&buf1, b->key.k.p);
+ bch2_bpos_to_text(&buf2, k.k->p);
+ panic("expected end %s got %s\n", buf1.buf, buf2.buf);
}
break;
}
@@ -180,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct disk_reservation *res,
struct closure *cl,
+ bool interior_node,
unsigned flags)
{
struct write_point *wp;
@@ -192,10 +194,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
- alloc_reserve = RESERVE_BTREE_MOVINGGC;
+ alloc_reserve = RESERVE_btree_movinggc;
} else {
nr_reserve = BTREE_NODE_RESERVE;
- alloc_reserve = RESERVE_BTREE;
+ alloc_reserve = RESERVE_btree;
}
mutex_lock(&c->btree_reserve_cache_lock);
@@ -241,7 +243,9 @@ retry:
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, interior_node);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
@@ -257,15 +261,19 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
{
struct bch_fs *c = as->c;
struct btree *b;
+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!as->nr_prealloc_nodes);
+ BUG_ON(!p->nr);
+
+ b = p->b[--p->nr];
- b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
set_btree_node_accessed(b);
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
@@ -371,70 +379,94 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
static void bch2_btree_reserve_put(struct btree_update *as)
{
struct bch_fs *c = as->c;
+ struct prealloc_nodes *p;
mutex_lock(&c->btree_reserve_cache_lock);
- while (as->nr_prealloc_nodes) {
- struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ for (p = as->prealloc_nodes;
+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+ p++) {
+ while (p->nr) {
+ struct btree *b = p->b[--p->nr];
- six_unlock_write(&b->c.lock);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+ if (c->btree_reserve_cache_nr <
+ ARRAY_SIZE(c->btree_reserve_cache)) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
-
- btree_node_lock_type(c, b, SIX_LOCK_write);
- __btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
+ a->ob = b->ob;
+ b->ob.nr = 0;
+ bkey_copy(&a->k, &b->key);
+ } else {
+ bch2_open_buckets_put(c, &b->ob);
+ }
- six_unlock_intent(&b->c.lock);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ }
}
mutex_unlock(&c->btree_reserve_cache_lock);
}
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
- unsigned flags, struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_update *as,
+ unsigned nr_nodes[2],
+ unsigned flags)
{
struct bch_fs *c = as->c;
+ struct closure cl;
struct btree *b;
+ unsigned interior;
int ret;
- BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+ closure_init_stack(&cl);
+retry:
+
+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
+ *
+ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+ * blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret)
- return ret;
+ goto err;
- while (as->nr_prealloc_nodes < nr_nodes) {
- b = __bch2_btree_node_alloc(c, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT
- ? NULL : cl, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err_free;
- }
+ for (interior = 0; interior < 2; interior++) {
+ struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+ while (p->nr < nr_nodes[interior]) {
+ b = __bch2_btree_node_alloc(c, &as->disk_res,
+ flags & BTREE_INSERT_NOWAIT
+ ? NULL : &cl,
+ interior, flags);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto err;
+ }
- as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
+ p->b[p->nr++] = b;
+ }
}
bch2_btree_cache_cannibalize_unlock(c);
+ closure_sync(&cl);
return 0;
-err_free:
+err:
bch2_btree_cache_cannibalize_unlock(c);
- trace_btree_reserve_get_fail(c, nr_nodes, cl);
+ closure_sync(&cl);
+
+ if (ret == -EAGAIN)
+ goto retry;
+
+ trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl);
return ret;
}
@@ -500,24 +532,25 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
struct bkey_i *k;
int ret;
- trans->extra_journal_entries = (void *) &as->journal_entries[0];
- trans->extra_journal_entry_u64s = as->journal_u64s;
+ ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s);
+ if (ret)
+ return ret;
+
+ memcpy(&darray_top(trans->extra_journal_entries),
+ as->journal_entries,
+ as->journal_u64s * sizeof(u64));
+ trans->extra_journal_entries.nr += as->journal_u64s;
+
trans->journal_pin = &as->journal;
for_each_keylist_key(&as->new_keys, k) {
- ret = bch2_trans_mark_key(trans,
- bkey_s_c_null,
- bkey_i_to_s_c(k),
- BTREE_TRIGGER_INSERT);
+ ret = bch2_trans_mark_new(trans, k, 0);
if (ret)
return ret;
}
for_each_keylist_key(&as->old_keys, k) {
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(k),
- bkey_s_c_null,
- BTREE_TRIGGER_OVERWRITE);
+ ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0);
if (ret)
return ret;
}
@@ -545,8 +578,6 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
- BUG_ON(!journal_pin_active(&as->journal));
-
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
@@ -582,7 +613,7 @@ static void btree_update_nodes_written(struct btree_update *as)
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED,
+ JOURNAL_WATERMARK_reserved,
btree_update_nodes_written_trans(&trans, as));
bch2_trans_exit(&trans);
@@ -602,11 +633,13 @@ err:
* we're in journal error state:
*/
- btree_node_lock_type(c, b, SIX_LOCK_intent);
- btree_node_lock_type(c, b, SIX_LOCK_write);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->write_blocked_list);
+ if (list_empty(&b->write_blocked))
+ clear_btree_node_write_blocked(b);
/*
* Node might have been freed, recheck under
@@ -651,13 +684,14 @@ err:
BUG_ON(b->will_make_reachable != (unsigned long) as);
b->will_make_reachable = 0;
+ clear_btree_node_will_make_reachable(b);
}
mutex_unlock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
- btree_node_lock_type(c, b, SIX_LOCK_read);
+ six_lock_read(&b->c.lock, NULL, NULL);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@@ -717,6 +751,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b;
+
+ set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
@@ -782,6 +818,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
as->new_nodes[as->nr_new_nodes++] = b;
b->will_make_reachable = 1UL|(unsigned long) as;
+ set_btree_node_will_make_reachable(b);
mutex_unlock(&c->btree_interior_update_lock);
@@ -804,6 +841,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
* xchg() is for synchronization with bch2_btree_complete_write:
*/
v = xchg(&b->will_make_reachable, 0);
+ clear_btree_node_will_make_reachable(b);
as = (struct btree_update *) (v & ~1UL);
if (!as) {
@@ -869,7 +907,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
closure_wake_up(&c->btree_interior_update_wait);
}
- clear_btree_node_dirty(c, b);
+ clear_btree_node_dirty_acct(c, b);
clear_btree_node_need_write(b);
/*
@@ -930,31 +968,43 @@ static void bch2_btree_update_done(struct btree_update *as)
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, unsigned nr_nodes, unsigned flags)
+ unsigned level, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
- struct closure cl;
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- int journal_flags = 0;
+ unsigned nr_nodes[2] = { 0, 0 };
+ unsigned update_level = level;
+ int journal_flags = flags & JOURNAL_WATERMARK_MASK;
int ret = 0;
BUG_ON(!path->should_be_locked);
- if (flags & BTREE_INSERT_JOURNAL_RESERVED)
- journal_flags |= JOURNAL_RES_GET_RESERVED;
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ journal_flags |= JOURNAL_RES_GET_NONBLOCK;
- closure_init_stack(&cl);
-retry:
+ while (1) {
+ nr_nodes[!!update_level] += 1 + split;
+ update_level++;
+
+ if (!btree_path_node(path, update_level))
+ break;
+
+ /*
+ * XXX: figure out how far we might need to split,
+ * instead of locking/reserving all the way to the root:
+ */
+ split = update_level + 1 < BTREE_MAX_DEPTH;
+ }
+
+ /* Might have to allocate a new root: */
+ if (update_level < BTREE_MAX_DEPTH)
+ nr_nodes[1] += 1;
- /*
- * XXX: figure out how far we might need to split,
- * instead of locking/reserving all the way to the root:
- */
if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
+ trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
path->btree_id, &path->pos);
ret = btree_trans_restart(trans);
return ERR_PTR(ret);
@@ -1002,60 +1052,37 @@ retry:
if (ret)
goto err;
+ bch2_trans_unlock(trans);
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
-
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- bch2_btree_update_free(as);
- btree_trans_restart(trans);
- return ERR_PTR(ret);
- }
-
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags);
- if (ret) {
- trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
- goto err;
- }
-
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
- goto err;
- }
+ journal_flags);
+ if (ret) {
+ bch2_btree_update_free(as);
+ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+ btree_trans_restart(trans);
+ return ERR_PTR(ret);
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
- nr_nodes * btree_sectors(c),
+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags);
if (ret)
goto err;
- bch2_journal_pin_add(&c->journal,
- atomic64_read(&c->journal.seq),
- &as->journal, NULL);
+ if (!bch2_trans_relock(trans)) {
+ ret = -EINTR;
+ goto err;
+ }
return as;
err:
bch2_btree_update_free(as);
-
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- ret = -EINTR;
- }
-
- if (ret == -EINTR && bch2_trans_relock(trans))
- goto retry;
-
return ERR_PTR(ret);
}
@@ -1105,8 +1132,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct btree *old;
trace_btree_set_root(c, b);
- BUG_ON(!b->written &&
- !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+ BUG_ON(!b->written);
old = btree_node_root(c, b);
@@ -1146,13 +1172,17 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
- char buf[160];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
- bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid);
+ printbuf_exit(&buf);
dump_stack();
}
@@ -1170,7 +1200,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_node_iter_advance(node_iter, b);
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
}
@@ -1391,8 +1421,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
- bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
/*
* Note that on recursive parent_keys == keys, so we
@@ -1411,7 +1441,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
- bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
}
} else {
trace_btree_compact(c, b);
@@ -1419,7 +1449,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->c.lock);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1556,14 +1586,13 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
struct btree_path *path,
unsigned flags)
{
- struct bch_fs *c = trans->c;
struct btree *b = path_l(path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
as = bch2_btree_update_start(trans, path, path->level,
- btree_update_reserve_required(c, b), flags);
+ true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1634,15 +1663,17 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
}
if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
- char buf1[100], buf2[100];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
- bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+ bch2_bpos_to_text(&buf1, prev->data->max_key);
+ bch2_bpos_to_text(&buf2, next->data->min_key);
bch_err(c,
"btree topology error in btree merge:\n"
" prev ends at %s\n"
" next starts at %s",
- buf1, buf2);
+ buf1.buf, buf2.buf);
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
bch2_topology_error(c);
ret = -EIO;
goto err;
@@ -1672,11 +1703,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
goto out;
parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, level,
- btree_update_reserve_required(c, parent) + 1,
- flags|
+ as = bch2_btree_update_start(trans, path, level, false,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ BTREE_INSERT_USE_RESERVE|
+ flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
@@ -1689,6 +1719,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
n = bch2_btree_node_alloc(as, b->c.level);
bch2_btree_update_add_new_node(as, n);
+ SET_BTREE_NODE_SEQ(n->data,
+ max(BTREE_NODE_SEQ(b->data),
+ BTREE_NODE_SEQ(m->data)) + 1);
+
btree_set_min(n, prev->data->min_key);
btree_set_max(n, next->data->max_key);
n->data->format = new_f;
@@ -1701,7 +1735,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->c.lock);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
@@ -1755,10 +1789,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level,
- (parent
- ? btree_update_reserve_required(c, parent)
- : 0) + 1,
- flags);
+ false, flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret) {
trace_btree_gc_rewrite_node_fail(c, b);
@@ -1775,7 +1806,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
trace_btree_gc_rewrite_node(c, b);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
@@ -1847,9 +1878,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
struct async_btree_rewrite *a;
- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
- return;
-
if (!percpu_ref_tryget(&c->writes))
return;
@@ -1878,21 +1906,14 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter2 = { NULL };
struct btree *parent;
- u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_key(trans,
- bkey_s_c_null,
- bkey_i_to_s_c(new_key),
- BTREE_TRIGGER_INSERT);
+ ret = bch2_trans_mark_new(trans, new_key, 0);
if (ret)
return ret;
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(&b->key),
- bkey_s_c_null,
- BTREE_TRIGGER_OVERWRITE);
+ ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0);
if (ret)
return ret;
}
@@ -1918,6 +1939,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
btree_node_unlock(iter2.path, iter2.path->level);
path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
iter2.path->level++;
+ btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
+
+ bch2_btree_path_check_sort(trans, iter2.path, 0);
ret = bch2_btree_iter_traverse(&iter2) ?:
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
@@ -1926,19 +1950,24 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
} else {
BUG_ON(btree_node_root(c, b) != b);
- trans->extra_journal_entries = (void *) &journal_entries[0];
- trans->extra_journal_entry_u64s =
- journal_entry_set((void *) &journal_entries[0],
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- new_key, new_key->k.u64s);
+ ret = darray_make_room(trans->extra_journal_entries,
+ jset_u64s(new_key->k.u64s));
+ if (ret)
+ return ret;
+
+ journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ BCH_JSET_ENTRY_btree_root,
+ b->c.btree_id, b->c.level,
+ new_key, new_key->k.u64s);
+ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
}
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED);
+ JOURNAL_WATERMARK_reserved);
if (ret)
goto err;
@@ -2001,7 +2030,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
return -EINTR;
}
- new_hash = bch2_btree_node_mem_alloc(c);
+ new_hash = bch2_btree_node_mem_alloc(c, false);
}
path->intent_ref++;
@@ -2077,7 +2106,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, false);
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8cf59cee6e4e..e72eb8795616 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -76,18 +76,20 @@ struct btree_update {
struct journal_entry_pin journal;
/* Preallocated nodes we reserve when we start the update: */
- struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_prealloc_nodes;
+ struct prealloc_nodes {
+ struct btree *b[BTREE_UPDATE_NODES_MAX];
+ unsigned nr;
+ } prealloc_nodes[2];
/* Nodes being freed: */
struct keylist old_keys;
u64 _old_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_VAL_U64s_MAX];
+ BKEY_BTREE_PTR_U64s_MAX];
/* Nodes being added: */
struct keylist new_keys;
u64 _new_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_VAL_U64s_MAX];
+ BKEY_BTREE_PTR_U64s_MAX];
/* New nodes, that will be made reachable by this update: */
struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 1966441b1a62..a0480c63dd81 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -15,6 +15,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery.h"
#include "subvolume.h"
#include "replicas.h"
@@ -22,6 +23,10 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, enum btree_update_flags);
+
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
@@ -162,10 +167,24 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
+ unsigned long old, new, v;
+ unsigned idx = w - b->writes;
+
+ six_lock_read(&b->c.lock, NULL, NULL);
+ v = READ_ONCE(b->flags);
+
+ do {
+ old = new = v;
+
+ if (!(old & (1 << BTREE_NODE_dirty)) ||
+ !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+ w->journal.seq != seq)
+ break;
+
+ new |= 1 << BTREE_NODE_need_write;
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
- btree_node_lock_type(c, b, SIX_LOCK_read);
- bch2_btree_node_write_cond(c, b,
- (btree_current_write(b) == w && w->journal.seq == seq));
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
return 0;
}
@@ -194,7 +213,7 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
/**
* btree_insert_key - insert a key one key into a leaf node
*/
-static bool btree_insert_key_leaf(struct btree_trans *trans,
+static void btree_insert_key_leaf(struct btree_trans *trans,
struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
@@ -205,12 +224,9 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- EBUG_ON(!insert->level &&
- !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
-
if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
&insert_l(insert)->iter, insert->k)))
- return false;
+ return;
i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
le64_to_cpu(i->journal_seq)));
@@ -218,7 +234,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
if (unlikely(!btree_node_dirty(b)))
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -231,8 +247,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
-
- return true;
}
/* Cached btree updates: */
@@ -268,7 +282,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
return ret;
if (!bch2_trans_relock(trans)) {
- trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
+ trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
return -EINTR;
}
@@ -281,15 +295,40 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret;
- if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
- flags |= JOURNAL_RES_GET_RESERVED;
-
ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
- trans->journal_u64s, flags);
+ trans->journal_u64s,
+ flags|
+ (trans->flags & JOURNAL_WATERMARK_MASK));
return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
}
+#define JSET_ENTRY_LOG_U64s 4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
+ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+ unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
+ unsigned b, buflen = u64s * sizeof(u64);
+
+ l->entry.u64s = cpu_to_le16(u64s);
+ l->entry.btree_id = 0;
+ l->entry.level = 0;
+ l->entry.type = BCH_JSET_ENTRY_log;
+ l->entry.pad[0] = 0;
+ l->entry.pad[1] = 0;
+ l->entry.pad[2] = 0;
+ b = min_t(unsigned, strlen(trans->fn), buflen);
+ memcpy(l->d, trans->fn, b);
+ while (b < buflen)
+ l->d[b++] = '\0';
+
+ trans->journal_res.offset += JSET_ENTRY_LOG_U64s;
+ trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s;
+}
+
static inline enum btree_insert_ret
btree_key_can_insert(struct btree_trans *trans,
struct btree *b,
@@ -308,14 +347,15 @@ btree_key_can_insert_cached(struct btree_trans *trans,
struct btree_path *path,
unsigned u64s)
{
+ struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) path->l[0].b;
- unsigned new_u64s;
+ unsigned old_u64s = ck->u64s, new_u64s;
struct bkey_i *new_k;
EBUG_ON(path->level);
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(trans->c) &&
+ bch2_btree_key_cache_must_wait(c) &&
!(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
@@ -330,12 +370,27 @@ btree_key_can_insert_cached(struct btree_trans *trans,
new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
- if (!new_k)
+ if (!new_k) {
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_ids[path->btree_id], new_u64s);
return -ENOMEM;
+ }
ck->u64s = new_u64s;
ck->k = new_k;
- return BTREE_INSERT_OK;
+ /*
+ * Keys returned by peek() are no longer valid pointers, so we need a
+ * transaction restart:
+ */
+ trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
+ path->btree_id, &path->pos,
+ old_u64s, new_u64s);
+ /*
+ * Not using btree_trans_restart() because we can't unlock here, we have
+ * write locks held:
+ */
+ trans->restarted = true;
+ return -EINTR;
}
static inline void do_btree_insert_one(struct btree_trans *trans,
@@ -343,18 +398,16 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
- bool did_work;
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
i->k->k.needs_whiteout = false;
- did_work = !i->cached
- ? btree_insert_key_leaf(trans, i)
- : bch2_btree_insert_key_cached(trans, i->path, i->k);
- if (!did_work)
- return;
+ if (!i->cached)
+ btree_insert_key_leaf(trans, i);
+ else
+ bch2_btree_insert_key_cached(trans, i->path, i->k);
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
bch2_journal_add_keys(j, &trans->journal_res,
@@ -367,10 +420,163 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
}
}
-static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+ struct btree_insert_entry *i,
+ unsigned flags)
+{
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+ struct bkey_i *new = i->k;
+ int ret;
+
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ return 0;
+
+ if (!btree_node_type_needs_gc(i->btree_id))
+ return 0;
+
+ if (bch2_bkey_ops[old.k->type].atomic_trigger ==
+ bch2_bkey_ops[i->k->k.type].atomic_trigger &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ } else {
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+ _deleted.p = i->path->pos;
+
+ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_mark_key(trans, old, deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+ }
+
+ return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+ bool overwrite)
+{
+ /*
+ * Transactional triggers create new btree_insert_entries, so we can't
+ * pass them a pointer to a btree_insert_entry, that memory is going to
+ * move:
+ */
+ struct bkey old_k = i->old_k;
+ struct bkey_s_c old = { &old_k, i->old_v };
+
+ if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ return 0;
+
+ if (!i->insert_trigger_run &&
+ !i->overwrite_trigger_run &&
+ bch2_bkey_ops[old.k->type].trans_trigger ==
+ bch2_bkey_ops[i->k->k.type].trans_trigger &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ i->overwrite_trigger_run = true;
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_key(trans, old, i->k,
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|
+ i->flags) ?: 1;
+ } else if (overwrite && !i->overwrite_trigger_run) {
+ i->overwrite_trigger_run = true;
+ return bch2_trans_mark_old(trans, old, i->flags) ?: 1;
+ } else if (!i->insert_trigger_run) {
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1;
+ } else {
+ return 0;
+ }
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+ struct btree_insert_entry *btree_id_start)
+{
+ struct btree_insert_entry *i;
+ bool trans_trigger_run;
+ int ret, overwrite;
+
+ for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ if (i->btree_id != btree_id)
+ continue;
+
+ ret = run_one_trans_trigger(trans, i, overwrite);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
+ }
+
+ return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ unsigned btree_id = 0;
+ int ret = 0;
+
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ if (btree_id == BTREE_ID_alloc)
+ continue;
+
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
+
+ ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ if (ret)
+ return ret;
+ }
+
+ trans_for_each_update(trans, i) {
+ if (i->btree_id > BTREE_ID_alloc)
+ break;
+ if (i->btree_id == BTREE_ID_alloc) {
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ if (ret)
+ return ret;
+ break;
+ }
+ }
+
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
+
+ return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ int ret = 0;
trans_for_each_update(trans, i) {
/*
@@ -379,10 +585,14 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
- bch2_mark_update(trans, i->path, i->k,
- i->flags|BTREE_TRIGGER_GC);
+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (ret)
+ break;
+ }
}
+
+ return ret;
}
static inline int
@@ -398,7 +608,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
int ret;
if (race_fault()) {
- trace_trans_restart_fault_inject(trans->ip, trace_ip);
+ trace_trans_restart_fault_inject(trans->fn, trace_ip);
trans->restarted = true;
return -EINTR;
}
@@ -435,6 +645,32 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (btree_node_type_needs_gc(i->bkey_type))
marking = true;
+
+ /*
+ * Revalidate before calling mem triggers - XXX, ugly:
+ *
+ * - successful btree node splits don't cause transaction
+ * restarts and will have invalidated the pointer to the bkey
+ * value
+ * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
+ * when it has to resort
+ * - btree_key_can_insert_cached() when it has to reallocate
+ *
+ * Ugly because we currently have no way to tell if the
+ * pointer's been invalidated, which means it's debatabale
+ * whether we should be stashing the old key at all.
+ */
+ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p);
+
+ if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
}
/*
@@ -446,17 +682,20 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
JOURNAL_RES_GET_NONBLOCK);
if (ret)
return ret;
+
+ if (unlikely(trans->journal_transaction_names))
+ journal_transaction_name(trans);
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
- if (unlikely(trans->extra_journal_entry_u64s)) {
+ if (unlikely(trans->extra_journal_entries.nr)) {
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries,
- trans->extra_journal_entry_u64s);
+ trans->extra_journal_entries.data,
+ trans->extra_journal_entries.nr);
- trans->journal_res.offset += trans->extra_journal_entry_u64s;
- trans->journal_res.u64s -= trans->extra_journal_entry_u64s;
+ trans->journal_res.offset += trans->extra_journal_entries.nr;
+ trans->journal_res.u64s -= trans->extra_journal_entries.nr;
}
/*
@@ -478,11 +717,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
return BTREE_INSERT_NEED_MARK_REPLICAS;
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
- bch2_mark_update(trans, i->path, i->k, i->flags);
+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, i->flags);
+ if (ret)
+ return ret;
+ }
- if (unlikely(c->gc_pos.phase))
- bch2_trans_mark_gc(trans);
+ if (unlikely(c->gc_pos.phase)) {
+ ret = bch2_trans_commit_run_gc_triggers(trans);
+ if (ret)
+ return ret;
+ }
trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
@@ -572,8 +817,10 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (have_conflicting_read_lock(trans, i->path))
goto fail;
- __btree_node_lock_type(trans->c, insert_l(i)->b,
- SIX_LOCK_write);
+ btree_node_lock_type(trans, i->path,
+ insert_l(i)->b,
+ i->path->pos, i->level,
+ SIX_LOCK_write, NULL, NULL);
}
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
@@ -588,10 +835,18 @@ fail:
bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
}
- trace_trans_restart_would_deadlock_write(trans->ip);
+ trace_trans_restart_would_deadlock_write(trans->fn);
return btree_trans_restart(trans);
}
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -601,42 +856,29 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct bkey_s_c old;
int ret, u64s_delta = 0;
trans_for_each_update(trans, i) {
const char *invalid = bch2_bkey_invalid(c,
bkey_i_to_s_c(i->k), i->bkey_type);
if (invalid) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
- buf, (void *) trans->ip,
- (void *) i->ip_allocated, invalid);
- bch2_fatal_error(c);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+ buf.buf, trans->fn, (void *) i->ip_allocated, invalid);
+ printbuf_exit(&buf);
return -EINVAL;
}
btree_insert_entry_checks(trans, i);
}
trans_for_each_update(trans, i) {
- struct bkey u;
-
- /*
- * peek_slot() doesn't yet work on iterators that point to
- * interior nodes:
- */
- if (i->cached || i->level)
+ if (i->cached)
continue;
- old = bch2_btree_path_peek_slot(i->path, &u);
- ret = bkey_err(old);
- if (unlikely(ret))
- return ret;
-
u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ u64s_delta -= i->old_btree_u64s;
if (!same_leaf_as_next(trans, i)) {
if (u64s_delta <= 0) {
@@ -653,8 +895,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK|
- ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
- ? JOURNAL_RES_GET_RESERVED : 0));
+ (trans->flags & JOURNAL_WATERMARK_MASK));
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s, trace_ip);
@@ -669,6 +910,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
+ if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_drop_overwrites_from_journal(trans);
+
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
@@ -716,7 +960,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
return 0;
if (ret == -EINTR)
- trace_trans_restart_btree_node_split(trans->ip, trace_ip,
+ trace_trans_restart_btree_node_split(trans->fn, trace_ip,
i->btree_id, &i->path->pos);
break;
case BTREE_INSERT_NEED_MARK_REPLICAS:
@@ -729,14 +973,14 @@ int bch2_trans_commit_error(struct btree_trans *trans,
if (bch2_trans_relock(trans))
return 0;
- trace_trans_restart_mark_replicas(trans->ip, trace_ip);
+ trace_trans_restart_mark_replicas(trans->fn, trace_ip);
ret = -EINTR;
break;
case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans);
if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
- !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
+ !(trans->flags & JOURNAL_WATERMARK_reserved)) {
trans->restarted = true;
ret = -EAGAIN;
break;
@@ -749,13 +993,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
if (bch2_trans_relock(trans))
return 0;
- trace_trans_restart_journal_res_get(trans->ip, trace_ip);
+ trace_trans_restart_journal_res_get(trans->fn, trace_ip);
ret = -EINTR;
break;
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
- trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
+ trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
@@ -765,7 +1009,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
if (bch2_trans_relock(trans))
return 0;
- trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
+ trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
ret = -EINTR;
break;
default:
@@ -774,7 +1018,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
}
BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
- BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
+ BUG_ON(ret == -ENOSPC &&
+ !(trans->flags & BTREE_INSERT_NOWAIT) &&
+ (trans->flags & BTREE_INSERT_NOFAIL));
return ret;
}
@@ -785,7 +1031,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+ test_bit(BCH_FS_STARTED, &c->flags))
return -EROFS;
bch2_trans_unlock(trans);
@@ -801,155 +1048,72 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- struct bkey unpacked;
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
- bool trans_trigger_run;
- unsigned btree_id = 0;
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
int ret = 0;
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
- btree_id_start++;
-
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- if (i->insert_trigger_run ||
- (i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- continue;
-
- BUG_ON(i->overwrite_trigger_run);
-
- i->insert_trigger_run = true;
- trans_trigger_run = true;
-
- old = bch2_btree_path_peek_slot(i->path, &unpacked);
- _deleted.p = i->path->pos;
-
- if (old.k->type == i->k->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- i->overwrite_trigger_run = true;
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
- } else {
- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|i->flags);
- }
-
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip, _RET_IP_,
- i->btree_id, &i->path->pos);
- if (ret)
- return ret;
- }
- } while (trans_trigger_run);
-
- do {
- trans_trigger_run = false;
-
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- if (i->overwrite_trigger_run ||
- (i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- continue;
-
- BUG_ON(!i->insert_trigger_run);
-
- i->overwrite_trigger_run = true;
- trans_trigger_run = true;
-
- old = bch2_btree_path_peek_slot(i->path, &unpacked);
- _deleted.p = i->path->pos;
-
- ret = bch2_trans_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|i->flags);
-
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip, _RET_IP_,
- i->btree_id, &i->path->pos);
- if (ret)
- return ret;
- }
- } while (trans_trigger_run);
+ trans_for_each_update(trans, i) {
+ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+ if (ret)
+ break;
}
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
- return 0;
+ return ret;
}
int __bch2_trans_commit(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
- !trans->extra_journal_entry_u64s)
+ !trans->extra_journal_entries.nr)
goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&trans->c->gc_lock);
+ lockdep_assert_held(&c->gc_lock);
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+ ret = bch2_trans_commit_run_triggers(trans);
+ if (ret)
+ goto out_reset;
- trans->journal_u64s = trans->extra_journal_entry_u64s;
- trans->journal_preres_u64s = 0;
+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ ret = do_bch2_trans_commit_to_journal_replay(trans);
+ goto out_reset;
+ }
if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
- unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+ unlikely(!percpu_ref_tryget(&c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
goto out_reset;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
- /*
- * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
- * from the key cache flush code:
- */
- trans_for_each_update(trans, i)
- if (!i->cached &&
- !(i->flags & BTREE_TRIGGER_NORUN))
- bch2_btree_key_cache_verify_clean(trans,
- i->btree_id, i->k->k.p);
-#endif
+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
- ret = bch2_trans_commit_run_triggers(trans);
- if (ret)
- goto out;
+ trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_preres_u64s = 0;
+
+ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+ if (trans->journal_transaction_names)
+ trans->journal_u64s += JSET_ENTRY_LOG_U64s;
trans_for_each_update(trans, i) {
BUG_ON(!i->path->should_be_locked);
if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
- trace_trans_restart_upgrade(trans->ip, _RET_IP_,
+ trace_trans_restart_upgrade(trans->fn, _RET_IP_,
i->btree_id, &i->path->pos);
ret = btree_trans_restart(trans);
goto out;
@@ -965,7 +1129,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
if (trans->extra_journal_res) {
- ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
+ ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
(trans->flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0);
@@ -984,10 +1148,10 @@ retry:
if (ret)
goto err;
out:
- bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+ bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
- percpu_ref_put(&trans->c->writes);
+ percpu_ref_put(&c->writes);
out_reset:
trans_for_each_update(trans, i)
bch2_path_put(trans, i->path, true);
@@ -995,8 +1159,7 @@ out_reset:
trans->extra_journal_res = 0;
trans->nr_updates = 0;
trans->hooks = NULL;
- trans->extra_journal_entries = NULL;
- trans->extra_journal_entry_u64s = 0;
+ trans->extra_journal_entries.nr = 0;
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
@@ -1023,6 +1186,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
if (!snapshot_t(c, pos.snapshot)->children[0])
return 0;
@@ -1051,10 +1217,10 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_update_extent(struct btree_trans *trans,
- struct btree_iter *orig_iter,
- struct bkey_i *insert,
- enum btree_update_flags flags)
+int bch2_trans_update_extent(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert,
+ enum btree_update_flags flags)
{
struct bch_fs *c = trans->c;
struct btree_iter iter, update_iter;
@@ -1068,7 +1234,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
BTREE_ITER_INTENT|
BTREE_ITER_WITH_UPDATES|
BTREE_ITER_NOT_EXTENTS);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -1212,19 +1378,16 @@ nomerge1:
bkey_reassemble(update, k);
bch2_cut_front(insert->k.p, update);
- bch2_trans_copy_iter(&update_iter, &iter);
- update_iter.pos = update->k.p;
- ret = bch2_trans_update(trans, &update_iter, update,
+ ret = bch2_trans_update_by_path(trans, iter.path, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
flags);
- bch2_trans_iter_exit(trans, &update_iter);
-
if (ret)
goto err;
goto out;
}
next:
- k = bch2_btree_iter_next(&iter);
+ bch2_btree_iter_advance(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -1301,26 +1464,25 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
return ret;
}
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_update_flags flags)
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
- BUG_ON(!iter->path->should_be_locked);
-
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- return bch2_trans_update_extent(trans, iter, k, flags);
+ BUG_ON(!path->should_be_locked);
BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
- BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+ BUG_ON(bpos_cmp(k->k.p, path->pos));
n = (struct btree_insert_entry) {
.flags = flags,
- .bkey_type = __btree_node_type(iter->path->level, iter->btree_id),
- .btree_id = iter->btree_id,
- .level = iter->path->level,
- .cached = iter->flags & BTREE_ITER_CACHED,
- .path = iter->path,
+ .bkey_type = __btree_node_type(path->level, path->btree_id),
+ .btree_id = path->btree_id,
+ .level = path->level,
+ .cached = path->cached,
+ .path = path,
.k = k,
.ip_allocated = _RET_IP_,
};
@@ -1331,16 +1493,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
btree_insert_entry_cmp(i - 1, i) >= 0);
#endif
- if (bkey_deleted(&n.k->k) &&
- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
- int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
- if (unlikely(ret < 0))
- return ret;
-
- if (ret)
- n.k->k.type = KEY_TYPE_whiteout;
- }
-
/*
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
@@ -1353,28 +1505,95 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
!btree_insert_entry_cmp(&n, i)) {
BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
- /*
- * This is a hack to ensure that inode creates update the btree,
- * not the key cache, which helps with cache coherency issues in
- * other areas:
- */
- if (n.cached && !i->cached) {
- i->k = n.k;
- i->flags = n.flags;
- return 0;
- }
-
bch2_path_put(trans, i->path, true);
- *i = n;
- } else
+ i->flags = n.flags;
+ i->cached = n.cached;
+ i->k = n.k;
+ i->path = n.path;
+ i->ip_allocated = n.ip_allocated;
+ } else {
array_insert_item(trans->updates, trans->nr_updates,
i - trans->updates, n);
- __btree_path_get(n.path, true);
+ i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
+ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p);
+ if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
+ }
+
+ __btree_path_get(n.path, true);
return 0;
}
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ struct btree_path *path = iter->update_path ?: iter->path;
+ struct bkey_cached *ck;
+ int ret;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ return bch2_trans_update_extent(trans, iter, k, flags);
+
+ if (bkey_deleted(&k->k) &&
+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (ret)
+ k->k.type = KEY_TYPE_whiteout;
+ }
+
+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ !path->cached &&
+ !path->level &&
+ btree_id_cached(trans->c, path->btree_id)) {
+ if (!iter->key_cache_path ||
+ !iter->key_cache_path->should_be_locked ||
+ bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+ if (!iter->key_cache_path)
+ iter->key_cache_path =
+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_CACHED, _THIS_IP_);
+
+ iter->key_cache_path =
+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ BTREE_ITER_CACHED);
+ if (unlikely(ret))
+ return ret;
+
+ ck = (void *) iter->key_cache_path->l[0].b;
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+ btree_trans_restart(trans);
+ return -EINTR;
+ }
+
+ iter->key_cache_path->should_be_locked = true;
+ }
+
+ path = iter->key_cache_path;
+ }
+
+ return bch2_trans_update_by_path(trans, path, k, flags);
+}
+
void bch2_trans_commit_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
@@ -1428,14 +1647,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
- unsigned iter_flags,
+ unsigned update_flags,
u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
retry:
while ((bch2_trans_begin(trans),
(k = bch2_btree_iter_peek(&iter)).k) &&
@@ -1463,7 +1682,7 @@ retry:
*/
delete.k.p = iter.pos;
- if (btree_node_type_is_extents(id)) {
+ if (iter.flags & BTREE_ITER_IS_EXTENTS) {
unsigned max_sectors =
KEY_SIZE_MAX & (~0 << trans->c->block_bits);
@@ -1478,7 +1697,8 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BTREE_INSERT_NOFAIL|
+ update_flags);
bch2_disk_reservation_put(trans->c, &disk_res);
if (ret)
break;
@@ -1500,8 +1720,37 @@ retry:
*/
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start, struct bpos end,
+ unsigned update_flags,
u64 *journal_seq)
{
return bch2_trans_do(c, NULL, journal_seq, 0,
- bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
+ bch2_btree_delete_range_trans(&trans, id, start, end,
+ update_flags, journal_seq));
+}
+
+int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
+{
+ unsigned len = strlen(msg);
+ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
+ struct jset_entry_log *l;
+ int ret;
+
+ ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s));
+ if (ret)
+ return ret;
+
+ l = (void *) &darray_top(trans->extra_journal_entries);
+ l->entry.u64s = cpu_to_le16(u64s);
+ l->entry.btree_id = 0;
+ l->entry.level = 1;
+ l->entry.type = BCH_JSET_ENTRY_log;
+ l->entry.pad[0] = 0;
+ l->entry.pad[1] = 0;
+ l->entry.pad[2] = 0;
+ memcpy(l->d, msg, len);
+ while (len & 7)
+ l->d[len++] = '\0';
+
+ trans->extra_journal_entries.nr += jset_u64s(u64s);
+ return 0;
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index f7d4a0678e39..7654ab24a909 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -11,6 +11,7 @@
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "ec.h"
#include "error.h"
#include "inode.h"
@@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
}
}
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u16 last_seq_ondisk = c->journal.flushed_seq_ondisk;
- struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
- struct bucket_mark m;
- unsigned i;
-
- if (journal_seq - c->last_bucket_seq_cleanup <
- (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
- return;
-
- c->last_bucket_seq_cleanup = journal_seq;
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets) {
- bucket_cmpxchg(g, m, ({
- if (!m.journal_seq_valid ||
- bucket_needs_journal_commit(m, last_seq_ondisk))
- break;
-
- m.journal_seq_valid = 0;
- }));
- }
- up_read(&ca->bucket_lock);
- }
-}
-
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
@@ -315,29 +279,24 @@ bch2_fs_usage_read_short(struct bch_fs *c)
return ret;
}
-static inline int is_unavailable_bucket(struct bucket_mark m)
+static inline int is_unavailable_bucket(struct bch_alloc_v4 a)
{
- return !is_available_bucket(m);
+ return a.dirty_sectors || a.stripe;
}
static inline int bucket_sectors_fragmented(struct bch_dev *ca,
- struct bucket_mark m)
+ struct bch_alloc_v4 a)
{
- return bucket_sectors_used(m)
- ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ return a.dirty_sectors
+ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
: 0;
}
-static inline int is_stripe_data_bucket(struct bucket_mark m)
+static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a)
{
- return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
-{
- return m.cached_sectors && !m.dirty_sectors
+ return a.cached_sectors && !a.dirty_sectors
? BCH_DATA_cached
- : m.data_type;
+ : a.data_type;
}
static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -352,19 +311,13 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bucket_mark old, struct bucket_mark new,
+ struct bch_alloc_v4 old,
+ struct bch_alloc_v4 new,
u64 journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
struct bch_dev_usage *u;
- /*
- * Hack for bch2_fs_initialize path, where we're first marking sb and
- * journal non-transactionally:
- */
- if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
- journal_seq = 1;
-
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
u = dev_usage_ptr(ca, journal_seq, gc);
@@ -390,9 +343,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket old, struct bucket new,
+ u64 journal_seq, bool gc)
+{
+ struct bch_alloc_v4 old_a = {
+ .gen = old.gen,
+ .data_type = old.data_type,
+ .dirty_sectors = old.dirty_sectors,
+ .cached_sectors = old.cached_sectors,
+ .stripe = old.stripe,
+ };
+ struct bch_alloc_v4 new_a = {
+ .gen = new.gen,
+ .data_type = new.data_type,
+ .dirty_sectors = new.dirty_sectors,
+ .cached_sectors = new.cached_sectors,
+ .stripe = new.stripe,
+ };
- if (!is_available_bucket(old) && is_available_bucket(new))
- bch2_wake_allocator(ca);
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
}
static inline int __update_replicas(struct bch_fs *c,
@@ -416,22 +388,23 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
{
struct bch_fs_usage __percpu *fs_usage;
int idx, ret = 0;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
percpu_down_read(&c->mark_lock);
+ buf.atomic++;
idx = bch2_replicas_entry_idx(c, r);
if (idx < 0 &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err(c, "no replicas entry\n"
" while marking %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, r);
- if (ret)
- return ret;
-
percpu_down_read(&c->mark_lock);
+
+ if (ret)
+ goto err;
idx = bch2_replicas_entry_idx(c, r);
}
if (idx < 0) {
@@ -447,6 +420,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
err:
fsck_err:
percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
@@ -525,49 +499,21 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator)
-{
- struct bucket *g = bucket(ca, b);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- new.owned_by_allocator = owned_by_allocator;
- }));
-
- BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
-static inline u8 bkey_alloc_gen(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_alloc:
- return bkey_s_c_to_alloc(k).v->gen;
- case KEY_TYPE_alloc_v2:
- return bkey_s_c_to_alloc_v2(k).v->gen;
- case KEY_TYPE_alloc_v3:
- return bkey_s_c_to_alloc_v3(k).v->gen;
- default:
- return 0;
- }
-}
-
-static int bch2_mark_alloc(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_alloc(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_alloc_unpacked u;
- struct bch_dev *ca;
- struct bucket *g;
- struct bucket_mark old_m, m;
+ struct bch_alloc_v4 old_a, new_a;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
int ret = 0;
- /* We don't do anything for deletions - do we?: */
- if (!bkey_is_alloc(new.k))
- return 0;
+ if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket ||
+ new.k->p.offset >= ca->mi.nbuckets, trans,
+ "alloc key outside range of device's buckets"))
+ return -EIO;
/*
* alloc btree is read in by bch2_alloc_read, not gc:
@@ -576,49 +522,80 @@ static int bch2_mark_alloc(struct btree_trans *trans,
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+ bch2_alloc_to_v4(old, &old_a);
+ bch2_alloc_to_v4(new, &new_a);
+
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ !old_a.data_type != !new_a.data_type &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
- v->journal_seq = cpu_to_le64(journal_seq);
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ new_a.journal_seq = !new_a.data_type &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+ ? 0 : journal_seq;
+ v->journal_seq = new_a.journal_seq;
}
- ca = bch_dev_bkey_exists(c, new.k->p.inode);
+ if (old_a.data_type && !new_a.data_type && new_a.journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ new_a.journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
+ return ret;
+ }
+ }
- if (new.k->p.offset >= ca->mi.nbuckets)
- return 0;
+ if (!new_a.data_type &&
+ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
- u = bch2_alloc_unpack(new);
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ BCH_ALLOC_V4_NEED_DISCARD(&new_a) &&
+ !new_a.journal_seq)
+ bch2_do_discards(c);
+
+ if (!old_a.data_type &&
+ new_a.data_type &&
+ should_invalidate_buckets(ca))
+ bch2_do_invalidates(c);
+
+ if (bucket_state(new_a) == BUCKET_need_gc_gens) {
+ atomic_inc(&c->kick_gc);
+ wake_up_process(c->gc_thread);
+ }
percpu_down_read(&c->mark_lock);
- if (!gc && u.gen != bkey_alloc_gen(old))
- *bucket_gen(ca, new.k->p.offset) = u.gen;
+ if (!gc && new_a.gen != old_a.gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a.gen;
- g = __bucket(ca, new.k->p.offset, gc);
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
- old_m = bucket_cmpxchg(g, m, ({
- m.gen = u.gen;
- m.data_type = u.data_type;
- m.dirty_sectors = u.dirty_sectors;
- m.cached_sectors = u.cached_sectors;
- m.stripe = u.stripe != 0;
+ if (gc) {
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
- if (journal_seq) {
- m.journal_seq_valid = 1;
- m.journal_seq = journal_seq;
- }
- }));
+ bucket_lock(g);
- bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
+ g->gen_valid = 1;
+ g->gen = new_a.gen;
+ g->data_type = new_a.data_type;
+ g->stripe = new_a.stripe;
+ g->stripe_redundancy = new_a.stripe_redundancy;
+ g->dirty_sectors = new_a.dirty_sectors;
+ g->cached_sectors = new_a.cached_sectors;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
- g->gen_valid = 1;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
+ bucket_unlock(g);
+ }
percpu_up_read(&c->mark_lock);
/*
@@ -627,9 +604,9 @@ static int bch2_mark_alloc(struct btree_trans *trans,
*/
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_m.cached_sectors) {
+ old_a.cached_sectors) {
ret = update_cached_sectors(c, new, ca->dev_idx,
- -old_m.cached_sectors,
+ -old_a.cached_sectors,
journal_seq, gc);
if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
@@ -637,29 +614,18 @@ static int bch2_mark_alloc(struct btree_trans *trans,
}
trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
- old_m.cached_sectors);
+ old_a.cached_sectors);
}
return 0;
}
-#define checked_add(a, b) \
-({ \
- unsigned _res = (unsigned) (a) + (b); \
- bool overflow = _res > U16_MAX; \
- if (overflow) \
- _res = U16_MAX; \
- (a) = _res; \
- overflow; \
-})
-
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type data_type,
unsigned sectors, struct gc_pos pos,
unsigned flags)
{
- struct bucket *g;
- struct bucket_mark old, new;
+ struct bucket old, new, *g;
bool overflow;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -674,10 +640,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
percpu_down_read(&c->mark_lock);
g = gc_bucket(ca, b);
- old = bucket_cmpxchg(g, new, ({
- new.data_type = data_type;
- overflow = checked_add(new.dirty_sectors, sectors);
- }));
+
+ bucket_lock(g);
+ old = *g;
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ overflow = g->dirty_sectors < sectors;
+
+ new = *g;
+ bucket_unlock(g);
bch2_fs_inconsistent_on(old.data_type &&
old.data_type != data_type, c,
@@ -691,7 +663,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
bch2_data_types[old.data_type ?: data_type],
old.dirty_sectors, sectors);
- bch2_dev_usage_update(c, ca, old, new, 0, true);
+ bch2_dev_usage_update_m(c, ca, old, new, 0, true);
percpu_up_read(&c->mark_lock);
}
@@ -710,83 +682,99 @@ static int check_bucket_ref(struct bch_fs *c,
struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 bucket_data_type,
- u16 dirty_sectors, u16 cached_sectors)
+ u8 b_gen, u8 bucket_data_type,
+ u32 dirty_sectors, u32 cached_sectors)
{
- size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
u16 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (gen_after(ptr->gen, bucket_gen)) {
+ if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (bucket_gen != ptr->gen && !ptr->cached) {
+ if (b_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
+ *bucket_gen(ca, bucket_nr),
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (bucket_gen != ptr->gen)
- return 1;
+ if (b_gen != ptr->gen) {
+ ret = 1;
+ goto err;
+ }
if (bucket_data_type && ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
+ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bucket_sectors, sectors,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
-
- return 0;
+err:
+ printbuf_exit(&buf);
+ return ret;
}
static int mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c k,
unsigned ptr_idx,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
+ u64 journal_seq = trans->journal_res.seq;
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
@@ -794,9 +782,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g;
- struct bucket_mark new, old;
- char buf[200];
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -804,45 +791,45 @@ static int mark_stripe_bucket(struct btree_trans *trans,
/* * XXX doesn't handle deletion */
percpu_down_read(&c->mark_lock);
+ buf.atomic++;
g = PTR_GC_BUCKET(ca, ptr);
- if (g->mark.dirty_sectors ||
+ if (g->dirty_sectors ||
(g->stripe && g->stripe != k.k->p.offset)) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EINVAL;
goto err;
}
- old = bucket_cmpxchg(g, new, ({
- ret = check_bucket_ref(c, k, ptr, sectors, data_type,
- new.gen, new.data_type,
- new.dirty_sectors, new.cached_sectors);
- if (ret)
- goto err;
-
- new.dirty_sectors += sectors;
- if (data_type)
- new.data_type = data_type;
+ bucket_lock(g);
+ old = *g;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
+ ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+ new.gen, new.data_type,
+ new.dirty_sectors, new.cached_sectors);
+ if (ret) {
+ bucket_unlock(g);
+ goto err;
+ }
- new.stripe = true;
- }));
+ new.dirty_sectors += sectors;
+ if (data_type)
+ new.data_type = data_type;
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+ new = *g;
+ bucket_unlock(g);
+
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
err:
percpu_up_read(&c->mark_lock);
-
- return 0;
+ printbuf_exit(&buf);
+ return ret;
}
static int __mark_pointer(struct btree_trans *trans,
@@ -850,9 +837,9 @@ static int __mark_pointer(struct btree_trans *trans,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 *bucket_data_type,
- u16 *dirty_sectors, u16 *cached_sectors)
+ u32 *dirty_sectors, u32 *cached_sectors)
{
- u16 *dst_sectors = !ptr->cached
+ u32 *dst_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
@@ -876,11 +863,9 @@ static int bch2_mark_pointer(struct btree_trans *trans,
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g;
+ struct bucket old, new, *g;
u8 bucket_data_type;
- u64 v;
int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -888,35 +873,27 @@ static int bch2_mark_pointer(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
g = PTR_GC_BUCKET(ca, &p.ptr);
- v = atomic64_read(&g->_mark.v);
- do {
- new.v.counter = old.v.counter = v;
- bucket_data_type = new.data_type;
-
- ret = __mark_pointer(trans, k, &p.ptr, sectors,
- data_type, new.gen,
- &bucket_data_type,
- &new.dirty_sectors,
- &new.cached_sectors);
- if (ret)
- goto err;
+ bucket_lock(g);
+ old = *g;
- new.data_type = bucket_data_type;
+ bucket_data_type = g->data_type;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
+ ret = __mark_pointer(trans, k, &p.ptr, sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (ret) {
+ bucket_unlock(g);
+ goto err;
+ }
- if (flags & BTREE_TRIGGER_NOATOMIC) {
- g->_mark = new;
- break;
- }
- } while ((v = atomic64_cmpxchg(&g->_mark.v,
- old.v.counter,
- new.v.counter)) != old.v.counter);
+ g->data_type = bucket_data_type;
+
+ new = *g;
+ bucket_unlock(g);
- bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
err:
percpu_up_read(&c->mark_lock);
@@ -937,9 +914,11 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
BUG_ON(!(flags & BTREE_TRIGGER_GC));
m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
-
- if (!m)
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ (u64) p.idx);
return -ENOMEM;
+ }
spin_lock(&c->ec_stripes_heap_lock);
@@ -962,9 +941,9 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_extent(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_extent(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
@@ -1032,10 +1011,11 @@ static int bch2_mark_extent(struct btree_trans *trans,
if (r.e.nr_devs) {
ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
if (ret) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
return ret;
}
}
@@ -1043,14 +1023,14 @@ static int bch2_mark_extent(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_stripe(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- size_t idx = new.k->p.offset;
+ u64 idx = new.k->p.offset;
const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(old).v : NULL;
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
@@ -1064,13 +1044,16 @@ static int bch2_mark_stripe(struct btree_trans *trans,
struct stripe *m = genradix_ptr(&c->stripes, idx);
if (!m || (old_s && !m->alive)) {
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf1), c, old);
- bch2_bkey_val_to_text(&PBUF(buf2), c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
"old %s\n"
- "new %s", idx, buf1, buf2);
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
bch2_inconsistent_error(c);
return -1;
}
@@ -1100,9 +1083,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
struct gc_stripe *m =
genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
- if (!m)
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ idx);
return -ENOMEM;
-
+ }
/*
* This will be wrong when we bring back runtime gc: we should
* be unmarking the old key and then marking the new key
@@ -1124,7 +1109,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
memset(m->block_sectors, 0, sizeof(m->block_sectors));
for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
+ ret = mark_stripe_bucket(trans, new, i, flags);
if (ret)
return ret;
}
@@ -1133,10 +1118,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
((s64) m->sectors * m->nr_redundant),
journal_seq, gc);
if (ret) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
return ret;
}
}
@@ -1144,9 +1130,9 @@ static int bch2_mark_stripe(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_inode(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_inode(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bch_fs_usage __percpu *fs_usage;
@@ -1175,9 +1161,9 @@ static int bch2_mark_inode(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_reservation(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_reservation(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -1207,18 +1193,24 @@ static int bch2_mark_reservation(struct btree_trans *trans,
return 0;
}
-static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 start, u64 end,
u64 *idx, unsigned flags, size_t r_idx)
{
+ struct bch_fs *c = trans->c;
struct reflink_gc *r;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 next_idx = end;
s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
if (r_idx >= c->reflink_gc_nr)
goto not_found;
r = genradix_ptr(&c->reflink_gc_table, r_idx);
- if (*idx < r->offset - r->size)
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
goto not_found;
BUG_ON((s64) r->refcount + add < 0);
@@ -1227,37 +1219,37 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
*idx = r->offset;
return 0;
not_found:
- *idx = U64_MAX;
- ret = -EIO;
-
- /*
- * XXX: we're replacing the entire reflink pointer with an error
- * key, we should just be replacing the part that was missing:
- */
- if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+ if (fsck_err(c, "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
struct bkey_i_error new;
bkey_init(&new.k);
new.k.type = KEY_TYPE_error;
- new.k.p = p.k->p;
- new.k.size = p.k->size;
- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+ new.k.p = bkey_start_pos(p.k);
+ new.k.p.offset += *idx - start;
+ bch2_key_resize(&new.k, next_idx - *idx);
+ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
}
+
+ *idx = next_idx;
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx);
+ u64 idx = le64_to_cpu(p.v->idx), start = idx;
u64 end = le64_to_cpu(p.v->idx) + p.k->size;
int ret = 0;
@@ -1281,73 +1273,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
}
while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_key(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
-{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
- switch (k.k->type) {
- case KEY_TYPE_alloc:
- case KEY_TYPE_alloc_v2:
- case KEY_TYPE_alloc_v3:
- return bch2_mark_alloc(trans, old, new, flags);
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return bch2_mark_extent(trans, old, new, flags);
- case KEY_TYPE_stripe:
- return bch2_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- return bch2_mark_inode(trans, old, new, flags);
- case KEY_TYPE_reservation:
- return bch2_mark_reservation(trans, old, new, flags);
- case KEY_TYPE_reflink_p:
- return bch2_mark_reflink_p(trans, old, new, flags);
- case KEY_TYPE_snapshot:
- return bch2_mark_snapshot(trans, old, new, flags);
- default:
- return 0;
- }
-}
-
-int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *new, unsigned flags)
-{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- struct bkey unpacked;
- int ret;
-
- _deleted.p = path->pos;
-
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (!btree_node_type_needs_gc(path->btree_id))
- return 0;
-
- old = bch2_btree_path_peek_slot(path, &unpacked);
-
- if (old.k->type == new->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
+ ret = __bch2_mark_reflink_p(trans, p, start, end,
+ &idx, flags, l++);
return ret;
}
@@ -1359,33 +1286,26 @@ void fs_usage_apply_warn(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
bch_err(c, "disk usage increased %lli more than %u sectors reserved",
should_not_have_added, disk_res_sectors);
trans_for_each_update(trans, i) {
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+
pr_err("while inserting");
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- pr_err("%s", buf);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ pr_err(" %s", buf.buf);
pr_err("overlapping with");
-
- if (!i->cached) {
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
-
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- pr_err("%s", buf);
- } else {
- struct bkey_cached *ck = (void *) i->path->l[0].b;
-
- if (ck->valid) {
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
- pr_err("%s", buf);
- }
- }
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, old);
+ pr_err(" %s", buf.buf);
}
+
__WARN();
+ printbuf_exit(&buf);
}
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
@@ -1466,52 +1386,25 @@ need_mark:
/* trans_mark: */
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
- const struct bch_extent_ptr *ptr,
- struct bkey_alloc_unpacked *u)
-{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
- struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
- int ret;
-
- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
-
- *u = update && !bpos_cmp(update->k.p, pos)
- ? bch2_alloc_unpack(bkey_i_to_s_c(update))
- : alloc_mem_to_key(c, iter);
-
- return 0;
-}
-
static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k, struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc_v4 *a;
int ret;
- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
- u.gen, &u.data_type,
- &u.dirty_sectors, &u.cached_sectors);
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors);
if (ret)
goto out;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
@@ -1523,7 +1416,6 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_stripe *s;
@@ -1539,16 +1431,15 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
goto err;
if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c,
+ bch2_trans_inconsistent(trans,
"pointer to nonexistent stripe %llu",
(u64) p.ec.idx);
- bch2_inconsistent_error(c);
ret = -EIO;
goto err;
}
if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
- bch2_fs_inconsistent(c,
+ bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
ret = -EIO;
@@ -1577,10 +1468,14 @@ err:
return ret;
}
-static int bch2_trans_mark_extent(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1642,7 +1537,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc_v4 *a;
enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
? BCH_DATA_parity : 0;
s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
@@ -1651,59 +1546,59 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
if (deleting)
sectors = -sectors;
- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
- u.gen, u.data_type,
- u.dirty_sectors, u.cached_sectors);
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors, a->v.cached_sectors);
if (ret)
goto err;
if (!deleting) {
- if (bch2_fs_inconsistent_on(u.stripe ||
- u.stripe_redundancy, c,
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
- u.dirty_sectors,
- u.stripe, s.k->p.offset)) {
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
- if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
- u.dirty_sectors,
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
s.k->p.offset)) {
ret = -EIO;
goto err;
}
- u.stripe = s.k->p.offset;
- u.stripe_redundancy = s.v->nr_redundant;
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
} else {
- if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
- u.stripe_redundancy != s.v->nr_redundant, c,
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, u.gen,
- s.k->p.offset, u.stripe)) {
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
ret = -EIO;
goto err;
}
- u.stripe = 0;
- u.stripe_redundancy = 0;
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
}
- u.dirty_sectors += sectors;
+ a->v.dirty_sectors += sectors;
if (data_type)
- u.data_type = !deleting ? data_type : 0;
+ a->v.data_type = !deleting ? data_type : 0;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto err;
err:
@@ -1711,66 +1606,68 @@ err:
return ret;
}
-static int bch2_trans_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
- struct bkey_s_c_stripe old_s = { .k = NULL };
- struct bkey_s_c_stripe new_s = { .k = NULL };
+ const struct bch_stripe *old_s = NULL;
+ struct bch_stripe *new_s = NULL;
struct bch_replicas_padded r;
unsigned i, nr_blocks;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
- old_s = bkey_s_c_to_stripe(old);
- if (new.k->type == KEY_TYPE_stripe)
- new_s = bkey_s_c_to_stripe(new);
+ old_s = bkey_s_c_to_stripe(old).v;
+ if (new->k.type == KEY_TYPE_stripe)
+ new_s = &bkey_i_to_stripe(new)->v;
/*
* If the pointers aren't changing, we don't need to do anything:
*/
- if (new_s.k && old_s.k &&
- new_s.v->nr_blocks == old_s.v->nr_blocks &&
- new_s.v->nr_redundant == old_s.v->nr_redundant &&
- !memcmp(old_s.v->ptrs, new_s.v->ptrs,
- new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- BUG_ON(new_s.k && old_s.k &&
- (new_s.v->nr_blocks != old_s.v->nr_blocks ||
- new_s.v->nr_redundant != old_s.v->nr_redundant));
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
- nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- if (new_s.k) {
- s64 sectors = le16_to_cpu(new_s.v->sectors);
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
- bch2_bkey_to_replicas(&r.e, new);
- update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
}
- if (old_s.k) {
- s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
bch2_bkey_to_replicas(&r.e, old);
- update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
}
for (i = 0; i < nr_blocks; i++) {
- if (new_s.k && old_s.k &&
- !memcmp(&new_s.v->ptrs[i],
- &old_s.v->ptrs[i],
- sizeof(new_s.v->ptrs[i])))
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
continue;
- if (new_s.k) {
- ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
+ if (new_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_i_to_s_c_stripe(new), i, false);
if (ret)
break;
}
- if (old_s.k) {
- ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+ if (old_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
if (ret)
break;
}
@@ -1779,12 +1676,12 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_mark_inode(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
+int bch2_trans_mark_inode(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
- int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
if (nr) {
struct replicas_delta_list *d =
@@ -1795,9 +1692,14 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
return 0;
}
-static int bch2_trans_mark_reservation(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
@@ -1825,7 +1727,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_i *n;
__le64 *refcount;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
@@ -1845,19 +1747,19 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
refcount = bkey_refcount(n);
if (!refcount) {
- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
- bch2_fs_inconsistent(c,
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
"nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf);
+ *idx, buf.buf);
ret = -EIO;
goto err;
}
if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
- bch2_fs_inconsistent(c,
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
"indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf);
+ *idx, buf.buf);
ret = -EIO;
goto err;
}
@@ -1879,11 +1781,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
le64_add_cpu(refcount, add);
- if (!*refcount) {
- n->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&n->k, 0);
- }
-
bch2_btree_iter_set_pos_to_extent_start(&iter);
ret = bch2_trans_update(trans, &iter, n, 0);
if (ret)
@@ -1892,12 +1789,18 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
*idx = k.k->p.offset;
err:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
@@ -1918,31 +1821,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
- struct bkey_s_c new, unsigned flags)
-{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return bch2_trans_mark_extent(trans, k, flags);
- case KEY_TYPE_stripe:
- return bch2_trans_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- return bch2_trans_mark_inode(trans, old, new, flags);
- case KEY_TYPE_reservation:
- return bch2_trans_mark_reservation(trans, k, flags);
- case KEY_TYPE_reflink_p:
- return bch2_trans_mark_reflink_p(trans, k, flags);
- default:
- return 0;
- }
-}
-
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
@@ -1950,11 +1828,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
- struct bch_extent_ptr ptr = {
- .dev = ca->dev_idx,
- .offset = bucket_to_sector(ca, b),
- };
+ struct bkey_i_alloc_v4 *a;
int ret = 0;
/*
@@ -1963,26 +1837,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
if (b >= ca->mi.nbuckets)
return 0;
- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- if (u.data_type && u.data_type != type) {
+ if (a->v.data_type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
goto out;
}
- u.data_type = type;
- u.dirty_sectors = sectors;
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
@@ -2145,65 +2019,29 @@ recalculate:
/* Startup/shutdown: */
-static void buckets_free_rcu(struct rcu_head *rcu)
-{
- struct bucket_array *buckets =
- container_of(rcu, struct bucket_array, rcu);
-
- kvpfree(buckets,
- sizeof(struct bucket_array) +
- buckets->nbuckets * sizeof(struct bucket));
-}
-
static void bucket_gens_free_rcu(struct rcu_head *rcu)
{
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
- struct bucket_array *buckets = NULL, *old_buckets = NULL;
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
- alloc_heap alloc_heap;
-
- size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
- /* XXX: these should be tunable */
- size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
- size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve * 2);
- bool resize = ca->buckets[0] != NULL;
+ bool resize = ca->bucket_gens != NULL;
int ret = -ENOMEM;
- unsigned i;
- memset(&free, 0, sizeof(free));
- memset(&free_inc, 0, sizeof(free_inc));
- memset(&alloc_heap, 0, sizeof(alloc_heap));
-
- if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
- nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO)) ||
- !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO)) ||
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+ (c->opts.buckets_nouse &&
+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)) ||
- !init_fifo(&free[RESERVE_MOVINGGC],
- copygc_reserve, GFP_KERNEL) ||
- !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
- !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+ GFP_KERNEL|__GFP_ZERO))))
goto err;
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = nbuckets;
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
@@ -2215,64 +2053,39 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
percpu_down_write(&c->mark_lock);
}
- old_buckets = bucket_array(ca);
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
- size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
- memcpy(buckets->b,
- old_buckets->b,
- n * sizeof(struct bucket));
memcpy(bucket_gens->b,
old_bucket_gens->b,
n);
- memcpy(buckets_nouse,
- ca->buckets_nouse,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
+ if (buckets_nouse)
+ memcpy(buckets_nouse,
+ ca->buckets_nouse,
+ BITS_TO_LONGS(n) * sizeof(unsigned long));
}
- rcu_assign_pointer(ca->buckets[0], buckets);
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
- buckets = old_buckets;
bucket_gens = old_bucket_gens;
swap(ca->buckets_nouse, buckets_nouse);
+ nbuckets = ca->mi.nbuckets;
+
if (resize) {
percpu_up_write(&c->mark_lock);
+ up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
}
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- fifo_move(&free[i], &ca->free[i]);
- swap(ca->free[i], free[i]);
- }
- fifo_move(&free_inc, &ca->free_inc);
- swap(ca->free_inc, free_inc);
- spin_unlock(&c->freelist_lock);
-
- /* with gc lock held, alloc_heap can't be in use: */
- swap(ca->alloc_heap, alloc_heap);
-
- nbuckets = ca->mi.nbuckets;
-
- if (resize)
- up_write(&ca->bucket_lock);
-
ret = 0;
err:
- free_heap(&alloc_heap);
- free_fifo(&free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
- call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
- if (buckets)
- call_rcu(&old_buckets->rcu, buckets_free_rcu);
+ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
return ret;
}
@@ -2281,15 +2094,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
- free_heap(&ca->alloc_heap);
- free_fifo(&ca->free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
+ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+ sizeof(struct bucket_gens) + ca->mi.nbuckets);
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 45c6d230f242..853bc9dd1294 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -15,52 +15,32 @@
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-#define bucket_cmpxchg(g, new, expr) \
-({ \
- struct bucket *_g = g; \
- u64 _v = atomic64_read(&(g)->_mark.v); \
- struct bucket_mark _old; \
- \
- do { \
- (new).v.counter = _old.v.counter = _v; \
- expr; \
- } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
- _old.v.counter, \
- (new).v.counter)) != _old.v.counter);\
- _old; \
-})
-
-static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
- bool gc)
+static inline void bucket_unlock(struct bucket *b)
{
- return rcu_dereference_check(ca->buckets[gc],
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
- lockdep_is_held(&ca->bucket_lock));
+ smp_store_release(&b->lock, 0);
}
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline void bucket_lock(struct bucket *b)
{
- return __bucket_array(ca, false);
+ while (xchg(&b->lock, 1))
+ cpu_relax();
}
-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
{
- struct bucket_array *buckets = __bucket_array(ca, gc);
-
- BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
- return buckets->b + b;
+ return rcu_dereference_check(ca->buckets_gc,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- return __bucket(ca, b, true);
-}
+ struct bucket_array *buckets = gc_bucket_array(ca);
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
- return __bucket(ca, b, false);
+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+ return buckets->b + b;
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
@@ -70,7 +50,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
-
}
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
@@ -81,26 +60,18 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
return gens->b + b;
}
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
- return g->mark.gen - g->oldest_gen;
-}
-
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
return sector_to_bucket(ca, ptr->offset);
}
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
+ const struct bch_extent_ptr *ptr)
{
- return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
}
static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
@@ -147,74 +118,55 @@ static inline u8 ptr_stale(struct bch_dev *ca,
return ret;
}
-/* bucket gc marks */
-
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-{
- return mark.dirty_sectors + mark.cached_sectors;
-}
-
-static inline bool is_available_bucket(struct bucket_mark mark)
-{
- return !mark.dirty_sectors && !mark.stripe;
-}
-
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
- u16 last_seq_ondisk)
-{
- return m.journal_seq_valid &&
- ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
/* Device usage: */
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage stats)
+ struct bch_dev_usage stats,
+ enum alloc_reserve reserve)
{
- u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+ s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+ s64 reserved = 0;
+
+ switch (reserve) {
+ case RESERVE_none:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case RESERVE_movinggc:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree_movinggc:
+ break;
+ default:
+ BUG();
+ }
if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow (%llu > %llu)\n",
stats.buckets_unavailable, total))
return 0;
- return total - stats.buckets_unavailable;
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca)
-{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+ return max_t(s64, 0,
+ total -
+ stats.buckets_unavailable -
+ ca->nr_open_buckets -
+ reserved);
}
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
- struct bch_dev_usage stats)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- struct bch_fs *c = ca->fs;
- s64 available = __dev_buckets_available(ca, stats);
- unsigned i;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
- available -= fifo_used(&ca->free[i]);
- available -= fifo_used(&ca->free_inc);
- available -= ca->nr_open_buckets;
- spin_unlock(&c->freelist_lock);
-
- return max(available, 0LL);
-}
-
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
-{
- return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
}
/* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
-
return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr);
}
@@ -240,21 +192,54 @@ bch2_fs_usage_read_short(struct bch_fs *);
/* key/bucket marking: */
-void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_update(struct btree_trans *, struct btree_path *,
- struct bkey_i *, unsigned);
+int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
- struct bkey_s_c, unsigned);
+ struct bkey_i *, unsigned);
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+ struct bkey_s_c old, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = old.k->p;
+
+ return bch2_trans_mark_key(trans, old, &deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+ struct bkey_i *new, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = new->k.p;
+
+ return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
+}
+
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 18bca269b750..e79a33795bf9 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -7,42 +7,15 @@
#define BUCKET_JOURNAL_SEQ_BITS 16
-struct bucket_mark {
- union {
- atomic64_t v;
-
- struct {
- u8 gen;
- u8 data_type:3,
- owned_by_allocator:1,
- journal_seq_valid:1,
- stripe:1;
- u16 dirty_sectors;
- u16 cached_sectors;
-
- /*
- * low bits of journal sequence number when this bucket was most
- * recently modified: if journal_seq_valid is set, this bucket can't be
- * reused until the journal sequence number written to disk is >= the
- * bucket's journal sequence number:
- */
- u16 journal_seq;
- };
- };
-};
-
struct bucket {
- union {
- struct bucket_mark _mark;
- const struct bucket_mark mark;
- };
-
- u64 io_time[2];
- u8 oldest_gen;
- u8 gc_gen;
- unsigned gen_valid:1;
- u8 stripe_redundancy;
- u32 stripe;
+ u8 lock;
+ u8 gen_valid:1;
+ u8 data_type:7;
+ u8 gen;
+ u8 stripe_redundancy;
+ u32 stripe;
+ u32 dirty_sectors;
+ u32 cached_sectors;
};
struct bucket_array {
@@ -121,7 +94,7 @@ struct copygc_heap_entry {
u8 dev;
u8 gen;
u8 replicas;
- u16 fragmentation;
+ u32 fragmentation;
u32 sectors;
u64 offset;
};
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 000000000000..2e5b955080de
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+ unsigned hash_seed_idx, u64 dev_bucket)
+{
+ unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]);
+
+ BUG_ON(!is_power_of_2(t->size));
+
+ return t->d + (h & (t->size - 1));
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size)
+{
+ unsigned i;
+
+ t->size = size;
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+ memset(t->d, 0, sizeof(t->d[0]) * size);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket)
+{
+ struct buckets_waiting_for_journal_table *t;
+ u64 dev_bucket = (u64) dev << 56 | bucket;
+ bool ret = false;
+ unsigned i;
+
+ mutex_lock(&b->lock);
+ t = b->t;
+
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+ if (h->dev_bucket == dev_bucket) {
+ ret = h->journal_seq > flushed_seq;
+ break;
+ }
+ }
+
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+ struct bucket_hashed *new,
+ u64 flushed_seq)
+{
+ struct bucket_hashed *last_evicted = NULL;
+ unsigned tries, i;
+
+ for (tries = 0; tries < 10; tries++) {
+ struct bucket_hashed *old, *victim = NULL;
+
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ old = bucket_hash(t, i, new->dev_bucket);
+
+ if (old->dev_bucket == new->dev_bucket ||
+ old->journal_seq <= flushed_seq) {
+ *old = *new;
+ return true;
+ }
+
+ if (last_evicted != old)
+ victim = old;
+ }
+
+ /* hashed to same slot 3 times: */
+ if (!victim)
+ break;
+
+ /* Failed to find an empty slot: */
+ swap(*new, *victim);
+ last_evicted = victim;
+ }
+
+ return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket,
+ u64 journal_seq)
+{
+ struct buckets_waiting_for_journal_table *t, *n;
+ struct bucket_hashed tmp, new = {
+ .dev_bucket = (u64) dev << 56 | bucket,
+ .journal_seq = journal_seq,
+ };
+ size_t i, new_size, nr_elements = 1, nr_rehashes = 0;
+ int ret = 0;
+
+ mutex_lock(&b->lock);
+
+ if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+ goto out;
+
+ t = b->t;
+ for (i = 0; i < t->size; i++)
+ nr_elements += t->d[i].journal_seq > flushed_seq;
+
+ new_size = nr_elements < t->size / 3 ? t->size : t->size * 2;
+
+ n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+retry_rehash:
+ nr_rehashes++;
+ bucket_table_init(n, new_size);
+
+ tmp = new;
+ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+ for (i = 0; i < t->size; i++) {
+ if (t->d[i].journal_seq <= flushed_seq)
+ continue;
+
+ tmp = t->d[i];
+ if (!bucket_table_insert(n, &tmp, flushed_seq))
+ goto retry_rehash;
+ }
+
+ b->t = n;
+ kvfree(t);
+
+ pr_debug("took %zu rehashes, table at %zu/%zu elements",
+ nr_rehashes, nr_elements, b->t->size);
+out:
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ kvfree(b->t);
+}
+
+#define INITIAL_TABLE_SIZE 8
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ mutex_init(&b->lock);
+
+ b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL);
+ if (!b->t)
+ return -ENOMEM;
+
+ bucket_table_init(b->t, INITIAL_TABLE_SIZE);
+ return 0;
+}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 000000000000..d2ae19cbe18c
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 000000000000..fea7f944d0ed
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+ u64 dev_bucket;
+ u64 journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+ size_t size;
+ siphash_key_t hash_seeds[3];
+ struct bucket_hashed d[];
+};
+
+struct buckets_waiting_for_journal {
+ struct mutex lock;
+ struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index db68a78276cf..aa26588ed5ed 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -568,8 +568,11 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!dev)
+ return -EINVAL;
+
for_each_online_member(ca, c, i)
- if (ca->disk_sb.bdev->bd_dev == dev) {
+ if (ca->dev == dev) {
percpu_ref_put(&ca->io_ref);
return i;
}
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index fbe8603cfb30..425582f60d7a 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -93,9 +93,9 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
}
}
-static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+ struct nonce nonce,
+ struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
@@ -104,17 +104,20 @@ static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
- BUG_ON(ret);
+ if (ret)
+ pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+ return ret;
}
-static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
struct scatterlist sg;
sg_init_one(&sg, buf, len);
- do_encrypt_sg(tfm, nonce, &sg, len);
+ return do_encrypt_sg(tfm, nonce, &sg, len);
}
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
@@ -136,25 +139,29 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
goto err;
}
- do_encrypt(chacha20, nonce, buf, len);
+ ret = do_encrypt(chacha20, nonce, buf, len);
err:
crypto_free_sync_skcipher(chacha20);
return ret;
}
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+ struct nonce nonce)
{
u8 key[POLY1305_KEY_SIZE];
+ int ret;
nonce.d[3] ^= BCH_NONCE_POLY;
memset(key, 0, sizeof(key));
- do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ if (ret)
+ return ret;
desc->tfm = c->poly1305;
crypto_shash_init(desc);
crypto_shash_update(desc, key, sizeof(key));
+ return 0;
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -196,13 +203,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
}
}
-void bch2_encrypt(struct bch_fs *c, unsigned type,
+int bch2_encrypt(struct bch_fs *c, unsigned type,
struct nonce nonce, void *data, size_t len)
{
if (!bch2_csum_type_is_encryption(type))
- return;
+ return 0;
- do_encrypt(c->chacha20, nonce, data, len);
+ return do_encrypt(c->chacha20, nonce, data, len);
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -277,23 +284,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
-void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
+int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
struct scatterlist sgl[16], *sg = sgl;
size_t bytes = 0;
+ int ret = 0;
if (!bch2_csum_type_is_encryption(type))
- return;
+ return 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ if (ret)
+ return ret;
nonce = nonce_add(nonce, bytes);
bytes = 0;
@@ -307,7 +318,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
}
sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
}
struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
@@ -407,16 +418,12 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
}
#ifdef __KERNEL__
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
{
- char key_description[60];
struct key *keyring_key;
const struct user_key_payload *ukp;
int ret;
- snprintf(key_description, sizeof(key_description),
- "bcachefs:%pUb", &sb->user_uuid);
-
keyring_key = request_key(&key_type_logon, key_description, NULL);
if (IS_ERR(keyring_key))
return PTR_ERR(keyring_key);
@@ -436,16 +443,10 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
}
#else
#include <keyutils.h>
-#include <uuid/uuid.h>
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
{
key_serial_t key_id;
- char key_description[60];
- char uuid[40];
-
- uuid_unparse_lower(sb->user_uuid.b, uuid);
- sprintf(key_description, "bcachefs:%s", uuid);
key_id = request_key("user", key_description, NULL,
KEY_SPEC_USER_KEYRING);
@@ -459,6 +460,17 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
}
#endif
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+ char key_description[60];
+ char uuid[40];
+
+ uuid_unparse_lower(sb->user_uuid.b, uuid);
+ sprintf(key_description, "bcachefs:%s", uuid);
+
+ return __bch2_request_key(key_description, key);
+}
+
int bch2_decrypt_sb_key(struct bch_fs *c,
struct bch_sb_field_crypt *crypt,
struct bch_key *key)
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index f5c1a609c5c4..c86c3c05d620 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
void *data, size_t);
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
@@ -61,8 +61,8 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
-void bch2_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
+int bch2_encrypt_bio(struct bch_fs *, unsigned,
+ struct nonce, struct bio *);
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 8e4179d8dc27..7d9ebcc9a445 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -197,9 +197,11 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
goto err;
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
- ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
- ret = ZSTD_decompressDCtx(ctx,
+ src_len = le32_to_cpup(src_data.b);
+
+ ret = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
@@ -333,8 +335,8 @@ static int attempt_compress(struct bch_fs *c,
return strm.total_out;
}
case BCH_COMPRESSION_TYPE_zstd: {
- ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
- ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+ ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+ zstd_cctx_workspace_bound(&c->zstd_params.cParams));
/*
* ZSTD requires that when we decompress we pass in the exact
@@ -347,11 +349,11 @@ static int attempt_compress(struct bch_fs *c,
* factor (7 bytes) from the dst buffer size to account for
* that.
*/
- size_t len = ZSTD_compressCCtx(ctx,
+ size_t len = zstd_compress_cctx(ctx,
dst + 4, dst_len - 4 - 7,
src, src_len,
- c->zstd_params);
- if (ZSTD_isError(len))
+ &c->zstd_params);
+ if (zstd_is_error(len))
return 0;
*((__le32 *) dst) = cpu_to_le32(len);
@@ -546,7 +548,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t decompress_workspace_size = 0;
bool decompress_workspace_needed;
- ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
+ ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
struct {
unsigned feature;
unsigned type;
@@ -558,8 +560,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
- ZSTD_CCtxWorkspaceBound(params.cParams),
- ZSTD_DCtxWorkspaceBound() },
+ zstd_cctx_workspace_bound(&params.cParams),
+ zstd_dctx_workspace_bound() },
}, *i;
int ret = 0;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
new file mode 100644
index 000000000000..745b1cdb0d17
--- /dev/null
+++ b/fs/bcachefs/darray.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include "util.h"
+#include <linux/slab.h>
+
+#define DARRAY(type) \
+struct { \
+ size_t nr, size; \
+ type *data; \
+}
+
+typedef DARRAY(void) darray_void;
+
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
+{
+ if (d->nr + more > d->size) {
+ size_t new_size = roundup_pow_of_two(d->nr + more);
+ void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ d->data = data;
+ d->size = new_size;
+ }
+
+ return 0;
+}
+
+#define darray_make_room(_d, _more) \
+ __darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more))
+
+#define darray_top(_d) ((_d).data[(_d).nr])
+
+#define darray_push(_d, _item) \
+({ \
+ int _ret = darray_make_room((_d), 1); \
+ \
+ if (!_ret) \
+ (_d).data[(_d).nr++] = (_item); \
+ _ret; \
+})
+
+#define darray_insert_item(_d, _pos, _item) \
+({ \
+ int _ret = darray_make_room((_d), 1); \
+ \
+ if (!_ret) \
+ array_insert_item((_d).data, (_d).nr, (_pos), (_item)); \
+ _ret; \
+})
+
+#define darray_for_each(_d, _i) \
+ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_init(_d) \
+do { \
+ (_d).data = NULL; \
+ (_d).nr = (_d).size = 0; \
+} while (0)
+
+#define darray_exit(_d) \
+do { \
+ kfree((_d).data); \
+ darray_init(_d); \
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index ee5b7f696796..2d65ae370931 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -169,10 +169,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
failed |= bch2_btree_verify_replica(c, b, p);
if (failed) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ printbuf_exit(&buf);
}
out:
mutex_unlock(&c->verify_lock);
@@ -184,12 +185,12 @@ out:
/* XXX: bch_fs refcounting */
struct dump_iter {
- struct bpos from;
- struct bch_fs *c;
+ struct bch_fs *c;
enum btree_id id;
+ struct bpos from;
+ u64 iter;
- char buf[1 << 12];
- size_t bytes; /* what's currently in buf */
+ struct printbuf buf;
char __user *ubuf; /* destination user buffer */
size_t size; /* size of requested read */
@@ -198,9 +199,9 @@ struct dump_iter {
static int flush_buf(struct dump_iter *i)
{
- if (i->bytes) {
- size_t bytes = min(i->bytes, i->size);
- int err = copy_to_user(i->ubuf, i->buf, bytes);
+ if (i->buf.pos) {
+ size_t bytes = min_t(size_t, i->buf.pos, i->size);
+ int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
if (err)
return err;
@@ -208,8 +209,8 @@ static int flush_buf(struct dump_iter *i)
i->ret += bytes;
i->ubuf += bytes;
i->size -= bytes;
- i->bytes -= bytes;
- memmove(i->buf, i->buf + bytes, i->bytes);
+ i->buf.pos -= bytes;
+ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
}
return 0;
@@ -226,15 +227,20 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
file->private_data = i;
i->from = POS_MIN;
+ i->iter = 0;
i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
i->id = bd->id;
+ i->buf = PRINTBUF;
return 0;
}
static int bch2_dump_release(struct inode *inode, struct file *file)
{
- kfree(file->private_data);
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
return 0;
}
@@ -266,11 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
k = bch2_btree_iter_peek(&iter);
while (k.k && !(err = bkey_err(k))) {
- bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
- i->bytes = strlen(i->buf);
- BUG_ON(i->bytes >= sizeof(i->buf));
- i->buf[i->bytes] = '\n';
- i->bytes++;
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ pr_char(&i->buf, '\n');
k = bch2_btree_iter_next(&iter);
i->from = iter.pos;
@@ -319,8 +322,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
bch2_trans_init(&trans, i->c, 0, 0);
for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
- bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
- i->bytes = strlen(i->buf);
+ bch2_btree_node_to_text(&i->buf, i->c, b);
err = flush_buf(i);
if (err)
break;
@@ -384,16 +386,14 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
bch2_btree_node_iter_peek(&l->iter, l->b);
if (l->b != prev_node) {
- bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
- i->bytes = strlen(i->buf);
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
err = flush_buf(i);
if (err)
break;
}
prev_node = l->b;
- bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
- i->bytes = strlen(i->buf);
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
err = flush_buf(i);
if (err)
break;
@@ -422,10 +422,148 @@ static const struct file_operations bfloat_failed_debug_ops = {
.read = bch2_read_bfloat_failed,
};
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+ struct btree *b)
+{
+ out->tabstops[0] = 32;
+
+ pr_buf(out, "%px btree=%s l=%u ",
+ b,
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level);
+ pr_newline(out);
+
+ pr_indent_push(out, 2);
+
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ pr_newline(out);
+
+ pr_buf(out, "flags: ");
+ pr_tab(out);
+ bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
+ pr_newline(out);
+
+ pr_buf(out, "written:");
+ pr_tab(out);
+ pr_buf(out, "%u", b->written);
+ pr_newline(out);
+
+ pr_buf(out, "writes blocked:");
+ pr_tab(out);
+ pr_buf(out, "%u", !list_empty_careful(&b->write_blocked));
+ pr_newline(out);
+
+ pr_buf(out, "will make reachable:");
+ pr_tab(out);
+ pr_buf(out, "%lx", b->will_make_reachable);
+ pr_newline(out);
+
+ pr_buf(out, "journal pin %px:", &b->writes[0].journal);
+ pr_tab(out);
+ pr_buf(out, "%llu", b->writes[0].journal.seq);
+ pr_newline(out);
+
+ pr_buf(out, "journal pin %px:", &b->writes[1].journal);
+ pr_tab(out);
+ pr_buf(out, "%llu", b->writes[1].journal.seq);
+ pr_newline(out);
+
+ pr_indent_pop(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (!i->size)
+ break;
+
+ rcu_read_lock();
+ i->buf.atomic++;
+ tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;;
+ } else {
+ done = true;
+ }
+ --i->buf.atomic;
+ rcu_read_unlock();
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_cached_btree_nodes_read,
+};
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (!i->size)
+ break;
+
+ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+ i->iter++;
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_journal_pins_read,
+};
+
void bch2_fs_debug_exit(struct bch_fs *c)
{
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove_recursive(c->debug);
+ if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+ debugfs_remove_recursive(c->fs_debug_dir);
}
void bch2_fs_debug_init(struct bch_fs *c)
@@ -437,29 +575,39 @@ void bch2_fs_debug_init(struct bch_fs *c)
return;
snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
- c->debug = debugfs_create_dir(name, bch_debug);
- if (IS_ERR_OR_NULL(c->debug))
+ c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+ if (IS_ERR_OR_NULL(c->fs_debug_dir))
+ return;
+
+ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+ c->btree_debug, &cached_btree_nodes_ops);
+
+ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+ c->btree_debug, &journal_pins_ops);
+
+ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+ if (IS_ERR_OR_NULL(c->btree_debug_dir))
return;
for (bd = c->btree_debug;
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
- 0400, c->debug, bd,
- &btree_debug_ops);
+ debugfs_create_file(bch2_btree_ids[bd->id],
+ 0400, c->btree_debug_dir, bd,
+ &btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
bch2_btree_ids[bd->id]);
- bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
- &btree_format_debug_ops);
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
bch2_btree_ids[bd->id]);
- bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
- &bfloat_failed_debug_ops);
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &bfloat_failed_debug_ops);
}
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 6f699b736b34..760e4f74715f 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -122,9 +122,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- bch_scnmemcpy(out, d.v->d_name,
- bch2_dirent_name_bytes(d));
- pr_buf(out, " -> %llu type %s",
+ pr_buf(out, "%.*s -> %llu type %s",
+ bch2_dirent_name_bytes(d),
+ d.v->d_name,
d.v->d_type != DT_SUBVOL
? le64_to_cpu(d.v->d_inum)
: le32_to_cpu(d.v->d_child_subvol),
@@ -470,16 +470,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
if (ret)
return ret;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(dir.inum, 0, snapshot), 0, k, ret) {
- if (k.k->p.inode > dir.inum)
- break;
-
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(dir.inum, 0, snapshot),
+ POS(dir.inum, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
ret = -ENOTEMPTY;
break;
}
- }
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -503,11 +500,9 @@ retry:
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
- if (k.k->p.inode > inum.inum)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+ SPOS(inum.inum, ctx->pos, snapshot),
+ POS(inum.inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index c52b6faac9b4..81b41b07c24b 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -17,24 +17,20 @@ static int group_cmp(const void *_l, const void *_r)
strncmp(l->label, r->label, sizeof(l->label));
}
-static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g, *sorted = NULL;
- struct bch_sb_field_members *mi;
- struct bch_member *m;
- unsigned i, nr_groups, len;
- const char *err = NULL;
-
- mi = bch2_sb_get_members(sb);
- groups = bch2_sb_get_disk_groups(sb);
- nr_groups = disk_groups_nr(groups);
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ unsigned nr_groups = disk_groups_nr(groups);
+ unsigned i, len;
+ int ret = -EINVAL;
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++) {
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
unsigned g;
if (!BCH_MEMBER_GROUP(m))
@@ -42,45 +38,54 @@ static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
g = BCH_MEMBER_GROUP(m) - 1;
- if (g >= nr_groups ||
- BCH_GROUP_DELETED(&groups->entries[g]))
- return "disk has invalid group";
+ if (g >= nr_groups) {
+ pr_buf(err, "disk %u has invalid label %u (have %u)",
+ i, g, nr_groups);
+ return -EINVAL;
+ }
+
+ if (BCH_GROUP_DELETED(&groups->entries[g])) {
+ pr_buf(err, "disk %u has deleted label %u", i, g);
+ return -EINVAL;
+ }
}
if (!nr_groups)
- return NULL;
+ return 0;
+
+ for (i = 0; i < nr_groups; i++) {
+ g = groups->entries + i;
- for (g = groups->entries;
- g < groups->entries + nr_groups;
- g++) {
if (BCH_GROUP_DELETED(g))
continue;
len = strnlen(g->label, sizeof(g->label));
if (!len) {
- err = "group with empty label";
- goto err;
+ pr_buf(err, "label %u empty", i);
+ return -EINVAL;
}
}
sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
if (!sorted)
- return "cannot allocate memory";
+ return -ENOMEM;
memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
- for (i = 0; i + 1 < nr_groups; i++)
- if (!BCH_GROUP_DELETED(sorted + i) &&
- !group_cmp(sorted + i, sorted + i + 1)) {
- err = "duplicate groups";
+ for (g = sorted; g + 1 < sorted + nr_groups; g++)
+ if (!BCH_GROUP_DELETED(g) &&
+ !group_cmp(&g[0], &g[1])) {
+ pr_buf(err, "duplicate label %llu.%.*s",
+ BCH_GROUP_PARENT(g),
+ (int) sizeof(g->label), g->label);
goto err;
}
- err = NULL;
+ ret = 0;
err:
kfree(sorted);
- return err;
+ return 0;
}
static void bch2_sb_disk_groups_to_text(struct printbuf *out,
@@ -338,12 +343,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out,
- struct bch_sb_handle *sb,
- unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
- bch2_sb_get_disk_groups(sb->sb);
+ bch2_sb_get_disk_groups(sb);
struct bch_disk_group *g;
unsigned nr = 0;
u16 path[32];
@@ -372,15 +375,13 @@ void bch2_disk_path_to_text(struct printbuf *out,
v = path[--nr];
g = groups->entries + v;
- bch_scnmemcpy(out, g->label,
- strnlen(g->label, sizeof(g->label)));
-
+ pr_buf(out, "%.*s", (int) sizeof(g->label), g->label);
if (nr)
pr_buf(out, ".");
}
return;
inval:
- pr_buf(out, "invalid group %u", v);
+ pr_buf(out, "invalid label %u", v);
}
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
@@ -444,7 +445,10 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
return -EINVAL;
}
-void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
{
struct target t = target_decode(v);
@@ -452,33 +456,49 @@ void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
case TARGET_NULL:
pr_buf(out, "none");
break;
- case TARGET_DEV: {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- char b[BDEVNAME_SIZE];
-
- pr_buf(out, "/dev/%s",
- bdevname(ca->disk_sb.bdev, b));
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- pr_buf(out, "offline device %u", t.dev);
+ case TARGET_DEV:
+ if (c) {
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ char b[BDEVNAME_SIZE];
+
+ pr_buf(out, "/dev/%s",
+ bdevname(ca->disk_sb.bdev, b));
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ pr_buf(out, "offline device %u", t.dev);
+ } else {
+ pr_buf(out, "invalid device %u", t.dev);
+ }
+
+ rcu_read_unlock();
} else {
- pr_buf(out, "invalid device %u", t.dev);
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ struct bch_member *m = mi->members + t.dev;
+
+ if (bch2_dev_exists(sb, mi, t.dev)) {
+ pr_buf(out, "Device ");
+ pr_uuid(out, m->uuid.b);
+ pr_buf(out, " (%u)", t.dev);
+ } else {
+ pr_buf(out, "Bad device %u", t.dev);
+ }
}
-
- rcu_read_unlock();
break;
- }
case TARGET_GROUP:
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, &c->disk_sb, t.group);
- mutex_unlock(&c->sb_lock);
+ if (c) {
+ mutex_lock(&c->sb_lock);
+ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
+ mutex_unlock(&c->sb_lock);
+ } else {
+ bch2_disk_path_to_text(out, sb, t.group);
+ }
break;
default:
BUG();
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index 3d84f23c34ed..de915480514b 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -75,11 +75,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
- unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 3cccd1faade5..616a551265e0 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -286,14 +286,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- char buf2[200];
+ struct printbuf buf2 = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
bch_err_ratelimited(c,
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
- want.lo, got.lo, buf2);
+ want.lo, got.lo, buf2.buf);
+ printbuf_exit(&buf2);
clear_bit(i, buf->valid);
break;
}
@@ -677,7 +678,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx)
return bch2_btree_delete_range(c, BTREE_ID_stripes,
POS(0, idx),
POS(0, idx + 1),
- NULL);
+ 0, NULL);
}
static void ec_stripe_delete_work(struct work_struct *work)
@@ -1294,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
BUG_ON(nr_have_data > h->s->nr_data);
BUG_ON(nr_have_parity > h->s->nr_parity);
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
buckets.nr = 0;
if (nr_have_parity < h->s->nr_parity) {
ret = bch2_bucket_alloc_set(c, &buckets,
@@ -1306,8 +1304,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
&nr_have_parity,
&have_cache,
h->copygc
- ? RESERVE_MOVINGGC
- : RESERVE_NONE,
+ ? RESERVE_movinggc
+ : RESERVE_none,
0,
cl);
@@ -1323,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
}
if (ret)
- goto err;
+ return ret;
}
buckets.nr = 0;
@@ -1335,8 +1333,8 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
&nr_have_data,
&have_cache,
h->copygc
- ? RESERVE_MOVINGGC
- : RESERVE_NONE,
+ ? RESERVE_movinggc
+ : RESERVE_none,
0,
cl);
@@ -1351,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
}
if (ret)
- goto err;
+ return ret;
}
-err:
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
- return ret;
+
+ return 0;
}
/* XXX: doesn't obey target: */
@@ -1558,50 +1554,48 @@ void bch2_stripes_heap_start(struct bch_fs *c)
bch2_stripes_heap_insert(c, m, iter.pos);
}
-static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_stripes_read(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
const struct bch_stripe *s;
- struct bch_fs *c = trans->c;
struct stripe *m;
unsigned i;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
+ int ret;
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- return ret;
+ bch2_trans_init(&trans, c, 0, 0);
- s = bkey_s_c_to_stripe(k).v;
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->alive = true;
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
- for (i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ s = bkey_s_c_to_stripe(k).v;
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_update(c, m, k.k->p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
+ m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->alive = true;
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
- return ret;
-}
+ for (i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-int bch2_stripes_read(struct bch_fs *c)
-{
- struct btree_trans trans;
- int ret;
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_update(c, m, k.k->p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_init(&trans, c, 0, 0);
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
- bch2_stripes_read_fn);
bch2_trans_exit(&trans);
+
if (ret)
bch_err(c, "error reading stripes: %i", ret);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 78d468c7680a..9d508a2f3bbc 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -14,6 +14,8 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_stripe, \
+ .atomic_trigger = bch2_mark_stripe, \
}
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 2cea694575e9..8279a9ba76a5 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only");
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c)
void bch2_fatal_error(struct bch_fs *c)
{
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "fatal error - emergency read only");
}
void bch2_io_error_work(struct work_struct *work)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 986938298adc..6e63c38186f3 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *);
#define bch2_fs_inconsistent_on(cond, c, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_fs_inconsistent(c, __VA_ARGS__); \
@@ -59,7 +59,7 @@ do { \
#define bch2_dev_inconsistent_on(cond, ca, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_dev_inconsistent(ca, __VA_ARGS__); \
@@ -67,6 +67,26 @@ do { \
})
/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...) \
+({ \
+ bch_err(trans->c, __VA_ARGS__); \
+ bch2_inconsistent_error(trans->c); \
+ bch2_dump_trans_updates(trans); \
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_trans_inconsistent(trans, __VA_ARGS__); \
+ _ret; \
+})
+
+/*
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
* be able to repair:
*/
@@ -129,7 +149,7 @@ void bch2_flush_fsck_errs(struct bch_fs *);
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, ...) \
- ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
@@ -164,7 +184,7 @@ do { \
#define bch2_fs_fatal_err_on(cond, c, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_fs_fatal_error(c, __VA_ARGS__); \
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 58b2c96f450c..2fd5d9672a44 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- unsigned ret = 0;
+ unsigned ret = 0, lru = 0;
bkey_extent_entry_for_each(ptrs, entry) {
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
+ /* Might also be updating LRU btree */
+ if (entry->ptr.cached)
+ lru++;
+
+ fallthrough;
case BCH_EXTENT_ENTRY_stripe_ptr:
ret++;
}
}
- return ret;
+ /*
+ * Updating keys in the alloc btree may also update keys in the
+ * freespace or discard btrees:
+ */
+ return lru + ret * 2;
}
static int count_iters_for_insert(struct btree_trans *trans,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 44c584e9adaa..77a0d49a2372 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -954,15 +954,25 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
- pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "",
- ca && ptr_stale(ca, ptr)
- ? " stale" : "");
+ if (!ca) {
+ pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
+ b, offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+
+ if (ca && ptr_stale(ca, ptr))
+ pr_buf(out, " stale");
+ }
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9c2567274a2b..ae650849d98a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -381,6 +381,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.key_invalid = bch2_btree_ptr_invalid, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
}
#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
@@ -388,6 +390,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
}
/* KEY_TYPE_extent: */
@@ -402,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
}
/* KEY_TYPE_reservation: */
@@ -414,6 +420,8 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.key_invalid = bch2_reservation_invalid, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
+ .trans_trigger = bch2_trans_mark_reservation, \
+ .atomic_trigger = bch2_mark_reservation, \
}
/* Extent checksum entries: */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 26d5cad7e6a5..05429c9631cd 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -17,10 +17,6 @@
*
* With one based indexing each level of the tree starts at a power of two -
* good for cacheline alignment:
- *
- * Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size) - that is, there
- * are actually size - 1 elements
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
@@ -42,12 +38,12 @@ static inline unsigned eytzinger1_right_child(unsigned i)
static inline unsigned eytzinger1_first(unsigned size)
{
- return rounddown_pow_of_two(size - 1);
+ return rounddown_pow_of_two(size);
}
static inline unsigned eytzinger1_last(unsigned size)
{
- return rounddown_pow_of_two(size) - 1;
+ return rounddown_pow_of_two(size + 1) - 1;
}
/*
@@ -62,13 +58,13 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i >= size);
+ EBUG_ON(i > size);
- if (eytzinger1_right_child(i) < size) {
+ if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
- i <<= __fls(size) - __fls(i);
- i >>= i >= size;
+ i <<= __fls(size + 1) - __fls(i);
+ i >>= i > size;
} else {
i >>= ffz(i) + 1;
}
@@ -78,14 +74,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i >= size);
+ EBUG_ON(i > size);
- if (eytzinger1_left_child(i) < size) {
+ if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
- i <<= __fls(size) - __fls(i);
+ i <<= __fls(size + 1) - __fls(i);
i -= 1;
- i >>= i >= size;
+ i >>= i > size;
} else {
i >>= __ffs(i) + 1;
}
@@ -95,17 +91,17 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
static inline unsigned eytzinger1_extra(unsigned size)
{
- return (size - rounddown_pow_of_two(size - 1)) << 1;
+ return (size + 1 - rounddown_pow_of_two(size)) << 1;
}
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
unsigned b = __fls(i);
- unsigned shift = __fls(size - 1) - b;
+ unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i >= size);
+ EBUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@@ -130,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
- EBUG_ON(!i || i >= size);
+ EBUG_ON(!i || i > size);
/*
* sign bit trick:
@@ -144,7 +140,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
shift = __ffs(i);
i >>= shift + 1;
- i |= 1U << (__fls(size - 1) - shift);
+ i |= 1U << (__fls(size) - shift);
return i;
}
@@ -185,39 +181,39 @@ static inline unsigned eytzinger0_right_child(unsigned i)
static inline unsigned eytzinger0_first(unsigned size)
{
- return eytzinger1_first(size + 1) - 1;
+ return eytzinger1_first(size) - 1;
}
static inline unsigned eytzinger0_last(unsigned size)
{
- return eytzinger1_last(size + 1) - 1;
+ return eytzinger1_last(size) - 1;
}
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
{
- return eytzinger1_next(i + 1, size + 1) - 1;
+ return eytzinger1_next(i + 1, size) - 1;
}
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
{
- return eytzinger1_prev(i + 1, size + 1) - 1;
+ return eytzinger1_prev(i + 1, size) - 1;
}
static inline unsigned eytzinger0_extra(unsigned size)
{
- return eytzinger1_extra(size + 1);
+ return eytzinger1_extra(size);
}
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
- return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+ return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
}
static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
unsigned extra)
{
- return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+ return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
}
static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9cdd03f3eeb0..051372b88347 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,15 @@
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return true;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return true;
+ return false;
+}
+
static inline struct address_space *faults_disabled_mapping(void)
{
return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
@@ -1024,7 +1033,7 @@ retry:
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
+ BTREE_ITER_SLOTS);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
@@ -1062,8 +1071,6 @@ retry:
sectors = min(sectors, k.k->size - offset_into_extent);
- bch2_trans_unlock(trans);
-
if (readpages_iter)
readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
@@ -1280,7 +1287,7 @@ static void bch2_writepage_io_done(struct closure *cl)
* racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn't add more sectors than expected:
*/
- WARN_ON(io->op.i_sectors_delta > 0);
+ WARN_ON_ONCE(io->op.i_sectors_delta > 0);
/*
* (error (due to going RO) halfway through a page can screw that up
@@ -1466,8 +1473,8 @@ do_io:
sectors << 9, offset << 9));
/* Check for writing past i_size: */
- WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)));
+ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+ round_up(i_size, block_bytes(c)));
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
@@ -1810,11 +1817,11 @@ again:
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
- if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
bytes = min_t(unsigned long, iov_iter_count(iter),
PAGE_SIZE - offset);
- if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
ret = -EFAULT;
break;
}
@@ -1872,7 +1879,7 @@ static void bch2_dio_read_complete(struct closure *cl)
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
- dio->req->ki_complete(dio->req, dio->ret, 0);
+ dio->req->ki_complete(dio->req, dio->ret);
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
}
@@ -1921,7 +1928,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
iter->count -= shorten;
bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_VECS),
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
&c->dio_read_bioset);
bio->bi_end_io = bch2_direct_IO_read_endio;
@@ -1956,7 +1963,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
goto start;
while (iter->count) {
bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_VECS),
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
&c->bio_read);
bio->bi_end_io = bch2_direct_IO_read_split_endio;
start:
@@ -2103,7 +2110,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
while (1) {
iter_count = dio->iter.count;
- if (kthread)
+ if (kthread && dio->mm)
kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
@@ -2113,7 +2120,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
dropped_locks = fdm_dropped_locks();
current->faults_disabled_mapping = NULL;
- if (kthread)
+ if (kthread && dio->mm)
kthread_unuse_mm(dio->mm);
/*
@@ -2246,7 +2253,7 @@ err:
inode_dio_end(&inode->v);
if (!sync) {
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
ret = -EIOCBQUEUED;
}
return ret;
@@ -2306,9 +2313,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_is_bvec(iter)
- ? 0
- : iov_iter_npages(iter, BIO_MAX_VECS),
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
init_completion(&dio->done);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2d2ad7f768c0..d462c06899d6 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -30,6 +30,7 @@
#include <linux/pagemap.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
+#include <linux/seq_file.h>
#include <linux/statfs.h>
#include <linux/string.h>
#include <linux/xattr.h>
@@ -104,7 +105,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
bch2_assert_pos_locked(trans, BTREE_ID_inodes,
POS(0, bi->bi_inum),
- 0 && c->opts.inodes_use_key_cache);
+ c->opts.inodes_use_key_cache);
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
@@ -134,7 +135,6 @@ int __must_check bch2_write_inode(struct bch_fs *c,
int ret;
bch2_trans_init(&trans, c, 0, 512);
- trans.ip = _RET_IP_;
retry:
bch2_trans_begin(&trans);
@@ -934,9 +934,9 @@ retry:
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(ei->v.i_ino, start, snapshot), 0);
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(iter.pos, end) < 0) {
+ while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+ !(ret = bkey_err(k))) {
enum btree_id data_btree = BTREE_ID_extents;
if (!bkey_extent_is_data(k.k) &&
@@ -1472,12 +1472,12 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode), true);
+ bch2_inode_rm(c, inode_inum(inode));
}
}
void bch2_evict_subvolume_inodes(struct bch_fs *c,
- struct snapshot_id_list *s)
+ snapshot_id_list *s)
{
struct super_block *sb = c->vfs_sb;
struct inode *inode;
@@ -1675,7 +1675,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
enum bch_opt_id i;
- char buf[512];
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
@@ -1687,13 +1688,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
continue;
- bch2_opt_to_text(&PBUF(buf), c, opt, v,
+ printbuf_reset(&buf);
+ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
OPT_SHOW_MOUNT_STYLE);
seq_putc(seq, ',');
- seq_puts(seq, buf);
+ seq_puts(seq, buf.buf);
}
- return 0;
+ if (buf.allocation_failure)
+ ret = -ENOMEM;
+ printbuf_exit(&buf);
+ return ret;
}
static void bch2_put_super(struct super_block *sb)
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index b2211ec7f302..9f4b57e30e2a 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -191,7 +191,7 @@ int bch2_setattr_nonsize(struct user_namespace *,
struct iattr *);
int __bch2_unlink(struct inode *, struct dentry *, bool);
-void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@@ -199,7 +199,7 @@ int bch2_vfs_init(void);
#else
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
- struct snapshot_id_list *s) {}
+ snapshot_id_list *s) {}
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 361dbf338023..2582ddf14803 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "bkey_buf.h"
#include "btree_update.h"
+#include "darray.h"
#include "dirent.h"
#include "error.h"
#include "fs-common.h"
@@ -471,11 +472,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str
pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
if (bkey_cmp(s->pos, pos))
- s->nr = 0;
+ s->ids.nr = 0;
s->pos = pos;
/* Might get called multiple times due to lock restarts */
- if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+ if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
return 0;
return snapshots_seen_add(c, s, pos.snapshot);
@@ -498,7 +499,7 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
ancestor = snapshot_t(c, ancestor)->equiv;
/* @ancestor should be the snapshot most recently added to @seen */
- BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+ BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
BUG_ON(seen->pos.snapshot != ancestor);
if (id == ancestor)
@@ -507,11 +508,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
if (!bch2_snapshot_is_ancestor(c, id, ancestor))
return false;
- for (i = seen->nr - 2;
- i >= 0 && seen->d[i] >= id;
+ for (i = seen->ids.nr - 2;
+ i >= 0 && seen->ids.data[i] >= id;
--i)
- if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
- bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
+ bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
return false;
return true;
@@ -537,26 +538,25 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
}
#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
- for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ u64 count;
+};
+
struct inode_walker {
bool first_this_inode;
u64 cur_inum;
- size_t nr;
- size_t size;
- struct inode_walker_entry {
- struct bch_inode_unpacked inode;
- u32 snapshot;
- u64 count;
- } *d;
+ DARRAY(struct inode_walker_entry) inodes;
};
static void inode_walker_exit(struct inode_walker *w)
{
- kfree(w->d);
- w->d = NULL;
+ darray_exit(w->inodes);
}
static struct inode_walker inode_walker_init(void)
@@ -564,40 +564,17 @@ static struct inode_walker inode_walker_init(void)
return (struct inode_walker) { 0, };
}
-static int inode_walker_realloc(struct inode_walker *w)
-{
- if (w->nr == w->size) {
- size_t new_size = max_t(size_t, 8UL, w->size * 2);
- void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
- GFP_KERNEL);
- if (!d)
- return -ENOMEM;
-
- w->d = d;
- w->size = new_size;
- }
-
- return 0;
-}
-
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c inode)
{
struct bch_inode_unpacked u;
- int ret;
-
- ret = inode_walker_realloc(w);
- if (ret)
- return ret;
BUG_ON(bch2_inode_unpack(inode, &u));
- w->d[w->nr++] = (struct inode_walker_entry) {
+ return darray_push(w->inodes, ((struct inode_walker_entry) {
.inode = u,
.snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
- };
-
- return 0;
+ }));
}
static int __walk_inode(struct btree_trans *trans,
@@ -616,7 +593,7 @@ static int __walk_inode(struct btree_trans *trans,
goto lookup_snapshot;
}
- w->nr = 0;
+ w->inodes.nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -634,26 +611,25 @@ static int __walk_inode(struct btree_trans *trans,
w->cur_inum = pos.inode;
w->first_this_inode = true;
lookup_snapshot:
- for (i = 0; i < w->nr; i++)
- if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+ for (i = 0; i < w->inodes.nr; i++)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
goto found;
return INT_MAX;
found:
- BUG_ON(pos.snapshot > w->d[i].snapshot);
+ BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
- if (pos.snapshot != w->d[i].snapshot) {
+ if (pos.snapshot != w->inodes.data[i].snapshot) {
ancestor_pos = i;
- while (i && w->d[i - 1].snapshot > pos.snapshot)
+ while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
--i;
- ret = inode_walker_realloc(w);
+ ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]);
if (ret)
return ret;
- array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
- w->d[i].snapshot = pos.snapshot;
- w->d[i].count = 0;
+ w->inodes.data[i].snapshot = pos.snapshot;
+ w->inodes.data[i].count = 0;
}
return i;
@@ -669,7 +645,7 @@ static int __get_visible_inodes(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- w->nr = 0;
+ w->inodes.nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -695,15 +671,16 @@ static int check_key_has_snapshot(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
"key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -743,7 +720,7 @@ static int hash_check_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
- char buf[200];
+ struct printbuf buf = PRINTBUF;
struct bkey_s_c k;
u64 hash;
int ret = 0;
@@ -767,8 +744,9 @@ static int hash_check_key(struct btree_trans *trans,
if (fsck_err_on(k.k->type == desc.key_type &&
!desc.cmp_bkey(k, hash_k), c,
"duplicate hash table keys:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- hash_k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ buf.buf))) {
ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
break;
}
@@ -779,13 +757,16 @@ static int hash_check_key(struct btree_trans *trans,
}
}
+out:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
bad_hash:
if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
"hashed to %llu\n%s",
desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
return 0;
ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
@@ -793,9 +774,9 @@ bad_hash:
bch_err(c, "hash_redo_key err %i", ret);
return ret;
}
- return -EINTR;
+ ret = -EINTR;
fsck_err:
- return ret;
+ goto out;
}
static int check_inode(struct btree_trans *trans,
@@ -1125,7 +1106,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
int ret = 0, ret2 = 0;
s64 count2;
- for (i = w->d; i < w->d + w->nr; i++) {
+ darray_for_each(w->inodes, i) {
if (i->inode.bi_sectors == i->count)
continue;
@@ -1163,32 +1144,34 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct inode_walker_entry *i;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
k = bch2_btree_iter_peek(iter);
if (!k.k)
- return 0;
+ goto out;
ret = bkey_err(k);
if (ret)
- return ret;
+ goto err;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
ret = snapshots_seen_update(c, s, k.k->p);
if (ret)
- return ret;
+ goto err;
if (k.k->type == KEY_TYPE_whiteout)
- return 0;
+ goto out;
if (inode->cur_inum != k.k->p.inode) {
ret = check_i_sectors(trans, inode);
if (ret)
- return ret;
+ goto err;
}
#if 0
if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
@@ -1198,33 +1181,43 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
bch2_bkey_val_to_text(&PBUF(buf2), c, k);
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
- return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
+ ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+ goto out;
+ }
}
#endif
ret = __walk_inode(trans, inode, k.k->p);
if (ret < 0)
- return ret;
+ goto err;
if (fsck_err_on(ret == INT_MAX, c,
"extent in missing inode:\n %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
- if (ret == INT_MAX)
- return 0;
+ if (ret == INT_MAX) {
+ ret = 0;
+ goto out;
+ }
- i = inode->d + ret;
+ i = inode->inodes.data + ret;
ret = 0;
if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
!S_ISLNK(i->inode.bi_mode), c,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
@@ -1234,11 +1227,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
"extent type %u offset %llu past end of inode %llu, i_size %llu",
k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
k.k->p.snapshot),
POS(k.k->p.inode, U64_MAX),
0, NULL) ?: -EINTR;
+ goto out;
}
}
}
@@ -1250,7 +1244,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_buf_reassemble(&prev, c, k);
#endif
+out:
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -1309,12 +1306,13 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
int ret = 0, ret2 = 0;
s64 count2;
- for (i = w->d; i < w->d + w->nr; i++) {
+ darray_for_each(w->inodes, i) {
if (i->inode.bi_nlink == i->count)
continue;
- count2 = lockrestart_do(trans,
- bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+ count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
+ if (count2 < 0)
+ return count2;
if (i->count != count2) {
bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
@@ -1347,7 +1345,7 @@ static int check_dirent_target(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
bool backpointer_exists = true;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (!target->bi_dir &&
@@ -1373,9 +1371,7 @@ static int check_dirent_target(struct btree_trans *trans,
"directory %llu with multiple links",
target->bi_inum)) {
ret = __remove_dirent(trans, d.k->p);
- if (ret)
- goto err;
- return 0;
+ goto out;
}
if (fsck_err_on(backpointer_exists &&
@@ -1412,18 +1408,19 @@ static int check_dirent_target(struct btree_trans *trans,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
- (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
- return ret;
+ goto err;
d = dirent_i_to_s_c(n);
}
@@ -1437,19 +1434,21 @@ static int check_dirent_target(struct btree_trans *trans,
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
- return ret;
+ goto err;
d = dirent_i_to_s_c(n);
}
+out:
err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -1463,68 +1462,81 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k;
struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
- char buf[200];
- int ret;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
k = bch2_btree_iter_peek(iter);
if (!k.k)
- return 0;
+ goto out;
ret = bkey_err(k);
if (ret)
- return ret;
+ goto err;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
ret = snapshots_seen_update(c, s, k.k->p);
if (ret)
- return ret;
+ goto err;
if (k.k->type == KEY_TYPE_whiteout)
- return 0;
+ goto out;
if (dir->cur_inum != k.k->p.inode) {
ret = check_subdir_count(trans, dir);
if (ret)
- return ret;
+ goto err;
}
ret = __walk_inode(trans, dir, k.k->p);
if (ret < 0)
- return ret;
+ goto err;
if (fsck_err_on(ret == INT_MAX, c,
"dirent in nonexisting directory:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
- if (ret == INT_MAX)
- return 0;
+ if (ret == INT_MAX) {
+ ret = 0;
+ goto out;
+ }
- i = dir->d + ret;
+ i = dir->inodes.data + ret;
ret = 0;
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
"dirent in non directory inode type %s:\n%s",
bch2_d_type_str(inode_d_type(&i->inode)),
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter, 0);
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
if (dir->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
+ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
ret = hash_check_key(trans, bch2_dirent_hash_desc,
hash_info, iter, k);
if (ret < 0)
- return ret;
- if (ret) /* dirent has been deleted */
- return 0;
+ goto err;
+ if (ret) {
+ /* dirent has been deleted */
+ ret = 0;
+ goto out;
+ }
if (k.k->type != KEY_TYPE_dirent)
- return 0;
+ goto out;
d = bkey_s_c_to_dirent(k);
@@ -1537,24 +1549,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
ret = __subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && ret != -ENOENT)
- return ret;
+ goto err;
if (fsck_err_on(ret, c,
"dirent points to missing subvolume %llu",
- le64_to_cpu(d.v->d_child_subvol)))
- return __remove_dirent(trans, d.k->p);
+ le64_to_cpu(d.v->d_child_subvol))) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto err;
+ }
ret = __lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && ret != -ENOENT)
- return ret;
+ goto err;
if (fsck_err_on(ret, c,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
bch_err(c, "repair not implemented yet");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
@@ -1564,32 +1579,33 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
subvol_root.bi_subvol = target_subvol;
ret = __write_inode(trans, &subvol_root, target_snapshot);
if (ret)
- return ret;
+ goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
- return ret;
+ goto err;
} else {
ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
- return ret;
+ goto err;
- if (fsck_err_on(!target->nr, c,
+ if (fsck_err_on(!target->inodes.nr, c,
"dirent points to missing inode:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
ret = __remove_dirent(trans, d.k->p);
if (ret)
- return ret;
+ goto err;
}
- for (i = target->d; i < target->d + target->nr; i++) {
+ darray_for_each(target->inodes, i) {
ret = check_dirent_target(trans, iter, d,
&i->inode, i->snapshot);
if (ret)
- return ret;
+ goto err;
}
}
@@ -1597,7 +1613,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
i->count++;
+out:
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -1680,7 +1699,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
ret = 0;
if (inode->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
fsck_err:
@@ -1790,21 +1809,18 @@ static int check_root(struct bch_fs *c)
check_root_trans(&trans));
}
-struct pathbuf {
- size_t nr;
- size_t size;
-
- struct pathbuf_entry {
- u64 inum;
- u32 snapshot;
- } *entries;
+struct pathbuf_entry {
+ u64 inum;
+ u32 snapshot;
};
-static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
struct pathbuf_entry *i;
- for (i = p->entries; i < p->entries + p->nr; i++)
+ darray_for_each(*p, i)
if (i->inum == inum &&
i->snapshot == snapshot)
return true;
@@ -1812,26 +1828,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
+static int path_down(struct bch_fs *c, pathbuf *p,
+ u64 inum, u32 snapshot)
{
- if (p->nr == p->size) {
- size_t new_size = max_t(size_t, 256UL, p->size * 2);
- void *n = krealloc(p->entries,
- new_size * sizeof(p->entries[0]),
- GFP_KERNEL);
- if (!n) {
- return -ENOMEM;
- }
-
- p->entries = n;
- p->size = new_size;
- };
-
- p->entries[p->nr++] = (struct pathbuf_entry) {
+ int ret = darray_push(*p, ((struct pathbuf_entry) {
.inum = inum,
.snapshot = snapshot,
- };
- return 0;
+ }));
+
+ if (ret)
+ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+ p->size);
+ return ret;
}
/*
@@ -1840,7 +1848,7 @@ static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
* XXX: we should also be verifying that inodes are in the right subvolumes
*/
static int check_path(struct btree_trans *trans,
- struct pathbuf *p,
+ pathbuf *p,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
@@ -1893,7 +1901,7 @@ static int check_path(struct btree_trans *trans,
if (!S_ISDIR(inode->bi_mode))
break;
- ret = path_down(p, inode->bi_inum, snapshot);
+ ret = path_down(c, p, inode->bi_inum, snapshot);
if (ret) {
bch_err(c, "memory allocation failure");
return ret;
@@ -1914,7 +1922,7 @@ static int check_path(struct btree_trans *trans,
/* XXX print path */
bch_err(c, "directory structure loop");
- for (i = p->entries; i < p->entries + p->nr; i++)
+ darray_for_each(*p, i)
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
@@ -1951,7 +1959,7 @@ static int check_directory_structure(struct bch_fs *c)
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
- struct pathbuf path = { 0, 0, NULL };
+ pathbuf path = { 0, };
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -1981,7 +1989,7 @@ static int check_directory_structure(struct bch_fs *c)
BUG_ON(ret == -EINTR);
- kfree(path.entries);
+ darray_exit(path);
bch2_trans_exit(&trans);
return ret;
@@ -1998,12 +2006,15 @@ struct nlink_table {
} *d;
};
-static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+ u64 inum, u32 snapshot)
{
if (t->nr == t->size) {
size_t new_size = max_t(size_t, 128UL, t->size * 2);
void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
if (!d) {
+ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+ new_size);
return -ENOMEM;
}
@@ -2093,7 +2104,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
if (!u.bi_nlink)
continue;
- ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
if (ret) {
*end = k.k->p.offset;
ret = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index ef6da53567b8..14b0b595202d 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans,
u32 snapshot;
int ret;
- if (0 && trans->c->opts.inodes_use_key_cache)
- flags |= BTREE_ITER_CACHED;
-
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot), flags);
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@@ -585,79 +583,62 @@ found_slot:
static int bch2_inode_delete_keys(struct btree_trans *trans,
subvol_inum inum, enum btree_id id)
{
- u64 offset = 0;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ u32 snapshot;
int ret = 0;
- while (!ret || ret == -EINTR) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(trans->c, 0);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i delete;
- u32 snapshot;
+ /*
+ * We're never going to be deleting extents, no need to use an extent
+ * iterator:
+ */
+ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ while (1) {
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- continue;
+ goto err;
- bch2_trans_iter_init(trans, &iter, id,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
-
- if (!k.k || iter.pos.inode != inum.inum) {
- bch2_trans_iter_exit(trans, &iter);
- break;
- }
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
+ if (!k.k)
+ break;
+
bkey_init(&delete.k);
delete.k.p = iter.pos;
- if (btree_node_type_is_extents(iter.btree_id)) {
- unsigned max_sectors =
- min_t(u64, U64_MAX - iter.pos.offset,
- KEY_SIZE_MAX & (~0 << trans->c->block_bits));
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
-
- ret = bch2_extent_trim_atomic(trans, &iter, &delete);
- if (ret)
- goto err;
- }
-
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
- bch2_trans_commit(trans, &disk_res, NULL,
+ bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
- bch2_disk_reservation_put(trans->c, &disk_res);
err:
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
+ if (ret && ret != -EINTR)
+ break;
}
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans trans;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
- unsigned iter_flags = BTREE_ITER_INTENT;
u32 snapshot;
int ret;
- if (0 && cached && c->opts.inodes_use_key_cache)
- iter_flags |= BTREE_ITER_CACHED;
-
bch2_trans_init(&trans, c, 0, 1024);
/*
@@ -681,7 +662,8 @@ retry:
goto err;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot), iter_flags);
+ SPOS(0, inum.inum, snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 723186d8afb6..2337ecfc600e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -13,11 +13,15 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode (struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
}
#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \
.key_invalid = bch2_inode_v2_invalid, \
.val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
}
static inline bool bkey_is_inode(const struct bkey *k)
@@ -87,7 +91,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, u32, u64);
-int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 50b90b728a6d..36929451af2c 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -764,6 +764,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
struct bch_fs *c = op->c;
struct nonce nonce = extent_nonce(op->version, op->crc);
struct bch_csum csum;
+ int ret;
if (!bch2_csum_type_is_encryption(op->crc.csum_type))
return 0;
@@ -778,10 +779,10 @@ static int bch2_write_decrypt(struct bch_write_op *op)
if (bch2_crc_cmp(op->crc.csum, csum))
return -EIO;
- bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
op->crc.csum_type = 0;
op->crc.csum = (struct bch_csum) { 0, 0 };
- return 0;
+ return ret;
}
static enum prep_encoded_ret {
@@ -996,8 +997,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
crc.live_size = src_len >> 9;
swap(dst->bi_iter.bi_size, dst_len);
- bch2_encrypt_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
+ ret = bch2_encrypt_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ if (ret)
+ goto err;
+
crc.csum = bch2_checksum_bio(c, op->csum_type,
extent_nonce(version, crc), dst);
crc.csum_type = op->csum_type;
@@ -1055,7 +1059,7 @@ static void __bch2_write(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct write_point *wp;
- struct bio *bio;
+ struct bio *bio = NULL;
bool skip_put = true;
unsigned nofs_flags;
int ret;
@@ -1772,6 +1776,7 @@ static void __bch2_read_endio(struct work_struct *work)
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
struct bch_csum csum;
+ int ret;
nofs_flags = memalloc_nofs_save();
@@ -1806,7 +1811,10 @@ static void __bch2_read_endio(struct work_struct *work)
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc_is_compressed(crc)) {
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
goto decompression_err;
} else {
@@ -1817,7 +1825,9 @@ static void __bch2_read_endio(struct work_struct *work)
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
src->bi_iter.bi_size = dst_iter.bi_size;
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
@@ -1830,7 +1840,10 @@ static void __bch2_read_endio(struct work_struct *work)
* Re encrypt data we decrypted, so it's consistent with
* rbio->crc:
*/
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
promote_start(rbio->promote, rbio);
rbio->promote = NULL;
}
@@ -1865,6 +1878,11 @@ decompression_err:
"decompression error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
goto out;
+decrypt_err:
+ bch_err_inum_ratelimited(c, rbio->read_pos.inode,
+ "decrypt error");
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ goto out;
}
static void bch2_read_endio(struct bio *bio)
@@ -1893,9 +1911,8 @@ static void bch2_read_endio(struct bio *bio)
return;
}
- if (rbio->pick.ptr.cached &&
- (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr))) {
+ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+ ptr_stale(ca, &rbio->pick.ptr)) {
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -1954,6 +1971,35 @@ err:
return ret;
}
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bch_extent_ptr ptr)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+ struct btree_iter iter;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+ BTREE_ITER_CACHED);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
+ goto out;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "%s", buf.buf);
+ bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+ bch2_trans_iter_exit(trans, &iter);
+out:
+ printbuf_exit(&buf);
+}
+
int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bpos read_pos,
enum btree_id data_btree, struct bkey_s_c k,
@@ -1963,7 +2009,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca;
+ struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
@@ -1980,7 +2026,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
zero_fill_bio_iter(&orig->bio, iter);
goto out_read_done;
}
-
+retry_pick:
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
/* hole or reservation - just zero fill: */
@@ -1993,8 +2039,27 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
goto err;
}
- if (pick_ret > 0)
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+ /*
+ * Stale dirty pointers are treated as IO errors, but @failed isn't
+ * allocated unless we're in the retry path - so if we're not in the
+ * retry path, don't check here, it'll be caught in bch2_read_endio()
+ * and we'll end up in the retry path:
+ */
+ if ((flags & BCH_READ_IN_RETRY) &&
+ !pick.ptr.cached &&
+ unlikely(ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ bch2_mark_io_failure(failed, &pick);
+ goto retry_pick;
+ }
+
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ bch2_trans_unlock(trans);
if (flags & BCH_READ_NODECODE) {
/*
@@ -2241,7 +2306,7 @@ retry:
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
+ BTREE_ITER_SLOTS);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
@@ -2282,12 +2347,6 @@ retry:
*/
sectors = min(sectors, k.k->size - offset_into_extent);
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(&trans);
-
bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 1aa422dccef7..fb5114518666 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -50,7 +50,7 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
- return op->alloc_reserve == RESERVE_MOVINGGC
+ return op->alloc_reserve == RESERVE_movinggc
? op->c->copygc_wq
: op->c->btree_update_wq;
}
@@ -79,7 +79,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->compression_type = bch2_compression_opt_to_type[opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
- op->alloc_reserve = RESERVE_NONE;
+ op->alloc_reserve = RESERVE_none;
op->incompressible = 0;
op->open_buckets.nr = 0;
op->devs_have.nr = 0;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index e0017dcf3312..505e8367b5f2 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -15,23 +15,26 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
+#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "super-io.h"
#include <trace/events/bcachefs.h>
-static u64 last_unwritten_seq(struct journal *j)
-{
- union journal_res_state s = READ_ONCE(j->reservations);
+#define x(n) #n,
+static const char * const bch2_journal_watermarks[] = {
+ JOURNAL_WATERMARKS()
+ NULL
+};
- lockdep_assert_held(&j->lock);
-
- return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
-}
+static const char * const bch2_journal_errors[] = {
+ JOURNAL_ERRORS()
+ NULL
+};
+#undef x
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
- return seq >= last_unwritten_seq(j);
+ return seq > j->seq_ondisk;
}
static bool __journal_entry_is_open(union journal_res_state state)
@@ -39,6 +42,11 @@ static bool __journal_entry_is_open(union journal_res_state state)
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
static bool journal_entry_is_open(struct journal *j)
{
return __journal_entry_is_open(j->reservations);
@@ -50,8 +58,6 @@ journal_seq_to_buf(struct journal *j, u64 seq)
struct journal_buf *buf = NULL;
EBUG_ON(seq > journal_cur_seq(j));
- EBUG_ON(seq == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
if (journal_seq_unwritten(j, seq)) {
buf = j->buf + (seq & JOURNAL_BUF_MASK);
@@ -69,54 +75,6 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
p->devs.nr = 0;
}
-static void journal_pin_new_entry(struct journal *j)
-{
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-}
-
-static void bch2_journal_buf_init(struct journal *j)
-{
- struct journal_buf *buf = journal_cur_buf(j);
-
- bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(journal_cur_seq(j));
- buf->data->u64s = 0;
-}
-
-void bch2_journal_halt(struct journal *j)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
-
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- /*
- * XXX: we're not using j->lock here because this can be called from
- * interrupt context, this can race with journal_write_done()
- */
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
- closure_wake_up(&journal_cur_buf(j)->wait);
-}
-
/* journal entry close/open: */
void __bch2_journal_buf_put(struct journal *j)
@@ -132,7 +90,7 @@ void __bch2_journal_buf_put(struct journal *j)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static bool __journal_entry_close(struct journal *j)
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -140,34 +98,24 @@ static bool __journal_entry_close(struct journal *j)
u64 v = atomic64_read(&j->reservations.counter);
unsigned sectors;
+ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+ closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
lockdep_assert_held(&j->lock);
do {
old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return true;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
- /* this entry will never be written: */
- closure_wake_up(&buf->wait);
- return true;
- }
-
- if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
- j->need_write_time = local_clock();
- }
+ new.cur_entry_offset = closed_val;
- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
- new.idx++;
-
- if (new.idx == new.unwritten_idx)
- return false;
-
- BUG_ON(journal_state_count(new, new.idx));
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+ old.cur_entry_offset == new.cur_entry_offset)
+ return;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ if (!__journal_entry_is_open(old))
+ return;
+
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
@@ -197,36 +145,42 @@ static bool __journal_entry_close(struct journal *j)
*/
buf->last_seq = journal_last_seq(j);
buf->data->last_seq = cpu_to_le64(buf->last_seq);
+ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
- /* Initialize new buffer: */
- journal_pin_new_entry(j);
-
- bch2_journal_buf_init(j);
-
cancel_delayed_work(&j->write_work);
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
bch2_journal_buf_put(j, old.idx);
- return true;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+ spin_lock(&j->lock);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
}
static bool journal_entry_want_write(struct journal *j)
{
- union journal_res_state s = READ_ONCE(j->reservations);
- bool ret = false;
+ bool ret = !journal_entry_is_open(j) ||
+ journal_cur_seq(j) == journal_last_unwritten_seq(j);
- /*
- * Don't close it yet if we already have a write in flight, but do set
- * NEED_WRITE:
- */
- if (s.idx != s.unwritten_idx)
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
- else
- ret = __journal_entry_close(j);
+ /* Don't close it yet if we already have a write in flight: */
+ if (ret)
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ else if (nr_unwritten_journal_entries(j)) {
+ struct journal_buf *buf = journal_cur_buf(j);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+ }
return ret;
}
@@ -255,34 +209,71 @@ static bool journal_entry_close(struct journal *j)
static int journal_entry_open(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = journal_cur_buf(j);
+ struct journal_buf *buf = j->buf +
+ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
union journal_res_state old, new;
int u64s;
u64 v;
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return cur_entry_blocked;
+ return JOURNAL_ERR_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
+ if (bch2_journal_error(j))
+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+ if (!fifo_free(&j->pin))
+ return JOURNAL_ERR_journal_pin_full;
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
+ return JOURNAL_ERR_max_in_flight;
+
BUG_ON(!j->cur_entry_sectors);
+ buf->expires =
+ (journal_cur_seq(j) == j->flushed_seq_ondisk
+ ? jiffies
+ : j->last_flush_write) +
+ msecs_to_jiffies(c->opts.journal_flush_delay);
+
buf->u64s_reserved = j->entry_u64s_reserved;
buf->disk_sectors = j->cur_entry_sectors;
buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
u64s = (int) (buf->sectors << 9) / sizeof(u64) -
journal_entry_overhead(j);
- u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+ if (u64s <= 0)
+ return JOURNAL_ERR_journal_full;
- if (u64s <= le32_to_cpu(buf->data->u64s))
- return cur_entry_journal_full;
+ if (fifo_empty(&j->pin) && j->reclaim_thread)
+ wake_up_process(j->reclaim_thread);
+
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for journal_last_seq() to be calculated correctly
+ */
+ atomic64_inc(&j->seq);
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+ bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
+
+ memset(buf->data, 0, sizeof(*buf->data));
+ buf->data->seq = cpu_to_le64(journal_cur_seq(j));
+ buf->data->u64s = 0;
/*
* Must be set before marking the journal entry as open:
@@ -293,14 +284,14 @@ static int journal_entry_open(struct journal *j)
do {
old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return cur_entry_insufficient_devices;
+ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ new.idx++;
+ BUG_ON(journal_state_count(new, new.idx));
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
- EBUG_ON(journal_state_count(new, new.idx));
journal_state_inc(&new);
+ new.cur_entry_offset = 0;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
@@ -318,8 +309,7 @@ static int journal_entry_open(struct journal *j)
static bool journal_quiesced(struct journal *j)
{
- union journal_res_state s = READ_ONCE(j->reservations);
- bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
+ bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
if (!ret)
journal_entry_close(j);
@@ -334,8 +324,21 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ long delta;
- journal_entry_close(j);
+ spin_lock(&j->lock);
+ if (!__journal_entry_is_open(j->reservations))
+ goto unlock;
+
+ delta = journal_cur_buf(j)->expires - jiffies;
+
+ if (delta > 0)
+ mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+ spin_unlock(&j->lock);
}
static int __journal_res_get(struct journal *j, struct journal_res *res,
@@ -364,13 +367,12 @@ retry:
return 0;
}
- if (!(flags & JOURNAL_RES_GET_RESERVED) &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
/*
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
- ret = cur_entry_journal_full;
+ ret = JOURNAL_ERR_journal_full;
goto unlock;
}
@@ -385,20 +387,13 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- if (journal_entry_is_open(j) &&
- !__journal_entry_close(j)) {
- /*
- * We failed to get a reservation on the current open journal
- * entry because it's full, and we can't close it because
- * there's still a previous one in flight:
- */
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ ret = journal_entry_open(j);
+
+ if (ret == JOURNAL_ERR_max_in_flight)
trace_journal_entry_full(c);
- ret = cur_entry_blocked;
- } else {
- ret = journal_entry_open(j);
- }
unlock:
- if ((ret && ret != cur_entry_insufficient_devices) &&
+ if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
trace_journal_full(c);
@@ -410,23 +405,24 @@ unlock:
if (!ret)
goto retry;
- if ((ret == cur_entry_journal_full ||
- ret == cur_entry_journal_pin_full) &&
+ if ((ret == JOURNAL_ERR_journal_full ||
+ ret == JOURNAL_ERR_journal_pin_full) &&
!can_discard &&
- j->reservations.idx == j->reservations.unwritten_idx &&
- (flags & JOURNAL_RES_GET_RESERVED)) {
- char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-
- bch_err(c, "Journal stuck!");
- if (journal_debug_buf) {
- bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
- bch_err(c, "%s", journal_debug_buf);
-
- bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
- bch_err(c, "Journal pins:\n%s", journal_debug_buf);
- kfree(journal_debug_buf);
- }
+ !nr_unwritten_journal_entries(j) &&
+ (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
+ struct printbuf buf = PRINTBUF;
+
+ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
+ bch2_journal_errors[ret]);
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "%s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_journal_pins_to_text(&buf, j);
+ bch_err(c, "Journal pins:\n%s", buf.buf);
+
+ printbuf_exit(&buf);
bch2_fatal_error(c);
dump_stack();
}
@@ -435,8 +431,8 @@ unlock:
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
- if ((ret == cur_entry_journal_full ||
- ret == cur_entry_journal_pin_full) &&
+ if ((ret == JOURNAL_ERR_journal_full ||
+ ret == JOURNAL_ERR_journal_pin_full) &&
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
@@ -449,7 +445,7 @@ unlock:
}
}
- return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
+ return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
}
/*
@@ -528,7 +524,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -573,12 +569,15 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
}
/* if seq was written, but not flushed - flush a newer one instead */
- seq = max(seq, last_unwritten_seq(j));
+ seq = max(seq, journal_last_unwritten_seq(j));
recheck_need_open:
- if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+ if (seq > journal_cur_seq(j)) {
struct journal_res res = { 0 };
+ if (journal_entry_is_open(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
spin_unlock(&j->lock);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
@@ -588,7 +587,11 @@ recheck_need_open:
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
if (parent && !closure_wait(&buf->wait, parent))
BUG();
@@ -640,6 +643,58 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
return ret ?: ret2 < 0 ? ret2 : 0;
}
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+}
+
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ u64 unwritten_seq;
+ bool ret = false;
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+ return false;
+
+ if (seq <= c->journal.flushed_seq_ondisk)
+ return false;
+
+ spin_lock(&j->lock);
+ if (seq <= c->journal.flushed_seq_ondisk)
+ goto out;
+
+ for (unwritten_seq = journal_last_unwritten_seq(j);
+ unwritten_seq < seq;
+ unwritten_seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+ /* journal write is already in flight, and was a flush write: */
+ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+ goto out;
+
+ buf->noflush = true;
+ }
+
+ ret = true;
+out:
+ spin_unlock(&j->lock);
+ return ret;
+}
+
int bch2_journal_meta(struct journal *j)
{
struct journal_buf *buf;
@@ -654,55 +709,48 @@ int bch2_journal_meta(struct journal *j)
buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
bch2_journal_res_put(j, &res);
return bch2_journal_flush_seq(j, res.seq);
}
-/*
- * bch2_journal_flush_async - if there is an open journal entry, or a journal
- * still being written, write it and wait for the write to complete
- */
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
{
- u64 seq, journal_seq;
+ struct jset_entry_log *entry;
+ struct journal_res res = { 0 };
+ unsigned msglen, u64s;
+ va_list args;
+ int ret;
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
+ va_start(args, fmt);
+ msglen = vsnprintf(NULL, 0, fmt, args) + 1;
+ va_end(args);
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return;
- }
- spin_unlock(&j->lock);
+ u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
- bch2_journal_flush_seq_async(j, seq, parent);
-}
+ ret = bch2_journal_res_get(j, &res, u64s, 0);
+ if (ret)
+ return ret;
-int bch2_journal_flush(struct journal *j)
-{
- u64 seq, journal_seq;
+ entry = container_of(journal_res_entry(j, &res),
+ struct jset_entry_log, entry);;
+ memset(entry, 0, u64s * sizeof(u64));
+ entry->entry.type = BCH_JSET_ENTRY_log;
+ entry->entry.u64s = u64s - 1;
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
+ va_start(args, fmt);
+ vsnprintf(entry->d, INT_MAX, fmt, args);
+ va_end(args);
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return 0;
- }
- spin_unlock(&j->lock);
+ bch2_journal_res_put(j, &res);
- return bch2_journal_flush_seq(j, seq);
+ return bch2_journal_flush_seq(j, res.seq);
}
/* block/unlock the journal: */
@@ -732,28 +780,53 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ struct open_bucket **ob = NULL;
+ long *bu = NULL;
+ unsigned i, nr_got = 0, nr_want = nr - ja->nr;
+ unsigned old_nr = ja->nr;
+ unsigned old_discard_idx = ja->discard_idx;
+ unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk;
+ unsigned old_dirty_idx = ja->dirty_idx;
+ unsigned old_cur_idx = ja->cur_idx;
int ret = 0;
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
+ if (c) {
+ bch2_journal_block(&c->journal);
+ bch2_journal_flush_all_pins(&c->journal);
+ }
+ bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
+ ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq) {
+ if (!bu || !ob || !new_buckets || !new_bucket_seq) {
ret = -ENOMEM;
- goto err;
+ goto err_unblock;
}
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets) {
- ret = -ENOSPC;
- goto err;
+ for (nr_got = 0; nr_got < nr_want; nr_got++) {
+ if (new_fs) {
+ bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+ if (bu[nr_got] < 0) {
+ ret = -ENOSPC;
+ break;
+ }
+ } else {
+ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
+ false, cl);
+ if (IS_ERR(ob[nr_got])) {
+ ret = cl ? -EAGAIN : -ENOSPC;
+ break;
+ }
+
+ bu[nr_got] = ob[nr_got]->bucket;
+ }
}
+ if (!nr_got)
+ goto err_unblock;
+
/*
* We may be called from the device add path, before the new device has
* actually been added to the running filesystem:
@@ -766,51 +839,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq);
- if (!new_fs)
- spin_unlock(&c->journal.lock);
-
- while (ja->nr < nr) {
- struct open_bucket *ob = NULL;
- unsigned pos;
- long b;
+ for (i = 0; i < nr_got; i++) {
+ unsigned pos = ja->discard_idx ?: ja->nr;
+ long b = bu[i];
- if (new_fs) {
- b = bch2_bucket_alloc_new_fs(ca);
- if (b < 0) {
- ret = -ENOSPC;
- goto err;
- }
- } else {
- rcu_read_lock();
- ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
- false, cl);
- rcu_read_unlock();
- if (IS_ERR(ob)) {
- ret = cl ? -EAGAIN : -ENOSPC;
- goto err;
- }
-
- b = ob->bucket;
- }
-
- if (c)
- spin_lock(&c->journal.lock);
-
- /*
- * XXX
- * For resize at runtime, we should be writing the new
- * superblock before inserting into the journal array
- */
-
- pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
__array_insert_item(ja->buckets, ja->nr, pos);
__array_insert_item(ja->bucket_seq, ja->nr, pos);
- __array_insert_item(journal_buckets->buckets, ja->nr, pos);
ja->nr++;
ja->buckets[pos] = b;
ja->bucket_seq[pos] = 0;
- journal_buckets->buckets[pos] = cpu_to_le64(b);
if (pos <= ja->discard_idx)
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -820,29 +858,56 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ }
+
+ ret = bch2_journal_buckets_to_sb(c, ca);
+ if (ret) {
+ /* Revert: */
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+ ja->nr = old_nr;
+ ja->discard_idx = old_discard_idx;
+ ja->dirty_idx_ondisk = old_dirty_idx_ondisk;
+ ja->dirty_idx = old_dirty_idx;
+ ja->cur_idx = old_cur_idx;
+ }
+
+ if (!new_fs)
+ spin_unlock(&c->journal.lock);
- if (c)
- spin_unlock(&c->journal.lock);
+ if (c)
+ bch2_journal_unblock(&c->journal);
- if (!new_fs) {
+ if (ret)
+ goto err;
+
+ if (!new_fs) {
+ for (i = 0; i < nr_got; i++) {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_trans_mark_metadata_bucket(&trans, ca,
- b, BCH_DATA_journal,
+ bu[i], BCH_DATA_journal,
ca->mi.bucket_size));
-
- bch2_open_bucket_put(c, ob);
-
- if (ret)
+ if (ret) {
+ bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
goto err;
+ }
}
}
err:
- bch2_sb_resize_journal(&ca->disk_sb,
- ja->nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (ob && !new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_open_bucket_put(c, ob[i]);
+
kfree(new_bucket_seq);
kfree(new_buckets);
+ kfree(ob);
+ kfree(bu);
return ret;
+err_unblock:
+ if (c)
+ bch2_journal_unblock(&c->journal);
+ goto err;
}
/*
@@ -855,11 +920,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
struct journal_device *ja = &ca->journal;
struct closure cl;
unsigned current_nr;
- int ret;
+ int ret = 0;
+
+ /* don't handle reducing nr of buckets yet: */
+ if (nr < ja->nr)
+ return 0;
closure_init_stack(&cl);
- do {
+ while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
struct disk_reservation disk_res = { 0, 0 };
closure_sync(&cl);
@@ -887,7 +956,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
if (ja->nr != current_nr)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- } while (ret == -EAGAIN);
+ }
return ret;
}
@@ -918,17 +987,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
- union journal_res_state state;
bool ret = false;
- unsigned i;
+ u64 seq;
spin_lock(&j->lock);
- state = READ_ONCE(j->reservations);
- i = state.idx;
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j) && !ret;
+ seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, seq);
- while (i != state.unwritten_idx) {
- i = (i - 1) & JOURNAL_BUF_MASK;
- if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
ret = true;
}
spin_unlock(&j->lock);
@@ -943,6 +1011,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
+ bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j);
wait_event(j->wait, journal_entry_close(j));
@@ -957,11 +1026,9 @@ void bch2_fs_journal_stop(struct journal *j)
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
- (journal_entry_is_open(j) ||
- j->last_empty_seq + 1 != journal_cur_seq(j)));
+ j->last_empty_seq != journal_cur_seq(j));
cancel_delayed_work_sync(&j->write_work);
- bch2_journal_reclaim_stop(j);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -991,6 +1058,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->flushed_seq_ondisk = cur_seq - 1;
+ j->seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
@@ -1028,11 +1096,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
set_bit(JOURNAL_STARTED, &j->flags);
j->last_flush_write = jiffies;
- journal_pin_new_entry(j);
-
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
-
- bch2_journal_buf_init(j);
+ j->reservations.unwritten_idx++;
c->last_bucket_seq_cleanup = journal_cur_seq(j);
@@ -1060,9 +1125,20 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(sb);
+ struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+ bch2_sb_get_journal_v2(sb);
unsigned i;
- ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ ja->nr = 0;
+
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+ for (i = 0; i < nr; i++)
+ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+ } else if (journal_buckets) {
+ ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ }
ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->bucket_seq)
@@ -1077,8 +1153,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->buckets)
return -ENOMEM;
- for (i = 0; i < ja->nr; i++)
- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+ unsigned j, dst = 0;
+
+ for (i = 0; i < nr; i++)
+ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ ja->buckets[dst++] =
+ le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+ } else if (journal_buckets) {
+ for (i = 0; i < ja->nr; i++)
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ }
return 0;
}
@@ -1144,17 +1230,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
union journal_res_state s;
struct bch_dev *ca;
unsigned long now = jiffies;
+ u64 seq;
unsigned i;
+ out->atomic++;
+ out->tabstops[0] = 24;
+
rcu_read_lock();
s = READ_ONCE(j->reservations);
- pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin));
+ pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
+ pr_buf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
+ pr_buf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]);
pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
@@ -1164,35 +1256,54 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error);
+ pr_buf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
pr_buf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- pr_buf(out, "error\n");
+ pr_buf(out, "error");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- pr_buf(out, "closed\n");
+ pr_buf(out, "closed");
break;
default:
- pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
+ pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
+ pr_newline(out);
- i = s.idx;
- while (i != s.unwritten_idx) {
- i = (i - 1) & JOURNAL_BUF_MASK;
+ for (seq = journal_cur_seq(j);
+ seq >= journal_last_unwritten_seq(j);
+ --seq) {
+ i = seq & JOURNAL_BUF_MASK;
- pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
- i, journal_state_count(s, i), j->buf[i].sectors);
+ pr_buf(out, "unwritten entry:");
+ pr_tab(out);
+ pr_buf(out, "%llu", seq);
+ pr_newline(out);
+ pr_indent_push(out, 2);
+
+ pr_buf(out, "refcount:");
+ pr_tab(out);
+ pr_buf(out, "%u", journal_state_count(s, i));
+ pr_newline(out);
+
+ pr_buf(out, "sectors:");
+ pr_tab(out);
+ pr_buf(out, "%u", j->buf[i].sectors);
+ pr_newline(out);
+
+ pr_buf(out, "expires");
+ pr_tab(out);
+ pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies);
+ pr_newline(out);
+
+ pr_indent_pop(out, 2);
}
pr_buf(out,
- "need write:\t\t%i\n"
"replay done:\t\t%i\n",
- test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
pr_buf(out, "space:\n");
@@ -1230,6 +1341,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
rcu_read_unlock();
+
+ --out->atomic;
}
void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
@@ -1239,27 +1352,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
spin_unlock(&j->lock);
}
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
- u64 i;
spin_lock(&j->lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
- pr_buf(out, "%llu: count %u\n",
- i, atomic_read(&pin_list->count));
+ *seq = max(*seq, j->pin.front);
+
+ if (*seq >= j->pin.back) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ out->atomic++;
- list_for_each_entry(pin, &pin_list->list, list)
- pr_buf(out, "\t%px %ps\n",
- pin, pin->flush);
+ pin_list = journal_seq_pin(j, *seq);
- if (!list_empty(&pin_list->flushed))
- pr_buf(out, "flushed:\n");
+ pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+ pr_newline(out);
+ pr_indent_push(out, 2);
- list_for_each_entry(pin, &pin_list->flushed, list)
- pr_buf(out, "\t%px %ps\n",
- pin, pin->flush);
+ list_for_each_entry(pin, &pin_list->list, list) {
+ pr_buf(out, "\t%px %ps", pin, pin->flush);
+ pr_newline(out);
}
+
+ list_for_each_entry(pin, &pin_list->key_cache_list, list) {
+ pr_buf(out, "\t%px %ps", pin, pin->flush);
+ pr_newline(out);
+ }
+
+ if (!list_empty(&pin_list->flushed)) {
+ pr_buf(out, "flushed:");
+ pr_newline(out);
+ }
+
+ list_for_each_entry(pin, &pin_list->flushed, list) {
+ pr_buf(out, "\t%px %ps", pin, pin->flush);
+ pr_newline(out);
+ }
+
+ pr_indent_pop(out, 2);
+
+ --out->atomic;
spin_unlock(&j->lock);
+
+ return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+ u64 seq = 0;
+
+ while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+ seq++;
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index c39cbbf1bccd..e7321c327d9d 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -141,6 +141,11 @@ static inline u64 journal_cur_seq(struct journal *j)
return j->pin.back - 1;
}
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+ return j->seq_ondisk + 1;
+}
+
void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
@@ -261,9 +266,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
.buf3_count = idx == 3,
}).v, &j->reservations.counter);
- EBUG_ON(((s.idx - idx) & 3) >
- ((s.idx - s.unwritten_idx) & 3));
-
if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
__bch2_journal_buf_put(j);
}
@@ -293,9 +295,9 @@ static inline void bch2_journal_res_put(struct journal *j,
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned);
-#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
-#define JOURNAL_RES_GET_CHECK (1 << 1)
-#define JOURNAL_RES_GET_RESERVED (1 << 2)
+/* First two bits for JOURNAL_WATERMARK: */
+#define JOURNAL_RES_GET_NONBLOCK (1 << 2)
+#define JOURNAL_RES_GET_CHECK (1 << 3)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
@@ -316,8 +318,7 @@ static inline int journal_res_get_fast(struct journal *j,
EBUG_ON(!journal_state_count(new, new.idx));
- if (!(flags & JOURNAL_RES_GET_RESERVED) &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
return 0;
new.cur_entry_offset += res->u64s;
@@ -370,23 +371,27 @@ out:
/* journal_preres: */
-static inline bool journal_check_may_get_unreserved(struct journal *j)
+static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
- bool ret = s.reserved < s.remaining &&
- fifo_free(&j->pin) > 8;
-
- lockdep_assert_held(&j->lock);
-
- if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
- if (ret) {
- set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
- journal_wake(j);
- } else {
- clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
- }
- }
- return ret;
+ unsigned watermark = JOURNAL_WATERMARK_any;
+
+ if (fifo_free(&j->pin) < j->pin.size / 4)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+ if (fifo_free(&j->pin) < j->pin.size / 8)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+ if (s.reserved > s.remaining)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+ if (!s.remaining)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+ if (watermark == j->watermark)
+ return;
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static inline void bch2_journal_preres_put(struct journal *j,
@@ -406,12 +411,8 @@ static inline void bch2_journal_preres_put(struct journal *j,
closure_wake_up(&j->preres_wait);
}
- if (s.reserved <= s.remaining &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
- spin_lock(&j->lock);
- journal_check_may_get_unreserved(j);
- spin_unlock(&j->lock);
- }
+ if (s.reserved <= s.remaining && j->watermark)
+ journal_set_watermark(j);
}
int __bch2_journal_preres_get(struct journal *,
@@ -432,8 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
old.v = new.v = v;
ret = 0;
- if ((flags & JOURNAL_RES_GET_RESERVED) ||
- test_bit(JOURNAL_NOCHANGES, &j->flags) ||
+ if ((flags & JOURNAL_WATERMARK_reserved) ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
@@ -477,7 +477,9 @@ void bch2_journal_flush_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
+int bch2_journal_log_msg(struct journal *, const char *, ...);
void bch2_journal_halt(struct journal *);
@@ -501,6 +503,7 @@ void bch2_journal_block(struct journal *);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e161e86e48c4..e61b88930a7f 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
@@ -47,12 +48,12 @@ struct journal_list {
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct bch_extent_ptr entry_ptr,
+ struct journal_ptr entry_ptr,
struct journal_list *jlist, struct jset *j,
bool bad)
{
struct journal_replay *i, *pos, *dup = NULL;
- struct bch_extent_ptr *ptr;
+ struct journal_ptr *ptr;
struct list_head *where;
size_t bytes = vstruct_bytes(j);
u64 last_seq = 0;
@@ -252,14 +253,15 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id));
if (invalid) {
- char buf[160];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
type, where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s),
- invalid, buf);
+ invalid, buf.buf);
+ printbuf_exit(&buf);
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -274,7 +276,7 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_btree_keys(struct bch_fs *c,
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -295,7 +297,24 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
return 0;
}
-static int journal_entry_validate_btree_root(struct bch_fs *c,
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct bkey_i *k;
+ bool first = true;
+
+ vstruct_for_each(entry, k) {
+ if (!first) {
+ pr_newline(out);
+ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ }
+ pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+ first = false;
+ }
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -323,7 +342,13 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -332,7 +357,12 @@ static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
return 0;
}
-static int journal_entry_validate_blacklist(struct bch_fs *c,
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -347,7 +377,16 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_blacklist *bl =
+ container_of(entry, struct jset_entry_blacklist, entry);
+
+ pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -373,7 +412,18 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_usage(struct bch_fs *c,
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_blacklist_v2 *bl =
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ pr_buf(out, "start=%llu end=%llu",
+ le64_to_cpu(bl->start),
+ le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -394,7 +444,18 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_data_usage(struct bch_fs *c,
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_usage *u =
+ container_of(entry, struct jset_entry_usage, entry);
+
+ pr_buf(out, "type=%s v=%llu",
+ bch2_fs_usage_types[u->entry.btree_id],
+ le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -416,7 +477,17 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_clock(struct bch_fs *c,
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_data_usage *u =
+ container_of(entry, struct jset_entry_data_usage, entry);
+
+ bch2_replicas_entry_to_text(out, &u->r);
+ pr_buf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -442,7 +513,16 @@ fsck_err:
return ret;
}
-static int journal_entry_validate_dev_usage(struct bch_fs *c,
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
@@ -479,15 +559,59 @@ fsck_err:
return ret;
}
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+ pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+
+ for (i = 0; i < nr_types; i++) {
+ if (i < BCH_DATA_NR)
+ pr_buf(out, " %s", bch2_data_types[i]);
+ else
+ pr_buf(out, " (unknown data type %u)", i);
+ pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+ le64_to_cpu(u->d[i].buckets),
+ le64_to_cpu(u->d[i].sectors),
+ le64_to_cpu(u->d[i].fragmented));
+ }
+
+ pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
+ le64_to_cpu(u->buckets_ec),
+ le64_to_cpu(u->buckets_unavailable));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+ const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+ pr_buf(out, "%.*s", bytes, l->d);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, const char *,
struct jset_entry *, unsigned, int, int);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#define x(f, nr) \
[BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
- .validate = journal_entry_validate_##f, \
+ .validate = journal_entry_##f##_validate, \
+ .to_text = journal_entry_##f##_to_text, \
},
BCH_JSET_ENTRY_TYPES()
#undef x
@@ -503,6 +627,17 @@ int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
: 0;
}
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ if (entry->type < BCH_JSET_ENTRY_NR) {
+ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+ } else {
+ pr_buf(out, "(unknown type %u)", entry->type);
+ }
+}
+
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
@@ -592,9 +727,11 @@ static int jset_validate(struct bch_fs *c,
sector, le64_to_cpu(jset->seq)))
ret = JOURNAL_ENTRY_BAD;
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
+ bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret);
csum_done:
/* last_seq is ignored when JSET_NO_FLUSH is true */
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
@@ -737,9 +874,12 @@ reread:
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
- .dev = ca->dev_idx,
- .offset = offset,
+ ret = journal_entry_add(c, ca, (struct journal_ptr) {
+ .dev = ca->dev_idx,
+ .bucket = bucket,
+ .bucket_offset = offset -
+ bucket_to_sector(ca, ja->buckets[bucket]),
+ .sector = offset,
}, jlist, j, ret != 0);
mutex_unlock(&jlist->lock);
@@ -766,12 +906,14 @@ static void bch2_journal_read_device(struct closure *cl)
struct journal_device *ja =
container_of(cl, struct journal_device, read);
struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+ struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
+ struct journal_replay *r;
struct journal_read_buf buf = { NULL, 0 };
u64 min_seq = U64_MAX;
unsigned i;
- int ret;
+ int ret = 0;
if (!ja->nr)
goto out;
@@ -803,11 +945,37 @@ static void bch2_journal_read_device(struct closure *cl)
* allocate
*/
while (ja->bucket_seq[ja->cur_idx] > min_seq &&
- ja->bucket_seq[ja->cur_idx] >
+ ja->bucket_seq[ja->cur_idx] ==
ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = 0;
+ ja->sectors_free = ca->mi.bucket_size;
+
+ mutex_lock(&jlist->lock);
+ list_for_each_entry(r, jlist->head, list) {
+ for (i = 0; i < r->nr_ptrs; i++) {
+ if (r->ptrs[i].dev == ca->dev_idx &&
+ sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+ unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) +
+ vstruct_sectors(&r->j, c->block_bits);
+
+ ja->sectors_free = min(ja->sectors_free,
+ ca->mi.bucket_size - wrote);
+ }
+ }
+ }
+ mutex_unlock(&jlist->lock);
+
+ if (ja->bucket_seq[ja->cur_idx] &&
+ ja->sectors_free == ca->mi.bucket_size) {
+ bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+ for (i = 0; i < 3; i++) {
+ unsigned idx = ja->cur_idx - 1 + i;
+ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+ }
+ ja->sectors_free = 0;
+ }
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
@@ -817,6 +985,7 @@ static void bch2_journal_read_device(struct closure *cl)
ja->discard_idx = ja->dirty_idx_ondisk =
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
+ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
@@ -828,8 +997,8 @@ err:
goto out;
}
-static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
{
unsigned i;
@@ -837,13 +1006,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
u64 offset;
- div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
if (i)
pr_buf(out, " ");
- pr_buf(out, "%u:%llu (offset %llu)",
+ pr_buf(out, "%u:%u:%u (sector %llu)",
j->ptrs[i].dev,
- (u64) j->ptrs[i].offset, offset);
+ j->ptrs[i].bucket,
+ j->ptrs[i].bucket_offset,
+ j->ptrs[i].sector);
}
}
@@ -854,6 +1025,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
struct journal_replay *i, *t;
struct bch_dev *ca;
unsigned iter;
+ struct printbuf buf = PRINTBUF;
size_t keys = 0, entries = 0;
bool degraded = false;
u64 seq, last_seq = 0;
@@ -912,7 +1084,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (!last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
- return -1;
+ ret = -1;
+ goto err;
}
/* Drop blacklisted entries and entries older than last_seq: */
@@ -944,7 +1117,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
while (seq < le64_to_cpu(i->j.seq)) {
u64 missing_start, missing_end;
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
while (seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -960,14 +1133,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
seq++;
if (i->list.prev != list) {
- struct printbuf out = PBUF(buf1);
struct journal_replay *p = list_prev_entry(i, list);
- bch2_journal_ptrs_to_text(&out, c, p);
- pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+ bch2_journal_ptrs_to_text(&buf1, c, p);
+ pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
} else
- sprintf(buf1, "(none)");
- bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+ pr_buf(&buf1, "(none)");
+ bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
@@ -975,7 +1147,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
" next at %s",
missing_start, missing_end,
last_seq, *blacklist_seq - 1,
- buf1, buf2);
+ buf1.buf, buf2.buf);
+
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
}
seq++;
@@ -989,14 +1164,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
.e.nr_required = 1,
};
unsigned ptr;
- char buf[80];
if (i->ignore)
continue;
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
- goto fsck_err;
+ goto err;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
@@ -1008,15 +1182,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
* the devices - this is wrong:
*/
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, &replicas.e);
+
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
"superblock not marked as containing replicas %s",
- (bch2_replicas_entry_to_text(&PBUF(buf),
- &replicas.e), buf)))) {
+ buf.buf))) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
- return ret;
+ goto err;
}
for_each_jset_key(k, _n, entry, &i->j)
@@ -1030,7 +1206,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (*start_seq != *blacklist_seq)
bch_info(c, "dropped unflushed entries %llu-%llu",
*blacklist_seq, *start_seq - 1);
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -1157,49 +1335,6 @@ done:
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
}
-static void journal_write_compact(struct jset *jset)
-{
- struct jset_entry *i, *next, *prev = NULL;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each_safe(jset, i, next) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == BCH_JSET_ENTRY_btree_keys &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
- }
-
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
/* we aren't holding j->lock: */
@@ -1225,7 +1360,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
- return j->buf + j->reservations.unwritten_idx;
+ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
static void journal_write_done(struct closure *cl)
@@ -1262,15 +1397,18 @@ static void journal_write_done(struct closure *cl)
journal_seq_pin(j, seq)->devs = w->devs_written;
if (!err) {
- j->seq_ondisk = seq;
-
if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
+
+ bch2_do_discards(c);
+ closure_wake_up(&c->freelist_wait);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
+ j->seq_ondisk = seq;
+
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
@@ -1286,7 +1424,7 @@ static void journal_write_done(struct closure *cl)
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
- BUG_ON(new.idx == new.unwritten_idx);
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -1297,13 +1435,24 @@ static void journal_write_done(struct closure *cl)
closure_wake_up(&w->wait);
journal_wake(j);
- if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
- mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
- spin_unlock(&j->lock);
-
- if (new.unwritten_idx != new.idx &&
- !journal_state_count(new, new.unwritten_idx))
+ if (!journal_state_count(new, new.unwritten_idx) &&
+ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+ struct journal_buf *buf = journal_cur_buf(j);
+ long delta = buf->expires - jiffies;
+
+ /*
+ * We don't close a journal entry to write it while there's
+ * previous entries still in flight - the current journal entry
+ * might want to be written now:
+ */
+
+ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+ }
+
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
@@ -1385,7 +1534,7 @@ void bch2_journal_write(struct closure *cl)
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
- char *journal_debug_buf = NULL;
+ struct printbuf journal_debug_buf = PRINTBUF;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
@@ -1398,10 +1547,11 @@ void bch2_journal_write(struct closure *cl)
j->write_start_time = local_clock();
spin_lock(&j->lock);
- if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
- !w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+ if (bch2_journal_error(j) ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
@@ -1438,10 +1588,8 @@ void bch2_journal_write(struct closure *cl)
le32_add_cpu(&jset->u64s, u64s);
BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
- journal_write_compact(jset);
-
jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
+ jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
@@ -1461,9 +1609,12 @@ void bch2_journal_write(struct closure *cl)
jset_validate_for_write(c, jset))
goto err;
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret))
+ goto err;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
@@ -1488,11 +1639,8 @@ retry_alloc:
goto retry_alloc;
}
- if (ret) {
- journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
- if (journal_debug_buf)
- __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
- }
+ if (ret)
+ __bch2_journal_debug_to_text(&journal_debug_buf, j);
/*
* write is allocated, no longer need to account for it in
@@ -1509,8 +1657,8 @@ retry_alloc:
if (ret) {
bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf);
- kfree(journal_debug_buf);
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
@@ -1518,7 +1666,7 @@ retry_alloc:
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (test_bit(JOURNAL_NOCHANGES, &j->flags))
+ if (c->opts.nochanges)
goto no_io;
for_each_rw_member(ca, c, i)
@@ -1541,16 +1689,12 @@ retry_alloc:
}
}
- bch2_bucket_seq_cleanup(c);
-
continue_at(cl, do_journal_write, c->io_complete_wq);
return;
no_io:
- bch2_bucket_seq_cleanup(c);
-
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
err:
- bch2_inconsistent_error(c);
+ bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f34281a28f12..f2001835e43e 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -8,7 +8,12 @@
*/
struct journal_replay {
struct list_head list;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+ struct journal_ptr {
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+ } ptrs[BCH_REPLICAS_MAX];
unsigned nr_ptrs;
/* checksum error, but we may want to try using it anyways: */
@@ -40,8 +45,13 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
-int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
- unsigned, int, int);
+int bch2_journal_entry_validate(struct bch_fs *, const char *,
+ struct jset_entry *, unsigned, int, int);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+ struct jset_entry *);
+
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+ struct journal_replay *);
int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ab9a6d966d5e..a9f7d5a7feb2 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -34,10 +34,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja,
enum journal_space_from from)
{
- unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
- ? ((journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr)
- : ja->nr;
+ unsigned available = (journal_space_from(ja, from) -
+ ja->cur_idx - 1 + ja->nr) % ja->nr;
/*
* Don't use the last bucket unless writing the new last_seq
@@ -61,25 +59,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
old.v, new.v)) != old.v);
}
-static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
-{
- unsigned sectors = 0;
-
- while (!sectors && *idx != j->reservations.idx) {
- sectors = j->buf[*idx].sectors;
-
- *idx = (*idx + 1) & JOURNAL_BUF_MASK;
- }
-
- return sectors;
-}
-
static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
struct journal_device *ja = &ca->journal;
- unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
+ unsigned sectors, buckets, unwritten;
+ u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
@@ -94,7 +80,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
- while ((unwritten = get_unwritten_sectors(j, &idx))) {
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+ if (!unwritten)
+ continue;
+
/* entry won't fit on this device, skip: */
if (unwritten > ca->mi.bucket_size)
continue;
@@ -202,7 +195,7 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
- ret = cur_entry_insufficient_devices;
+ ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
@@ -216,23 +209,24 @@ void bch2_journal_space_available(struct journal *j)
total = j->space[journal_space_total].total;
if (!clean_ondisk &&
- j->reservations.idx ==
- j->reservations.unwritten_idx) {
- char *buf = kmalloc(4096, GFP_ATOMIC);
-
- bch_err(c, "journal stuck");
- if (buf) {
- __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
- pr_err("\n%s", buf);
- kfree(buf);
- }
+ journal_cur_seq(j) == j->seq_ondisk) {
+ struct printbuf buf = PRINTBUF;
+
+ __bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "journal stuck\n%s", buf.buf);
+ printbuf_exit(&buf);
+ /*
+ * Hack: bch2_fatal_error() calls bch2_journal_halt() which
+ * takes journal lock:
+ */
+ spin_unlock(&j->lock);
bch2_fatal_error(c);
- ret = cur_entry_journal_stuck;
+ spin_lock(&j->lock);
+
+ ret = JOURNAL_ERR_journal_stuck;
} else if (!j->space[journal_space_discarded].next_entry)
- ret = cur_entry_journal_full;
- else if (!fifo_free(&j->pin))
- ret = cur_entry_journal_pin_full;
+ ret = JOURNAL_ERR_journal_full;
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
@@ -251,7 +245,7 @@ out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
- journal_check_may_get_unreserved(j);
+ journal_set_watermark(j);
if (!ret)
journal_wake(j);
@@ -286,7 +280,8 @@ void bch2_journal_do_discards(struct journal *j)
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
- if (ca->mi.discard &&
+ if (!c->opts.nochanges &&
+ ca->mi.discard &&
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
@@ -373,9 +368,6 @@ static inline void __journal_pin_drop(struct journal *j,
if (atomic_dec_and_test(&pin_list->count) &&
pin_list == &fifo_peek_front(&j->pin))
bch2_journal_reclaim_fast(j);
- else if (fifo_used(&j->pin) == 1 &&
- atomic_read(&pin_list->count) == 1)
- journal_wake(j);
}
void bch2_journal_pin_drop(struct journal *j,
@@ -489,9 +481,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
u64 seq;
int err;
- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
- return 0;
-
lockdep_assert_held(&j->reclaim_lock);
while (1) {
@@ -671,7 +660,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
if (nr_flushed)
wake_up(&j->reclaim_wait);
- } while ((min_nr || min_key_cache) && !direct);
+ } while ((min_nr || min_key_cache) && nr_flushed && !direct);
memalloc_noreclaim_restore(flags);
@@ -688,12 +677,11 @@ static int bch2_journal_reclaim_thread(void *arg)
struct journal *j = arg;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned long delay, now;
+ bool journal_empty;
int ret = 0;
set_freezable();
- kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
-
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
@@ -716,10 +704,17 @@ static int bch2_journal_reclaim_thread(void *arg)
break;
if (j->reclaim_kicked)
break;
- if (time_after_eq(jiffies, j->next_reclaim))
- break;
- freezable_schedule_timeout(j->next_reclaim - jiffies);
+ spin_lock(&j->lock);
+ journal_empty = fifo_empty(&j->pin);
+ spin_unlock(&j->lock);
+
+ if (journal_empty)
+ freezable_schedule();
+ else if (time_after(j->next_reclaim, jiffies))
+ freezable_schedule_timeout(j->next_reclaim - jiffies);
+ else
+ break;
}
__set_current_state(TASK_RUNNING);
}
@@ -771,7 +766,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
+ if (journal_flush_pins(j, seq_to_flush, 0, 0))
+ *did_work = true;
spin_lock(&j->lock);
/*
@@ -780,8 +776,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
*/
ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
- (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+ !fifo_used(&j->pin);
spin_unlock(&j->lock);
mutex_unlock(&j->reclaim_lock);
@@ -829,10 +824,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = 0;
spin_lock(&j->lock);
- while (!ret && seq < j->pin.back) {
+ while (!ret) {
struct bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
+ if (seq >= j->pin.back)
+ break;
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
journal_seq_pin(j, seq)->devs);
seq++;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
new file mode 100644
index 000000000000..8efe7b7e3dcb
--- /dev/null
+++ b/fs/bcachefs/journal_sb.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+ const u64 *l = _l;
+ const u64 *r = _r;
+
+ return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ int ret = -EINVAL;
+ unsigned nr;
+ unsigned i;
+ u64 *b;
+
+ nr = bch2_nr_journal_buckets(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ for (i = 0; i < nr; i++)
+ b[i] = le64_to_cpu(journal->buckets[i]);
+
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+ if (!b[0]) {
+ pr_buf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0] < le16_to_cpu(m->first_bucket)) {
+ pr_buf(err, "journal bucket %llu before first bucket %u",
+ b[0], le16_to_cpu(m->first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1], le64_to_cpu(m->nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++)
+ if (b[i] == b[i + 1]) {
+ pr_buf(err, "duplicate journal buckets %llu", b[i]);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+ pr_buf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+ pr_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+ .validate = bch2_sb_journal_validate,
+ .to_text = bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+ u64 start;
+ u64 end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+ const struct u64_range *l = _l;
+ const struct u64_range *r = _r;
+
+ return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ int ret = -EINVAL;
+ unsigned nr;
+ unsigned i;
+ struct u64_range *b;
+
+ nr = bch2_sb_field_journal_v2_nr_entries(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ for (i = 0; i < nr; i++) {
+ b[i].start = le64_to_cpu(journal->d[i].start);
+ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+ }
+
+ sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+ if (!b[0].start) {
+ pr_buf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0].start < le16_to_cpu(m->first_bucket)) {
+ pr_buf(err, "journal bucket %llu before first bucket %u",
+ b[0].start, le16_to_cpu(m->first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++) {
+ if (b[i].end == b[i + 1].start) {
+ pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu",
+ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+ goto err;
+ }
+
+ if (b[i].end > b[i + 1].start) {
+ pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+ goto err;
+ }
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+ pr_buf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ pr_buf(out, " %llu-%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+ pr_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+ .validate = bch2_sb_journal_v2_validate,
+ .to_text = bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal_v2 *j;
+ unsigned i, dst = 0, nr = 1;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ if (!ja->nr) {
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+ return 0;
+ }
+
+ for (i = 0; i + 1 < ja->nr; i++)
+ if (ja->buckets[i] + 1 != ja->buckets[i + 1])
+ nr++;
+
+ j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+ (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+ if (!j)
+ return -ENOSPC;
+
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+ j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+ j->d[dst].nr = le64_to_cpu(1);
+
+ for (i = 1; i < ja->nr; i++) {
+ if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+ le64_add_cpu(&j->d[dst].nr, 1);
+ } else {
+ dst++;
+ j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+ j->d[dst].nr = le64_to_cpu(1);
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
new file mode 100644
index 000000000000..a39192e9f6f4
--- /dev/null
+++ b/fs/bcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+ return j
+ ? (__le64 *) vstruct_end(&j->field) - j->buckets
+ : 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+ if (!j)
+ return 0;
+
+ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 10bd23e969d2..3140c8731431 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -66,6 +66,12 @@ blacklist_entry_try_merge(struct bch_fs *c,
return bl;
}
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+ u64 start, u64 end)
+{
+ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
@@ -76,28 +82,21 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
nr = blacklist_nr_entries(bl);
- if (bl) {
- for (i = 0; i < nr; i++) {
- struct journal_seq_blacklist_entry *e =
- bl->start + i;
-
- if (start == le64_to_cpu(e->start) &&
- end == le64_to_cpu(e->end))
- goto out;
-
- if (start <= le64_to_cpu(e->start) &&
- end >= le64_to_cpu(e->end)) {
- e->start = cpu_to_le64(start);
- e->end = cpu_to_le64(end);
-
- if (i + 1 < nr)
- bl = blacklist_entry_try_merge(c,
- bl, i);
- if (i)
- bl = blacklist_entry_try_merge(c,
- bl, i - 1);
- goto out_write_sb;
- }
+ for (i = 0; i < nr; i++) {
+ struct journal_seq_blacklist_entry *e =
+ bl->start + i;
+
+ if (bl_entry_contig_or_overlaps(e, start, end)) {
+ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+ if (i + 1 < nr)
+ bl = blacklist_entry_try_merge(c,
+ bl, i);
+ if (i)
+ bl = blacklist_entry_try_merge(c,
+ bl, i - 1);
+ goto out_write_sb;
}
}
@@ -189,27 +188,34 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
return 0;
}
-static const char *
-bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
- struct journal_seq_blacklist_entry *i;
- unsigned nr = blacklist_nr_entries(bl);
+ unsigned i, nr = blacklist_nr_entries(bl);
- for (i = bl->start; i < bl->start + nr; i++) {
- if (le64_to_cpu(i->start) >=
- le64_to_cpu(i->end))
- return "entry start >= end";
-
- if (i + 1 < bl->start + nr &&
- le64_to_cpu(i[0].end) >
- le64_to_cpu(i[1].start))
- return "entries out of order";
+ for (i = 0; i < nr; i++) {
+ struct journal_seq_blacklist_entry *e = bl->start + i;
+
+ if (le64_to_cpu(e->start) >=
+ le64_to_cpu(e->end)) {
+ pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+ i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+ return -EINVAL;
+ }
+
+ if (i + 1 < nr &&
+ le64_to_cpu(e[0].end) >
+ le64_to_cpu(e[1].start)) {
+ pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+ return -EINVAL;
+ }
}
- return NULL;
+ return 0;
}
static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
@@ -229,9 +235,88 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
le64_to_cpu(i->start),
le64_to_cpu(i->end));
}
+ pr_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
.validate = bch2_sb_journal_seq_blacklist_validate,
.to_text = bch2_sb_journal_seq_blacklist_to_text
};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ journal_seq_blacklist_gc_work);
+ struct journal_seq_blacklist_table *t;
+ struct bch_sb_field_journal_seq_blacklist *bl;
+ struct journal_seq_blacklist_entry *src, *dst;
+ struct btree_trans trans;
+ unsigned i, nr, new_nr;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct btree_iter iter;
+ struct btree *b;
+
+ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+ 0, 0, BTREE_ITER_PREFETCH);
+retry:
+ bch2_trans_begin(&trans);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+ b &&
+ !test_bit(BCH_FS_STOPPING, &c->flags))
+ b = bch2_btree_iter_next_node(&iter);
+
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
+ }
+
+ bch2_trans_exit(&trans);
+ if (ret)
+ return;
+
+ mutex_lock(&c->sb_lock);
+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+ if (!bl)
+ goto out;
+
+ nr = blacklist_nr_entries(bl);
+ dst = bl->start;
+
+ t = c->journal_seq_blacklist_table;
+ BUG_ON(nr != t->nr);
+
+ for (src = bl->start, i = eytzinger0_first(t->nr);
+ src < bl->start + nr;
+ src++, i = eytzinger0_next(i, nr)) {
+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
+
+ if (t->entries[i].dirty)
+ *dst++ = *src;
+ }
+
+ new_nr = dst - bl->start;
+
+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+ if (new_nr != nr) {
+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+ new_nr ? sb_blacklist_u64s(new_nr) : 0);
+ BUG_ON(new_nr && !bl);
+
+ if (!new_nr)
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+
+ bch2_write_super(c);
+ }
+out:
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index b4f876a04586..afb886ec8e25 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -17,4 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+void bch2_blacklist_entries_gc(struct work_struct *);
+
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 54cc69bde1bb..a6cdb885ad41 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -25,6 +25,8 @@ struct journal_buf {
struct closure_waitlist wait;
u64 last_seq; /* copy of data->last_seq */
+ long expires;
+ u64 flush_time;
unsigned buf_size; /* size in bytes of @data */
unsigned sectors; /* maximum size for current entry */
@@ -139,20 +141,39 @@ enum journal_space_from {
journal_space_nr,
};
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
-
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
- JOURNAL_RECLAIM_STARTED,
- JOURNAL_NEED_WRITE,
- JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
- JOURNAL_NOCHANGES,
+};
+
+#define JOURNAL_WATERMARKS() \
+ x(any) \
+ x(copygc) \
+ x(reserved)
+
+enum journal_watermark {
+#define x(n) JOURNAL_WATERMARK_##n,
+ JOURNAL_WATERMARKS()
+#undef x
+};
+
+#define JOURNAL_WATERMARK_MASK 3
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS() \
+ x(ok) \
+ x(blocked) \
+ x(max_in_flight) \
+ x(journal_full) \
+ x(journal_pin_full) \
+ x(journal_stuck) \
+ x(insufficient_devices)
+
+enum journal_errors {
+#define x(n) JOURNAL_ERR_##n,
+ JOURNAL_ERRORS()
+#undef x
};
/* Embedded in struct bch_fs */
@@ -162,6 +183,7 @@ struct journal {
unsigned long flags;
union journal_res_state reservations;
+ enum journal_watermark watermark;
/* Max size of current journal entry */
unsigned cur_entry_u64s;
@@ -171,14 +193,7 @@ struct journal {
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- enum {
- cur_entry_ok,
- cur_entry_blocked,
- cur_entry_journal_full,
- cur_entry_journal_pin_full,
- cur_entry_journal_stuck,
- cur_entry_insufficient_devices,
- } cur_entry_error;
+ enum journal_errors cur_entry_error;
union journal_preres_state prereserved;
@@ -246,6 +261,10 @@ struct journal {
spinlock_t err_lock;
struct mutex reclaim_lock;
+ /*
+ * Used for waiting until journal reclaim has freed up space in the
+ * journal:
+ */
wait_queue_head_t reclaim_wait;
struct task_struct *reclaim_thread;
bool reclaim_kicked;
@@ -265,7 +284,6 @@ struct journal {
unsigned long last_flush_write;
u64 res_get_blocked_start;
- u64 need_write_time;
u64 write_start_time;
u64 nr_flush_writes;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
new file mode 100644
index 000000000000..4f0e6960e597
--- /dev/null
+++ b/fs/bcachefs/lru.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ if (bkey_val_bytes(k.k) < sizeof(*lru))
+ return "incorrect value size";
+
+ return NULL;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 existing_idx;
+ int ret = 0;
+
+ if (!time)
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ POS(id, time),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_lru) {
+ bch2_fs_inconsistent(c,
+ "pointer to nonexistent lru %llu:%llu",
+ id, time);
+ ret = -EIO;
+ goto err;
+ }
+
+ existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+ if (existing_idx != idx) {
+ bch2_fs_inconsistent(c,
+ "lru %llu:%llu with wrong backpointer: got %llu, should be %llu",
+ id, time, existing_idx, idx);
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_lru *lru;
+ int ret = 0;
+
+ if (!*time)
+ return 0;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
+ POS(lru_id, *time),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES, k, ret)
+ if (bkey_deleted(k.k))
+ break;
+
+ if (ret)
+ goto err;
+
+ BUG_ON(iter.pos.inode != lru_id);
+ *time = iter.pos.offset;
+
+ lru = bch2_trans_kmalloc(trans, sizeof(*lru));
+ ret = PTR_ERR_OR_ZERO(lru);
+ if (ret)
+ goto err;
+
+ bkey_lru_init(&lru->k_i);
+ lru->k.p = iter.pos;
+ lru->v.idx = cpu_to_le64(idx);
+
+ ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
+ u64 old_time, u64 *new_time)
+{
+ if (old_time == *new_time)
+ return 0;
+
+ return lru_delete(trans, id, idx, old_time) ?:
+ lru_set(trans, id, idx, new_time);
+}
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+ struct btree_iter *lru_iter, bool initial)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c lru_k, k;
+ struct bch_alloc_v4 a;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ u64 idx;
+ int ret;
+
+ lru_k = bch2_btree_iter_peek(lru_iter);
+ if (!lru_k.k)
+ return 0;
+
+ ret = bkey_err(lru_k);
+ if (ret)
+ return ret;
+
+ idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(lru_k.k->p.inode, idx), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (fsck_err_on(bucket_state(a) != BUCKET_cached ||
+ a.io_time[READ] != lru_k.k->p.offset, c,
+ "incorrect lru entry %s\n"
+ " for %s",
+ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.p = lru_iter->pos;
+
+ ret = bch2_trans_update(trans, lru_iter, update, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c, bool initial)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_check_lru_key(&trans, &iter, initial));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
new file mode 100644
index 000000000000..4db6a8399332
--- /dev/null
+++ b/fs/bcachefs/lru.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_lru (struct bkey_ops) { \
+ .key_invalid = bch2_lru_invalid, \
+ .val_to_text = bch2_lru_to_text, \
+}
+
+int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
+
+int bch2_check_lrus(struct bch_fs *, bool);
+
+#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f73be9cb7ac3..1de213506adf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -92,10 +92,10 @@ next:
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
struct bkey_i *update;
- size_t i;
+ u32 *i;
- for (i = 0; i < s.nr; i++)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+ darray_for_each(s.ids, i)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
goto next;
update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
@@ -125,7 +125,7 @@ next:
}
}
bch2_trans_iter_exit(trans, &iter);
- kfree(s.d);
+ darray_exit(s.ids);
return ret;
}
@@ -351,8 +351,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
}
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
- m->op.alloc_reserve = RESERVE_MOVINGGC;
- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
+ m->op.alloc_reserve = RESERVE_movinggc;
} else {
/* XXX: this should probably be passed in */
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
@@ -481,25 +480,26 @@ static void move_read_endio(struct bio *bio)
atomic_sub(io->read_sectors, &ctxt->read_sectors);
io->read_completed = true;
- if (next_pending_write(ctxt))
- wake_up(&ctxt->wait);
-
+ wake_up(&ctxt->wait);
closure_put(&ctxt->cl);
}
-static void do_pending_writes(struct moving_context *ctxt)
+static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
{
struct moving_io *io;
+ if (trans)
+ bch2_trans_unlock(trans);
+
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
}
}
-#define move_ctxt_wait_event(_ctxt, _cond) \
+#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
do { \
- do_pending_writes(_ctxt); \
+ do_pending_writes(_ctxt, _trans); \
\
if (_cond) \
break; \
@@ -507,11 +507,12 @@ do { \
next_pending_write(_ctxt) || (_cond)); \
} while (1)
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+ struct btree_trans *trans)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- move_ctxt_wait_event(ctxt,
+ move_ctxt_wait_event(ctxt, trans,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
@@ -533,14 +534,6 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->write_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
-
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->read_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
-
/* write path might have to decompress data: */
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -691,26 +684,36 @@ static int __bch2_move_data(struct bch_fs *c,
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- bch2_trans_unlock(&trans);
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
- bch2_trans_begin(&trans);
+ move_ctxt_wait_event(ctxt, &trans,
+ atomic_read(&ctxt->write_sectors) <
+ SECTORS_IN_FLIGHT_PER_DEVICE);
- k = bch2_btree_iter_peek(&iter);
+ move_ctxt_wait_event(ctxt, &trans,
+ atomic_read(&ctxt->read_sectors) <
+ SECTORS_IN_FLIGHT_PER_DEVICE);
- stats->pos = iter.pos;
+ bch2_trans_begin(&trans);
+ k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
+
ret = bkey_err(k);
+ if (ret == -EINTR)
+ continue;
if (ret)
break;
+
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
+ stats->pos = iter.pos;
+
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
@@ -745,22 +748,22 @@ static int __bch2_move_data(struct bch_fs *c,
BUG();
}
- /* unlock before doing IO: */
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(&trans);
ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
- if (ret2 == -EINTR) {
- bch2_trans_begin(&trans);
+ if (ret2 == -EINTR)
continue;
- }
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
+ bch2_move_ctxt_wait_for_io(ctxt, &trans);
continue;
}
@@ -845,7 +848,7 @@ int bch2_move_data(struct bch_fs *c,
}
- move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+ move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
EBUG_ON(atomic_read(&ctxt.write_sectors));
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 7b7eee9b1773..cb6b81678ecc 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -6,6 +6,7 @@
*/
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
@@ -29,21 +30,6 @@
#include <linux/sort.h>
#include <linux/wait.h>
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca) \
- ((ca)->free[RESERVE_MOVINGGC].size / 2)
-
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
{
const struct copygc_heap_entry *l = _l;
@@ -69,10 +55,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
.dev = p.ptr.dev,
.offset = p.ptr.offset,
};
+ ssize_t i;
- ssize_t i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
+ if (p.ptr.cached)
+ continue;
+
+ i = eytzinger0_find_le(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, &search);
#if 0
/* eytzinger search verify code: */
ssize_t j = -1, k;
@@ -101,7 +91,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
data_opts->target = io_opts->background_target;
data_opts->nr_replicas = 1;
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_JOURNAL_RESERVED;
+ JOURNAL_WATERMARK_copygc;
data_opts->rewrite_dev = p.ptr.dev;
if (p.has_ec)
@@ -114,37 +104,113 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
return DATA_SKIP;
}
-static bool have_copygc_reserve(struct bch_dev *ca)
+static inline int fragmentation_cmp(copygc_heap *heap,
+ struct copygc_heap_entry l,
+ struct copygc_heap_entry r)
{
- bool ret;
+ return cmp_int(l.fragmentation, r.fragmentation);
+}
+
+static int walk_buckets_to_copygc(struct bch_fs *c)
+{
+ copygc_heap *h = &c->copygc_heap;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ int ret;
- spin_lock(&ca->fs->freelist_lock);
- ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
- ca->allocator_state != ALLOCATOR_running;
- spin_unlock(&ca->fs->freelist_lock);
+ bch2_trans_init(&trans, c, 0, 0);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
+ struct copygc_heap_entry e;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (a.data_type != BCH_DATA_user ||
+ a.dirty_sectors >= ca->mi.bucket_size ||
+ bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
+ continue;
+
+ e = (struct copygc_heap_entry) {
+ .dev = iter.pos.inode,
+ .gen = a.gen,
+ .replicas = 1 + a.stripe_redundancy,
+ .fragmentation = (u64) a.dirty_sectors * (1ULL << 31)
+ / ca->mi.bucket_size,
+ .sectors = a.dirty_sectors,
+ .offset = bucket_to_sector(ca, iter.pos.offset),
+ };
+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
return ret;
}
-static inline int fragmentation_cmp(copygc_heap *heap,
- struct copygc_heap_entry l,
- struct copygc_heap_entry r)
+static int bucket_inorder_cmp(const void *_l, const void *_r)
{
- return cmp_int(l.fragmentation, r.fragmentation);
+ const struct copygc_heap_entry *l = _l;
+ const struct copygc_heap_entry *r = _r;
+
+ return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
+}
+
+static int check_copygc_was_done(struct bch_fs *c,
+ u64 *sectors_not_moved,
+ u64 *buckets_not_moved)
+{
+ copygc_heap *h = &c->copygc_heap;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a;
+ struct copygc_heap_entry *i;
+ int ret = 0;
+
+ sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
+
+ for (i = h->data; i < h->data + h->used; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+
+ bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
+
+ ret = lockrestart_do(&trans,
+ bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
+ break;
+
+ bch2_alloc_to_v4(k, &a);
+
+ if (a.gen == i->gen && a.dirty_sectors) {
+ *sectors_not_moved += a.dirty_sectors;
+ *buckets_not_moved += 1;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
}
static int bch2_copygc(struct bch_fs *c)
{
copygc_heap *h = &c->copygc_heap;
struct copygc_heap_entry e, *i;
- struct bucket_array *buckets;
struct bch_move_stats move_stats;
u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
u64 sectors_reserved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
struct bch_dev *ca;
unsigned dev_idx;
- size_t b, heap_size = 0;
+ size_t heap_size = 0;
int ret;
bch_move_stats_init(&move_stats, "copygc");
@@ -169,44 +235,25 @@ static int bch2_copygc(struct bch_fs *c)
}
for_each_rw_member(ca, c, dev_idx) {
- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
- spin_lock(&ca->fs->freelist_lock);
- sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
- spin_unlock(&ca->fs->freelist_lock);
-
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- struct bucket *g = buckets->b + b;
- struct bucket_mark m = READ_ONCE(g->mark);
- struct copygc_heap_entry e;
-
- if (m.owned_by_allocator ||
- m.data_type != BCH_DATA_user ||
- !bucket_sectors_used(m) ||
- bucket_sectors_used(m) >= ca->mi.bucket_size)
- continue;
-
- WARN_ON(m.stripe && !g->stripe_redundancy);
-
- e = (struct copygc_heap_entry) {
- .dev = dev_idx,
- .gen = m.gen,
- .replicas = 1 + g->stripe_redundancy,
- .fragmentation = bucket_sectors_used(m) * (1U << 15)
- / ca->mi.bucket_size,
- .sectors = bucket_sectors_used(m),
- .offset = bucket_to_sector(ca, b),
- };
- heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
- }
- up_read(&ca->bucket_lock);
+ s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
+ ca->mi.nbuckets >> 6);
+
+ sectors_reserved += avail * ca->mi.bucket_size;
+ }
+
+ ret = walk_buckets_to_copygc(c);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error walking buckets to copygc!");
+ return ret;
+ }
+
+ if (!h->used) {
+ bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+ return 0;
}
/*
- * Our btree node allocations also come out of RESERVE_MOVINGGC:
+ * Our btree node allocations also come out of RESERVE_movingc:
*/
sectors_reserved = (sectors_reserved * 3) / 4;
if (!sectors_reserved) {
@@ -226,8 +273,11 @@ static int bch2_copygc(struct bch_fs *c)
buckets_to_move = h->used;
- if (!buckets_to_move)
+ if (!buckets_to_move) {
+ bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
+ sectors_reserved);
return 0;
+ }
eytzinger0_sort(h->data, h->used,
sizeof(h->data[0]),
@@ -240,30 +290,18 @@ static int bch2_copygc(struct bch_fs *c)
writepoint_ptr(&c->copygc_write_point),
copygc_pred, NULL,
&move_stats);
+ if (ret) {
+ bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+ return ret;
+ }
- for_each_rw_member(ca, c, dev_idx) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
- for (i = h->data; i < h->data + h->used; i++) {
- struct bucket_mark m;
- size_t b;
-
- if (i->dev != dev_idx)
- continue;
-
- b = sector_to_bucket(ca, i->offset);
- m = READ_ONCE(buckets->b[b].mark);
-
- if (i->gen == m.gen &&
- bucket_sectors_used(m)) {
- sectors_not_moved += bucket_sectors_used(m);
- buckets_not_moved++;
- }
- }
- up_read(&ca->bucket_lock);
+ ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
+ if (ret) {
+ bch_err(c, "error %i from check_copygc_was_done()", ret);
+ return ret;
}
- if (sectors_not_moved && !ret)
+ if (sectors_not_moved)
bch_warn_ratelimited(c,
"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
sectors_not_moved, sectors_to_move,
@@ -301,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
- ca->mi.bucket_size) >> 1);
+ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
+ ca->mi.bucket_size) >> 1);
fragmented = usage.d[BCH_DATA_user].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index d9ca69f2ecde..77fbb7d2194e 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -9,7 +9,12 @@
#include "super-io.h"
#include "util.h"
-#define x(t, n) #t,
+#define x(t, n) [n] = #t,
+
+const char * const bch2_metadata_versions[] = {
+ BCH_METADATA_VERSIONS()
+ NULL
+};
const char * const bch2_error_actions[] = {
BCH_ERROR_ACTIONS()
@@ -71,6 +76,16 @@ const char * const bch2_member_states[] = {
NULL
};
+const char * const bch2_jset_entry_types[] = {
+ BCH_JSET_ENTRY_TYPES()
+ NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+ BCH_FS_USAGE_TYPES()
+ NULL
+};
+
#undef x
const char * const bch2_d_types[BCH_DT_MAX] = {
@@ -86,6 +101,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_SUBVOL] = "subvol",
};
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+ BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+ BUG();
+}
+
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define x(_name, ...) \
@@ -199,42 +224,43 @@ static int bch2_mount_opt_lookup(const char *name)
return bch2_opt_lookup(name);
}
-static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
{
if (v < opt->min) {
- if (msg)
- pr_err("invalid %s%s: too small (min %llu)",
- msg, opt->attr.name, opt->min);
+ if (err)
+ pr_buf(err, "%s: too small (min %llu)",
+ opt->attr.name, opt->min);
return -ERANGE;
}
if (opt->max && v >= opt->max) {
- if (msg)
- pr_err("invalid %s%s: too big (max %llu)",
- msg, opt->attr.name, opt->max);
+ if (err)
+ pr_buf(err, "%s: too big (max %llu)",
+ opt->attr.name, opt->max);
return -ERANGE;
}
if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
- if (msg)
- pr_err("invalid %s %s: not a multiple of 512",
- msg, opt->attr.name);
+ if (err)
+ pr_buf(err, "%s: not a multiple of 512",
+ opt->attr.name);
return -EINVAL;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
- if (msg)
- pr_err("invalid %s%s: must be a power of two",
- msg, opt->attr.name);
+ if (err)
+ pr_buf(err, "%s: must be a power of two",
+ opt->attr.name);
return -EINVAL;
}
return 0;
}
-int bch2_opt_parse(struct bch_fs *c, const char *msg,
+int bch2_opt_parse(struct bch_fs *c,
const struct bch_option *opt,
- const char *val, u64 *res)
+ const char *val, u64 *res,
+ struct printbuf *err)
{
ssize_t ret;
@@ -267,10 +293,11 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg,
return ret;
}
- return bch2_opt_validate(opt, msg, *res);
+ return bch2_opt_validate(opt, *res, err);
}
-void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_opt_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bch_sb *sb,
const struct bch_option *opt, u64 v,
unsigned flags)
{
@@ -300,7 +327,7 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
pr_buf(out, opt->choices[v]);
break;
case BCH_OPT_FN:
- opt->to_text(out, c, v);
+ opt->to_text(out, c, sb, v);
break;
default:
BUG();
@@ -346,6 +373,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
int ret, id;
+ struct printbuf err = PRINTBUF;
u64 v;
if (!options)
@@ -365,8 +393,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
if (id < 0)
goto bad_opt;
- ret = bch2_opt_parse(c, "mount option ",
- &bch2_opt_table[id], val, &v);
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
if (ret < 0)
goto bad_val;
} else {
@@ -409,7 +436,7 @@ bad_opt:
ret = -1;
goto out;
bad_val:
- pr_err("Invalid value %s for mount option %s", val, name);
+ pr_err("Invalid mount option %s", err.buf);
ret = -1;
goto out;
no_val:
@@ -418,9 +445,26 @@ no_val:
goto out;
out:
kfree(copied_opts_start);
+ printbuf_exit(&err);
return ret;
}
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ v = opt->get_sb(sb);
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = 1ULL << v;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v <<= 9;
+
+ return v;
+}
+
/*
* Initial options from superblock - here we don't want any options undefined,
* any options the superblock doesn't specify are set to 0:
@@ -428,28 +472,14 @@ out:
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
{
unsigned id;
- int ret;
for (id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
- if (opt->get_sb == NO_SB_OPT)
+ if (opt->get_sb == BCH2_NO_SB_OPT)
continue;
- v = opt->get_sb(sb);
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = 1ULL << v;
-
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v <<= 9;
-
- ret = bch2_opt_validate(opt, "superblock option ", v);
- if (ret)
- return ret;
-
- bch2_opt_set_by_id(opts, id, v);
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
}
return 0;
@@ -457,7 +487,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_NO_SB_OPT)
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
if (opt->flags & OPT_SB_FIELD_SECTORS)
@@ -471,7 +501,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_NO_SB_OPT)
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
mutex_lock(&c->sb_lock);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 661eb5764f68..8bc67d07afb9 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -8,6 +8,7 @@
#include <linux/sysfs.h>
#include "bcachefs_format.h"
+extern const char * const bch2_metadata_versions[];
extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
@@ -20,6 +21,8 @@ extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const bch2_data_types[];
extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
static inline const char *bch2_d_type_str(unsigned d_type)
@@ -40,7 +43,8 @@ static inline const char *bch2_d_type_str(unsigned d_type)
*/
/* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
@@ -200,7 +204,7 @@ enum opt_type {
x(btree_node_mem_ptr_optimization, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, true, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
@@ -227,7 +231,7 @@ enum opt_type {
x(inline_data, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, true, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
@@ -252,26 +256,26 @@ enum opt_type {
x(degraded, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in degraded mode") \
x(very_degraded, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Enable discard/TRIM support") \
x(verbose, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Extra debugging information during mount/recovery")\
x(journal_flush_delay, u32, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U32_MAX), \
+ OPT_UINT(1, U32_MAX), \
BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
NULL, "Delay in milliseconds before automatic journal commits")\
x(journal_flush_disabled, u8, \
@@ -289,94 +293,109 @@ enum opt_type {
x(fsck, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Fix errors during fsck without asking") \
x(ratelimit_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
+ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
NULL, "Ratelimit error messages during fsck") \
x(nochanges, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Super read only mode - no writes at all will be issued,\n"\
"even if we have to replay the journal") \
x(norecovery, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't replay the journal") \
x(rebuild_replicas, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Rebuild the superblock replicas section") \
x(keep_journal, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't free journal entries/keys after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Read all journal entries, not just dirty ones")\
+ x(read_journal_only, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Only read the journal, skip the rest of recovery")\
+ x(journal_transaction_names, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
+ NULL, "Log transaction function names in journal") \
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't open device in exclusive mode") \
x(sb, u64, \
OPT_MOUNT, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, BCH_SB_SECTOR, \
+ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
OPT_FS, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, NULL) \
x(nostart, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don\'t start filesystem, only open devices") \
x(reconstruct_alloc, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
+ x(buckets_nouse, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Allocate the buckets_nouse bitmap") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, NULL) \
x(fs_size, u64, \
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, 0, \
+ BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(bucket, u32, \
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, 0, \
+ BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
OPT_DEVICE, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
- NO_SB_OPT, 1, \
+ BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times")
@@ -443,7 +462,7 @@ struct bch_option {
};
struct {
int (*parse)(struct bch_fs *, const char *, u64 *);
- void (*to_text)(struct printbuf *, struct bch_fs *, u64);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
};
};
@@ -458,18 +477,20 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
- const char *, u64 *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+ const char *, u64 *, struct printbuf *);
#define OPT_SHOW_FULL_LIST (1 << 0)
#define OPT_SHOW_MOUNT_STYLE (1 << 1)
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8f8f4b0accd6..ca029a00e7b8 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -6,19 +6,55 @@
#include "subvolume.h"
#include "super-io.h"
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
- struct bch_sb_field *f)
+static const char * const bch2_quota_types[] = {
+ "user",
+ "group",
+ "project",
+};
+
+static const char * const bch2_quota_counters[] = {
+ "space",
+ "inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
- if (vstruct_bytes(&q->field) != sizeof(*q))
- return "invalid field quota: wrong size";
+ if (vstruct_bytes(&q->field) < sizeof(*q)) {
+ pr_buf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&q->field), sizeof(*q));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
+}
+
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+ unsigned qtyp, counter;
+
+ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+ pr_buf(out, "%s: flags %llx",
+ bch2_quota_types[qtyp],
+ le64_to_cpu(q->q[qtyp].flags));
+
+ for (counter = 0; counter < Q_COUNTERS; counter++)
+ pr_buf(out, " %s timelimit %u warnlimit %u",
+ bch2_quota_counters[counter],
+ le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+ le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+ pr_newline(out);
+ }
}
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
- .validate = bch2_sb_validate_quota,
+ .validate = bch2_sb_quota_validate,
+ .to_text = bch2_sb_quota_to_text,
};
const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -32,11 +68,6 @@ const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
-static const char * const bch2_quota_counters[] = {
- "space",
- "inodes",
-};
-
void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -570,7 +601,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_USR, 0),
POS(QTYP_USR + 1, 0),
- NULL);
+ 0, NULL);
if (ret)
return ret;
}
@@ -582,7 +613,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_GRP, 0),
POS(QTYP_GRP + 1, 0),
- NULL);
+ 0, NULL);
if (ret)
return ret;
}
@@ -594,7 +625,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_PRJ, 0),
POS(QTYP_PRJ + 1, 0),
- NULL);
+ 0, NULL);
if (ret)
return ret;
}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index a573fede05b1..d914892f5339 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -257,35 +257,47 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
- char h1[21], h2[21];
- bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
- bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
- pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
- w.dev_most_full_idx, h1, h2);
+ out->tabstops[0] = 20;
- bch2_hprint(&PBUF(h1), w.total_work << 9);
- bch2_hprint(&PBUF(h2), c->capacity << 9);
- pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
+ pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+ pr_tab(out);
- pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+ bch2_hprint(out, w.dev_most_full_work << 9);
+ pr_buf(out, "/");
+ bch2_hprint(out, w.dev_most_full_capacity << 9);
+ pr_newline(out);
+
+ pr_buf(out, "total work:");
+ pr_tab(out);
+
+ bch2_hprint(out, w.total_work << 9);
+ pr_buf(out, "/");
+ bch2_hprint(out, c->capacity << 9);
+ pr_newline(out);
+
+ pr_buf(out, "rate:");
+ pr_tab(out);
+ pr_buf(out, "%u", r->pd.rate.rate);
+ pr_newline(out);
switch (r->state) {
case REBALANCE_WAITING:
- pr_buf(out, "waiting\n");
+ pr_buf(out, "waiting");
break;
case REBALANCE_THROTTLED:
- bch2_hprint(&PBUF(h1),
+ pr_buf(out, "throttled for %lu sec or ",
+ (r->throttled_until_cputime - jiffies) / HZ);
+ bch2_hprint(out,
(r->throttled_until_iotime -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
- pr_buf(out, "throttled for %lu sec or %s io\n",
- (r->throttled_until_cputime - jiffies) / HZ,
- h1);
+ pr_buf(out, " io");
break;
case REBALANCE_RUNNING:
- pr_buf(out, "running\n");
+ pr_buf(out, "running");
break;
}
+ pr_newline(out);
}
void bch2_rebalance_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 62efcc9504ba..ca92fe84c248 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -16,6 +16,7 @@
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
+#include "lru.h"
#include "move.h"
#include "quota.h"
#include "recovery.h"
@@ -59,23 +60,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
static int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
- struct journal_key *r)
+ const struct journal_key *r)
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p));
}
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{
- return (cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p));
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-static size_t journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
@@ -96,6 +95,24 @@ static size_t journal_key_search(struct journal_keys *journal_keys,
return l;
}
+struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *end = keys->d + keys->nr;
+ struct journal_key *k = keys->d +
+ bch2_journal_key_search(keys, btree_id, level, pos);
+
+ while (k < end && k->overwritten)
+ k++;
+
+ if (k < end &&
+ k->btree_id == btree_id &&
+ k->level == level)
+ return k->k;
+ return NULL;
+}
+
static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
{
struct bkey_i *n = iter->keys->d[idx].k;
@@ -109,17 +126,33 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
iter->idx++;
}
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
{
struct journal_key n = {
.btree_id = id,
.level = level,
- .allocated = true
+ .k = k,
+ .allocated = true,
+ /*
+ * Ensure these keys are done last by journal replay, to unblock
+ * journal reclaim:
+ */
+ .journal_seq = U32_MAX,
};
struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter;
- unsigned idx = journal_key_search(keys, id, level, k->k.p);
+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+ BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+ if (idx < keys->nr &&
+ journal_key_cmp(&n, &keys->d[idx]) == 0) {
+ if (keys->d[idx].allocated)
+ kfree(keys->d[idx].k);
+ keys->d[idx] = n;
+ return 0;
+ }
if (keys->nr == keys->size) {
struct journal_keys new_keys = {
@@ -140,25 +173,34 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
*keys = new_keys;
}
- n.k = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
- if (!n.k)
- return -ENOMEM;
+ array_insert_item(keys->d, keys->nr, idx, n);
- bkey_copy(n.k, k);
+ list_for_each_entry(iter, &c->journal_iters, list)
+ journal_iter_fix(c, iter, idx);
- if (idx < keys->nr &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
- } else {
- array_insert_item(keys->d, keys->nr, idx, n);
+ return 0;
+}
- list_for_each_entry(iter, &c->journal_iters, list)
- journal_iter_fix(c, iter, idx);
- }
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
+{
+ struct bkey_i *n;
+ int ret;
- return 0;
+ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ bkey_copy(n, k);
+ ret = bch2_journal_key_insert_take(c, id, level, n);
+ if (ret)
+ kfree(n);
+ return ret;
}
int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
@@ -172,17 +214,33 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
return bch2_journal_key_insert(c, id, level, &whiteout);
}
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (idx < keys->nr &&
+ keys->d[idx].btree_id == btree &&
+ keys->d[idx].level == level &&
+ !bpos_cmp(keys->d[idx].k->k.p, pos))
+ keys->d[idx].overwritten = true;
+}
+
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->idx - iter->keys->nr
- ? iter->keys->d + iter->idx : NULL;
+ struct journal_key *k = iter->keys->d + iter->idx;
- if (k &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level)
- return k->k;
+ while (k < iter->keys->d + iter->keys->nr &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level) {
+ if (!k->overwritten)
+ return k->k;
+
+ iter->idx++;
+ k = iter->keys->d + iter->idx;
+ }
- iter->idx = iter->keys->nr;
return NULL;
}
@@ -205,8 +263,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->btree_id = id;
iter->level = level;
iter->keys = &c->journal_keys;
- iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
- list_add(&iter->list, &c->journal_iters);
+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -292,106 +349,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
- struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->b = b;
- bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
- bch2_journal_iter_init(c, &iter->journal,
- b->c.btree_id, b->c.level, b->data->min_key);
+ iter->node_iter = node_iter;
+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ INIT_LIST_HEAD(&iter->journal.list);
}
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
- struct btree_and_journal_iter iter)
-{
- unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
- struct bkey_s_c k;
- struct bkey_buf tmp;
-
- BUG_ON(!b->c.level);
-
- bch2_bkey_buf_init(&tmp);
-
- while (i < nr &&
- (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
- b->c.btree_id, b->c.level - 1);
-
- bch2_btree_and_journal_iter_advance(&iter);
- i++;
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
- enum btree_id btree_id,
- btree_walk_key_fn key_fn)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf tmp;
- struct btree *child;
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- child = bch2_btree_node_get_noiter(c, tmp.k,
- b->c.btree_id, b->c.level - 1,
- false);
-
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
-
- btree_and_journal_iter_prefetch(c, b, iter);
-
- ret = bch2_btree_and_journal_walk_recurse(trans, child,
- btree_id, key_fn);
- six_unlock_read(&child->c.lock);
- } else {
- ret = key_fn(trans, k);
- }
-
- if (ret)
- break;
-
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
- btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b)
{
- struct bch_fs *c = trans->c;
- struct btree *b = c->btree_roots[btree_id].b;
- int ret = 0;
-
- if (btree_node_fake(b))
- return 0;
+ struct btree_node_iter node_iter;
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
- six_unlock_read(&b->c.lock);
-
- return ret;
+ bch2_btree_node_iter_init_from_start(&node_iter, b);
+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+ list_add(&iter->journal.list, &c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@@ -416,9 +400,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p) ?:
+ return journal_key_cmp(l, r) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@@ -511,8 +493,8 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int __bch2_journal_replay_key(struct btree_trans *trans,
- struct journal_key *k)
+static int bch2_journal_replay_key(struct btree_trans *trans,
+ struct journal_key *k)
{
struct btree_iter iter;
unsigned iter_flags =
@@ -521,111 +503,75 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
int ret;
if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
+ iter_flags |= BTREE_ITER_CACHED;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
-{
- unsigned commit_flags =
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED;
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
- if (!k->allocated)
- commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+ /* Must be checked with btree locked: */
+ if (k->overwritten)
+ goto out;
- return bch2_trans_do(c, NULL, NULL, commit_flags,
- __bch2_journal_replay_key(&trans, k));
+ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
static int journal_sort_seq_cmp(const void *_l, const void *_r)
{
- const struct journal_key *l = _l;
- const struct journal_key *r = _r;
+ const struct journal_key *l = *((const struct journal_key **)_l);
+ const struct journal_key *r = *((const struct journal_key **)_r);
- return cmp_int(r->level, l->level) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->btree_id, r->btree_id) ?:
- bpos_cmp(l->k->k.p, r->k->k.p);
+ return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c,
- struct journal_keys keys)
+static int bch2_journal_replay(struct bch_fs *c)
{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key **keys_sorted, *k;
struct journal *j = &c->journal;
- struct journal_key *i;
- u64 seq;
+ size_t i;
int ret;
- sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+ keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
+ if (!keys_sorted)
+ return -ENOMEM;
- if (keys.nr)
- replay_now_at(j, keys.journal_seq_base);
+ for (i = 0; i < keys->nr; i++)
+ keys_sorted[i] = &keys->d[i];
- seq = j->replay_journal_seq;
+ sort(keys_sorted, keys->nr,
+ sizeof(keys_sorted[0]),
+ journal_sort_seq_cmp, NULL);
- /*
- * First replay updates to the alloc btree - these will only update the
- * btree key cache:
- */
- for_each_journal_key(keys, i) {
- cond_resched();
+ if (keys->nr)
+ replay_now_at(j, keys->journal_seq_base);
- if (!i->level && i->btree_id == BTREE_ID_alloc) {
- j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_journal_replay_key(c, i);
- if (ret)
- goto err;
- }
- }
+ for (i = 0; i < keys->nr; i++) {
+ k = keys_sorted[i];
- /*
- * Next replay updates to interior btree nodes:
- */
- for_each_journal_key(keys, i) {
cond_resched();
- if (i->level) {
- j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_journal_replay_key(c, i);
- if (ret)
- goto err;
- }
- }
-
- /*
- * Now that the btree is in a consistent state, we can start journal
- * reclaim (which will be flushing entries from the btree key cache back
- * to the btree:
- */
- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
- set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
- journal_reclaim_kick(j);
-
- j->replay_journal_seq = seq;
+ if (!k->allocated)
+ replay_now_at(j, keys->journal_seq_base + k->journal_seq);
- /*
- * Now replay leaf node updates:
- */
- for_each_journal_key(keys, i) {
- cond_resched();
-
- if (i->level || i->btree_id == BTREE_ID_alloc)
- continue;
-
- replay_now_at(j, keys.journal_seq_base + i->journal_seq);
-
- ret = bch2_journal_replay_key(c, i);
- if (ret)
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL|
+ (!k->allocated
+ ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
+ : 0),
+ bch2_journal_replay_key(&trans, k));
+ if (ret) {
+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+ ret, bch2_btree_ids[k->btree_id], k->level);
goto err;
+ }
}
replay_now_at(j, j->replay_journal_seq_end);
@@ -633,10 +579,12 @@ static int bch2_journal_replay(struct bch_fs *c,
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
- return bch2_journal_error(j);
+ ret = bch2_journal_error(j);
+
+ if (keys->nr && !ret)
+ bch2_journal_log_msg(&c->journal, "journal replay finished");
err:
- bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
- ret, bch2_btree_ids[i->btree_id], i->level);
+ kvfree(keys_sorted);
return ret;
}
@@ -674,15 +622,15 @@ static int journal_replay_entry_early(struct bch_fs *c,
container_of(entry, struct jset_entry_usage, entry);
switch (entry->btree_id) {
- case FS_USAGE_RESERVED:
+ case BCH_FS_USAGE_reserved:
if (entry->level < BCH_REPLICAS_MAX)
c->usage_base->persistent_reserved[entry->level] =
le64_to_cpu(u->v);
break;
- case FS_USAGE_INODES:
+ case BCH_FS_USAGE_inodes:
c->usage_base->nr_inodes = le64_to_cpu(u->v);
break;
- case FS_USAGE_KEY_VERSION:
+ case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
le64_to_cpu(u->v));
break;
@@ -702,10 +650,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
- sizeof(struct jset_entry_dev_usage_type);
- unsigned i;
+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
@@ -820,6 +765,8 @@ static int verify_superblock_clean(struct bch_fs *c,
{
unsigned i;
struct bch_sb_field_clean *clean = *cleanp;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret = 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
@@ -832,7 +779,6 @@ static int verify_superblock_clean(struct bch_fs *c,
}
for (i = 0; i < BTREE_ID_NR; i++) {
- char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
@@ -842,6 +788,19 @@ static int verify_superblock_clean(struct bch_fs *c,
if (!k1 && !k2)
continue;
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ if (k1)
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+ else
+ pr_buf(&buf1, "(none)");
+
+ if (k2)
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+ else
+ pr_buf(&buf2, "(none)");
+
mustfix_fsck_err_on(!k1 || !k2 ||
IS_ERR(k1) ||
IS_ERR(k2) ||
@@ -851,10 +810,12 @@ static int verify_superblock_clean(struct bch_fs *c,
"superblock btree root %u doesn't match journal after clean shutdown\n"
"sb: l=%u %s\n"
"journal: l=%u %s\n", i,
- l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
- l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
+ l1, buf1.buf,
+ l2, buf2.buf);
}
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -881,7 +842,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
return ERR_PTR(-ENOMEM);
}
- ret = bch2_sb_clean_validate(c, clean, READ);
+ ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
@@ -977,7 +938,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
@@ -991,7 +951,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
goto err;
if (!bkey_is_inode(k.k)) {
- bch_err(c, "root inode not found");
+ bch_err(trans->c, "root inode not found");
ret = -ENOENT;
goto err;
}
@@ -1069,8 +1029,8 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
c->opts.version_upgrade = true;
c->opts.fsck = true;
- } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
- bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+ } else if (c->sb.version < bcachefs_metadata_version_alloc_v4) {
+ bch_info(c, "filesystem version is prior to alloc_v4 - upgrading");
c->opts.version_upgrade = true;
}
}
@@ -1084,6 +1044,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct journal_replay *i;
+ bch_verbose(c, "starting journal read");
ret = bch2_journal_read(c, &c->journal_entries,
&blacklist_seq, &journal_seq);
if (ret)
@@ -1132,6 +1093,9 @@ use_clean:
blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
+ if (c->opts.read_journal_only)
+ goto out;
+
if (c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
drop_alloc_keys(&c->journal_keys);
@@ -1143,6 +1107,16 @@ use_clean:
if (ret)
goto err;
+ /*
+ * After an unclean shutdown, skip then next few journal sequence
+ * numbers as they may have been referenced by btree writes that
+ * happened before their corresponding journal writes - those btree
+ * writes need to be ignored, by skipping and blacklisting the next few
+ * journal sequence numbers:
+ */
+ if (!c->sb.clean)
+ journal_seq += 8;
+
if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
@@ -1163,7 +1137,11 @@ use_clean:
bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
+
+ down_read(&c->gc_lock);
ret = bch2_alloc_read(c);
+ up_read(&c->gc_lock);
+
if (ret)
goto err;
bch_verbose(c, "alloc read done");
@@ -1175,7 +1153,12 @@ use_clean:
goto err;
bch_verbose(c, "stripes_read done");
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ /*
+ * If we're not running fsck, this ensures bch2_fsck_err() calls are
+ * instead interpreted as bch2_inconsistent_err() calls:
+ */
+ if (!c->opts.fsck)
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
if (c->opts.fsck ||
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
@@ -1183,18 +1166,32 @@ use_clean:
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bool metadata_only = c->opts.norecovery;
- bch_info(c, "starting mark and sweep");
- err = "error in mark and sweep";
+ bch_info(c, "checking allocations");
+ err = "error checking allocations";
ret = bch2_gc(c, true, metadata_only);
if (ret)
goto err;
- bch_verbose(c, "mark and sweep done");
+ bch_verbose(c, "done checking allocations");
+ }
+
+ if (c->opts.fsck) {
+ bch_info(c, "checking need_discard and freespace btrees");
+ err = "error checking need_discard and freespace btrees";
+ ret = bch2_check_alloc_info(c, true);
+ if (ret)
+ goto err;
+
+ ret = bch2_check_lrus(c, true);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking need_discard and freespace btrees");
}
bch2_stripes_heap_start(c);
clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
/*
* Skip past versions that might have possibly been used (as nonces),
@@ -1206,29 +1203,18 @@ use_clean:
if (c->opts.norecovery)
goto out;
- bch_verbose(c, "starting journal replay");
+ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
err = "journal replay failed";
- ret = bch2_journal_replay(c, c->journal_keys);
+ ret = bch2_journal_replay(c);
if (ret)
goto err;
- bch_verbose(c, "journal replay done");
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
- if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
- !c->opts.nochanges) {
- /*
- * note that even when filesystem was clean there might be work
- * to do here, if we ran gc (because of fsck) which recalculated
- * oldest_gen:
- */
- bch_verbose(c, "writing allocation info");
- err = "error writing out alloc info";
- ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error writing alloc info");
- goto err;
- }
- bch_verbose(c, "alloc write done");
- }
+ err = "error initializing freespace";
+ ret = bch2_fs_freespace_init(c);
+ if (ret)
+ goto err;
if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
bch2_fs_lazy_rw(c);
@@ -1279,34 +1265,7 @@ use_clean:
bch_verbose(c, "quotas done");
}
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
- le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
- struct bch_move_stats stats;
-
- bch_move_stats_init(&stats, "recovery");
-
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c);
- if (ret)
- goto err;
-
- ret = bch2_scan_old_btree_nodes(c, &stats);
- if (ret)
- goto err;
- bch_info(c, "scanning for old btree nodes done");
- }
-
mutex_lock(&c->sb_lock);
- /*
- * With journal replay done, we can clear the journal seq blacklist
- * table:
- */
- BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
- BUG_ON(le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written);
-
- bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0);
-
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1330,6 +1289,28 @@ use_clean:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
+ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
+ struct bch_move_stats stats;
+
+ bch_move_stats_init(&stats, "recovery");
+
+ bch_info(c, "scanning for old btree nodes");
+ ret = bch2_fs_read_write(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_scan_old_btree_nodes(c, &stats);
+ if (ret)
+ goto err;
+ bch_info(c, "scanning for old btree nodes done");
+ }
+
+ if (c->journal_seq_blacklist_table &&
+ c->journal_seq_blacklist_table->nr > 128)
+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+
ret = 0;
out:
set_bit(BCH_FS_FSCK_DONE, &c->flags);
@@ -1375,15 +1356,13 @@ int bch2_fs_initialize(struct bch_fs *c)
}
mutex_unlock(&c->sb_lock);
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
- set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
-
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
@@ -1409,6 +1388,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* Write out the superblock and journal buckets, now that we can do
* btree updates
*/
+ bch_verbose(c, "marking superblocks");
err = "error marking superblock and journal";
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1420,6 +1400,12 @@ int bch2_fs_initialize(struct bch_fs *c)
ca->new_fs_bucket_idx = 0;
}
+ bch_verbose(c, "initializing freespace");
+ err = "error initializing freespace";
+ ret = bch2_fs_freespace_init(c);
+ if (ret)
+ goto err;
+
err = "error creating root snapshot node";
ret = bch2_fs_initialize_subvolumes(c);
if (ret)
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index e45c70b3693f..e6927a918df3 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,24 +31,32 @@ struct btree_and_journal_iter {
} last;
};
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+ unsigned, struct bpos);
+struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos pos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+ unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct bch_fs *, struct btree *,
+ struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
-typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-
-int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
-
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_entries_free(struct list_head *);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c8d6d73681e0..6824730945d4 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -98,6 +98,24 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
}
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
+
+ if (!r->v.refcount) {
+ r->k.type = KEY_TYPE_deleted;
+ r->k.size = 0;
+ set_bkey_val_u64s(&r->k, 0);
+ return 0;
+ }
+ }
+
+ return bch2_trans_mark_extent(trans, old, new, flags);
+}
+
/* indirect inline data */
const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
@@ -119,6 +137,24 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+ struct bkey_i_indirect_inline_data *r =
+ bkey_i_to_indirect_inline_data(new);
+
+ if (!r->v.refcount) {
+ r->k.type = KEY_TYPE_deleted;
+ r->k.size = 0;
+ set_bkey_val_u64s(&r->k, 0);
+ }
+ }
+
+ return 0;
+}
+
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i *orig)
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 3745873fd88d..8eb41c0292eb 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -10,27 +10,37 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
- .key_merge = bch2_reflink_p_merge, \
+ .key_merge = bch2_reflink_p_merge, \
+ .trans_trigger = bch2_trans_mark_reflink_p, \
+ .atomic_trigger = bch2_mark_reflink_p, \
}
const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
+ struct bkey_i *, unsigned);
#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
+ .trans_trigger = bch2_trans_mark_reflink_v, \
+ .atomic_trigger = bch2_mark_extent, \
}
const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
struct bkey_s_c);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+ struct bkey_s_c, struct bkey_i *,
+ unsigned);
#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
+ .trans_trigger = bch2_trans_mark_indirect_inline_data, \
}
static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 6c5ea78d6762..c2771112d573 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -36,23 +36,40 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
+void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+ struct bch_replicas_entry_v0 *e)
+{
+ unsigned i;
+
+ if (e->data_type < BCH_DATA_NR)
+ pr_buf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ pr_buf(out, "(invalid data type %u)", e->data_type);
+
+ pr_buf(out, ": %u [", e->nr_devs);
+ for (i = 0; i < e->nr_devs; i++)
+ pr_buf(out, i ? " %u" : "%u", e->devs[i]);
+ pr_buf(out, "]");
+}
+
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e)
{
unsigned i;
- pr_buf(out, "%s: %u/%u [",
- bch2_data_types[e->data_type],
- e->nr_required,
- e->nr_devs);
+ if (e->data_type < BCH_DATA_NR)
+ pr_buf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ pr_buf(out, "(invalid data type %u)", e->data_type);
+ pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
for (i = 0; i < e->nr_devs; i++)
pr_buf(out, i ? " %u" : "%u", e->devs[i]);
pr_buf(out, "]");
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
- struct bch_replicas_cpu *r)
+ struct bch_replicas_cpu *r)
{
struct bch_replicas_entry *e;
bool first = true;
@@ -413,18 +430,10 @@ err:
goto out;
}
-static int __bch2_mark_replicas(struct bch_fs *c,
- struct bch_replicas_entry *r,
- bool check)
-{
- return likely(bch2_replicas_marked(c, r)) ? 0
- : check ? -1
- : bch2_mark_replicas_slowpath(c, r);
-}
-
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
{
- return __bch2_mark_replicas(c, r, false);
+ return likely(bch2_replicas_marked(c, r))
+ ? 0 : bch2_mark_replicas_slowpath(c, r);
}
/* replicas delta list: */
@@ -808,67 +817,78 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
return 0;
}
-static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
- unsigned i;
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ unsigned i, j;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
- for (i = 0; i + 1 < cpu_r->nr; i++) {
- struct bch_replicas_entry *l =
+ for (i = 0; i < cpu_r->nr; i++) {
+ struct bch_replicas_entry *e =
cpu_replicas_entry(cpu_r, i);
- struct bch_replicas_entry *r =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
- if (!memcmp(l, r, cpu_r->entry_size))
- return "duplicate replicas entry";
- }
+ if (e->data_type >= BCH_DATA_NR) {
+ pr_buf(err, "invalid data type in entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
- return NULL;
-}
+ if (!e->nr_devs) {
+ pr_buf(err, "no devices in entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_replicas_cpu cpu_r = { .entries = NULL };
- struct bch_replicas_entry *e;
- const char *err;
- unsigned i;
+ if (e->nr_required > 1 &&
+ e->nr_required >= e->nr_devs) {
+ pr_buf(err, "bad nr_required in entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
- for_each_replicas_entry(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
+ for (j = 0; j < e->nr_devs; j++)
+ if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+ pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
- err = "invalid replicas entry: no devices";
- if (!e->nr_devs)
- goto err;
+ if (i + 1 < cpu_r->nr) {
+ struct bch_replicas_entry *n =
+ cpu_replicas_entry(cpu_r, i + 1);
- err = "invalid replicas entry: bad nr_required";
- if (e->nr_required > 1 &&
- e->nr_required >= e->nr_devs)
- goto err;
+ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr_devs; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
+ if (!memcmp(e, n, cpu_r->entry_size)) {
+ pr_buf(err, "duplicate replicas entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
+ }
}
- err = "cannot allocate memory";
+ return 0;
+}
+
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+ struct bch_replicas_cpu cpu_r;
+ int ret;
+
if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
- goto err;
+ return -ENOMEM;
- err = check_dup_replicas_entries(&cpu_r);
-err:
+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
kfree(cpu_r.entries);
- return err;
+ return ret;
}
static void bch2_sb_replicas_to_text(struct printbuf *out,
@@ -886,49 +906,50 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
bch2_replicas_entry_to_text(out, e);
}
+ pr_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
- .validate = bch2_sb_validate_replicas,
+ .validate = bch2_sb_replicas_validate,
.to_text = bch2_sb_replicas_to_text,
};
-static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_replicas_cpu cpu_r = { .entries = NULL };
- struct bch_replicas_entry_v0 *e;
- const char *err;
- unsigned i;
+ struct bch_replicas_cpu cpu_r;
+ int ret;
- for_each_replicas_entry_v0(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
+ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
+ return -ENOMEM;
- err = "invalid replicas entry: no devices";
- if (!e->nr_devs)
- goto err;
+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+ kfree(cpu_r.entries);
+ return ret;
+}
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr_devs; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
- }
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+ struct bch_replicas_entry_v0 *e;
+ bool first = true;
- err = "cannot allocate memory";
- if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
- goto err;
+ for_each_replicas_entry(sb_r, e) {
+ if (!first)
+ pr_buf(out, " ");
+ first = false;
- err = check_dup_replicas_entries(&cpu_r);
-err:
- kfree(cpu_r.entries);
- return err;
+ bch2_replicas_entry_v0_to_text(out, e);
+ }
+ pr_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
- .validate = bch2_sb_validate_replicas_v0,
+ .validate = bch2_sb_replicas_v0_validate,
+ .to_text = bch2_sb_replicas_v0_to_text,
};
/* Query replicas: */
@@ -969,11 +990,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
if (dflags & ~flags) {
if (print) {
- char buf[100];
+ struct printbuf buf = PRINTBUF;
- bch2_replicas_entry_to_text(&PBUF(buf), e);
+ bch2_replicas_entry_to_text(&buf, e);
bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf);
+ nr_online, buf.buf);
+ printbuf_exit(&buf);
}
ret = false;
break;
@@ -985,19 +1007,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
return ret;
}
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
{
- struct bch_replicas_entry *e;
- unsigned i, ret = 0;
+ struct bch_sb_field_replicas *replicas;
+ struct bch_sb_field_replicas_v0 *replicas_v0;
+ unsigned i, data_has = 0;
+
+ replicas = bch2_sb_get_replicas(sb);
+ replicas_v0 = bch2_sb_get_replicas_v0(sb);
+
+ if (replicas) {
+ struct bch_replicas_entry *r;
+
+ for_each_replicas_entry(replicas, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ } else if (replicas_v0) {
+ struct bch_replicas_entry_v0 *r;
+
+ for_each_replicas_entry_v0(replicas_v0, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ }
- percpu_down_read(&c->mark_lock);
- for_each_cpu_replicas_entry(&c->replicas, e)
- for (i = 0; i < e->nr_devs; i++)
- if (e->devs[i] == ca->dev_idx)
- ret |= 1 << e->data_type;
+ return data_has;
+}
- percpu_up_read(&c->mark_lock);
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+ mutex_unlock(&c->sb_lock);
return ret;
}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index d237d7c51ccb..87820b2e1ad3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -64,6 +64,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
unsigned, bool);
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 57d636740d2f..591bbb9f8beb 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -163,12 +163,10 @@ bch2_hash_lookup(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+ POS(inum.inum, U64_MAX),
BTREE_ITER_SLOTS|flags, k, ret) {
- if (iter->pos.inode != inum.inum)
- break;
-
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
return 0;
@@ -199,15 +197,12 @@ bch2_hash_hole(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inum.inum)
- break;
-
+ POS(inum.inum, U64_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
if (!is_visible_key(desc, inum, k))
return 0;
- }
bch2_trans_iter_exit(trans, iter);
return ret ?: -ENOSPC;
@@ -260,14 +255,12 @@ int bch2_hash_set(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
SPOS(inum.inum,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
+ POS(inum.inum, U64_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter.pos.inode != inum.inum)
- break;
-
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 8aeb2e417a15..cdb89ba216cc 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -139,7 +139,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans)
for_each_btree_key(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
u32 id = k.k->p.offset, child[2];
- unsigned nr_live = 0, live_idx;
+ unsigned nr_live = 0, live_idx = 0;
if (k.k->type != KEY_TYPE_snapshot)
continue;
@@ -151,7 +151,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans)
for (i = 0; i < 2; i++) {
ret = snapshot_live(trans, child[i]);
if (ret < 0)
- break;
+ goto err;
if (ret)
live_idx = i;
@@ -162,6 +162,7 @@ static int bch2_snapshots_set_equiv(struct btree_trans *trans)
? snapshot_t(c, child[live_idx])->equiv
: id;
}
+err:
bch2_trans_iter_exit(trans, &iter);
if (ret)
@@ -456,10 +457,10 @@ err:
return ret;
}
-static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
{
struct btree_iter iter;
struct bkey_i_snapshot *n;
@@ -522,7 +523,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
n = bch2_trans_kmalloc(trans, sizeof(*n));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, k);
@@ -544,36 +545,21 @@ err:
return ret;
}
-static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+static int snapshot_id_add(snapshot_id_list *s, u32 id)
{
BUG_ON(snapshot_list_has_id(s, id));
- if (s->nr == s->size) {
- size_t new_size = max(8U, s->size * 2);
- void *n = krealloc(s->d,
- new_size * sizeof(s->d[0]),
- GFP_KERNEL);
- if (!n) {
- pr_err("error allocating snapshot ID list");
- return -ENOMEM;
- }
-
- s->d = n;
- s->size = new_size;
- };
-
- s->d[s->nr++] = id;
- return 0;
+ return darray_push(*s, id);
}
static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
- struct snapshot_id_list *deleted,
+ snapshot_id_list *deleted,
enum btree_id btree_id)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- struct snapshot_id_list equiv_seen = { 0 };
+ snapshot_id_list equiv_seen = { 0 };
struct bpos last_pos = POS_MIN;
int ret = 0;
@@ -620,7 +606,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
}
bch2_trans_iter_exit(trans, &iter);
- kfree(equiv_seen.d);
+ darray_exit(equiv_seen);
return ret;
}
@@ -632,7 +618,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_snapshot snap;
- struct snapshot_id_list deleted = { 0 };
+ snapshot_id_list deleted = { 0 };
u32 i, id, children[2];
int ret = 0;
@@ -712,15 +698,15 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
for (i = 0; i < deleted.nr; i++) {
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(&trans, deleted.d[i]));
+ bch2_snapshot_node_delete(&trans, deleted.data[i]));
if (ret) {
bch_err(c, "error deleting snapshot %u: %i",
- deleted.d[i], ret);
+ deleted.data[i], ret);
goto err;
}
}
err:
- kfree(deleted.d);
+ darray_exit(deleted);
bch2_trans_exit(&trans);
percpu_ref_put(&c->writes);
}
@@ -875,14 +861,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs,
snapshot_wait_for_pagecache_and_delete_work);
- struct snapshot_id_list s;
+ snapshot_id_list s;
u32 *id;
int ret = 0;
while (!ret) {
mutex_lock(&c->snapshots_unlinked_lock);
s = c->snapshots_unlinked;
- memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+ darray_init(c->snapshots_unlinked);
mutex_unlock(&c->snapshots_unlinked_lock);
if (!s.nr)
@@ -890,7 +876,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
bch2_evict_subvolume_inodes(c, &s);
- for (id = s.d; id < s.d + s.nr; id++) {
+ for (id = s.data; id < s.data + s.nr; id++) {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_subvolume_delete(&trans, *id));
if (ret) {
@@ -899,7 +885,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
}
}
- kfree(s.d);
+ darray_exit(s);
}
percpu_ref_put(&c->writes);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index e4c3fdcdf22f..f609291acafa 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_SUBVOLUME_H
#define _BCACHEFS_SUBVOLUME_H
+#include "darray.h"
#include "subvolume_types.h"
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -58,15 +59,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
struct snapshots_seen {
struct bpos pos;
- size_t nr;
- size_t size;
- u32 *d;
+ DARRAY(u32) ids;
};
static inline void snapshots_seen_exit(struct snapshots_seen *s)
{
- kfree(s->d);
- s->d = NULL;
+ kfree(s->ids.data);
+ s->ids.data = NULL;
}
static inline void snapshots_seen_init(struct snapshots_seen *s)
@@ -76,30 +75,19 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
{
- if (s->nr == s->size) {
- size_t new_size = max(s->size, (size_t) 128) * 2;
- u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-
- if (!d) {
- bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
- new_size);
- return -ENOMEM;
- }
-
- s->size = new_size;
- s->d = d;
- }
-
- s->d[s->nr++] = id;
- return 0;
+ int ret = darray_push(s->ids, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+ return ret;
}
-static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- unsigned i;
+ u32 *i;
- for (i = 0; i < s->nr; i++)
- if (id == s->d[i])
+ darray_for_each(*s, i)
+ if (*i == id)
return true;
return false;
}
@@ -122,6 +110,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32,
struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+ u32 *, u32 *, unsigned);
+
int bch2_subvolume_delete(struct btree_trans *, u32);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32,
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 9410b9587591..f7562b5d51df 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -2,10 +2,8 @@
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
-struct snapshot_id_list {
- u32 nr;
- u32 size;
- u32 *d;
-};
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b8d2cf66a630..71abf87114df 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -10,6 +10,7 @@
#include "io.h"
#include "journal.h"
#include "journal_io.h"
+#include "journal_sb.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "quota.h"
@@ -27,8 +28,8 @@ const char * const bch2_sb_fields[] = {
NULL
};
-static const char *bch2_sb_field_validate(struct bch_sb *,
- struct bch_sb_field *);
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+ struct printbuf *);
struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
enum bch_sb_field_type type)
@@ -202,22 +203,31 @@ static inline void __bch2_sb_layout_size_assert(void)
BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
}
-static const char *validate_sb_layout(struct bch_sb_layout *layout)
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
{
u64 offset, prev_offset, max_sectors;
unsigned i;
- if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
- return "Not a bcachefs superblock layout";
+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
+ pr_buf(out, "Not a bcachefs superblock layout");
+ return -EINVAL;
+ }
- if (layout->layout_type != 0)
- return "Invalid superblock layout type";
+ if (layout->layout_type != 0) {
+ pr_buf(out, "Invalid superblock layout type %u",
+ layout->layout_type);
+ return -EINVAL;
+ }
- if (!layout->nr_superblocks)
- return "Invalid superblock layout: no superblocks";
+ if (!layout->nr_superblocks) {
+ pr_buf(out, "Invalid superblock layout: no superblocks");
+ return -EINVAL;
+ }
- if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
- return "Invalid superblock layout: too many superblocks";
+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+ pr_buf(out, "Invalid superblock layout: too many superblocks");
+ return -EINVAL;
+ }
max_sectors = 1 << layout->sb_max_size_bits;
@@ -226,122 +236,163 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
for (i = 1; i < layout->nr_superblocks; i++) {
offset = le64_to_cpu(layout->sb_offset[i]);
- if (offset < prev_offset + max_sectors)
- return "Invalid superblock layout: superblocks overlap";
+ if (offset < prev_offset + max_sectors) {
+ pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+ " (sb %u ends at %llu next starts at %llu",
+ i - 1, prev_offset + max_sectors, offset);
+ return -EINVAL;
+ }
prev_offset = offset;
}
- return NULL;
+ return 0;
}
-const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+ int rw)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *mi;
- const char *err;
+ enum bch_opt_id opt_id;
u32 version, version_min;
u16 block_size;
+ int ret;
version = le16_to_cpu(sb->version);
- version_min = version >= bcachefs_metadata_version_new_versioning
+ version_min = version >= bcachefs_metadata_version_bkey_renumber
? le16_to_cpu(sb->version_min)
: version;
- if (version >= bcachefs_metadata_version_max ||
- version_min < bcachefs_metadata_version_min)
- return "Unsupported superblock version";
+ if (version >= bcachefs_metadata_version_max) {
+ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
+
+ if (version_min < bcachefs_metadata_version_min) {
+ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
- if (version_min > version)
- return "Bad minimum version";
+ if (version_min > version) {
+ pr_buf(out, "Bad minimum version %u, greater than version field %u",
+ version_min, version);
+ return -EINVAL;
+ }
if (sb->features[1] ||
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
- return "Filesystem has incompatible features";
+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+ pr_buf(out, "Filesystem has incompatible features");
+ return -EINVAL;
+ }
block_size = le16_to_cpu(sb->block_size);
- if (block_size > PAGE_SECTORS)
- return "Bad block size";
+ if (block_size > PAGE_SECTORS) {
+ pr_buf(out, "Block size too big (got %u, max %u)",
+ block_size, PAGE_SECTORS);
+ return -EINVAL;
+ }
- if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
- return "Bad user UUID";
+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
+ pr_buf(out, "Bad user UUID (got zeroes)");
+ return -EINVAL;
+ }
- if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
- return "Bad internal UUID";
+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
+ pr_buf(out, "Bad intenal UUID (got zeroes)");
+ return -EINVAL;
+ }
if (!sb->nr_devices ||
- sb->nr_devices <= sb->dev_idx ||
- sb->nr_devices > BCH_SB_MEMBERS_MAX)
- return "Bad number of member devices";
-
- if (!BCH_SB_META_REPLICAS_WANT(sb) ||
- BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_META_REPLICAS_REQ(sb) ||
- BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
- BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of data replicas";
+ sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+ pr_buf(out, "Bad number of member devices %u (max %u)",
+ sb->nr_devices, BCH_SB_MEMBERS_MAX);
+ return -EINVAL;
+ }
- if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
- BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of data replicas";
+ if (sb->dev_idx >= sb->nr_devices) {
+ pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+ sb->dev_idx, sb->nr_devices);
+ return -EINVAL;
+ }
- if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
- return "Invalid metadata checksum type";
+ if (!sb->time_precision ||
+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+ pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+ le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+ return -EINVAL;
+ }
- if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
- return "Invalid metadata checksum type";
+ if (rw == READ) {
+ /*
+ * Been seeing a bug where these are getting inexplicably
+ * zeroed, so we'r now validating them, but we have to be
+ * careful not to preven people's filesystems from mounting:
+ */
+ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+ }
- if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
- return "Invalid compression type";
+ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+ const struct bch_option *opt = bch2_opt_table + opt_id;
- if (!BCH_SB_BTREE_NODE_SIZE(sb))
- return "Btree node size not set";
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, opt_id);
- if (BCH_SB_GC_RESERVE(sb) < 5)
- return "gc reserve percentage too small";
+ pr_buf(out, "Invalid option ");
+ ret = bch2_opt_validate(opt, v, out);
+ if (ret)
+ return ret;
- if (!sb->time_precision ||
- le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
- return "invalid time precision";
+ printbuf_reset(out);
+ }
+ }
/* validate layout */
- err = validate_sb_layout(&sb->layout);
- if (err)
- return err;
+ ret = validate_sb_layout(&sb->layout, out);
+ if (ret)
+ return ret;
vstruct_for_each(sb, f) {
- if (!f->u64s)
- return "Invalid superblock: invalid optional field";
+ if (!f->u64s) {
+ pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+ le32_to_cpu(f->type));
+ return -EINVAL;
+ }
- if (vstruct_next(f) > vstruct_last(sb))
- return "Invalid superblock: invalid optional field";
+ if (vstruct_next(f) > vstruct_last(sb)) {
+ pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+ le32_to_cpu(f->type));
+ return -EINVAL;
+ }
}
/* members must be validated first: */
mi = bch2_sb_get_members(sb);
- if (!mi)
- return "Invalid superblock: member info area missing";
+ if (!mi) {
+ pr_buf(out, "Invalid superblock: member info area missing");
+ return -EINVAL;
+ }
- err = bch2_sb_field_validate(sb, &mi->field);
- if (err)
- return err;
+ ret = bch2_sb_field_validate(sb, &mi->field, out);
+ if (ret)
+ return ret;
vstruct_for_each(sb, f) {
if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
continue;
- err = bch2_sb_field_validate(sb, f);
- if (err)
- return err;
+ ret = bch2_sb_field_validate(sb, f, out);
+ if (ret)
+ return ret;
}
- return NULL;
+ return 0;
}
/* device open: */
@@ -403,7 +454,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
memcpy(dst->compat, src->compat, sizeof(dst->compat));
for (i = 0; i < BCH_SB_FIELD_NR; i++) {
- if (i == BCH_SB_FIELD_journal)
+ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
continue;
src_f = bch2_sb_field_get(src, i);
@@ -434,9 +485,6 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
__copy_super(&c->disk_sb, src);
- if (BCH_SB_INITIALIZED(c->disk_sb.sb))
- set_bit(BCH_FS_INITIALIZED, &c->flags);
-
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret)
return ret;
@@ -470,10 +518,12 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
/* read superblock: */
-static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
struct bch_csum csum;
+ u32 version, version_min;
size_t bytes;
+ int ret;
reread:
bio_reset(sb->bio);
bio_set_dev(sb->bio, sb->bdev);
@@ -481,40 +531,65 @@ reread:
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
- if (submit_bio_wait(sb->bio))
- return "IO error";
+ ret = submit_bio_wait(sb->bio);
+ if (ret) {
+ pr_buf(err, "IO error: %i", ret);
+ return ret;
+ }
+
+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
+ pr_buf(err, "Not a bcachefs superblock");
+ return -EINVAL;
+ }
- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
- return "Not a bcachefs superblock";
+ version = le16_to_cpu(sb->sb->version);
+ version_min = version >= bcachefs_metadata_version_bkey_renumber
+ ? le16_to_cpu(sb->sb->version_min)
+ : version;
- if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min ||
- le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
- return "Unsupported superblock version";
+ if (version >= bcachefs_metadata_version_max) {
+ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
+
+ if (version_min < bcachefs_metadata_version_min) {
+ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
- return "Bad superblock: too big";
+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+ pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+ bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+ return -EINVAL;
+ }
if (bytes > sb->buffer_size) {
if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
- return "cannot allocate memory";
+ return -ENOMEM;
goto reread;
}
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
- return "unknown csum type";
+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+ return -EINVAL;
+ }
/* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
null_nonce(), sb->sb);
- if (bch2_crc_cmp(csum, sb->sb->csum))
- return "bad checksum reading superblock";
+ if (bch2_crc_cmp(csum, sb->sb->csum)) {
+ pr_buf(err, "bad checksum");
+ return -EINVAL;
+ }
sb->seq = le64_to_cpu(sb->sb->seq);
- return NULL;
+ return 0;
}
int bch2_read_super(const char *path, struct bch_opts *opts,
@@ -522,7 +597,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
- const char *err;
+ struct printbuf err = PRINTBUF;
__le64 *i;
int ret;
@@ -554,25 +629,28 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
goto out;
}
- err = "cannot allocate memory";
ret = bch2_sb_realloc(sb, 0);
- if (ret)
+ if (ret) {
+ pr_buf(&err, "error allocating memory for superblock");
goto err;
+ }
- ret = -EFAULT;
- err = "dynamic fault";
- if (bch2_fs_init_fault("read_super"))
+ if (bch2_fs_init_fault("read_super")) {
+ pr_buf(&err, "dynamic fault");
+ ret = -EFAULT;
goto err;
+ }
- ret = -EINVAL;
- err = read_one_super(sb, offset);
- if (!err)
+ ret = read_one_super(sb, offset, &err);
+ if (!ret)
goto got_super;
if (opt_defined(*opts, sb))
goto err;
- pr_err("error reading default superblock: %s", err);
+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+ path, err.buf);
+ printbuf_reset(&err);
/*
* Error reading primary superblock - read location of backup
@@ -588,13 +666,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
*/
bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
- err = "IO error";
- if (submit_bio_wait(sb->bio))
+ ret = submit_bio_wait(sb->bio);
+ if (ret) {
+ pr_buf(&err, "IO error: %i", ret);
goto err;
+ }
memcpy(&layout, sb->sb, sizeof(layout));
- err = validate_sb_layout(&layout);
- if (err)
+ ret = validate_sb_layout(&layout, &err);
+ if (ret)
goto err;
for (i = layout.sb_offset;
@@ -604,32 +684,39 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
if (offset == opt_get(*opts, sb))
continue;
- err = read_one_super(sb, offset);
- if (!err)
+ ret = read_one_super(sb, offset, &err);
+ if (!ret)
goto got_super;
}
- ret = -EINVAL;
goto err;
got_super:
- err = "Superblock block size smaller than device block size";
- ret = -EINVAL;
if (le16_to_cpu(sb->sb->block_size) << 9 <
bdev_logical_block_size(sb->bdev)) {
- pr_err("error reading superblock: Superblock block size (%u) smaller than device block size (%u)",
+ pr_buf(&err, "block size (%u) smaller than device block size (%u)",
le16_to_cpu(sb->sb->block_size) << 9,
bdev_logical_block_size(sb->bdev));
- goto err_no_print;
+ ret = -EINVAL;
+ goto err;
}
ret = 0;
sb->have_layout = true;
+
+ ret = bch2_sb_validate(sb, &err, READ);
+ if (ret) {
+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+ path, err.buf);
+ goto err_no_print;
+ }
out:
pr_verbose_init(*opts, "ret %i", ret);
+ printbuf_exit(&err);
return ret;
err:
- pr_err("error reading superblock: %s", err);
+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
@@ -703,8 +790,8 @@ int bch2_write_super(struct bch_fs *c)
{
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
+ struct printbuf err = PRINTBUF;
unsigned i, sb = 0, nr_wrote;
- const char *err;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -731,10 +818,12 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_from_fs(c, ca);
for_each_online_member(ca, c, i) {
- err = bch2_sb_validate(&ca->disk_sb);
- if (err) {
- bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
- ret = -1;
+ printbuf_reset(&err);
+
+ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+ if (ret) {
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
+ percpu_ref_put(&ca->io_ref);
goto out;
}
}
@@ -752,11 +841,24 @@ int bch2_write_super(struct bch_fs *c)
closure_sync(cl);
for_each_online_member(ca, c, i) {
- if (!ca->sb_write_error &&
- ca->disk_sb.seq !=
- le64_to_cpu(ca->sb_read_scratch->seq)) {
+ if (ca->sb_write_error)
+ continue;
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+ bch2_fs_fatal_error(c,
+ "Superblock write was silently dropped! (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
+ percpu_ref_put(&ca->io_ref);
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock modified by another process");
+ "Superblock modified by another process (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
ret = -EROFS;
goto out;
@@ -811,6 +913,7 @@ int bch2_write_super(struct bch_fs *c)
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ printbuf_exit(&err);
return ret;
}
@@ -825,133 +928,218 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
mutex_unlock(&c->sb_lock);
}
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
- u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
- return l < r ? -1 : l > r ? 1 : 0;
-}
+/* BCH_SB_FIELD_members: */
-static const char *bch2_sb_validate_journal(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_members_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
- const char *err;
- unsigned nr;
+ struct bch_sb_field_members *mi = field_to_type(f, members);
unsigned i;
- u64 *b;
-
- journal = bch2_sb_get_journal(sb);
- if (!journal)
- return NULL;
- nr = bch2_nr_journal_buckets(journal);
- if (!nr)
- return NULL;
-
- b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
- if (!b)
- return "cannot allocate memory";
+ if ((void *) (mi->members + sb->nr_devices) >
+ vstruct_end(&mi->field)) {
+ pr_buf(err, "too many devices for section size");
+ return -EINVAL;
+ }
- for (i = 0; i < nr; i++)
- b[i] = le64_to_cpu(journal->buckets[i]);
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
- sort(b, nr, sizeof(u64), u64_cmp, NULL);
+ if (!bch2_member_exists(m))
+ continue;
- err = "journal bucket at sector 0";
- if (!b[0])
- goto err;
+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
+ pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+ i, le64_to_cpu(m->nbuckets), LONG_MAX);
+ return -EINVAL;
+ }
- err = "journal bucket before first bucket";
- if (m && b[0] < le16_to_cpu(m->first_bucket))
- goto err;
+ if (le64_to_cpu(m->nbuckets) -
+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
+ pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
+ return -EINVAL;
+ }
- err = "journal bucket past end of device";
- if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
- goto err;
+ if (le16_to_cpu(m->bucket_size) <
+ le16_to_cpu(sb->block_size)) {
+ pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
+ return -EINVAL;
+ }
- err = "duplicate journal buckets";
- for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1])
- goto err;
+ if (le16_to_cpu(m->bucket_size) <
+ BCH_SB_BTREE_NODE_SIZE(sb)) {
+ pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+ return -EINVAL;
+ }
+ }
- err = NULL;
-err:
- kfree(b);
- return err;
+ return 0;
}
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
- .validate = bch2_sb_validate_journal,
-};
-
-/* BCH_SB_FIELD_members: */
-
-static const char *bch2_sb_validate_members(struct bch_sb *sb,
- struct bch_sb_field *f)
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
{
struct bch_sb_field_members *mi = field_to_type(f, members);
- struct bch_member *m;
+ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+ unsigned i;
- if ((void *) (mi->members + sb->nr_devices) >
- vstruct_end(&mi->field))
- return "Invalid superblock: bad member info";
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
+ unsigned data_have = bch2_sb_dev_has_data(sb, i);
+ u64 bucket_size = le16_to_cpu(m->bucket_size);
+ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++) {
if (!bch2_member_exists(m))
continue;
- if (le64_to_cpu(m->nbuckets) > LONG_MAX)
- return "Too many buckets";
+ pr_buf(out, "Device:");
+ pr_tab(out);
+ pr_buf(out, "%u", i);
+ pr_newline(out);
+
+ pr_indent_push(out, 2);
+
+ pr_buf(out, "UUID:");
+ pr_tab(out);
+ pr_uuid(out, m->uuid.b);
+ pr_newline(out);
+
+ pr_buf(out, "Size:");
+ pr_tab(out);
+ pr_units(out, device_size, device_size << 9);
+ pr_newline(out);
+
+ pr_buf(out, "Bucket size:");
+ pr_tab(out);
+ pr_units(out, bucket_size, bucket_size << 9);
+ pr_newline(out);
+
+ pr_buf(out, "First bucket:");
+ pr_tab(out);
+ pr_buf(out, "%u", le16_to_cpu(m->first_bucket));
+ pr_newline(out);
+
+ pr_buf(out, "Buckets:");
+ pr_tab(out);
+ pr_buf(out, "%llu", le64_to_cpu(m->nbuckets));
+ pr_newline(out);
+
+ pr_buf(out, "Last mount:");
+ pr_tab(out);
+ if (m->last_mount)
+ pr_time(out, le64_to_cpu(m->last_mount));
+ else
+ pr_buf(out, "(never)");
+ pr_newline(out);
+
+ pr_buf(out, "State:");
+ pr_tab(out);
+ pr_buf(out, "%s",
+ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+ ? bch2_member_states[BCH_MEMBER_STATE(m)]
+ : "unknown");
+ pr_newline(out);
+
+ pr_buf(out, "Group:");
+ pr_tab(out);
+ if (BCH_MEMBER_GROUP(m)) {
+ unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+ if (idx < disk_groups_nr(gi))
+ pr_buf(out, "%s (%u)",
+ gi->entries[idx].label, idx);
+ else
+ pr_buf(out, "(bad disk labels section)");
+ } else {
+ pr_buf(out, "(none)");
+ }
+ pr_newline(out);
- if (le64_to_cpu(m->nbuckets) -
- le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
- return "Not enough buckets";
+ pr_buf(out, "Data allowed:");
+ pr_tab(out);
+ if (BCH_MEMBER_DATA_ALLOWED(m))
+ bch2_flags_to_text(out, bch2_data_types,
+ BCH_MEMBER_DATA_ALLOWED(m));
+ else
+ pr_buf(out, "(none)");
+ pr_newline(out);
- if (le16_to_cpu(m->bucket_size) <
- le16_to_cpu(sb->block_size))
- return "bucket size smaller than block size";
+ pr_buf(out, "Has data:");
+ pr_tab(out);
+ if (data_have)
+ bch2_flags_to_text(out, bch2_data_types, data_have);
+ else
+ pr_buf(out, "(none)");
+ pr_newline(out);
- if (le16_to_cpu(m->bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(sb))
- return "bucket size smaller than btree node size";
- }
+ pr_buf(out, "Discard:");
+ pr_tab(out);
+ pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
+ pr_newline(out);
- return NULL;
+ pr_buf(out, "Freespace initialized:");
+ pr_tab(out);
+ pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+ pr_newline(out);
+
+ pr_indent_pop(out, 2);
+ }
}
static const struct bch_sb_field_ops bch_sb_field_ops_members = {
- .validate = bch2_sb_validate_members,
+ .validate = bch2_sb_members_validate,
+ .to_text = bch2_sb_members_to_text,
};
/* BCH_SB_FIELD_crypt: */
-static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
- if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
- return "invalid field crypt: wrong size";
+ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+ pr_buf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&crypt->field), sizeof(*crypt));
+ return -EINVAL;
+ }
+
+ if (BCH_CRYPT_KDF_TYPE(crypt)) {
+ pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ return -EINVAL;
+ }
- if (BCH_CRYPT_KDF_TYPE(crypt))
- return "invalid field crypt: bad kdf type";
+ return 0;
+}
- return NULL;
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ pr_buf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ pr_newline(out);
+ pr_buf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
+ pr_newline(out);
+ pr_buf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
+ pr_newline(out);
+ pr_buf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
+ pr_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
- .validate = bch2_sb_validate_crypt,
+ .validate = bch2_sb_crypt_validate,
+ .to_text = bch2_sb_crypt_to_text,
};
/* BCH_SB_FIELD_clean: */
-int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
{
struct jset_entry *entry;
int ret;
@@ -1027,7 +1215,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_usage, entry);
u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = FS_USAGE_INODES;
+ u->entry.btree_id = BCH_FS_USAGE_inodes;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
}
@@ -1037,7 +1225,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_usage, entry);
u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = FS_USAGE_KEY_VERSION;
+ u->entry.btree_id = BCH_FS_USAGE_key_version;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
@@ -1047,7 +1235,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_usage, entry);
u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = FS_USAGE_RESERVED;
+ u->entry.btree_id = BCH_FS_USAGE_reserved;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
}
@@ -1123,7 +1311,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
- sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
@@ -1140,7 +1328,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
* this should be in the write path, and we should be validating every
* superblock section:
*/
- ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
if (ret) {
bch_err(c, "error writing marking filesystem clean: validate error");
goto out;
@@ -1151,19 +1339,47 @@ out:
mutex_unlock(&c->sb_lock);
}
-static const char *bch2_sb_validate_clean(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
- if (vstruct_bytes(&clean->field) < sizeof(*clean))
- return "invalid field crypt: wrong size";
+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+ pr_buf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&clean->field), sizeof(*clean));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+ struct jset_entry *entry;
+
+ pr_buf(out, "flags: %x", le32_to_cpu(clean->flags));
+ pr_newline(out);
+ pr_buf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
+ pr_newline(out);
+
+ for (entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+ !entry->u64s)
+ continue;
+
+ bch2_journal_entry_to_text(out, NULL, entry);
+ pr_newline(out);
+ }
}
static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
- .validate = bch2_sb_validate_clean,
+ .validate = bch2_sb_clean_validate,
+ .to_text = bch2_sb_clean_to_text,
};
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
@@ -1173,14 +1389,27 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
#undef x
};
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
{
unsigned type = le32_to_cpu(f->type);
+ struct printbuf field_err = PRINTBUF;
+ int ret;
+
+ if (type >= BCH_SB_FIELD_NR)
+ return 0;
+
+ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
+ if (ret) {
+ pr_buf(err, "Invalid superblock section %s: %s",
+ bch2_sb_fields[type],
+ field_err.buf);
+ pr_newline(err);
+ bch2_sb_field_to_text(err, sb, f);
+ }
- return type < BCH_SB_FIELD_NR
- ? bch2_sb_field_ops[type]->validate(sb, f)
- : NULL;
+ printbuf_exit(&field_err);
+ return ret;
}
void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
@@ -1190,13 +1419,183 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
? bch2_sb_field_ops[type] : NULL;
+ if (!out->tabstops[0])
+ out->tabstops[0] = 32;
+
if (ops)
pr_buf(out, "%s", bch2_sb_fields[type]);
else
pr_buf(out, "(unknown field %u)", type);
- pr_buf(out, " (size %llu):", vstruct_bytes(f));
+ pr_buf(out, " (size %zu):", vstruct_bytes(f));
+ pr_newline(out);
- if (ops && ops->to_text)
+ if (ops && ops->to_text) {
+ pr_indent_push(out, 2);
bch2_sb_field_ops[type]->to_text(out, sb, f);
+ pr_indent_pop(out, 2);
+ }
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+ unsigned i;
+
+ pr_buf(out, "Type: %u", l->layout_type);
+ pr_newline(out);
+
+ pr_buf(out, "Superblock max size: ");
+ pr_units(out,
+ 1 << l->sb_max_size_bits,
+ 512 << l->sb_max_size_bits);
+ pr_newline(out);
+
+ pr_buf(out, "Nr superblocks: %u", l->nr_superblocks);
+ pr_newline(out);
+
+ pr_buf(out, "Offsets: ");
+ for (i = 0; i < l->nr_superblocks; i++) {
+ if (i)
+ pr_buf(out, ", ");
+ pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+ }
+ pr_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+ bool print_layout, unsigned fields)
+{
+ struct bch_sb_field_members *mi;
+ struct bch_sb_field *f;
+ u64 fields_have = 0;
+ unsigned nr_devices = 0;
+
+ if (!out->tabstops[0])
+ out->tabstops[0] = 32;
+
+ mi = bch2_sb_get_members(sb);
+ if (mi) {
+ struct bch_member *m;
+
+ for (m = mi->members;
+ m < mi->members + sb->nr_devices;
+ m++)
+ nr_devices += bch2_member_exists(m);
+ }
+
+ pr_buf(out, "External UUID:");
+ pr_tab(out);
+ pr_uuid(out, sb->user_uuid.b);
+ pr_newline(out);
+
+ pr_buf(out, "Internal UUID:");
+ pr_tab(out);
+ pr_uuid(out, sb->uuid.b);
+ pr_newline(out);
+
+ pr_buf(out, "Device index:");
+ pr_tab(out);
+ pr_buf(out, "%u", sb->dev_idx);
+ pr_newline(out);
+
+ pr_buf(out, "Label:");
+ pr_tab(out);
+ pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+ pr_newline(out);
+
+ pr_buf(out, "Version:");
+ pr_tab(out);
+ pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
+ pr_newline(out);
+
+ pr_buf(out, "Oldest version on disk:");
+ pr_tab(out);
+ pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+ pr_newline(out);
+
+ pr_buf(out, "Created:");
+ pr_tab(out);
+ if (sb->time_base_lo)
+ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ else
+ pr_buf(out, "(not set)");
+ pr_newline(out);
+
+ pr_buf(out, "Sequence number:");
+ pr_tab(out);
+ pr_buf(out, "%llu", le64_to_cpu(sb->seq));
+ pr_newline(out);
+
+ pr_buf(out, "Superblock size:");
+ pr_tab(out);
+ pr_buf(out, "%zu", vstruct_bytes(sb));
+ pr_newline(out);
+
+ pr_buf(out, "Clean:");
+ pr_tab(out);
+ pr_buf(out, "%llu", BCH_SB_CLEAN(sb));
+ pr_newline(out);
+
+ pr_buf(out, "Devices:");
+ pr_tab(out);
+ pr_buf(out, "%u", nr_devices);
+ pr_newline(out);
+
+ pr_buf(out, "Sections:");
+ vstruct_for_each(sb, f)
+ fields_have |= 1 << le32_to_cpu(f->type);
+ pr_tab(out);
+ bch2_flags_to_text(out, bch2_sb_fields, fields_have);
+ pr_newline(out);
+
+ pr_buf(out, "Features:");
+ pr_tab(out);
+ bch2_flags_to_text(out, bch2_sb_features,
+ le64_to_cpu(sb->features[0]));
+ pr_newline(out);
+
+ pr_buf(out, "Compat features:");
+ pr_tab(out);
+ bch2_flags_to_text(out, bch2_sb_compat,
+ le64_to_cpu(sb->compat[0]));
+ pr_newline(out);
+
+ pr_newline(out);
+ pr_buf(out, "Options:");
+ pr_newline(out);
+ pr_indent_push(out, 2);
+ {
+ enum bch_opt_id id;
+
+ for (id = 0; id < bch2_opts_nr; id++) {
+ const struct bch_option *opt = bch2_opt_table + id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, id);
+
+ pr_buf(out, "%s:", opt->attr.name);
+ pr_tab(out);
+ bch2_opt_to_text(out, NULL, sb, opt, v,
+ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+ pr_newline(out);
+ }
+ }
+ }
+
+ pr_indent_pop(out, 2);
+
+ if (print_layout) {
+ pr_newline(out);
+ pr_buf(out, "layout:");
+ pr_newline(out);
+ pr_indent_push(out, 2);
+ bch2_sb_layout_to_text(out, &sb->layout);
+ pr_indent_pop(out, 2);
+ }
+
+ vstruct_for_each(sb, f)
+ if (fields & (1 << le32_to_cpu(f->type))) {
+ pr_newline(out);
+ bch2_sb_field_to_text(out, sb, f);
+ }
}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 5c264875acb4..14a25f6fe29a 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -38,9 +38,8 @@ BCH_SB_FIELDS()
extern const char * const bch2_sb_fields[];
struct bch_sb_field_ops {
- const char * (*validate)(struct bch_sb *, struct bch_sb_field *);
- void (*to_text)(struct printbuf *, struct bch_sb *,
- struct bch_sb_field *);
+ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
};
static inline __le64 bch2_sb_magic(struct bch_fs *c)
@@ -66,8 +65,6 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-const char *bch2_sb_validate(struct bch_sb_handle *);
-
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);
@@ -78,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
__bch2_check_set_feature(c, feat);
}
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
- return j
- ? (__le64 *) vstruct_end(&j->field) - j->buckets
- : 0;
-}
-
/* BCH_SB_FIELD_members: */
static inline bool bch2_member_exists(struct bch_member *m)
@@ -115,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
};
}
@@ -124,12 +113,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
void bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry **, u64);
-int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
int bch2_fs_mark_dirty(struct bch_fs *);
void bch2_fs_mark_clean(struct bch_fs *);
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index df6bffeffe06..4a071711d363 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -16,6 +16,7 @@
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
+#include "buckets_waiting_for_journal.h"
#include "chardev.h"
#include "checksum.h"
#include "clock.h"
@@ -198,17 +199,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
bch2_journal_flush_all_pins(&c->journal);
- /*
- * If the allocator threads didn't all start up, the btree updates to
- * write out alloc info aren't going to work:
- */
- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto nowrote_alloc;
-
bch_verbose(c, "flushing journal and stopping allocators");
bch2_journal_flush_all_pins(&c->journal);
- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
do {
clean_passes++;
@@ -233,17 +226,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete");
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
+
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
flush_work(&c->btree_interior_update_work);
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
-
- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
bch2_fs_journal_stop(&c->journal);
/*
@@ -279,10 +266,6 @@ void bch2_fs_read_only(struct bch_fs *c)
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch2_dev_allocator_stop()).
*/
percpu_ref_kill(&c->writes);
@@ -411,19 +394,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for_each_rw_member(ca, c, i) {
- ret = bch2_dev_allocator_start(ca);
- if (ret) {
- bch_err(c, "error starting allocator threads");
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
-
- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- for_each_rw_member(ca, c, i)
- bch2_wake_allocator(ca);
+ bch2_do_discards(c);
if (!early) {
ret = bch2_fs_read_write_late(c);
@@ -468,6 +439,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
+ bch2_fs_buckets_waiting_for_journal_exit(c);
bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
@@ -528,6 +500,8 @@ void __bch2_fs_stop(struct bch_fs *c)
set_bit(BCH_FS_STOPPING, &c->flags);
+ cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
down_write(&c->state_lock);
bch2_fs_read_only(c);
up_write(&c->state_lock);
@@ -670,6 +644,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
@@ -690,6 +665,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
spin_lock_init(&c->btree_write_error_lock);
+ INIT_WORK(&c->journal_seq_blacklist_gc_work,
+ bch2_blacklist_entries_gc);
+
INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
@@ -737,7 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+ uuid_unparse_lower(c->sb.user_uuid.b, c->name);
/* Compat: */
if (sb->version <= bcachefs_metadata_version_inode_v2 &&
@@ -755,6 +733,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_opts_apply(&c->opts, opts);
+ /* key cache currently disabled for inodes, because of snapshots: */
+ c->opts.inodes_use_key_cache = 0;
+
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+ if (c->opts.inodes_use_key_cache)
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
c->block_bits = ilog2(block_sectors(c));
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
@@ -805,6 +790,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_buckets_waiting_for_journal_init(c);
bch2_fs_subvolumes_init(c) ?:
bch2_fs_io_init(c) ?:
bch2_fs_encryption_init(c) ?:
@@ -814,9 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- if (c->opts.nochanges)
- set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -852,12 +835,9 @@ noinline_for_stack
static void print_mount_opts(struct bch_fs *c)
{
enum bch_opt_id i;
- char buf[512];
- struct printbuf p = PBUF(buf);
+ struct printbuf p = PRINTBUF;
bool first = true;
- strcpy(buf, "(null)");
-
if (c->opts.read_only) {
pr_buf(&p, "ro");
first = false;
@@ -876,10 +856,14 @@ static void print_mount_opts(struct bch_fs *c)
if (!first)
pr_buf(&p, ",");
first = false;
- bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
- bch_info(c, "mounted with opts: %s", buf);
+ if (!p.pos)
+ pr_buf(&p, "(null)");
+
+ bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
+ printbuf_exit(&p);
}
int bch2_fs_start(struct bch_fs *c)
@@ -927,20 +911,6 @@ int bch2_fs_start(struct bch_fs *c)
set_bit(BCH_FS_STARTED, &c->flags);
- /*
- * Allocator threads don't start filling copygc reserve until after we
- * set BCH_FS_STARTED - wake them now:
- *
- * XXX ugly hack:
- * Need to set ca->allocator_state here instead of relying on the
- * allocator threads to do it to avoid racing with the copygc threads
- * checking it and thinking they have no alloc reserve:
- */
- for_each_online_member(ca, c, i) {
- ca->allocator_state = ALLOCATOR_running;
- bch2_wake_allocator(ca);
- }
-
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
@@ -1032,8 +1002,6 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
- bch2_dev_allocator_stop(ca);
-
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
@@ -1148,8 +1116,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
- if (opt_defined(c->opts, discard))
- ca->mi.discard = opt_get(c->opts, discard);
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
@@ -1200,12 +1168,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- bch2_dev_allocator_start(ca)) {
- bch2_dev_free(ca);
- goto err;
- }
-
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
@@ -1251,6 +1213,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
+ ca->dev = ca->disk_sb.bdev->bd_dev;
+
percpu_ref_reinit(&ca->io_ref);
return 0;
@@ -1389,14 +1353,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
- bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
bch2_copygc_start(c);
}
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
@@ -1404,8 +1367,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
-
- return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1432,7 +1393,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
- ret = __bch2_dev_read_write(c, ca);
+ __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
@@ -1455,30 +1416,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- struct btree_trans trans;
- size_t i;
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- for (i = 0; i < ca->mi.nbuckets; i++) {
- ret = lockrestart_do(&trans,
- bch2_btree_key_cache_flush(&trans,
- BTREE_ID_alloc, POS(ca->dev_idx, i)));
- if (ret)
- break;
- }
- bch2_trans_exit(&trans);
-
- if (ret) {
+ ret = bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_NORUN, NULL);
+ if (ret)
bch_err(c, "error %i removing dev alloc info", ret);
- return ret;
- }
- return bch2_btree_delete_range(c, BTREE_ID_alloc,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx + 1, 0),
- NULL);
+ return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1543,11 +1494,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
data = bch2_dev_has_data(c, ca);
if (data) {
- char data_has_str[100];
+ struct printbuf data_has = PRINTBUF;
- bch2_flags_to_text(&PBUF(data_has_str),
- bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ bch2_flags_to_text(&data_has, bch2_data_types, data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ printbuf_exit(&data_has);
ret = -EBUSY;
goto err;
}
@@ -1596,52 +1547,58 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
+ struct printbuf errbuf = PRINTBUF;
int ret;
ret = bch2_read_super(path, &opts, &sb);
- if (ret)
- return ret;
-
- err = bch2_sb_validate(&sb);
- if (err)
- return -EINVAL;
+ if (ret) {
+ bch_err(c, "device add error: error reading super: %i", ret);
+ goto err;
+ }
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
err = bch2_dev_may_add(sb.sb, c);
- if (err)
- return -EINVAL;
+ if (err) {
+ bch_err(c, "device add error: %s", err);
+ ret = -EINVAL;
+ goto err;
+ }
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
bch2_free_super(&sb);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err;
}
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret) {
bch2_dev_free(ca);
- return ret;
+ goto err;
}
- err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
- if (ret)
+ if (ret) {
+ bch_err(c, "device add error: journal alloc failed");
goto err;
+ }
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
- err = "insufficient space in new superblock";
ret = bch2_sb_from_fs(c, ca);
- if (ret)
+ if (ret) {
+ bch_err(c, "device add error: new device superblock too small");
goto err_unlock;
+ }
mi = bch2_sb_get_members(ca->disk_sb.sb);
if (!bch2_sb_resize_members(&ca->disk_sb,
le32_to_cpu(mi->field.u64s) +
sizeof(dev_mi) / sizeof(u64))) {
+ bch_err(c, "device add error: new device superblock too small");
ret = -ENOSPC;
goto err_unlock;
}
@@ -1654,7 +1611,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
goto have_slot;
no_slot:
- err = "no slots available in superblock";
+ bch_err(c, "device add error: already have maximum number of devices");
ret = -ENOSPC;
goto err_unlock;
@@ -1663,12 +1620,12 @@ have_slot:
u64s = (sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
- err = "no space in superblock for member info";
- ret = -ENOSPC;
-
mi = bch2_sb_resize_members(&c->disk_sb, u64s);
- if (!mi)
+ if (!mi) {
+ bch_err(c, "device add error: no room in superblock for member info");
+ ret = -ENOSPC;
goto err_unlock;
+ }
/* success: */
@@ -1684,18 +1641,22 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
- err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret)
+ if (ret) {
+ bch_err(c, "device add error: error marking new superblock: %i", ret);
goto err_late;
+ }
+
+ ret = bch2_fs_freespace_init(c);
+ if (ret) {
+ bch_err(c, "device add error: error initializing free space: %i", ret);
+ goto err_late;
+ }
ca->new_fs_bucket_idx = 0;
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err_late;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return 0;
@@ -1707,12 +1668,12 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- bch_err(c, "Unable to add device: %s", err);
+ printbuf_exit(&errbuf);
return ret;
err_late:
up_write(&c->state_lock);
- bch_err(c, "Error going rw after adding device: %s", err);
- return -EINVAL;
+ ca = NULL;
+ goto err;
}
/* Hot add existing device to running filesystem: */
@@ -1755,11 +1716,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err;
}
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -1846,20 +1804,14 @@ err:
}
/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
struct bch_dev *ca;
- dev_t dev;
unsigned i;
- int ret;
-
- ret = lookup_bdev(path, &dev);
- if (ret)
- return ERR_PTR(ret);
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL)
- if (ca->disk_sb.bdev->bd_dev == dev)
+ if (!strcmp(name, ca->name))
goto found;
ca = ERR_PTR(-ENOENT);
found:
@@ -1878,18 +1830,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
const char *err;
+ struct printbuf errbuf = PRINTBUF;
int ret = 0;
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-ENODEV);
+
pr_verbose_init(opts, "");
if (!nr_devices) {
- c = ERR_PTR(-EINVAL);
- goto out2;
- }
-
- if (!try_module_get(THIS_MODULE)) {
- c = ERR_PTR(-ENODEV);
- goto out2;
+ ret = -EINVAL;
+ goto err;
}
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
@@ -1903,9 +1854,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
if (ret)
goto err;
- err = bch2_sb_validate(&sb[i]);
- if (err)
- goto err_print;
}
for (i = 1; i < nr_devices; i++)
@@ -1960,8 +1908,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
out:
kfree(sb);
+ printbuf_exit(&errbuf);
module_put(THIS_MODULE);
-out2:
pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
err_print:
@@ -1978,81 +1926,6 @@ err:
goto out;
}
-static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
- struct bch_opts opts)
-{
- const char *err;
- struct bch_fs *c;
- bool allocated_fs = false;
- int ret;
-
- err = bch2_sb_validate(sb);
- if (err)
- return err;
-
- mutex_lock(&bch_fs_list_lock);
- c = __bch2_uuid_to_fs(sb->sb->uuid);
- if (c) {
- closure_get(&c->cl);
-
- err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
- if (err)
- goto err;
- } else {
- allocated_fs = true;
- c = bch2_fs_alloc(sb->sb, opts);
-
- err = "bch2_fs_alloc() error";
- if (IS_ERR(c))
- goto err;
- }
-
- err = "bch2_dev_online() error";
-
- mutex_lock(&c->sb_lock);
- if (bch2_dev_attach_bdev(c, sb)) {
- mutex_unlock(&c->sb_lock);
- goto err;
- }
- mutex_unlock(&c->sb_lock);
-
- if (!c->opts.nostart && bch2_fs_may_start(c)) {
- err = "error starting filesystem";
- ret = bch2_fs_start(c);
- if (ret)
- goto err;
- }
-
- closure_put(&c->cl);
- mutex_unlock(&bch_fs_list_lock);
-
- return NULL;
-err:
- mutex_unlock(&bch_fs_list_lock);
-
- if (allocated_fs && !IS_ERR(c))
- bch2_fs_stop(c);
- else if (c)
- closure_put(&c->cl);
-
- return err;
-}
-
-const char *bch2_fs_open_incremental(const char *path)
-{
- struct bch_sb_handle sb;
- struct bch_opts opts = bch2_opts_empty();
- const char *err;
-
- if (bch2_read_super(path, &opts, &sb))
- return "error reading superblock";
-
- err = __bch2_fs_open_incremental(&sb, opts);
- bch2_free_super(&sb);
-
- return err;
-}
-
/* Global interfaces/init */
static void bcachefs_exit(void)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index c3273e9c711d..6d3efda26e63 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
return remainder;
}
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+ u32 *offset)
+{
+ return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
return !percpu_ref_is_zero(&ca->io_ref);
@@ -254,6 +260,5 @@ void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-const char *bch2_fs_open_incremental(const char *path);
#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index d8b159a5b7f7..89419fc7930d 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -32,6 +32,7 @@ struct bch_member_cpu {
u8 discard;
u8 data_allowed;
u8 durability;
+ u8 freespace_initialized;
u8 valid;
};
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 07e9b214bcb5..2594fec4b821 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -46,8 +46,28 @@ struct sysfs_ops type ## _sysfs_ops = { \
}
#define SHOW(fn) \
+static ssize_t fn ## _to_text(struct printbuf *, \
+ struct kobject *, struct attribute *);\
+ \
static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
char *buf) \
+{ \
+ struct printbuf out = PRINTBUF; \
+ ssize_t ret = fn ## _to_text(&out, kobj, attr); \
+ \
+ if (!ret && out.allocation_failure) \
+ ret = -ENOMEM; \
+ \
+ if (!ret) { \
+ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \
+ memcpy(buf, out.buf, ret); \
+ } \
+ printbuf_exit(&out); \
+ return ret; \
+} \
+ \
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+ struct attribute *attr)
#define STORE(fn) \
static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
@@ -64,22 +84,19 @@ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
#define sysfs_printf(file, fmt, ...) \
do { \
if (attr == &sysfs_ ## file) \
- return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+ pr_buf(out, fmt "\n", __VA_ARGS__); \
} while (0)
#define sysfs_print(file, var) \
do { \
if (attr == &sysfs_ ## file) \
- return snprint(buf, PAGE_SIZE, var); \
+ snprint(out, var); \
} while (0)
#define sysfs_hprint(file, val) \
do { \
- if (attr == &sysfs_ ## file) { \
- bch2_hprint(&out, val); \
- pr_buf(&out, "\n"); \
- return out.pos - buf; \
- } \
+ if (attr == &sysfs_ ## file) \
+ bch2_hprint(out, val); \
} while (0)
#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
@@ -153,13 +170,10 @@ read_attribute(congested);
read_attribute(btree_avg_write_size);
-read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(journal_pins);
read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_transactions);
@@ -170,11 +184,11 @@ read_attribute(internal_uuid);
read_attribute(has_data);
read_attribute(alloc_debug);
-write_attribute(wake_allocator);
read_attribute(read_realloc_races);
read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced);
+read_attribute(bucket_alloc_fail);
rw_attribute(discard);
rw_attribute(label);
@@ -192,7 +206,7 @@ read_attribute(new_stripes);
read_attribute(io_timers_read);
read_attribute(io_timers_write);
-read_attribute(data_op_data_progress);
+read_attribute(data_jobs);
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
@@ -230,32 +244,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
return nr ? div64_u64(sectors, nr) : 0;
}
-static long stats_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_move_stats *stats)
-{
- pr_buf(out, "%s: data type %s btree_id %s position: ",
- stats->name,
- bch2_data_types[stats->data_type],
- bch2_btree_ids[stats->btree_id]);
- bch2_bpos_to_text(out, stats->pos);
- pr_buf(out, "%s", "\n");
-
- return 0;
-}
-
static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
{
long ret = 0;
- struct bch_move_stats *iter;
+ struct bch_move_stats *stats;
mutex_lock(&c->data_progress_lock);
-
- if (list_empty(&c->data_progress_list))
- pr_buf(out, "%s", "no progress to report\n");
- else
- list_for_each_entry(iter, &c->data_progress_list, list) {
- stats_to_text(out, c, iter);
- }
+ list_for_each_entry(stats, &c->data_progress_list, list) {
+ pr_buf(out, "%s: data type %s btree_id %s position: ",
+ stats->name,
+ bch2_data_types[stats->data_type],
+ bch2_btree_ids[stats->btree_id]);
+ bch2_bpos_to_text(out, stats->pos);
+ pr_buf(out, "%s", "\n");
+ }
mutex_unlock(&c->data_progress_lock);
return ret;
@@ -266,8 +268,12 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+ enum btree_id id;
+ u64 nr_uncompressed_extents = 0,
nr_compressed_extents = 0,
+ nr_incompressible_extents = 0,
+ uncompressed_sectors = 0,
+ incompressible_sectors = 0,
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
int ret;
@@ -277,47 +283,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
- if (k.k->type == KEY_TYPE_extent) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!((1U << id) & BTREE_ID_HAS_PTRS))
+ continue;
+
+ for_each_btree_key(&trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
-
- extent_for_each_ptr_decode(e, p, entry) {
- if (!crc_is_compressed(p.crc)) {
- nr_uncompressed_extents++;
- uncompressed_sectors += e.k->size;
- } else {
- nr_compressed_extents++;
+ bool compressed = false, uncompressed = false, incompressible = false;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ switch (p.crc.compression_type) {
+ case BCH_COMPRESSION_TYPE_none:
+ uncompressed = true;
+ uncompressed_sectors += k.k->size;
+ break;
+ case BCH_COMPRESSION_TYPE_incompressible:
+ incompressible = true;
+ incompressible_sectors += k.k->size;
+ break;
+ default:
compressed_sectors_compressed +=
p.crc.compressed_size;
compressed_sectors_uncompressed +=
p.crc.uncompressed_size;
+ compressed = true;
+ break;
}
-
- /* only looking at the first ptr */
- break;
}
+
+ if (incompressible)
+ nr_incompressible_extents++;
+ else if (uncompressed)
+ nr_uncompressed_extents++;
+ else if (compressed)
+ nr_compressed_extents++;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(&trans, &iter);
+ }
bch2_trans_exit(&trans);
+
if (ret)
return ret;
- pr_buf(out,
- "uncompressed data:\n"
- " nr extents: %llu\n"
- " size (bytes): %llu\n"
- "compressed data:\n"
- " nr extents: %llu\n"
- " compressed size (bytes): %llu\n"
- " uncompressed size (bytes): %llu\n",
- nr_uncompressed_extents,
- uncompressed_sectors << 9,
- nr_compressed_extents,
- compressed_sectors_compressed << 9,
- compressed_sectors_uncompressed << 9);
+ pr_buf(out, "uncompressed:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents);
+ pr_buf(out, " size: ");
+ bch2_hprint(out, uncompressed_sectors << 9);
+ pr_buf(out, "\n");
+
+ pr_buf(out, "compressed:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_compressed_extents);
+ pr_buf(out, " compressed size: ");
+ bch2_hprint(out, compressed_sectors_compressed << 9);
+ pr_buf(out, "\n");
+ pr_buf(out, " uncompressed size: ");
+ bch2_hprint(out, compressed_sectors_uncompressed << 9);
+ pr_buf(out, "\n");
+
+ pr_buf(out, "incompressible:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents);
+ pr_buf(out, " size: ");
+ bch2_hprint(out, incompressible_sectors << 9);
+ pr_buf(out, "\n");
return 0;
}
@@ -331,7 +362,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
@@ -345,13 +375,13 @@ SHOW(bch2_fs)
atomic_long_read(&c->extent_migrate_done));
sysfs_print(extent_migrate_raced,
atomic_long_read(&c->extent_migrate_raced));
+ sysfs_print(bucket_alloc_fail,
+ atomic_long_read(&c->bucket_alloc_fail));
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
- if (attr == &sysfs_gc_gens_pos) {
- bch2_gc_gens_pos_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_gc_gens_pos)
+ bch2_gc_gens_pos_to_text(out, c);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
@@ -361,83 +391,48 @@ SHOW(bch2_fs)
max(0LL, c->copygc_wait -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
- if (attr == &sysfs_rebalance_work) {
- bch2_rebalance_work_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_rebalance_work)
+ bch2_rebalance_work_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
/* Debugging: */
- if (attr == &sysfs_journal_debug) {
- bch2_journal_debug_to_text(&out, &c->journal);
- return out.pos - buf;
- }
+ if (attr == &sysfs_journal_debug)
+ bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_journal_pins) {
- bch2_journal_pins_to_text(&out, &c->journal);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_updates)
+ bch2_btree_updates_to_text(out, c);
- if (attr == &sysfs_btree_updates) {
- bch2_btree_updates_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_cache)
+ bch2_btree_cache_to_text(out, c);
- if (attr == &sysfs_dirty_btree_nodes) {
- bch2_dirty_btree_nodes_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_key_cache)
+ bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
- if (attr == &sysfs_btree_cache) {
- bch2_btree_cache_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_transactions)
+ bch2_btree_trans_to_text(out, c);
- if (attr == &sysfs_btree_key_cache) {
- bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
- return out.pos - buf;
- }
+ if (attr == &sysfs_stripes_heap)
+ bch2_stripes_heap_to_text(out, c);
- if (attr == &sysfs_btree_transactions) {
- bch2_btree_trans_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c);
- if (attr == &sysfs_stripes_heap) {
- bch2_stripes_heap_to_text(&out, c);
- return out.pos - buf;
- }
-
- if (attr == &sysfs_open_buckets) {
- bch2_open_buckets_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_compression_stats)
+ bch2_compression_stats_to_text(out, c);
- if (attr == &sysfs_compression_stats) {
- bch2_compression_stats_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_new_stripes)
+ bch2_new_stripes_to_text(out, c);
- if (attr == &sysfs_new_stripes) {
- bch2_new_stripes_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_timers_read)
+ bch2_io_timers_to_text(out, &c->io_clock[READ]);
- if (attr == &sysfs_io_timers_read) {
- bch2_io_timers_to_text(&out, &c->io_clock[READ]);
- return out.pos - buf;
- }
- if (attr == &sysfs_io_timers_write) {
- bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_timers_write)
+ bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
- if (attr == &sysfs_data_op_data_progress) {
- data_progress_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_data_jobs)
+ data_progress_to_text(out, c);
return 0;
}
@@ -482,6 +477,17 @@ STORE(bch2_fs)
/* Debugging: */
+ if (!test_bit(BCH_FS_RW, &c->flags))
+ return -EROFS;
+
+ if (attr == &sysfs_prune_cache) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+ }
+
if (attr == &sysfs_trigger_gc) {
/*
* Full gc is currently incompatible with btree key cache:
@@ -495,14 +501,6 @@ STORE(bch2_fs)
#endif
}
- if (attr == &sysfs_prune_cache) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
- }
-
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -547,7 +545,7 @@ struct attribute *bch2_fs_files[] = {
SHOW(bch2_fs_internal)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
- return bch2_fs_show(&c->kobj, attr, buf);
+ return bch2_fs_to_text(out, &c->kobj, attr);
}
STORE(bch2_fs_internal)
@@ -559,9 +557,7 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_debug,
- &sysfs_journal_pins,
&sysfs_btree_updates,
- &sysfs_dirty_btree_nodes,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
@@ -577,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
&sysfs_extent_migrate_raced,
+ &sysfs_bucket_alloc_fail,
&sysfs_gc_gens_pos,
@@ -587,7 +584,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
- &sysfs_data_op_data_progress,
+ &sysfs_data_jobs,
&sysfs_internal_uuid,
NULL
@@ -597,39 +594,47 @@ struct attribute *bch2_fs_internal_files[] = {
SHOW(bch2_fs_opts_dir)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
int id = opt - bch2_opt_table;
u64 v = bch2_opt_get_by_id(&c->opts, id);
- bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
- pr_buf(&out, "\n");
+ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+ pr_char(out, '\n');
- return out.pos - buf;
+ return 0;
}
STORE(bch2_fs_opts_dir)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
- int ret, id = opt - bch2_opt_table;
+ int ret = size, id = opt - bch2_opt_table;
char *tmp;
u64 v;
+ /*
+ * We don't need to take c->writes for correctness, but it eliminates an
+ * unsightly error message in the dmesg log when we're RO:
+ */
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return -EROFS;
+
tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto err;
+ }
- ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
kfree(tmp);
if (ret < 0)
- return ret;
+ goto err;
ret = bch2_opt_check_may_set(c, id, v);
if (ret < 0)
- return ret;
+ goto err;
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
@@ -639,8 +644,9 @@ STORE(bch2_fs_opts_dir)
bch2_rebalance_add_work(c, S64_MAX);
rebalance_wakeup(c);
}
-
- return size;
+err:
+ percpu_ref_put(&c->writes);
+ return ret;
}
SYSFS_OPS(bch2_fs_opts_dir);
@@ -670,13 +676,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
#define x(name) \
- if (attr == &sysfs_time_stat_##name) { \
- bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
- return out.pos - buf; \
- }
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
BCH_TIME_STATS()
#undef x
@@ -697,24 +700,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- enum alloc_reserve i;
-
- spin_lock(&ca->fs->freelist_lock);
-
- pr_buf(out, "free_inc:\t%zu\t%zu\n",
- fifo_used(&ca->free_inc),
- ca->free_inc.size);
-
- for (i = 0; i < RESERVE_NR; i++)
- pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
- fifo_used(&ca->free[i]),
- ca->free[i].size);
-
- spin_unlock(&ca->fs->freelist_lock);
-}
-
static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
@@ -740,9 +725,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"ec\t%16llu\n"
"available%15llu\n"
"\n"
- "free_inc\t\t%zu/%zu\n"
- "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
- "free[RESERVE_NONE]\t%zu/%zu\n"
"freelist_wait\t\t%s\n"
"open buckets allocated\t%u\n"
"open buckets this dev\t%u\n"
@@ -750,13 +732,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"open_buckets_wait\t%s\n"
"open_buckets_btree\t%u\n"
"open_buckets_user\t%u\n"
- "btree reserve cache\t%u\n"
- "thread state:\t\t%s\n",
+ "btree reserve cache\t%u\n",
stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
+ __dev_buckets_available(ca, stats, RESERVE_none),
c->freelist_wait.list.first ? "waiting" : "empty",
OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
ca->nr_open_buckets,
@@ -764,8 +742,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_btree],
nr[BCH_DATA_user],
- c->btree_reserve_cache_nr,
- bch2_allocator_states[ca->allocator_state]);
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
@@ -792,7 +769,6 @@ SHOW(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
@@ -805,58 +781,44 @@ SHOW(bch2_dev)
if (attr == &sysfs_label) {
if (ca->mi.group) {
mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(&out, &c->disk_sb,
+ bch2_disk_path_to_text(out, c->disk_sb.sb,
ca->mi.group - 1);
mutex_unlock(&c->sb_lock);
}
- pr_buf(&out, "\n");
- return out.pos - buf;
+ pr_char(out, '\n');
}
if (attr == &sysfs_has_data) {
- bch2_flags_to_text(&out, bch2_data_types,
+ bch2_flags_to_text(out, bch2_data_types,
bch2_dev_has_data(c, ca));
- pr_buf(&out, "\n");
- return out.pos - buf;
+ pr_char(out, '\n');
}
if (attr == &sysfs_state_rw) {
- bch2_string_opt_to_text(&out, bch2_member_states,
+ bch2_string_opt_to_text(out, bch2_member_states,
ca->mi.state);
- pr_buf(&out, "\n");
- return out.pos - buf;
+ pr_char(out, '\n');
}
- if (attr == &sysfs_iodone) {
- dev_iodone_to_text(&out, ca);
- return out.pos - buf;
- }
+ if (attr == &sysfs_iodone)
+ dev_iodone_to_text(out, ca);
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
- if (attr == &sysfs_io_latency_stats_read) {
- bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
- return out.pos - buf;
- }
- if (attr == &sysfs_io_latency_stats_write) {
- bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_latency_stats_read)
+ bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+ if (attr == &sysfs_io_latency_stats_write)
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_reserve_stats) {
- reserve_stats_to_text(&out, ca);
- return out.pos - buf;
- }
- if (attr == &sysfs_alloc_debug) {
- dev_alloc_debug_to_text(&out, ca);
- return out.pos - buf;
- }
+ if (attr == &sysfs_alloc_debug)
+ dev_alloc_debug_to_text(out, ca);
return 0;
}
@@ -894,9 +856,6 @@ STORE(bch2_dev)
return ret;
}
- if (attr == &sysfs_wake_allocator)
- bch2_wake_allocator(ca);
-
return size;
}
SYSFS_OPS(bch2_dev);
@@ -922,11 +881,8 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write,
&sysfs_congested,
- &sysfs_reserve_stats,
-
/* debug: */
&sysfs_alloc_debug,
- &sysfs_wake_allocator,
NULL
};
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 60ccb94e5de5..4369bfc55a94 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -4,6 +4,7 @@
#include "bcachefs.h"
#include "btree_update.h"
#include "journal_reclaim.h"
+#include "subvolume.h"
#include "tests.h"
#include "linux/kthread.h"
@@ -14,15 +15,14 @@ static void delete_test_keys(struct bch_fs *c)
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX),
- SPOS(0, U64_MAX, U32_MAX),
+ SPOS(0, 0, U32_MAX), SPOS_MAX,
+ 0,
NULL);
BUG_ON(ret);
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- SPOS(0, U64_MAX, U32_MAX),
- NULL);
+ SPOS(0, 0, U32_MAX), SPOS_MAX,
+ 0, NULL);
BUG_ON(ret);
}
@@ -146,7 +146,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
i = 0;
for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- POS_MIN, 0, k, ret) {
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
if (k.k->p.inode)
break;
@@ -202,7 +202,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
i = 0;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS_MIN, 0, k, ret) {
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
}
@@ -256,8 +256,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
- 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
if (k.k->p.inode)
break;
@@ -272,7 +272,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
BTREE_ITER_SLOTS, k, ret) {
BUG_ON(k.k->p.offset != i);
BUG_ON(bkey_deleted(k.k) != (i & 1));
@@ -321,8 +322,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
- 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i + 8);
BUG_ON(k.k->size != 8);
i += 16;
@@ -335,7 +336,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX),
BTREE_ITER_SLOTS, k, ret) {
BUG_ON(bkey_deleted(k.k) != !(i % 16));
@@ -363,7 +365,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
struct bkey_s_c k;
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
@@ -383,7 +386,8 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
struct bkey_s_c k;
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0);
k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
@@ -406,8 +410,6 @@ static int insert_test_extent(struct bch_fs *c,
struct bkey_i_cookie k;
int ret;
- //pr_info("inserting %llu-%llu v %llu", start, end, test_version);
-
bkey_cookie_init(&k.k_i);
k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX;
@@ -459,6 +461,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
__test_extent_overwrite(c, 32, 64, 32, 128);
}
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie cookie;
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = snapid_hi;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, snapid_lo), 0);
+ k = bch2_btree_iter_peek(&iter);
+
+ BUG_ON(k.k->p.snapshot != U32_MAX);
+
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+ struct bkey_i_cookie cookie;
+ u32 snapids[2];
+ u32 snapid_subvols[2] = { 1, 1 };
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = U32_MAX;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_snapshot_node_create(&trans, U32_MAX,
+ snapids,
+ snapid_subvols,
+ 2));
+ if (ret)
+ return ret;
+
+ if (snapids[0] > snapids[1])
+ swap(snapids[0], snapids[1]);
+
+ ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+ if (ret) {
+ bch_err(c, "err %i from test_snapshot_filter", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
/* perf tests */
static u64 test_rand(void)
@@ -747,7 +813,8 @@ static int seq_delete(struct bch_fs *c, u64 nr)
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS_MAX, NULL);
+ SPOS(0, 0, U32_MAX), SPOS_MAX,
+ 0, NULL);
if (ret)
bch_err(c, "error in seq_delete: %i", ret);
return ret;
@@ -785,8 +852,10 @@ static int btree_perf_test_thread(void *data)
}
ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
- if (ret)
+ if (ret) {
+ bch_err(j->c, "%ps: error %i", j->fn, ret);
j->ret = ret;
+ }
if (atomic_dec_and_test(&j->done)) {
j->finish = sched_clock();
@@ -800,7 +869,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
u64 nr, unsigned nr_threads)
{
struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
- char name_buf[20], nr_buf[20], per_sec_buf[20];
+ char name_buf[20];
+ struct printbuf nr_buf = PRINTBUF;
+ struct printbuf per_sec_buf = PRINTBUF;
unsigned i;
u64 time;
@@ -839,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
perf_test(test_extent_overwrite_middle);
perf_test(test_extent_overwrite_all);
+ perf_test(test_snapshots);
+
if (!j.fn) {
pr_err("unknown test %s", testname);
return -EINVAL;
@@ -859,13 +932,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
time = j.finish - j.start;
scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
- bch2_hprint(&PBUF(nr_buf), nr);
- bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
+ bch2_hprint(&nr_buf, nr);
+ bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
- name_buf, nr_buf, nr_threads,
+ name_buf, nr_buf.buf, nr_threads,
div_u64(time, NSEC_PER_SEC),
div_u64(time * nr_threads, nr),
- per_sec_buf);
+ per_sec_buf.buf);
+ printbuf_exit(&per_sec_buf);
+ printbuf_exit(&nr_buf);
return j.ret;
}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 52de7c49cacb..37fc20413764 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -99,6 +99,71 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
+static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra)
+{
+ unsigned new_size;
+ char *buf;
+
+ if (out->pos + extra + 1 < out->size)
+ return 0;
+
+ new_size = roundup_pow_of_two(out->size + extra);
+ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC);
+
+ if (!buf) {
+ out->allocation_failure = true;
+ return -ENOMEM;
+ }
+
+ out->buf = buf;
+ out->size = new_size;
+ return 0;
+}
+
+void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ do {
+ va_start(args, fmt);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+ va_end(args);
+ } while (len + 1 >= printbuf_remaining(out) &&
+ !bch2_printbuf_realloc(out, len + 1));
+
+ len = min_t(size_t, len,
+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+ out->pos += len;
+}
+
+void bch2_pr_tab_rjust(struct printbuf *buf)
+{
+ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
+
+ if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) {
+ unsigned move = buf->pos - buf->last_field;
+ unsigned shift = buf->tabstops[buf->tabstop] -
+ printbuf_linelen(buf);
+
+ bch2_printbuf_realloc(buf, shift);
+
+ if (buf->last_field + shift + 1 < buf->size) {
+ move = min(move, buf->size - 1 - buf->last_field - shift);
+
+ memmove(buf->buf + buf->last_field + shift,
+ buf->buf + buf->last_field,
+ move);
+ memset(buf->buf + buf->last_field, ' ', shift);
+ buf->pos += shift;
+ buf->buf[buf->pos] = 0;
+ }
+ }
+
+ buf->last_field = buf->pos;
+ buf->tabstop++;
+}
+
void bch2_hprint(struct printbuf *buf, s64 v)
{
int u, t = 0;
@@ -114,10 +179,25 @@ void bch2_hprint(struct printbuf *buf, s64 v)
* 103 is magic: t is in the range [-1023, 1023] and we want
* to turn it into [-9, 9]
*/
- if (u && v < 100 && v > -100)
+ if (u && t && v < 100 && v > -100)
pr_buf(buf, ".%i", t / 103);
if (u)
- pr_buf(buf, "%c", si_units[u]);
+ pr_char(buf, si_units[u]);
+}
+
+void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes)
+{
+ switch (out->units) {
+ case PRINTBUF_UNITS_RAW:
+ pr_buf(out, "%llu", raw);
+ break;
+ case PRINTBUF_UNITS_BYTES:
+ pr_buf(out, "%llu", bytes);
+ break;
+ case PRINTBUF_UNITS_HUMAN_READABLE:
+ bch2_hprint(out, bytes);
+ break;
+ }
}
void bch2_string_opt_to_text(struct printbuf *out,
@@ -136,9 +216,6 @@ void bch2_flags_to_text(struct printbuf *out,
unsigned bit, nr = 0;
bool first = true;
- if (out->pos != out->end)
- *out->pos = '\0';
-
while (list[nr])
nr++;
@@ -467,36 +544,44 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd)
pd->backpressure = 1;
}
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
{
- /* 2^64 - 1 is 20 digits, plus null byte */
- char rate[21];
- char actual[21];
- char target[21];
- char proportional[21];
- char derivative[21];
- char change[21];
- s64 next_io;
+ out->tabstops[0] = 20;
+
+ pr_buf(out, "rate:");
+ pr_tab(out);
+ bch2_hprint(out, pd->rate.rate);
+ pr_newline(out);
+
+ pr_buf(out, "target:");
+ pr_tab(out);
+ bch2_hprint(out, pd->last_target);
+ pr_newline(out);
+
+ pr_buf(out, "actual:");
+ pr_tab(out);
+ bch2_hprint(out, pd->last_actual);
+ pr_newline(out);
+
+ pr_buf(out, "proportional:");
+ pr_tab(out);
+ bch2_hprint(out, pd->last_proportional);
+ pr_newline(out);
- bch2_hprint(&PBUF(rate), pd->rate.rate);
- bch2_hprint(&PBUF(actual), pd->last_actual);
- bch2_hprint(&PBUF(target), pd->last_target);
- bch2_hprint(&PBUF(proportional), pd->last_proportional);
- bch2_hprint(&PBUF(derivative), pd->last_derivative);
- bch2_hprint(&PBUF(change), pd->last_change);
+ pr_buf(out, "derivative:");
+ pr_tab(out);
+ bch2_hprint(out, pd->last_derivative);
+ pr_newline(out);
- next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
+ pr_buf(out, "change:");
+ pr_tab(out);
+ bch2_hprint(out, pd->last_change);
+ pr_newline(out);
- return sprintf(buf,
- "rate:\t\t%s/sec\n"
- "target:\t\t%s\n"
- "actual:\t\t%s\n"
- "proportional:\t%s\n"
- "derivative:\t%s\n"
- "change:\t\t%s/sec\n"
- "next io:\t%llims\n",
- rate, target, actual, proportional,
- derivative, change, next_io);
+ pr_buf(out, "next io:");
+ pr_tab(out);
+ pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+ pr_newline(out);
}
/* misc: */
@@ -579,19 +664,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-void bch_scnmemcpy(struct printbuf *out,
- const char *src, size_t len)
-{
- size_t n = printbuf_remaining(out);
-
- if (n) {
- n = min(n - 1, len);
- memcpy(out->pos, src, n);
- out->pos += n;
- *out->pos = '\0';
- }
-}
-
#include "eytzinger.h"
static int alignment_ok(const void *base, size_t align)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 80402b398442..888693703c75 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -210,9 +210,11 @@ do { \
\
BUG_ON(_i >= (h)->used); \
(h)->used--; \
- heap_swap(h, _i, (h)->used, set_backpointer); \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- heap_sift_down(h, _i, cmp, set_backpointer); \
+ if ((_i) < (h)->used) { \
+ heap_swap(h, _i, (h)->used, set_backpointer); \
+ heap_sift_up(h, _i, cmp, set_backpointer); \
+ heap_sift_down(h, _i, cmp, set_backpointer); \
+ } \
} while (0)
#define heap_pop(h, d, cmp, set_backpointer) \
@@ -235,31 +237,157 @@ do { \
#define ANYSINT_MAX(t) \
((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+enum printbuf_units {
+ PRINTBUF_UNITS_RAW,
+ PRINTBUF_UNITS_BYTES,
+ PRINTBUF_UNITS_HUMAN_READABLE,
+};
+
struct printbuf {
- char *pos;
- char *end;
+ char *buf;
+ unsigned size;
+ unsigned pos;
+ unsigned last_newline;
+ unsigned last_field;
+ unsigned indent;
+ enum printbuf_units units:8;
+ u8 atomic;
+ bool allocation_failure:1;
+ u8 tabstop;
+ u8 tabstops[4];
};
+#define PRINTBUF ((struct printbuf) { NULL })
+
+static inline void printbuf_exit(struct printbuf *buf)
+{
+ kfree(buf->buf);
+ buf->buf = ERR_PTR(-EINTR); /* poison value */
+}
+
+static inline void printbuf_reset(struct printbuf *buf)
+{
+ buf->pos = 0;
+ buf->last_newline = 0;
+ buf->last_field = 0;
+ buf->indent = 0;
+ buf->tabstop = 0;
+}
+
static inline size_t printbuf_remaining(struct printbuf *buf)
{
- return buf->end - buf->pos;
+ return buf->size - buf->pos;
}
-#define _PBUF(_buf, _len) \
- ((struct printbuf) { \
- .pos = _buf, \
- .end = _buf + _len, \
- })
+static inline size_t printbuf_linelen(struct printbuf *buf)
+{
+ return buf->pos - buf->last_newline;
+}
-#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
+void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
-#define pr_buf(_out, ...) \
-do { \
- (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \
- __VA_ARGS__); \
-} while (0)
+#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__)
-void bch_scnmemcpy(struct printbuf *, const char *, size_t);
+static inline void pr_char(struct printbuf *out, char c)
+{
+ bch2_pr_buf(out, "%c", c);
+}
+
+static inline void pr_indent_push(struct printbuf *buf, unsigned spaces)
+{
+ buf->indent += spaces;
+ while (spaces--)
+ pr_char(buf, ' ');
+}
+
+static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces)
+{
+ if (buf->last_newline + buf->indent == buf->pos) {
+ buf->pos -= spaces;
+ buf->buf[buf->pos] = 0;
+ }
+ buf->indent -= spaces;
+}
+
+static inline void pr_newline(struct printbuf *buf)
+{
+ unsigned i;
+
+ pr_char(buf, '\n');
+
+ buf->last_newline = buf->pos;
+
+ for (i = 0; i < buf->indent; i++)
+ pr_char(buf, ' ');
+
+ buf->last_field = buf->pos;
+ buf->tabstop = 0;
+}
+
+static inline void pr_tab(struct printbuf *buf)
+{
+ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
+
+ while (printbuf_remaining(buf) > 1 &&
+ printbuf_linelen(buf) < buf->tabstops[buf->tabstop])
+ pr_char(buf, ' ');
+
+ buf->last_field = buf->pos;
+ buf->tabstop++;
+}
+
+void bch2_pr_tab_rjust(struct printbuf *);
+
+static inline void pr_tab_rjust(struct printbuf *buf)
+{
+ bch2_pr_tab_rjust(buf);
+}
+
+void bch2_pr_units(struct printbuf *, s64, s64);
+#define pr_units(...) bch2_pr_units(__VA_ARGS__)
+
+static inline void pr_sectors(struct printbuf *out, u64 v)
+{
+ bch2_pr_units(out, v, v << 9);
+}
+
+#ifdef __KERNEL__
+static inline void pr_time(struct printbuf *out, u64 time)
+{
+ pr_buf(out, "%llu", time);
+}
+#else
+#include <time.h>
+static inline void pr_time(struct printbuf *out, u64 _time)
+{
+ char time_str[64];
+ time_t time = _time;
+ struct tm *tm = localtime(&time);
+ size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
+ if (!err)
+ pr_buf(out, "(formatting error)");
+ else
+ pr_buf(out, "%s", time_str);
+}
+#endif
+
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+ sprintf(out, "%pUb", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+ char uuid_str[40];
+
+ uuid_unparse_lower(uuid, uuid_str);
+ pr_buf(out, uuid_str);
+}
int bch2_strtoint_h(const char *, int *);
int bch2_strtouint_h(const char *, unsigned int *);
@@ -323,8 +451,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
_r; \
})
-#define snprint(buf, size, var) \
- snprintf(buf, size, \
+#define snprint(out, var) \
+ pr_buf(out, \
type_is(var, int) ? "%i\n" \
: type_is(var, unsigned) ? "%u\n" \
: type_is(var, long) ? "%li\n" \
@@ -441,7 +569,7 @@ struct bch_pd_controller {
void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
void bch2_pd_controller_init(struct bch_pd_controller *);
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
#define sysfs_pd_controller_attribute(name) \
rw_attribute(name##_rate); \
@@ -465,7 +593,7 @@ do { \
sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
\
if (attr == &sysfs_##name##_rate_debug) \
- return bch2_pd_controller_print_debug(var, buf); \
+ bch2_pd_controller_debug_to_text(out, var); \
} while (0)
#define sysfs_pd_controller_store(name, var) \
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index c099cdc0605f..53a694d71967 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -20,7 +20,7 @@
({ \
BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
\
- (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
+ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
})
#define vstruct_bytes(_s) \
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 4d7db64e3ef3..8d23b4c2449e 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -111,11 +111,11 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
else
pr_buf(out, "(unknown type %u)", xattr.v->x_type);
- bch_scnmemcpy(out, xattr.v->x_name,
- xattr.v->x_name_len);
- pr_buf(out, ":");
- bch_scnmemcpy(out, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ pr_buf(out, "%.*s:%.*s",
+ xattr.v->x_name_len,
+ xattr.v->x_name,
+ le16_to_cpu(xattr.v->x_val_len),
+ (char *) xattr_val(xattr.v));
}
static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -311,13 +311,9 @@ retry:
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot), 0, k, ret) {
- BUG_ON(k.k->p.inode < inum);
-
- if (k.k->p.inode > inum)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+ SPOS(inum, offset, snapshot),
+ POS(inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_xattr)
continue;
@@ -426,9 +422,8 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
const struct bch_option *opt;
int id, inode_opt_id;
- char buf[512];
- struct printbuf out = PBUF(buf);
- unsigned val_len;
+ struct printbuf out = PRINTBUF;
+ int ret;
u64 v;
id = bch2_opt_lookup(name);
@@ -449,16 +444,21 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
return -ENODATA;
v = bch2_opt_get_by_id(&opts, id);
- bch2_opt_to_text(&out, c, opt, v, 0);
+ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
- val_len = out.pos - buf;
+ ret = out.pos;
- if (buffer && val_len > size)
- return -ERANGE;
+ if (out.allocation_failure) {
+ ret = -ENOMEM;
+ } else if (buffer) {
+ if (out.pos > size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, out.buf, out.pos);
+ }
- if (buffer)
- memcpy(buffer, buf, val_len);
- return val_len;
+ printbuf_exit(&out);
+ return ret;
}
static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
@@ -525,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
memcpy(buf, value, size);
buf[size] = '\0';
- ret = bch2_opt_parse(c, NULL, opt, buf, &v);
+ ret = bch2_opt_parse(c, opt, buf, &v, NULL);
kfree(buf);
if (ret < 0)