summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-02-02 14:26:28 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2021-02-02 16:07:59 -0500
commit4064aa126e2c77b86d090cd8b7731d238e73ae85 (patch)
treec13339a6f28cd6354317d7d727681d5db698f907
parent7eef5f46ddbd9d5ae2152ee868b110da37279bb9 (diff)
Update bcachefs sources to 26409a8f75 bcachefs: Journal updates to dev usage
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_debug.c6
-rw-r--r--libbcachefs.c2
-rw-r--r--libbcachefs/alloc_background.c474
-rw-r--r--libbcachefs/alloc_background.h48
-rw-r--r--libbcachefs/alloc_types.h24
-rw-r--r--libbcachefs/bcachefs.h18
-rw-r--r--libbcachefs/bcachefs_format.h93
-rw-r--r--libbcachefs/bkey.h1
-rw-r--r--libbcachefs/btree_gc.c44
-rw-r--r--libbcachefs/btree_update_interior.c5
-rw-r--r--libbcachefs/buckets.c382
-rw-r--r--libbcachefs/buckets.h16
-rw-r--r--libbcachefs/buckets_types.h5
-rw-r--r--libbcachefs/clock.c8
-rw-r--r--libbcachefs/clock_types.h2
-rw-r--r--libbcachefs/ec.c35
-rw-r--r--libbcachefs/extents.c21
-rw-r--r--libbcachefs/journal.c3
-rw-r--r--libbcachefs/journal_io.c90
-rw-r--r--libbcachefs/movinggc.c15
-rw-r--r--libbcachefs/opts.h5
-rw-r--r--libbcachefs/rebalance.c10
-rw-r--r--libbcachefs/rebalance_types.h2
-rw-r--r--libbcachefs/recovery.c40
-rw-r--r--libbcachefs/replicas.c12
-rw-r--r--libbcachefs/replicas.h1
-rw-r--r--libbcachefs/super-io.c80
-rw-r--r--libbcachefs/super-io.h5
-rw-r--r--libbcachefs/super.c43
-rw-r--r--libbcachefs/sysfs.c4
31 files changed, 798 insertions, 698 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 953107c2..7b4e00b1 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-ea3414eed52e5d90c248453e84b2dcd91c960306
+26409a8f755b8faa620a49796d7935566204daaf
diff --git a/cmd_debug.c b/cmd_debug.c
index befd41f4..c4dd24ba 100644
--- a/cmd_debug.c
+++ b/cmd_debug.c
@@ -572,14 +572,10 @@ int cmd_list_journal(int argc, char *argv[])
printf("journal entry %8llu\n"
" version %8u\n"
" last seq %8llu\n"
- " read clock %8u\n"
- " write clock %8u\n"
,
le64_to_cpu(p->j.seq),
le32_to_cpu(p->j.version),
- le64_to_cpu(p->j.last_seq),
- le16_to_cpu(p->j.read_clock),
- le16_to_cpu(p->j.write_clock));
+ le64_to_cpu(p->j.last_seq));
for_each_jset_key(k, _n, entry, &p->j) {
char buf[200];
diff --git a/libbcachefs.c b/libbcachefs.c
index e7c1ca23..e359d48b 100644
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -623,8 +623,6 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
printf(" flags: %x", le32_to_cpu(clean->flags));
- printf(" read clock: %x", le16_to_cpu(clean->read_clock));
- printf(" write clock: %x", le16_to_cpu(clean->write_clock));
printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq));
}
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 896ec023..a91caf04 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -14,6 +14,7 @@
#include "ec.h"
#include "error.h"
#include "recovery.h"
+#include "varint.h"
#include <linux/kthread.h>
#include <linux/math64.h>
@@ -24,15 +25,12 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
- BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+ BCH_ALLOC_FIELDS_V1()
#undef x
- NULL
};
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
/* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work)
@@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)
/* Persistent alloc info: */
-static inline u64 get_alloc_field(const struct bch_alloc *a,
- const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+ const void **p, unsigned field)
{
- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
u64 v;
if (!(a->fields & (1 << field)))
@@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
return v;
}
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
- unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+ unsigned field, u64 v)
{
- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
if (!v)
return;
@@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
{
- struct bkey_alloc_unpacked ret = { .gen = 0 };
+ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+ const void *d = in->data;
+ unsigned idx = 0;
- if (k.k->type == KEY_TYPE_alloc) {
- const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
- const void *d = a->data;
- unsigned idx = 0;
+ out->gen = in->gen;
- ret.gen = a->gen;
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+}
-#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
- BCH_ALLOC_FIELDS()
+static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
+ void *d = a->v.data;
+ unsigned bytes, idx = 0;
+
+ a->k.p = POS(src.dev, src.bucket);
+ a->v.fields = 0;
+ a->v.gen = src.gen;
+
+#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name);
+ BCH_ALLOC_FIELDS_V1()
#undef x
- }
- return ret;
+ bytes = (void *) d - (void *) &a->v;
+ set_bkey_val_bytes(&a->k, bytes);
+ memset_u64s_tail(&a->v, 0, bytes);
}
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
- const struct bkey_alloc_unpacked src)
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
{
- unsigned idx = 0;
- void *d = dst->v.data;
+ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+ return 0;
+}
+
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ u8 *out = a->v.data;
+ u8 *end = (void *) &dst[1];
+ u8 *last_nonzero_field = out;
unsigned bytes;
- dst->v.fields = 0;
- dst->v.gen = src.gen;
+ a->k.p = POS(src.dev, src.bucket);
+ a->v.gen = src.gen;
+ a->v.oldest_gen = src.oldest_gen;
+ a->v.data_type = src.data_type;
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (src._name) { \
+ out += bch2_varint_encode(out, src._name); \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ }
-#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS_V2()
#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ a->v.nr_fields = last_nonzero_fieldnr;
- bytes = (void *) d - (void *) &dst->v;
- set_bkey_val_bytes(&dst->k, bytes);
- memset_u64s_tail(&dst->v, 0, bytes);
+ bytes = (u8 *) out - (u8 *) &a->v;
+ set_bkey_val_bytes(&a->k, bytes);
+ memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked ret = {
+ .dev = k.k->p.inode,
+ .bucket = k.k->p.offset,
+ .gen = 0,
+ };
+
+ if (k.k->type == KEY_TYPE_alloc_v2)
+ bch2_alloc_unpack_v2(&ret, k);
+ else if (k.k->type == KEY_TYPE_alloc)
+ bch2_alloc_unpack_v1(&ret, k);
+
+ return ret;
+}
+
+void bch2_alloc_pack(struct bch_fs *c,
+ struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
+ bch2_alloc_pack_v2(dst, src);
+ else
+ bch2_alloc_pack_v1(dst, src);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
if (a->fields & (1 << i))
- bytes += BCH_ALLOC_FIELD_BYTES[i];
+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
@@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- const void *d = a.v->data;
- unsigned i;
+ struct bkey_alloc_unpacked u;
- pr_buf(out, "gen %u", a.v->gen);
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
+
+ if (bch2_alloc_unpack_v2(&u, k))
+ return "unpack error";
- for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
- if (a.v->fields & (1 << i))
- pr_buf(out, " %s %llu",
- bch2_alloc_field_names[i],
- get_alloc_field(a.v, &d, i));
+ return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ pr_buf(out, "gen %u oldest_gen %u data_type %u",
+ u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
+ BCH_ALLOC_FIELDS_V2()
+#undef x
}
static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@@ -213,7 +315,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
struct bucket *g;
struct bkey_alloc_unpacked u;
- if (level || k.k->type != KEY_TYPE_alloc)
+ if (level ||
+ (k.k->type != KEY_TYPE_alloc &&
+ k.k->type != KEY_TYPE_alloc_v2))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ int ret;
down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
return ret;
}
- percpu_down_write(&c->mark_lock);
- bch2_dev_usage_from_buckets(c);
- percpu_up_write(&c->mark_lock);
-
- mutex_lock(&c->bucket_clock[READ].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, READ);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[READ].lock);
-
- mutex_lock(&c->bucket_clock[WRITE].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, WRITE);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[WRITE].lock);
-
return 0;
}
@@ -281,8 +363,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
struct bucket *g;
struct bucket_mark m;
struct bkey_alloc_unpacked old_u, new_u;
- __BKEY_PADDED(k, 8) alloc_key; /* hack: */
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf a;
int ret;
retry:
bch2_trans_begin(trans);
@@ -303,17 +384,14 @@ retry:
ca = bch_dev_bkey_exists(c, iter->pos.inode);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
- new_u = alloc_mem_to_key(g, m);
+ new_u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
if (!bkey_alloc_unpacked_cmp(old_u, new_u))
return 0;
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, new_u);
-
- bch2_trans_update(trans, iter, &a->k_i,
+ bch2_alloc_pack(c, &a, new_u);
+ bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|flags);
@@ -358,114 +436,6 @@ err:
/* Bucket IO clocks: */
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket *g;
- u16 max_last_io = 0;
- unsigned i;
-
- lockdep_assert_held(&c->bucket_clock[rw].lock);
-
- /* Recalculate max_last_io for this device: */
- for_each_bucket(g, buckets)
- max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
- ca->max_last_bucket_io[rw] = max_last_io;
-
- /* Recalculate global max_last_io: */
- max_last_io = 0;
-
- for_each_member_device(ca, c, i)
- max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
- clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets;
- struct bch_dev *ca;
- struct bucket *g;
- unsigned i;
-
- trace_rescale_prios(c);
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->io_time[rw] = clock->hand -
- bucket_last_io(c, g, rw) / 2;
-
- bch2_recalc_oldest_io(c, ca, rw);
-
- up_read(&ca->bucket_lock);
- }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
- return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
- struct bucket_clock *clock = container_of(timer,
- struct bucket_clock, rescale);
- struct bch_fs *c = container_of(clock,
- struct bch_fs, bucket_clock[clock->rw]);
- struct bch_dev *ca;
- u64 capacity;
- unsigned i;
-
- mutex_lock(&clock->lock);
-
- /* if clock cannot be advanced more, rescale prio */
- if (clock->max_last_io >= U16_MAX - 2)
- bch2_rescale_bucket_io_times(c, clock->rw);
-
- BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
- for_each_member_device(ca, c, i)
- ca->max_last_bucket_io[clock->rw]++;
- clock->max_last_io++;
- clock->hand++;
-
- mutex_unlock(&clock->lock);
-
- capacity = READ_ONCE(c->capacity);
-
- if (!capacity)
- return;
-
- /*
- * we only increment when 0.1% of the filesystem capacity has been read
- * or written too, this determines if it's time
- *
- * XXX: we shouldn't really be going off of the capacity of devices in
- * RW mode (that will be 0 when we're RO, yet we can still service
- * reads)
- */
- timer->expire += bucket_clock_freq(capacity);
-
- bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
-
- clock->hand = 1;
- clock->rw = rw;
- clock->rescale.fn = bch2_inc_clock_hand;
- clock->rescale.expire = bucket_clock_freq(c->capacity);
- mutex_init(&clock->lock);
-}
-
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
size_t bucket_nr, int rw)
{
@@ -473,9 +443,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
struct btree_iter *iter;
struct bucket *g;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
- u16 *time;
+ u64 *time, now;
int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -486,28 +456,25 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
if (ret)
goto out;
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
percpu_down_read(&c->mark_lock);
g = bucket(ca, bucket_nr);
- u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
-
time = rw == READ ? &u.read_time : &u.write_time;
- if (*time == c->bucket_clock[rw].hand)
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (*time == now)
goto out;
- *time = c->bucket_clock[rw].hand;
-
- bch2_alloc_pack(a, u);
+ *time = now;
- ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+ bch2_alloc_pack(c, a, u);
+ ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
bch2_trans_iter_put(trans, iter);
@@ -576,23 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
- size_t bucket,
- struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+ struct bucket_mark m)
{
u8 gc_gen;
- if (!is_available_bucket(mark))
+ if (!is_available_bucket(m))
return false;
- if (mark.owned_by_allocator)
+ if (m.owned_by_allocator)
return false;
if (ca->buckets_nouse &&
- test_bit(bucket, ca->buckets_nouse))
+ test_bit(b, ca->buckets_nouse))
return false;
- gc_gen = bucket_gc_gen(ca, bucket);
+ gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++;
@@ -606,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- * indication of how hot the data is -- we scale the prio so that the prio
- * farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- * indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- * smallest bucket_gc_gen() - since incrementing the same bucket's generation
- * number repeatedly forces us to run mark and sweep gc to avoid generation
- * number wraparound.
*/
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+ u64 now, u64 last_seq_ondisk)
{
- unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
- unsigned max_last_io = ca->max_last_bucket_io[READ];
+ unsigned used = bucket_sectors_used(m);
- /*
- * Time since last read, scaled to [0, 8) where larger value indicates
- * more recently read data:
- */
- unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
- /* How much we want to keep the data in this bucket: */
- unsigned long data_wantness =
- (hotness + 1) * bucket_sectors_used(m);
-
- unsigned long needs_journal_commit =
- bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+ if (used) {
+ /*
+ * Prefer to keep buckets that have been read more recently, and
+ * buckets that have more data in them:
+ */
+ u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
- return (data_wantness << 9) |
- (needs_journal_commit << 8) |
- (bucket_gc_gen(ca, b) / 16);
+ return -last_read_scaled;
+ } else {
+ /*
+ * Prefer to use buckets with smaller gc_gen so that we don't
+ * have to walk the btree and recalculate oldest_gen - but shift
+ * off the low bits so that buckets will still have equal sort
+ * keys when there's only a small difference, so that we can
+ * keep sequential buckets together:
+ */
+ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+ (bucket_gc_gen(g) >> 4);
+ }
}
static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -665,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
+ u64 now, last_seq_ondisk;
size_t b, i, nr = 0;
- ca->alloc_heap.used = 0;
-
- mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
-
- bch2_recalc_oldest_io(c, ca, READ);
+ ca->alloc_heap.used = 0;
+ now = atomic64_read(&c->io_clock[READ].now);
+ last_seq_ondisk = c->journal.last_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@@ -682,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
* all buckets have been visited.
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
- unsigned long key = bucket_sort_key(c, ca, b, m);
+ struct bucket *g = &buckets->b[b];
+ struct bucket_mark m = READ_ONCE(g->mark);
+ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
@@ -718,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
}
up_read(&ca->bucket_lock);
- mutex_unlock(&c->bucket_clock[READ].lock);
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -863,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{
-#if 0
- __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
- /* hack: */
- __BKEY_PADDED(k, 8) alloc_key;
-#endif
struct bch_fs *c = trans->c;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
@@ -920,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
goto out;
}
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
ret = bch2_btree_iter_traverse(iter);
@@ -931,7 +878,7 @@ retry:
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
- u = alloc_mem_to_key(g, m);
+ u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
@@ -941,14 +888,11 @@ retry:
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
- u.read_time = c->bucket_clock[READ].hand;
- u.write_time = c->bucket_clock[WRITE].hand;
-
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
+ u.read_time = atomic64_read(&c->io_clock[READ].now);
+ u.write_time = atomic64_read(&c->io_clock[WRITE].now);
- bch2_trans_update(trans, iter, &a->k_i,
+ bch2_alloc_pack(c, &a, u);
+ bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
/*
@@ -1455,8 +1399,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- bch2_bucket_clock_init(c, READ);
- bch2_bucket_clock_init(c, WRITE);
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index f60fcebf..6fededcd 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -7,12 +7,33 @@
#include "debug.h"
struct bkey_alloc_unpacked {
+ u64 bucket;
+ u8 dev;
u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS_V2()
#undef x
};
+struct bkey_alloc_buf {
+ struct bkey_i k;
+
+ union {
+ struct {
+#define x(_name, _bits) + _bits / 8
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef x
+ } _v1;
+ struct {
+#define x(_name, _bits) + 8 + _bits / 8
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef x
+ } _v2;
+ };
+} __attribute__((packed, aligned(8)));
+
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)
{
- return l.gen != r.gen
-#define x(_name, _bits) || l._name != r._name
- BCH_ALLOC_FIELDS()
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type
+#define x(_name, ...) || l._name != r._name
+ BCH_ALLOC_FIELDS_V2()
#undef x
;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
const struct bkey_alloc_unpacked);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+ struct bucket *g, struct bucket_mark m)
{
return (struct bkey_alloc_unpacked) {
+ .dev = iter->pos.inode,
+ .bucket = iter->pos.offset,
.gen = m.gen,
.oldest_gen = g->oldest_gen,
.data_type = m.data_type,
@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
- .key_invalid = bch2_alloc_invalid, \
+ .key_invalid = bch2_alloc_v1_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index 1abfff52..be164d61 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -10,30 +10,6 @@
struct ec_bucket_buf;
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
- /*
- * "now" in (read/write) IO time - incremented whenever we do X amount
- * of reads or writes.
- *
- * Goes with the bucket read/write prios: when we read or write to a
- * bucket we reset the bucket's prio to the current hand; thus hand -
- * prio = time since bucket was last read/written.
- *
- * The units are some amount (bytes/sectors) of data read/written, and
- * the units can change on the fly if we need to rescale to fit
- * everything in a u16 - your only guarantee is that the units are
- * consistent.
- */
- u16 hand;
- u16 max_last_io;
-
- int rw;
-
- struct io_timer rescale;
- struct mutex lock;
-};
-
enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1,
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 91b9375f..fa36e764 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -429,7 +429,9 @@ struct bch_dev {
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
- struct bch_dev_usage __percpu *usage[2];
+ struct bch_dev_usage *usage_base;
+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
+ struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@@ -451,9 +453,6 @@ struct bch_dev {
size_t fifo_last_bucket;
- /* last calculated minimum prio */
- u16 max_last_bucket_io[2];
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
@@ -473,6 +472,7 @@ struct bch_dev {
atomic64_t rebalance_work;
struct journal_device journal;
+ u64 prev_journal_sector;
struct work_struct io_error_work;
@@ -584,6 +584,8 @@ struct bch_fs {
struct journal_entry_res replicas_journal_res;
+ struct journal_entry_res dev_usage_journal_res;
+
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;
@@ -691,14 +693,6 @@ struct bch_fs {
struct mutex usage_scratch_lock;
struct bch_fs_usage *usage_scratch;
- /*
- * When we invalidate buckets, we use both the priority and the amount
- * of good data to determine which buckets to reuse first - to weight
- * those together consistently we keep track of the smallest nonzero
- * priority of any bucket.
- */
- struct bucket_clock bucket_clock[2];
-
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 6dc150cb..30e77190 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k)
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
- x(indirect_inline_data, 19)
+ x(indirect_inline_data, 19) \
+ x(alloc_v2, 20)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
- idx:51;
+ redundancy:4,
+ idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:51,
+ __u64 idx:47,
+ redundancy:4,
block:8,
type:5;
#endif
@@ -799,35 +802,40 @@ struct bch_alloc {
__u8 data[];
} __attribute__((packed, aligned(8)));
-#define BCH_ALLOC_FIELDS() \
+#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
- x(oldest_gen, 8)
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
- BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
#undef x
BCH_ALLOC_FIELD_NR
};
-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
- BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
- DIV_ROUND_UP(offsetof(struct bch_alloc, data)
- BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
/* Quotas: */
enum quota_types {
@@ -1131,8 +1139,8 @@ struct bch_sb_field_clean {
struct bch_sb_field field;
__le32 flags;
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
__le64 journal_seq;
union {
@@ -1305,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
/*
* Features:
@@ -1332,7 +1341,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
x(new_varint, 15) \
- x(journal_no_flush, 16)
+ x(journal_no_flush, 16) \
+ x(alloc_v2, 17)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
@@ -1340,7 +1350,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \
- (1ULL << BCH_FEATURE_journal_no_flush))
+ (1ULL << BCH_FEATURE_journal_no_flush)| \
+ (1ULL << BCH_FEATURE_alloc_v2))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1493,7 +1504,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist, 3) \
x(blacklist_v2, 4) \
x(usage, 5) \
- x(data_usage, 6)
+ x(data_usage, 6) \
+ x(clock, 7) \
+ x(dev_usage, 8)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -1541,6 +1554,30 @@ struct jset_entry_data_usage {
struct bch_replicas_entry r;
} __attribute__((packed));
+struct jset_entry_clock {
+ struct jset_entry entry;
+ __u8 rw;
+ __u8 pad[7];
+ __le64 time;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage_type {
+ __le64 buckets;
+ __le64 sectors;
+ __le64 fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+ struct jset_entry entry;
+ __le32 dev;
+ __u32 pad;
+
+ __le64 buckets_ec;
+ __le64 buckets_unavailable;
+
+ struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1563,8 +1600,8 @@ struct jset {
__u8 encrypted_start[0];
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
/* Sequence number of oldest dirty journal entry */
__le64 last_seq;
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 2c3b73a6..48821f6c 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -530,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
BKEY_VAL_ACCESSORS(btree_ptr_v2);
BKEY_VAL_ACCESSORS(indirect_inline_data);
+BKEY_VAL_ACCESSORS(alloc_v2);
/* byte order helpers */
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index bab5ebd3..c2c8a34f 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
- free_percpu(ca->usage[1]);
- ca->usage[1] = NULL;
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
}
free_percpu(c->usage_gc);
@@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
struct bch_dev *ca;
bool verify = (!initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
- unsigned i;
+ unsigned i, dev;
int ret = 0;
#define copy_field(_f, _msg, ...) \
@@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
}
}
- for_each_member_device(ca, c, i) {
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+
+ for_each_member_device(ca, c, dev) {
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
@@ -801,12 +804,23 @@ static int bch2_gc_done(struct bch_fs *c,
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
- };
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
+ {
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
+
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
- bch2_dev_usage_from_buckets(c);
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ }
+ }
+ };
{
unsigned nr = fs_usage_u64s(c);
@@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
- BUG_ON(ca->usage[1]);
+ BUG_ON(ca->usage_gc);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
@@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
return -ENOMEM;
}
- ca->usage[1] = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage[1]) {
- bch_err(c, "error allocating ca->usage[gc]");
+ ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage_gc) {
+ bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
@@ -1489,7 +1503,7 @@ static int bch2_gc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic_long_read(&clock->now);
+ unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
@@ -1510,7 +1524,7 @@ static int bch2_gc_thread(void *arg)
if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16;
- if (atomic_long_read(&clock->now) >= next)
+ if (atomic64_read(&clock->now) >= next)
break;
bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1536,7 @@ static int bch2_gc_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 8919ea62..dd1b8f6e 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+ wp = bch2_alloc_sectors_start(c,
+ c->opts.metadata_target ?:
+ c->opts.foreground_target,
+ 0,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index cb0f0e09..ef79f5ca 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
+ struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
@@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+ usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
+ ca->mi.bucket_size;
+ }
+
percpu_up_write(&c->mark_lock);
}
@@ -189,14 +198,27 @@ out_pool:
return ret;
}
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+ unsigned journal_seq,
+ bool gc)
+{
+ return this_cpu_ptr(gc
+ ? ca->usage_gc
+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
+ struct bch_fs *c = ca->fs;
struct bch_dev_usage ret;
+ unsigned seq, i, u64s = dev_usage_u64s();
- memset(&ret, 0, sizeof(ret));
- acc_u64s_percpu((u64 *) &ret,
- (u64 __percpu *) ca->usage[0],
- sizeof(ret) / sizeof(u64));
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
@@ -261,7 +283,8 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- unsigned u64s = fs_usage_u64s(c);
+ struct bch_dev *ca;
+ unsigned i, u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
(u64 __percpu *) c->usage[idx], u64s);
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ u64s = dev_usage_u64s();
+
+ acc_u64s_percpu((u64 *) ca->usage_base,
+ (u64 __percpu *) ca->usage[idx], u64s);
+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+ }
+ rcu_read_unlock();
+
write_seqcount_end(&c->usage_lock);
preempt_enable();
}
@@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
- bool gc)
+ u64 journal_seq, bool gc)
{
struct bch_dev_usage *u;
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
- u = this_cpu_ptr(ca->usage[gc]);
+ u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
account_bucket(fs_usage, u, bucket_type(old),
@@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_wake_allocator(ca);
}
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct bucket_mark old = { .v.counter = 0 };
- struct bucket_array *buckets;
- struct bucket *g;
- unsigned i;
- int cpu;
-
- c->usage_base->hidden = 0;
-
- for_each_member_device(ca, c, i) {
- for_each_possible_cpu(cpu)
- memset(per_cpu_ptr(ca->usage[0], cpu), 0,
- sizeof(*ca->usage[0]));
-
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- bch2_dev_usage_update(c, ca, c->usage_base,
- old, g->mark, false);
- }
-}
-
static inline int update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
@@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
new.owned_by_allocator = owned_by_allocator;
}));
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ /*
+ * XXX: this is wrong, this means we'll be doing updates to the percpu
+ * buckets_alloc counter that don't have an open journal buffer and
+ * we'll race with the machinery that accumulates that to ca->usage_base
+ */
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
@@ -685,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
struct bucket_mark old_m, m;
/* We don't do anything for deletions - do we?: */
- if (new.k->type != KEY_TYPE_alloc)
+ if (new.k->type != KEY_TYPE_alloc &&
+ new.k->type != KEY_TYPE_alloc_v2)
return 0;
/*
@@ -708,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
+ m.stripe = u.stripe != 0;
if (journal_seq) {
m.journal_seq_valid = 1;
@@ -715,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c,
}
}));
- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
/*
* need to know if we're getting called from the invalidate path or
@@ -778,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (c)
bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
- old, new, gc);
+ old, new, 0, gc);
return 0;
}
@@ -915,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
unsigned ptr_idx,
struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags,
- bool enabled)
+ u64 journal_seq, unsigned flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
@@ -932,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
char buf[200];
int ret;
- if (enabled)
- g->ec_redundancy = s->nr_redundant;
+ if (g->stripe && g->stripe != k.k->p.offset) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+ return -EINVAL;
+ }
old = bucket_cmpxchg(g, new, ({
ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@@ -941,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
if (ret)
return ret;
- if (new.stripe && enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- if (!new.stripe && !enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- new.stripe = enabled;
-
- if ((flags & BTREE_TRIGGER_GC) && parity) {
- new.data_type = enabled ? BCH_DATA_parity : 0;
- new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+ if (parity) {
+ new.data_type = BCH_DATA_parity;
+ new.dirty_sectors = le16_to_cpu(s->sectors);
}
if (journal_seq) {
@@ -966,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
}
}));
- if (!enabled)
- g->ec_redundancy = 0;
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
return 0;
}
@@ -1036,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
@@ -1163,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
unsigned i;
int ret;
+ BUG_ON(gc && old_s);
+
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
@@ -1170,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
if (!new_s) {
- /* Deleting: */
- for (i = 0; i < old_s->nr_blocks; i++) {
- ret = bucket_set_stripe(c, old, i, fs_usage,
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
-
- if (!gc && m->on_heap) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_del(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- }
-
- if (gc)
- update_replicas(c, fs_usage, &m->r.e,
- -((s64) m->sectors * m->nr_redundant));
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_del(c, m, idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
memset(m, 0, sizeof(*m));
} else {
- BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
- BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- if (!old_s ||
- memcmp(new_s->ptrs + i,
- old_s->ptrs + i,
- sizeof(struct bch_extent_ptr))) {
-
- if (old_s) {
- bucket_set_stripe(c, old, i, fs_usage,
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
- ret = bucket_set_stripe(c, new, i, fs_usage,
- journal_seq, flags, true);
- if (ret)
- return ret;
- }
- }
-
m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
@@ -1220,27 +1193,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
- unsigned s = stripe_blockcount_get(new_s, i);
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- if (!gc)
- m->block_sectors[i] = s;
- m->blocks_nonempty += !!s;
+ m->block_sectors[i] =
+ stripe_blockcount_get(new_s, i);
+ m->blocks_nonempty += !!m->block_sectors[i];
}
- if (gc && old_s)
- update_replicas(c, fs_usage, &m->r.e,
- -((s64) m->sectors * m->nr_redundant));
-
bch2_bkey_to_replicas(&m->r.e, new);
- if (gc)
- update_replicas(c, fs_usage, &m->r.e,
- ((s64) m->sectors * m->nr_redundant));
-
if (!gc) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, idx);
@@ -1248,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
}
+ if (gc) {
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ ret = mark_stripe_bucket(c, new, i, fs_usage,
+ journal_seq, flags);
+ if (ret)
+ return ret;
+ }
+
+ update_replicas(c, fs_usage, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant));
+ }
+
return 0;
}
@@ -1271,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
switch (k.k->type) {
case KEY_TYPE_alloc:
+ case KEY_TYPE_alloc_v2:
ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
@@ -1539,9 +1518,10 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
- const struct bch_extent_ptr *ptr,
- struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+ const struct bch_extent_ptr *ptr,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -1549,8 +1529,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
struct bucket *g;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bkey_alloc_buf *a;
int ret;
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+ if (IS_ERR(a))
+ return a;
+
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
if (iter) {
*u = bch2_alloc_unpack(k);
@@ -1562,17 +1547,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
ret = bch2_btree_iter_traverse(iter);
if (ret) {
bch2_trans_iter_put(trans, iter);
- return ret;
+ return ERR_PTR(ret);
}
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
- *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
*_iter = iter;
- return 0;
+ return a;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
@@ -1582,27 +1567,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
int ret;
- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto out;
-
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1713,34 +1691,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
}
static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
- const struct bch_extent_ptr *ptr,
- s64 sectors, bool parity)
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
{
- struct bkey_i_alloc *a;
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct bkey_alloc_buf *a;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
- int ret;
+ bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+ int ret = 0;
- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
if (parity) {
+ s64 sectors = le16_to_cpu(s.v->sectors);
+
+ if (deleting)
+ sectors = -sectors;
+
u.dirty_sectors += sectors;
u.data_type = u.dirty_sectors
? BCH_DATA_parity
: 0;
}
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto err;
+ if (!deleting) {
+ if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+ "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ u.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
+ u.stripe = s.k->p.offset;
+ u.stripe_redundancy = s.v->nr_redundant;
+ } else {
+ u.stripe = 0;
+ u.stripe_redundancy = 0;
+ }
+
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
err:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1750,51 +1745,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
+ struct bkey_s_c_stripe old_s = { NULL };
+ struct bkey_s_c_stripe new_s = { NULL };
struct bch_replicas_padded r;
unsigned i;
int ret = 0;
+ if (old.k->type == KEY_TYPE_stripe)
+ old_s = bkey_s_c_to_stripe(old);
+ if (new.k->type == KEY_TYPE_stripe)
+ new_s = bkey_s_c_to_stripe(new);
+
/*
* If the pointers aren't changing, we don't need to do anything:
*/
- if (new_s && old_s &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ if (new_s.k && old_s.k &&
+ new_s.v->nr_blocks == old_s.v->nr_blocks &&
+ new_s.v->nr_redundant == old_s.v->nr_redundant &&
+ !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+ new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- if (new_s) {
- unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
- s64 sectors = le16_to_cpu(new_s->sectors);
+ if (new_s.k) {
+ s64 sectors = le16_to_cpu(new_s.v->sectors);
bch2_bkey_to_replicas(&r.e, new);
- update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- bool parity = i >= nr_data;
+ update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
- ret = bch2_trans_mark_stripe_alloc_ref(trans,
- &new_s->ptrs[i], sectors, parity);
+ for (i = 0; i < new_s.v->nr_blocks; i++) {
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+ i, false);
if (ret)
return ret;
}
}
- if (old_s) {
- unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+ if (old_s.k) {
+ s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
bch2_bkey_to_replicas(&r.e, old);
- update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
-
- for (i = 0; i < old_s->nr_blocks; i++) {
- bool parity = i >= nr_data;
+ update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
- ret = bch2_trans_mark_stripe_alloc_ref(trans,
- &old_s->ptrs[i], sectors, parity);
+ for (i = 0; i < old_s.v->nr_blocks; i++) {
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+ i, true);
if (ret)
return ret;
}
@@ -2065,21 +2059,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
struct bch_extent_ptr ptr = {
.dev = ca->dev_idx,
.offset = bucket_to_sector(ca, b),
};
int ret = 0;
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
-
- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
if (u.data_type && u.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -2112,10 +2101,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
u.data_type = type;
u.dirty_sectors = sectors;
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -2422,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- free_percpu(ca->usage[0]);
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ free_percpu(ca->usage[i]);
+ kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+ unsigned i;
+
+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+ if (!ca->usage_base)
return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage[i])
+ return -ENOMEM;
+ }
+
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 37346240..6d15c455 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
- return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
{
- struct bucket *g = bucket(ca, b);
-
return g->mark.gen - g->oldest_gen;
}
@@ -169,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
@@ -214,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
READ_ONCE(c->replicas.nr);
}
+static inline unsigned dev_usage_u64s(void)
+{
+ return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 5fbe940a..404c89a7 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -37,11 +37,12 @@ struct bucket {
const struct bucket_mark mark;
};
- u16 io_time[2];
+ u64 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
- u8 ec_redundancy;
+ u8 stripe_redundancy;
+ u32 stripe;
};
struct bucket_array {
diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c
index 1d1590de..4324cfe7 100644
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
- if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+ if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
- unsigned long now = atomic_long_add_return(sectors, &clock->now);
+ unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned i;
spin_lock(&clock->timer_lock);
- now = atomic_long_read(&clock->now);
+ now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
int bch2_io_clock_init(struct io_clock *clock)
{
- atomic_long_set(&clock->now, 0);
+ atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h
index 92c740a4..5fae0012 100644
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@@ -26,7 +26,7 @@ struct io_timer {
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
- atomic_long_t now;
+ atomic64_t now;
u16 __percpu *pcpu_buf;
unsigned max_slop;
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 086897c3..10d55fc8 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ if (!bkey_cmp(k.k->p, POS_MIN))
+ return "stripe at pos 0";
+
if (k.k->p.inode)
return "invalid stripe key";
@@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
+ char buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+
bch_err_ratelimited(c,
- "stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
- i, j, v->csum_type,
- want.lo, got.lo);
+ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+ (void *) _RET_IP_, i, j, v->csum_type,
+ want.lo, got.lo, buf2);
clear_bit(i, buf->valid);
break;
}
@@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
static void ec_block_endio(struct bio *bio)
{
struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+ struct bch_stripe *v = &ec_bio->buf->key.v;
+ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
@@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(ca->fs,
+ "error %s stripe: stale pointer after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
+
bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref);
closure_put(cl);
@@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{
- //pr_info("deleting stripe %zu", idx);
return bch2_btree_delete_range(c, BTREE_ID_EC,
POS(0, idx),
POS(0, idx + 1),
@@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
*dst = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block,
+ .redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset,
};
}
@@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob)
return;
- //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
ec = ob->ec;
mutex_lock(&ec->lock);
@@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
struct stripe *m;
size_t heap_idx;
u64 stripe_idx;
+ s64 ret = -1;
if (may_create_new_stripe(c))
return -1;
spin_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ /* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
@@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- return stripe_idx;
+ ret = stripe_idx;
+ break;
}
}
-
spin_unlock(&c->ec_stripes_heap_lock);
- return -1;
+ return ret;
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 67ba2c21..4a3a3291 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -704,14 +704,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
if (p.ptr.cached)
continue;
- if (p.has_ec) {
- struct stripe *s =
- genradix_ptr(&c->stripes[0], p.ec.idx);
-
- WARN_ON(!s);
- if (s)
- replicas += s->nr_redundant;
- }
+ if (p.has_ec)
+ replicas += p.ec.redundancy;
replicas++;
@@ -734,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
durability = max_t(unsigned, durability, ca->mi.durability);
- if (p.has_ec) {
- struct stripe *s =
- genradix_ptr(&c->stripes[0], p.ec.idx);
-
- if (WARN_ON(!s))
- goto out;
+ if (p.has_ec)
+ durability += p.ec.redundancy;
- durability += s->nr_redundant;
- }
-out:
return durability;
}
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index a7c5f5fd..e41f0277 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1121,6 +1121,9 @@ int bch2_fs_journal_init(struct journal *j)
j->entry_u64s_reserved +=
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
+ j->entry_u64s_reserved +=
+ 2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index eacc9b2c..2abca164 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -5,6 +5,7 @@
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
+#include "disk_groups.h"
#include "error.h"
#include "io.h"
#include "journal.h"
@@ -426,6 +427,69 @@ fsck_err:
return ret;
}
+static int journal_entry_validate_clock(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes != sizeof(*clock),
+ c, "invalid journal entry clock: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(clock->rw > 1,
+ c, "invalid journal entry clock: bad rw")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+ unsigned dev;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < expected,
+ c, "invalid journal entry dev usage: bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ dev = le32_to_cpu(u->dev);
+
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+ c, "invalid journal entry dev usage: bad dev")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(u->pad,
+ c, "invalid journal entry dev usage: bad pad")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);
@@ -937,6 +1001,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+ bch2_replicas_entry_sort(&replicas.e);
+
/*
* If we're mounting in degraded mode - if we didn't read all
* the devices - this is wrong:
@@ -1032,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_devs_mask devs;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
+ unsigned target = c->opts.metadata_target ?:
+ c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
rcu_read_lock();
+retry:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
- &c->rw_devs[BCH_DATA_journal]);
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
@@ -1073,6 +1143,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
+
+ if (replicas < replicas_want && target) {
+ /* Retry from all devices: */
+ target = 0;
+ goto retry;
+ }
done:
rcu_read_unlock();
@@ -1278,6 +1354,9 @@ static void do_journal_write(struct closure *cl)
bio->bi_private = ca;
bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+ ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
if (!JSET_NO_FLUSH(w->data))
bio->bi_opf |= REQ_FUA;
if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
@@ -1348,8 +1427,8 @@ void bch2_journal_write(struct closure *cl)
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
- end = bch2_journal_super_entries_add_common(c, end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1358,10 +1437,7 @@ void bch2_journal_write(struct closure *cl)
journal_write_compact(jset);
- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
-
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index d0acc1ee..f915b30a 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = p.ptr.dev;
- if (p.has_ec) {
- struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
- data_opts->nr_replicas += m->nr_redundant;
- }
+ if (p.has_ec)
+ data_opts->nr_replicas += p.ec.redundancy;
return DATA_REWRITE;
}
@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
- WARN_ON(m.stripe && !g->ec_redundancy);
+ WARN_ON(m.stripe && !g->stripe_redundancy);
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
- .replicas = 1 + g->ec_redundancy,
+ .replicas = 1 + g->stripe_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),
@@ -301,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last, wait;
+ u64 last, wait;
set_freezable();
@@ -309,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 710a7ee6..d835a853 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -136,6 +136,11 @@ enum opt_type {
OPT_STR(bch2_str_hash_types), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \
NULL, "Hash function for directory entries and xattrs")\
+ x(metadata_target, u16, \
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_METADATA_TARGET, 0, \
+ "(target)", "Device or disk group for metadata writes") \
x(foreground_target, u16, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
OPT_FN(bch2_opt_target), \
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index c3373c48..d89920b8 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
- unsigned long io_start;
+ u64 io_start;
long throttle;
set_freezable();
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
(20 - w.dev_most_full_percent),
50);
- if (atomic_long_read(&clock->now) + clock->max_slop <
+ if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
- atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h
index 192c6be2..2f62a643 100644
--- a/libbcachefs/rebalance_types.h
+++ b/libbcachefs/rebalance_types.h
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
atomic64_t work_unknown_dev;
enum rebalance_state state;
- unsigned long throttled_until_iotime;
+ u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index f470e0e2..7ba098ad 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_data_usage: {
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
+
ret = bch2_replicas_set_usage(c, &u->r,
le64_to_cpu(u->v));
break;
}
+ case BCH_JSET_ENTRY_dev_usage: {
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+ unsigned i;
+
+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
+ ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
+
+ for (i = 0; i < nr_types; i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+
+ break;
+ }
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
@@ -847,6 +868,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(bl_entry->end) + 1);
break;
}
+ case BCH_JSET_ENTRY_clock: {
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+ }
}
return ret;
@@ -861,9 +888,6 @@ static int journal_replay_early(struct bch_fs *c,
int ret;
if (clean) {
- c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
@@ -876,9 +900,6 @@ static int journal_replay_early(struct bch_fs *c,
if (i->ignore)
continue;
- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
@@ -942,13 +963,6 @@ static int verify_superblock_clean(struct bch_fs *c,
return 0;
}
- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock %u doesn't match journal %u after clean shutdown",
- clean->read_clock, j->read_clock);
- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock write clock %u doesn't match journal %u after clean shutdown",
- clean->write_clock, j->write_clock);
-
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index ce8b7355..3970c442 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
break;
}
- replicas_entry_sort(e);
+ bch2_replicas_entry_sort(e);
}
void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i];
- replicas_entry_sort(e);
+ bch2_replicas_entry_sort(e);
}
static struct bch_replicas_cpu
@@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
{
- replicas_entry_sort(search);
+ bch2_replicas_entry_sort(search);
return __replicas_entry_idx(&c->replicas, search);
}
@@ -681,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
for_each_replicas_entry(sb_r, e) {
dst = cpu_replicas_entry(cpu_r, idx++);
memcpy(dst, e, replicas_entry_bytes(e));
- replicas_entry_sort(dst);
+ bch2_replicas_entry_sort(dst);
}
return 0;
@@ -718,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
dst->nr_devs = e->nr_devs;
dst->nr_required = 1;
memcpy(dst->devs, e->devs, e->nr_devs);
- replicas_entry_sort(dst);
+ bch2_replicas_entry_sort(dst);
}
return 0;
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 8b95164f..a16ef23b 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -5,6 +5,7 @@
#include "eytzinger.h"
#include "replicas_types.h"
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 751efd28..a510a25e 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -963,31 +963,28 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
return ret;
}
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
- memset(entry, 0, u64s * sizeof(u64));
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+ memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = u64s - 1;
-}
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
- entry_init_u64s(entry, u64s);
+ *end = vstruct_next(*end);
+ return entry;
}
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry *entry,
- u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+ struct jset_entry **end,
+ u64 journal_seq)
{
- unsigned i;
+ struct bch_dev *ca;
+ unsigned i, dev;
percpu_down_write(&c->mark_lock);
@@ -1000,58 +997,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
-
- entry = vstruct_next(entry);
}
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
-
- entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
- entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+ struct jset_entry_data_usage, entry);
- entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
memcpy(&u->r, e, replicas_entry_bytes(e));
+ }
- entry = vstruct_next(entry);
+ for_each_member_device(ca, c, dev) {
+ unsigned b = sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+ struct jset_entry_dev_usage *u =
+ container_of(jset_entry_init(end, b),
+ struct jset_entry_dev_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_dev_usage;
+ u->dev = cpu_to_le32(dev);
+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+ }
}
percpu_up_write(&c->mark_lock);
- return entry;
+ for (i = 0; i < 2; i++) {
+ struct jset_entry_clock *clock =
+ container_of(jset_entry_init(end, sizeof(*clock)),
+ struct jset_entry_clock, entry);
+
+ clock->entry.type = BCH_JSET_ENTRY_clock;
+ clock->rw = i;
+ clock->time = atomic64_read(&c->io_clock[i].now);
+ }
}
void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1080,15 +1096,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
- sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
- entry = bch2_journal_super_entries_add_common(c, entry, 0);
+ bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 7a068158..1a35124f 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_clean: */
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
- struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+ struct jset_entry **, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index f3c12d89..ac277df8 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, nr = 0, u64s =
+ (sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ nr++;
+ rcu_read_unlock();
+
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->dev_usage_journal_res, u64s * nr);
+}
+
/* Filesystem RO/RW: */
/*
@@ -174,9 +190,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
- bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
@@ -399,9 +412,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
@@ -779,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
+ bch2_dev_usage_journal_reserve(c);
+
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -1521,6 +1533,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
+
+ bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@@ -1530,19 +1544,6 @@ err:
return ret;
}
-static void dev_usage_clear(struct bch_dev *ca)
-{
- struct bucket_array *buckets;
-
- percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
- up_read(&ca->bucket_lock);
-}
-
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
@@ -1600,8 +1601,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- dev_usage_clear(ca);
-
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1655,6 +1654,8 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ bch2_dev_usage_journal_reserve(c);
+
err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 80964bdf..f934f12b 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
{
int rw = (private ? 1 : 0);
- return bucket_last_io(c, bucket(ca, b), rw);
+ return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
}
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
- return bucket_gc_gen(ca, b);
+ return bucket_gc_gen(bucket(ca, b));
}
static int unsigned_cmp(const void *_l, const void *_r)