summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c3
-rw-r--r--libbcachefs/alloc_background.c255
-rw-r--r--libbcachefs/alloc_background.h9
-rw-r--r--libbcachefs/alloc_foreground.c2
-rw-r--r--libbcachefs/bcachefs.h2
-rw-r--r--libbcachefs/bcachefs_format.h15
-rw-r--r--libbcachefs/btree_gc.c178
-rw-r--r--libbcachefs/btree_types.h2
-rw-r--r--libbcachefs/btree_update.h9
-rw-r--r--libbcachefs/btree_update_interior.c29
-rw-r--r--libbcachefs/btree_update_leaf.c103
-rw-r--r--libbcachefs/buckets.c358
-rw-r--r--libbcachefs/buckets.h53
-rw-r--r--libbcachefs/buckets_types.h29
-rw-r--r--libbcachefs/chardev.c6
-rw-r--r--libbcachefs/extents.c8
-rw-r--r--libbcachefs/fifo.h2
-rw-r--r--libbcachefs/journal.c446
-rw-r--r--libbcachefs/journal.h29
-rw-r--r--libbcachefs/journal_io.c138
-rw-r--r--libbcachefs/journal_io.h1
-rw-r--r--libbcachefs/journal_reclaim.c396
-rw-r--r--libbcachefs/journal_reclaim.h7
-rw-r--r--libbcachefs/journal_types.h27
-rw-r--r--libbcachefs/recovery.c15
-rw-r--r--libbcachefs/replicas.c61
-rw-r--r--libbcachefs/str_hash.h40
-rw-r--r--libbcachefs/super-io.c6
-rw-r--r--libbcachefs/super.c7
-rw-r--r--libbcachefs/sysfs.c14
31 files changed, 1218 insertions, 1034 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 39d11479..6766622b 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-09a546543006b60d44c4c51e7b40cd3ec7837a5e
+75e8a078b85703322fcf558f75a6845c0ef5dbb0
diff --git a/cmd_migrate.c b/cmd_migrate.c
index e9594ab7..4b6ceaa7 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -319,6 +319,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
struct bkey_i_extent *e;
BKEY_PADDED(k) k;
u64 b = sector_to_bucket(ca, physical);
+ struct bucket_mark m;
struct disk_reservation res;
unsigned sectors;
int ret;
@@ -337,7 +338,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
.gen = bucket(ca, b)->mark.gen,
});
- bucket_set_dirty(ca, b);
+ bucket_cmpxchg(bucket(ca, b), m, m.dirty = true);
ret = bch2_disk_reservation_get(c, &res, sectors, 1,
BCH_DISK_RESERVATION_NOFAIL);
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index ce42202f..f246319b 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -128,6 +128,34 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+{
+ struct bkey_alloc_unpacked ret = { .gen = a->gen };
+ const void *d = a->data;
+ unsigned idx = 0;
+
+#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
+ BCH_ALLOC_FIELDS()
+#undef x
+ return ret;
+}
+
+static void bch2_alloc_pack(struct bkey_i_alloc *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ unsigned idx = 0;
+ void *d = dst->v.data;
+
+ dst->v.fields = 0;
+ dst->v.gen = src.gen;
+
+#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
+ BCH_ALLOC_FIELDS()
+#undef x
+
+ set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+}
+
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -173,15 +201,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
{
const void *d = a->data;
- unsigned idx = 0;
+ unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
+ struct bucket_mark m;
- g->_mark.gen = a->gen;
- g->gen_valid = 1;
g->io_time[READ] = get_alloc_field(a, &d, idx++);
g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
- g->_mark.data_type = get_alloc_field(a, &d, idx++);
- g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++);
- g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
+ data_type = get_alloc_field(a, &d, idx++);
+ dirty_sectors = get_alloc_field(a, &d, idx++);
+ cached_sectors = get_alloc_field(a, &d, idx++);
+ g->oldest_gen = get_alloc_field(a, &d, idx++);
+
+ bucket_cmpxchg(g, m, ({
+ m.gen = a->gen;
+ m.data_type = data_type;
+ m.dirty_sectors = dirty_sectors;
+ m.cached_sectors = cached_sectors;
+ }));
+
+ g->gen_valid = 1;
}
static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
@@ -199,6 +236,7 @@ static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
put_alloc_field(a, &d, idx++, m.data_type);
put_alloc_field(a, &d, idx++, m.dirty_sectors);
put_alloc_field(a, &d, idx++, m.cached_sectors);
+ put_alloc_field(a, &d, idx++, g->oldest_gen);
set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
}
@@ -315,6 +353,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOMARK|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
if (ret)
@@ -358,7 +397,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
? 0
: bch2_btree_insert_at(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY,
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK,
BTREE_INSERT_ENTRY(&iter, k));
err:
bch2_btree_iter_unlock(&iter);
@@ -824,6 +864,142 @@ static inline long next_alloc_bucket(struct bch_dev *ca)
return -1;
}
+/*
+ * returns sequence number of most recent journal entry that updated this
+ * bucket:
+ */
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
+{
+ if (m.journal_seq_valid) {
+ u64 journal_seq = atomic64_read(&c->journal.seq);
+ u64 bucket_seq = journal_seq;
+
+ bucket_seq &= ~((u64) U16_MAX);
+ bucket_seq |= m.journal_seq;
+
+ if (bucket_seq > journal_seq)
+ bucket_seq -= 1 << 16;
+
+ return bucket_seq;
+ } else {
+ return 0;
+ }
+}
+
+static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
+ struct btree_iter *iter,
+ u64 *journal_seq, unsigned flags)
+{
+#if 0
+ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+ /* hack: */
+ __BKEY_PADDED(k, 8) alloc_key;
+#endif
+ struct bkey_i_alloc *a;
+ struct bkey_alloc_unpacked u;
+ struct bucket_mark m;
+ struct bkey_s_c k;
+ bool invalidating_cached_data;
+ size_t b;
+ int ret;
+
+ BUG_ON(!ca->alloc_heap.used ||
+ !ca->alloc_heap.data[0].nr);
+ b = ca->alloc_heap.data[0].bucket;
+
+ /* first, put on free_inc and mark as owned by allocator: */
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ spin_lock(&c->freelist_lock);
+
+ verify_not_on_freelist(c, ca, b);
+
+ BUG_ON(!fifo_push(&ca->free_inc, b));
+
+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+ m = bucket(ca, b)->mark;
+
+ spin_unlock(&c->freelist_lock);
+ percpu_up_read_preempt_enable(&c->mark_lock);
+
+ bch2_btree_iter_cond_resched(iter);
+
+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+retry:
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = btree_iter_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k && k.k->type == KEY_TYPE_alloc)
+ u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+ else
+ memset(&u, 0, sizeof(u));
+
+ invalidating_cached_data = u.cached_sectors != 0;
+
+ //BUG_ON(u.dirty_sectors);
+ u.data_type = 0;
+ u.dirty_sectors = 0;
+ u.cached_sectors = 0;
+ u.read_time = c->bucket_clock[READ].hand;
+ u.write_time = c->bucket_clock[WRITE].hand;
+ u.gen++;
+
+ a = bkey_alloc_init(&alloc_key.k);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, u);
+
+ ret = bch2_btree_insert_at(c, NULL,
+ invalidating_cached_data ? journal_seq : NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ flags,
+ BTREE_INSERT_ENTRY(iter, &a->k_i));
+ if (ret == -EINTR)
+ goto retry;
+
+ if (!ret) {
+ /* remove from alloc_heap: */
+ struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+ top->bucket++;
+ top->nr--;
+
+ if (!top->nr)
+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+ /*
+ * Make sure we flush the last journal entry that updated this
+ * bucket (i.e. deleting the last reference) before writing to
+ * this bucket again:
+ */
+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+ } else {
+ size_t b2;
+
+ /* remove from free_inc: */
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ spin_lock(&c->freelist_lock);
+
+ bch2_mark_alloc_bucket(c, ca, b, false,
+ gc_pos_alloc(c, NULL), 0);
+
+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
+ BUG_ON(b != b2);
+
+ spin_unlock(&c->freelist_lock);
+ percpu_up_read_preempt_enable(&c->mark_lock);
+ }
+
+ return ret;
+}
+
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t bucket, u64 *flush_seq)
{
@@ -844,18 +1020,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
percpu_up_read_preempt_enable(&c->mark_lock);
- if (m.journal_seq_valid) {
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u64 bucket_seq = journal_seq;
-
- bucket_seq &= ~((u64) U16_MAX);
- bucket_seq |= m.journal_seq;
-
- if (bucket_seq > journal_seq)
- bucket_seq -= 1 << 16;
-
- *flush_seq = max(*flush_seq, bucket_seq);
- }
+ *flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
return m.cached_sectors != 0;
}
@@ -868,7 +1033,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
struct btree_iter iter;
u64 journal_seq = 0;
int ret = 0;
- long b;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -876,14 +1040,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
/* Only use nowait if we've already invalidated at least one bucket: */
while (!ret &&
!fifo_full(&ca->free_inc) &&
- (b = next_alloc_bucket(ca)) >= 0) {
- bool must_flush =
- bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
-
- ret = __bch2_alloc_write_key(c, ca, b, &iter,
- must_flush ? &journal_seq : NULL,
- !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
- }
+ ca->alloc_heap.used)
+ ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
+ BTREE_INSERT_GC_LOCK_HELD|
+ (!fifo_empty(&ca->free_inc)
+ ? BTREE_INSERT_NOWAIT : 0));
bch2_btree_iter_unlock(&iter);
@@ -1305,24 +1466,16 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
}
-static void flush_held_btree_writes(struct bch_fs *c)
+static bool flush_done(struct bch_fs *c)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
- bool nodes_blocked;
+ bool nodes_unwritten;
size_t i;
- struct closure cl;
-
- closure_init_stack(&cl);
-
- clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
- pr_debug("flushing dirty btree nodes");
cond_resched();
- closure_wait(&c->btree_interior_update_wait, &cl);
-
- nodes_blocked = false;
+ nodes_unwritten = false;
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
@@ -1334,24 +1487,25 @@ again:
six_unlock_read(&b->lock);
goto again;
} else {
- nodes_blocked = true;
+ nodes_unwritten = true;
}
}
rcu_read_unlock();
- if (c->btree_roots_dirty)
+ if (c->btree_roots_dirty) {
bch2_journal_meta(&c->journal);
-
- if (nodes_blocked) {
- closure_sync(&cl);
goto again;
}
- closure_wake_up(&c->btree_interior_update_wait);
- closure_sync(&cl);
+ return !nodes_unwritten &&
+ !bch2_btree_interior_updates_nr_pending(c);
+}
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+ clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+
+ closure_wait_event(&c->btree_interior_update_wait, flush_done(c));
}
static void allocator_start_issue_discards(struct bch_fs *c)
@@ -1470,7 +1624,6 @@ not_enough:
&journal_seq);
fifo_push(&ca->free[RESERVE_BTREE], bu);
- bucket_set_dirty(ca, bu);
}
}
@@ -1517,7 +1670,6 @@ int bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
- bool wrote;
int ret;
down_read(&c->gc_lock);
@@ -1536,8 +1688,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
}
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- return bch2_alloc_write(c, false, &wrote);
+ return 0;
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 26561b3b..65e9b373 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -5,6 +5,15 @@
#include "alloc_types.h"
#include "debug.h"
+struct bkey_alloc_unpacked {
+ u8 gen;
+#define x(_name, _bits) u##_bits _name;
+ BCH_ALLOC_FIELDS()
+#undef x
+};
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index f2f9015d..6568e8ac 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -723,7 +723,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
{
u64 stranded = c->write_points_nr * c->bucket_size_max;
- u64 free = bch2_fs_sectors_free(c);
+ u64 free = bch2_fs_usage_read_short(c).free;
return stranded * factor > free;
}
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 245d8322..052ec263 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -396,8 +396,6 @@ struct bch_dev {
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_nouse;
unsigned long *buckets_written;
- /* most out of date gen in the btree */
- u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index d020cf74..56bf69eb 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -821,11 +821,12 @@ struct bch_alloc {
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS() \
- x(read_time, 2) \
- x(write_time, 2) \
- x(data_type, 1) \
- x(dirty_sectors, 2) \
- x(cached_sectors, 2)
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8)
enum {
#define x(name, bytes) BCH_ALLOC_FIELD_##name,
@@ -835,12 +836,12 @@ enum {
};
static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
BCH_ALLOC_FIELDS()
#undef x
};
-#define x(name, bytes) + bytes
+#define x(name, bits) + (bits / 8)
static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
DIV_ROUND_UP(offsetof(struct bch_alloc, data)
BCH_ALLOC_FIELDS(), sizeof(u64));
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index b1f5e8b1..5d6f6364 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -138,24 +138,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- size_t b = PTR_BUCKET_NR(ca, ptr);
- struct bucket *g = PTR_BUCKET(ca, ptr);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
if (mustfix_fsck_err_on(!g->gen_valid, c,
"found ptr with missing gen in alloc btree,\n"
"type %u gen %u",
k.k->type, ptr->gen)) {
- g->_mark.gen = ptr->gen;
- g->gen_valid = 1;
- bucket_set_dirty(ca, b);
+ g2->_mark.gen = g->_mark.gen = ptr->gen;
+ g2->_mark.dirty = g->_mark.dirty = true;
+ g2->gen_valid = g->gen_valid = true;
}
if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
"%u ptr gen in the future: %u > %u",
k.k->type, ptr->gen, g->mark.gen)) {
- g->_mark.gen = ptr->gen;
- g->gen_valid = 1;
- bucket_set_dirty(ca, b);
+ g2->_mark.gen = g->_mark.gen = ptr->gen;
+ g2->_mark.dirty = g->_mark.dirty = true;
+ g2->gen_valid = g->gen_valid = true;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
}
}
@@ -163,10 +163,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- size_t b = PTR_BUCKET_NR(ca, ptr);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
- if (gen_after(ca->oldest_gens[b], ptr->gen))
- ca->oldest_gens[b] = ptr->gen;
+ if (gen_after(g->oldest_gen, ptr->gen))
+ g->oldest_gen = ptr->gen;
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
@@ -230,12 +230,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
bch2_verify_btree_nr_keys(b);
+ gc_pos_set(c, gc_pos_btree_node(b));
+
ret = btree_gc_mark_node(c, b, &max_stale, initial);
if (ret)
break;
- gc_pos_set(c, gc_pos_btree_node(b));
-
if (!initial) {
if (max_stale > 64)
bch2_btree_node_rewrite(c, &iter,
@@ -483,88 +483,38 @@ static void bch2_gc_free(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
-static void bch2_gc_done_nocheck(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
-
- {
- struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
- struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
- struct stripe *dst, *src;
-
- c->ec_stripes_heap.used = 0;
-
- while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
- (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
- *dst = *src;
-
- if (dst->alive)
- bch2_stripes_heap_insert(c, dst, dst_iter.pos);
-
- genradix_iter_advance(&dst_iter, &c->stripes[0]);
- genradix_iter_advance(&src_iter, &c->stripes[1]);
- }
- }
-
- for_each_member_device(ca, c, i) {
- struct bucket_array *src = __bucket_array(ca, 1);
-
- memcpy(__bucket_array(ca, 0), src,
- sizeof(struct bucket_array) +
- sizeof(struct bucket) * src->nbuckets);
- };
-
- for_each_member_device(ca, c, i) {
- unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
- struct bch_dev_usage *dst = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
-
- *dst = *src;
- }
-
- {
- unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
- c->replicas.nr;
- struct bch_fs_usage *dst = (void *)
- bch2_acc_percpu_u64s((void *) c->usage[0], nr);
- struct bch_fs_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) c->usage[1], nr);
-
- memcpy(&dst->s.gc_start[0],
- &src->s.gc_start[0],
- nr * sizeof(u64) - offsetof(typeof(*dst), s.gc_start));
- }
-}
-
static void bch2_gc_done(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
+ bool verify = !initial ||
+ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
unsigned i;
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
- bch_err(c, _msg ": got %llu, should be %llu, fixing" \
- , ##__VA_ARGS__, dst->_f, src->_f); \
+ if (verify) \
+ bch_err(c, _msg ": got %llu, should be %llu, fixing"\
+ , ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
- bch_err_ratelimited(c, "stripe %zu has wrong "_msg \
- ": got %u, should be %u, fixing", \
- dst_iter.pos, ##__VA_ARGS__, \
- dst->_f, src->_f); \
+ if (verify) \
+ bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
+ ": got %u, should be %u, fixing", \
+ dst_iter.pos, ##__VA_ARGS__, \
+ dst->_f, src->_f); \
dst->_f = src->_f; \
dst->dirty = true; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
- bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
- ": got %u, should be %u, fixing", \
- i, b, dst->b[b].mark._f, src->b[b].mark._f); \
+ if (verify) \
+ bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
+ ": got %u, should be %u, fixing", i, b, \
+ dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
+ dst->b[b]._mark.dirty = true; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -573,12 +523,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
percpu_down_write(&c->mark_lock);
- if (initial &&
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
- bch2_gc_done_nocheck(c);
- goto out;
- }
-
{
struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
@@ -629,6 +573,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
+
+ if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
+ dst->b[b].oldest_gen = src->b[b].oldest_gen;
+ dst->b[b]._mark.dirty = true;
+ }
}
};
@@ -641,44 +590,46 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
- copy_dev_field(buckets[b],
- "buckets[%s]", bch2_data_types[b]);
- copy_dev_field(buckets_alloc, "buckets_alloc");
- copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets[b], "buckets[%s]",
+ bch2_data_types[b]);
+ copy_dev_field(buckets_alloc, "buckets_alloc");
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
for (b = 0; b < BCH_DATA_NR; b++)
- copy_dev_field(sectors[b],
- "sectors[%s]", bch2_data_types[b]);
- copy_dev_field(sectors_fragmented,
- "sectors_fragmented");
+ copy_dev_field(sectors[b], "sectors[%s]",
+ bch2_data_types[b]);
+ copy_dev_field(sectors_fragmented, "sectors_fragmented");
}
{
- unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
- c->replicas.nr;
+ unsigned nr = fs_usage_u64s(c);
struct bch_fs_usage *dst = (void *)
bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((void *) c->usage[1], nr);
- copy_fs_field(s.hidden, "hidden");
- copy_fs_field(s.data, "data");
- copy_fs_field(s.cached, "cached");
- copy_fs_field(s.reserved, "reserved");
- copy_fs_field(s.nr_inodes, "nr_inodes");
+ copy_fs_field(hidden, "hidden");
+ copy_fs_field(data, "data");
+ copy_fs_field(cached, "cached");
+ copy_fs_field(reserved, "reserved");
+ copy_fs_field(nr_inodes, "nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(persistent_reserved[i],
"persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
- /*
- * XXX: print out replicas entry
- */
- copy_fs_field(data[i], "data[%i]", i);
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+ char buf[80];
+
+ bch2_replicas_entry_to_text(&PBUF(buf), e);
+
+ copy_fs_field(replicas[i], "%s", buf);
}
}
-out:
+
percpu_up_write(&c->mark_lock);
#undef copy_fs_field
@@ -693,19 +644,18 @@ static int bch2_gc_start(struct bch_fs *c)
struct bch_dev *ca;
unsigned i;
+ percpu_down_write(&c->mark_lock);
+
/*
* indicate to stripe code that we need to allocate for the gc stripes
* radix tree, too
*/
gc_pos_set(c, gc_phase(GC_PHASE_START));
- percpu_down_write(&c->mark_lock);
BUG_ON(c->usage[1]);
- c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
- sizeof(u64) * c->replicas.nr,
- sizeof(u64),
- GFP_KERNEL);
+ c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
percpu_up_write(&c->mark_lock);
if (!c->usage[1])
@@ -740,8 +690,12 @@ static int bch2_gc_start(struct bch_fs *c)
dst->first_bucket = src->first_bucket;
dst->nbuckets = src->nbuckets;
- for (b = 0; b < src->nbuckets; b++)
- dst->b[b]._mark.gen = src->b[b].mark.gen;
+ for (b = 0; b < src->nbuckets; b++) {
+ dst->b[b]._mark.gen =
+ dst->b[b].oldest_gen =
+ src->b[b].mark.gen;
+ dst->b[b].gen_valid = src->b[b].gen_valid;
+ }
};
percpu_up_write(&c->mark_lock);
@@ -800,6 +754,8 @@ out:
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+ bch2_gc_free(c);
goto again;
}
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 18596dc8..b38722da 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -455,6 +455,7 @@ static inline bool btree_node_is_extents(struct btree *b)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
switch (type) {
+ case BKEY_TYPE_ALLOC:
case BKEY_TYPE_BTREE:
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_INODES:
@@ -489,7 +490,6 @@ enum btree_insert_ret {
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
BTREE_INSERT_ENOSPC,
- BTREE_INSERT_NEED_GC_LOCK,
BTREE_INSERT_NEED_MARK_REPLICAS,
};
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 4bd07258..faacde9a 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -81,6 +81,7 @@ enum {
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
+ __BTREE_INSERT_NOMARK,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
@@ -107,12 +108,12 @@ enum {
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
+/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
+/* Don't call bch2_mark_key: */
+#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
+
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 33b5cf40..b1b858de 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -483,7 +483,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
struct btree *b;
struct disk_reservation disk_res = { 0, 0 };
unsigned sectors = nr_nodes * c->opts.btree_node_size;
- int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
+ int ret, disk_res_flags = 0;
if (flags & BTREE_INSERT_NOFAIL)
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
@@ -1086,8 +1086,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&old->key),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
- gc_pos_btree_root(b->btree_id));
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1188,8 +1187,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
bkey_disassemble(b, k, &tmp),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
- gc_pos_btree_node(b));
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1564,7 +1562,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
- if (!down_read_trylock(&c->gc_lock)) {
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+ !down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
return -EINTR;
@@ -1607,7 +1606,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
*/
__bch2_btree_iter_downgrade(iter, 1);
out:
- up_read(&c->gc_lock);
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
}
@@ -1685,7 +1685,8 @@ retry:
}
/* We're changing btree topology, doesn't mix with gc: */
- if (!down_read_trylock(&c->gc_lock))
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+ !down_read_trylock(&c->gc_lock))
goto err_cycle_gc_lock;
if (!bch2_btree_iter_upgrade(iter, U8_MAX,
@@ -1745,7 +1746,8 @@ retry:
bch2_btree_update_done(as);
- up_read(&c->gc_lock);
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
out:
bch2_btree_iter_verify_locks(iter);
@@ -1776,7 +1778,8 @@ err_cycle_gc_lock:
err_unlock:
six_unlock_intent(&m->lock);
- up_read(&c->gc_lock);
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
err:
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
@@ -1942,8 +1945,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
c->opts.btree_node_size *
bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
- BCH_DISK_RESERVATION_NOFAIL|
- BCH_DISK_RESERVATION_GC_LOCK_HELD);
+ BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
parent = btree_node_parent(iter, b);
@@ -1989,8 +1991,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
- gc_pos_btree_root(b->btree_id));
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 0df894fc..da8c6987 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -415,6 +415,25 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
btree_iter_cmp(l.iter, r.iter);
}
+static bool btree_trans_relock(struct btree_insert *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_iter(trans, i)
+ return bch2_btree_iter_relock(i->iter);
+ return true;
+}
+
+static void btree_trans_unlock(struct btree_insert *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_iter(trans, i) {
+ bch2_btree_iter_unlock(i->iter);
+ break;
+ }
+}
+
/* Normal update interface: */
static enum btree_insert_ret
@@ -466,49 +485,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
struct btree_iter *linked;
unsigned u64s;
int ret;
-
+retry:
trans_for_each_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
- /* reserve space for deferred updates */
- __trans_for_each_entry(trans, i, i->deferred) {
-
- }
-
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- u64s = 0;
- trans_for_each_entry(trans, i)
- u64s += jset_u64s(i->k->k.u64s);
-
- while ((ret = bch2_journal_res_get(&c->journal,
- &trans->journal_res, u64s,
- JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
- struct btree_iter *iter = NULL;
-
- trans_for_each_iter(trans, i)
- iter = i->iter;
-
- if (iter)
- bch2_btree_iter_unlock(iter);
-
- ret = bch2_journal_res_get(&c->journal,
- &trans->journal_res, u64s,
- JOURNAL_RES_GET_CHECK);
- if (ret)
- return ret;
-
- if (iter && !bch2_btree_iter_relock(iter)) {
- trans_restart(" (iter relock after journal res get blocked)");
- return -EINTR;
- }
- }
-
- if (ret)
- return ret;
- }
-
multi_lock_write(c, trans);
if (race_fault()) {
@@ -536,6 +518,36 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
}
}
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ u64s = 0;
+ trans_for_each_entry(trans, i)
+ u64s += jset_u64s(i->k->k.u64s);
+
+ ret = bch2_journal_res_get(&c->journal,
+ &trans->journal_res, u64s,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (likely(!ret))
+ goto got_journal_res;
+ if (ret != -EAGAIN)
+ goto out;
+
+ multi_unlock_write(trans);
+ btree_trans_unlock(trans);
+
+ ret = bch2_journal_res_get(&c->journal,
+ &trans->journal_res, u64s,
+ JOURNAL_RES_GET_CHECK);
+ if (ret)
+ return ret;
+
+ if (!btree_trans_relock(trans)) {
+ trans_restart(" (iter relock after journal res get blocked)");
+ return -EINTR;
+ }
+
+ goto retry;
+ }
+got_journal_res:
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_entry(trans, i)
@@ -623,6 +635,9 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&c->gc_lock);
+
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
trans_for_each_entry(trans, i)
@@ -715,18 +730,6 @@ err:
ret = -EINTR;
}
break;
- case BTREE_INSERT_NEED_GC_LOCK:
- ret = -EINTR;
-
- if (!down_read_trylock(&c->gc_lock)) {
- if (flags & BTREE_INSERT_NOUNLOCK)
- goto out;
-
- bch2_btree_iter_unlock(trans->entries[0].iter);
- down_read(&c->gc_lock);
- }
- up_read(&c->gc_lock);
- break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 9f4872a9..377a8b0f 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -116,14 +116,14 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
- unsigned i, nr;
+ unsigned i;
percpu_down_write(&c->mark_lock);
- nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
- usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+ usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
+ fs_usage_u64s(c));
for (i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->s.reserved += usage->persistent_reserved[i];
+ usage->reserved += usage->persistent_reserved[i];
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
@@ -132,10 +132,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
switch (e->data_type) {
case BCH_DATA_BTREE:
case BCH_DATA_USER:
- usage->s.data += usage->data[i];
+ usage->data += usage->replicas[i];
break;
case BCH_DATA_CACHED:
- usage->s.cached += usage->data[i];
+ usage->cached += usage->replicas[i];
break;
}
}
@@ -143,44 +143,38 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
-#define bch2_usage_read_raw(_stats) \
-({ \
- typeof(*this_cpu_ptr(_stats)) _acc; \
- \
- memset(&_acc, 0, sizeof(_acc)); \
- acc_u64s_percpu((u64 *) &_acc, \
- (u64 __percpu *) _stats, \
- sizeof(_acc) / sizeof(u64)); \
- \
- _acc; \
-})
-
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
- return bch2_usage_read_raw(ca->usage[0]);
+ struct bch_dev_usage ret;
+
+ memset(&ret, 0, sizeof(ret));
+ acc_u64s_percpu((u64 *) &ret,
+ (u64 __percpu *) ca->usage[0],
+ sizeof(ret) / sizeof(u64));
+
+ return ret;
}
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage *ret;
- unsigned nr = READ_ONCE(c->replicas.nr);
+ unsigned v, u64s = fs_usage_u64s(c);
retry:
- ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+ ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
if (unlikely(!ret))
return NULL;
percpu_down_read_preempt_disable(&c->mark_lock);
- if (unlikely(nr < c->replicas.nr)) {
- nr = c->replicas.nr;
+ v = fs_usage_u64s(c);
+ if (unlikely(u64s != v)) {
+ u64s = v;
percpu_up_read_preempt_enable(&c->mark_lock);
kfree(ret);
goto retry;
}
- acc_u64s_percpu((u64 *) ret,
- (u64 __percpu *) c->usage[0],
- sizeof(*ret) / sizeof(u64) + nr);
+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
return ret;
}
@@ -197,27 +191,44 @@ static u64 avail_factor(u64 r)
return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
}
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
{
- return min(fs_usage.s.hidden +
- fs_usage.s.data +
- reserve_factor(fs_usage.s.reserved +
- fs_usage.s.online_reserved),
+ return min(fs_usage->hidden +
+ fs_usage->data +
+ reserve_factor(fs_usage->reserved +
+ fs_usage->online_reserved),
c->capacity);
}
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+ struct bch_fs_usage_short ret;
+ u64 data, reserved;
+
+ ret.capacity = c->capacity -
+ percpu_u64_get(&c->usage[0]->hidden);
+
+ data = percpu_u64_get(&c->usage[0]->data);
+ reserved = percpu_u64_get(&c->usage[0]->reserved) +
+ percpu_u64_get(&c->usage[0]->online_reserved);
+
+ ret.used = min(ret.capacity, data + reserve_factor(reserved));
+ ret.free = ret.capacity - ret.used;
+
+ ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
+
+ return ret;
+}
+
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *c)
{
- struct bch_fs_usage_summarized usage =
- bch2_usage_read_raw(&c->usage[0]->s);
struct bch_fs_usage_short ret;
- ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
- ret.used = min(ret.capacity, usage.data +
- reserve_factor(usage.reserved +
- usage.online_reserved));
- ret.nr_inodes = usage.nr_inodes;
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ ret = __bch2_fs_usage_read_short(c);
+ percpu_up_read_preempt_enable(&c->mark_lock);
return ret;
}
@@ -254,10 +265,9 @@ static bool bucket_became_unavailable(struct bucket_mark old,
int bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res,
- struct gc_pos gc_pos)
+ struct disk_reservation *disk_res)
{
- s64 added = fs_usage->s.data + fs_usage->s.reserved;
+ s64 added = fs_usage->data + fs_usage->reserved;
s64 should_not_have_added;
int ret = 0;
@@ -277,19 +287,11 @@ int bch2_fs_usage_apply(struct bch_fs *c,
if (added > 0) {
disk_res->sectors -= added;
- fs_usage->s.online_reserved -= added;
+ fs_usage->online_reserved -= added;
}
acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
- (u64 *) fs_usage,
- sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
-
- if (gc_visited(c, gc_pos)) {
- BUG_ON(!c->usage[1]);
- acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
- (u64 *) fs_usage,
- sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
- }
+ (u64 *) fs_usage, fs_usage_u64s(c));
return ret;
}
@@ -300,7 +302,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
int nr, s64 size)
{
if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
- fs_usage->s.hidden += size;
+ fs_usage->hidden += size;
dev_usage->buckets[type] += nr;
}
@@ -384,10 +386,10 @@ static inline void update_replicas(struct bch_fs *c,
BUG_ON(!sectors);
if (r->data_type == BCH_DATA_CACHED)
- fs_usage->s.cached += sectors;
+ fs_usage->cached += sectors;
else
- fs_usage->s.data += sectors;
- fs_usage->data[idx] += sectors;
+ fs_usage->data += sectors;
+ fs_usage->replicas[idx] += sectors;
}
static inline void update_cached_sectors(struct bch_fs *c,
@@ -401,15 +403,28 @@ static inline void update_cached_sectors(struct bch_fs *c,
update_replicas(c, fs_usage, &r.e, sectors);
}
-static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *old,
- bool gc)
+#define do_mark_fn(fn, c, pos, flags, ...) \
+({ \
+ int gc, ret = 0; \
+ \
+ percpu_rwsem_assert_held(&c->mark_lock); \
+ \
+ for (gc = 0; gc < 2 && !ret; gc++) \
+ if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \
+ (gc && gc_visited(c, pos))) \
+ ret = fn(c, __VA_ARGS__, gc); \
+ ret; \
+})
+
+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, struct bucket_mark *ret,
+ bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
- struct bucket_mark new;
+ struct bucket_mark old, new;
- *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = true;
@@ -420,26 +435,29 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.gen++;
}));
- if (old->cached_sectors)
+ if (old.cached_sectors)
update_cached_sectors(c, fs_usage, ca->dev_idx,
- -old->cached_sectors);
+ -((s64) old.cached_sectors));
+
+ if (!gc)
+ *ret = old;
+ return 0;
}
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
- percpu_rwsem_assert_held(&c->mark_lock);
-
- __bch2_invalidate_bucket(c, ca, b, old, false);
+ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
+ ca, b, old);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
}
-static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator,
- bool gc)
+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, bool owned_by_allocator,
+ bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
@@ -451,20 +469,70 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
+
+ return 0;
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
- percpu_rwsem_assert_held(&c->mark_lock);
+ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
+ ca, b, owned_by_allocator);
+}
- if (!(flags & BCH_BUCKET_MARK_GC))
- __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+ bool inserting,
+ struct bch_fs_usage *fs_usage,
+ unsigned journal_seq, unsigned flags,
+ bool gc)
+{
+ struct bkey_alloc_unpacked u;
+ struct bch_dev *ca;
+ struct bucket *g;
+ struct bucket_mark old, m;
+
+ if (!inserting)
+ return 0;
+
+ /*
+ * alloc btree is read in by bch2_alloc_read, not gc:
+ */
+ if (flags & BCH_BUCKET_MARK_GC)
+ return 0;
+
+ u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = __bucket(ca, k.k->p.offset, gc);
+
+ /*
+ * this should currently only be getting called from the bucket
+ * invalidate path:
+ */
+ BUG_ON(u.dirty_sectors);
+ BUG_ON(u.cached_sectors);
+ BUG_ON(!g->mark.owned_by_allocator);
+
+ old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
+ m.gen = u.gen;
+ m.data_type = u.data_type;
+ m.dirty_sectors = u.dirty_sectors;
+ m.cached_sectors = u.cached_sectors;
+ }));
- if ((flags & BCH_BUCKET_MARK_GC) ||
- gc_visited(c, pos))
- __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
+ g->io_time[READ] = u.read_time;
+ g->io_time[WRITE] = u.write_time;
+ g->oldest_gen = u.oldest_gen;
+ g->gen_valid = 1;
+
+ if (old.cached_sectors) {
+ update_cached_sectors(c, fs_usage, ca->dev_idx,
+ -old.cached_sectors);
+ trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
+ old.cached_sectors);
+ }
+
+ return 0;
}
#define checked_add(a, b) \
@@ -474,9 +542,9 @@ do { \
BUG_ON((a) != _res); \
} while (0)
-static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type type,
- unsigned sectors, bool gc)
+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type type,
+ unsigned sectors, bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
@@ -490,6 +558,8 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
+
+ return 0;
}
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -501,15 +571,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
if (likely(c)) {
- percpu_rwsem_assert_held(&c->mark_lock);
-
- if (!(flags & BCH_BUCKET_MARK_GC))
- __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- false);
- if ((flags & BCH_BUCKET_MARK_GC) ||
- gc_visited(c, pos))
- __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- true);
+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
+ ca, b, type, sectors);
} else {
struct bucket *g;
struct bucket_mark new;
@@ -553,7 +616,7 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
* loop, to avoid racing with the start of gc clearing all the marks - GC does
* that with the gc pos seqlock held.
*/
-static void bch2_mark_pointer(struct bch_fs *c,
+static bool bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
@@ -581,7 +644,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
EBUG_ON(!p.ptr.cached &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
- return;
+ return true;
}
if (!p.ptr.cached)
@@ -612,6 +675,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
+
+ return false;
}
static int bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -694,13 +759,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
: ptr_disk_sectors_delta(p, sectors);
-
- bch2_mark_pointer(c, p, disk_sectors, data_type,
- fs_usage, journal_seq, flags, gc);
+ bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
+ fs_usage, journal_seq, flags, gc);
if (p.ptr.cached) {
- update_cached_sectors(c, fs_usage, p.ptr.dev,
- disk_sectors);
+ if (disk_sectors && !stale)
+ update_cached_sectors(c, fs_usage, p.ptr.dev,
+ disk_sectors);
} else if (!p.ec_nr) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
@@ -826,30 +891,31 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
unsigned journal_seq, unsigned flags,
bool gc)
{
- int ret = 0;
+ if (!fs_usage || gc)
+ fs_usage = this_cpu_ptr(c->usage[gc]);
switch (k.k->type) {
+ case KEY_TYPE_alloc:
+ return bch2_mark_alloc(c, k, inserting,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_btree_ptr:
- ret = bch2_mark_extent(c, k, inserting
- ? c->opts.btree_node_size
- : -c->opts.btree_node_size,
- BCH_DATA_BTREE,
- fs_usage, journal_seq, flags, gc);
- break;
+ return bch2_mark_extent(c, k, inserting
+ ? c->opts.btree_node_size
+ : -c->opts.btree_node_size,
+ BCH_DATA_BTREE,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_extent:
- ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
- fs_usage, journal_seq, flags, gc);
- break;
+ return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_stripe:
- ret = bch2_mark_stripe(c, k, inserting,
- fs_usage, journal_seq, flags, gc);
- break;
+ return bch2_mark_stripe(c, k, inserting,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_inode:
if (inserting)
- fs_usage->s.nr_inodes++;
+ fs_usage->nr_inodes++;
else
- fs_usage->s.nr_inodes--;
- break;
+ fs_usage->nr_inodes--;
+ return 0;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -857,15 +923,13 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
replicas = clamp_t(unsigned, replicas, 1,
ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->s.reserved += sectors;
+ fs_usage->reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
- break;
+ return 0;
}
default:
- break;
+ return 0;
}
-
- return ret;
}
int bch2_mark_key_locked(struct bch_fs *c,
@@ -875,26 +939,9 @@ int bch2_mark_key_locked(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
- int ret;
-
- if (!(flags & BCH_BUCKET_MARK_GC)) {
- ret = __bch2_mark_key(c, k, inserting, sectors,
- fs_usage ?: this_cpu_ptr(c->usage[0]),
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
-
- if ((flags & BCH_BUCKET_MARK_GC) ||
- gc_visited(c, pos)) {
- ret = __bch2_mark_key(c, k, inserting, sectors,
- this_cpu_ptr(c->usage[1]),
- journal_seq, flags, true);
- if (ret)
- return ret;
- }
-
- return 0;
+ return do_mark_fn(__bch2_mark_key, c, pos, flags,
+ k, inserting, sectors, fs_usage,
+ journal_seq, flags);
}
int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
@@ -932,7 +979,7 @@ void bch2_mark_update(struct btree_insert *trans,
percpu_down_read_preempt_disable(&c->mark_lock);
fs_usage = bch2_fs_usage_get_scratch(c);
- if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!(trans->flags & BTREE_INSERT_NOMARK))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
@@ -985,7 +1032,7 @@ void bch2_mark_update(struct btree_insert *trans,
bch2_btree_node_iter_advance(&node_iter, b);
}
- if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+ if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
!warned_disk_usage &&
!xchg(&warned_disk_usage, 1)) {
char buf[200];
@@ -1026,13 +1073,13 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
{
percpu_u64_set(&c->pcpu->sectors_available, 0);
- return avail_factor(bch2_fs_sectors_free(c));
+ return avail_factor(__bch2_fs_usage_read_short(c).free);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read_preempt_disable(&c->mark_lock);
- this_cpu_sub(c->usage[0]->s.online_reserved,
+ this_cpu_sub(c->usage[0]->online_reserved,
res->sectors);
percpu_up_read_preempt_enable(&c->mark_lock);
@@ -1071,38 +1118,22 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
out:
pcpu->sectors_available -= sectors;
- this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+ this_cpu_add(c->usage[0]->online_reserved, sectors);
res->sectors += sectors;
percpu_up_read_preempt_enable(&c->mark_lock);
return 0;
recalculate:
- /*
- * GC recalculates sectors_available when it starts, so that hopefully
- * we don't normally end up blocking here:
- */
-
- /*
- * Piss fuck, we can be called from extent_insert_fixup() with btree
- * locks held:
- */
-
- if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
- if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
- down_read(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock))
- return -EINTR;
- }
-
percpu_down_write(&c->mark_lock);
+
sectors_available = bch2_recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
max_t(s64, 0, sectors_available - sectors));
- this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+ this_cpu_add(c->usage[0]->online_reserved, sectors);
res->sectors += sectors;
ret = 0;
} else {
@@ -1112,9 +1143,6 @@ recalculate:
percpu_up_write(&c->mark_lock);
- if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
- up_read(&c->gc_lock);
-
return ret;
}
@@ -1135,7 +1163,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_nouse = NULL;
unsigned long *buckets_written = NULL;
- u8 *oldest_gens = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
@@ -1161,8 +1188,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
- !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
- GFP_KERNEL|__GFP_ZERO)) ||
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
@@ -1197,9 +1222,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(buckets->b,
old_buckets->b,
n * sizeof(struct bucket));
- memcpy(oldest_gens,
- ca->oldest_gens,
- n * sizeof(u8));
memcpy(buckets_nouse,
ca->buckets_nouse,
BITS_TO_LONGS(n) * sizeof(unsigned long));
@@ -1211,7 +1233,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
- swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_nouse, buckets_nouse);
swap(ca->buckets_written, buckets_written);
@@ -1255,8 +1276,6 @@ err:
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(buckets_written,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
- kvpfree(oldest_gens,
- nbuckets * sizeof(u8));
if (buckets)
call_rcu(&old_buckets->rcu, buckets_free_rcu);
@@ -1276,7 +1295,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 19cf6525..0725aa94 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -16,13 +16,14 @@
#define bucket_cmpxchg(g, new, expr) \
({ \
+ struct bucket *_g = g; \
u64 _v = atomic64_read(&(g)->_mark.v); \
struct bucket_mark _old; \
\
do { \
(new).v.counter = _old.v.counter = _v; \
expr; \
- } while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
_old.v.counter, \
(new).v.counter)) != _old.v.counter);\
_old; \
@@ -56,18 +57,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
-static inline void bucket_set_dirty(struct bch_dev *ca, size_t b)
-{
- struct bucket *g;
- struct bucket_mark m;
-
- rcu_read_lock();
- g = bucket(ca, b);
- bucket_cmpxchg(g, m, m.dirty = true);
- rcu_read_unlock();
-
-}
-
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
@@ -86,7 +75,9 @@ static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
{
- return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+ struct bucket *g = bucket(ca, b);
+
+ return g->mark.gen - g->oldest_gen;
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -96,9 +87,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
}
static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+ const struct bch_extent_ptr *ptr,
+ bool gc)
{
- return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
}
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
@@ -219,31 +211,28 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
/* Filesystem usage: */
-static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
- struct bch_fs_usage *ret;
- ret = this_cpu_ptr(c->usage_scratch);
+ return sizeof(struct bch_fs_usage) / sizeof(u64) +
+ READ_ONCE(c->replicas.nr);
+}
- memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+ struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch);
+ memset(ret, 0, fs_usage_u64s(c) * sizeof(u64));
return ret;
}
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
-static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
-{
- struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-
- return usage.capacity - usage.used;
-}
-
/* key/bucket marking: */
void bch2_bucket_seq_cleanup(struct bch_fs *);
@@ -257,8 +246,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
-#define BCH_BUCKET_MARK_GC (1 << 1)
+#define BCH_BUCKET_MARK_GC (1 << 0)
+#define BCH_BUCKET_MARK_NOATOMIC (1 << 1)
int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
bool, s64, struct gc_pos,
@@ -268,7 +257,7 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, struct gc_pos);
+ struct disk_reservation *);
/* disk reservations: */
@@ -282,8 +271,6 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
-#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1)
-#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2)
int bch2_disk_reservation_add(struct bch_fs *,
struct disk_reservation *,
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 56863c23..869a1314 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -38,6 +38,7 @@ struct bucket {
};
u16 io_time[2];
+ u8 oldest_gen;
unsigned gen_valid:1;
};
@@ -62,35 +63,33 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
- /* summarized: */
- struct bch_fs_usage_summarized {
- u64 online_reserved;
+ u64 online_reserved;
- /* fields after online_reserved are cleared/recalculated by gc: */
- u64 gc_start[0];
+ /* fields after online_reserved are cleared/recalculated by gc: */
+ u64 gc_start[0];
- u64 hidden;
- u64 data;
- u64 cached;
- u64 reserved;
- u64 nr_inodes;
+ u64 hidden;
+ u64 data;
+ u64 cached;
+ u64 reserved;
+ u64 nr_inodes;
- /* XXX: add stats for compression ratio */
+ /* XXX: add stats for compression ratio */
#if 0
- u64 uncompressed;
- u64 compressed;
+ u64 uncompressed;
+ u64 compressed;
#endif
- } s;
/* broken out: */
u64 persistent_reserved[BCH_REPLICAS_MAX];
- u64 data[];
+ u64 replicas[];
};
struct bch_fs_usage_short {
u64 capacity;
u64 used;
+ u64 free;
u64 nr_inodes;
};
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index b84ae5c9..4e33e7b8 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -402,10 +402,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (!src)
return -ENOMEM;
- percpu_up_read_preempt_enable(&c->mark_lock);
+ dst.used = bch2_fs_sectors_used(c, src);
+ dst.online_reserved = src->online_reserved;
- dst.used = bch2_fs_sectors_used(c, *src);
- dst.online_reserved = src->s.online_reserved;
+ percpu_up_read_preempt_enable(&c->mark_lock);
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
dst.persistent_reserved[i] =
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 0f075fa1..369b100a 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -979,10 +979,8 @@ bch2_extent_can_insert(struct btree_insert *trans,
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bch2_extent_is_compressed(k))) {
- int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
-
- if (trans->flags & BTREE_INSERT_NOFAIL)
- flags |= BCH_DISK_RESERVATION_NOFAIL;
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
switch (bch2_disk_reservation_add(trans->c,
trans->disk_res,
@@ -991,8 +989,6 @@ bch2_extent_can_insert(struct btree_insert *trans,
break;
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
- case -EINTR:
- return BTREE_INSERT_NEED_GC_LOCK;
default:
BUG();
}
diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h
index 9715ddbd..0982af02 100644
--- a/libbcachefs/fifo.h
+++ b/libbcachefs/fifo.h
@@ -100,7 +100,7 @@ do { \
({ \
bool _r = !fifo_empty((fifo)); \
if (_r) \
- (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \
+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
_r; \
})
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 8ff8cfa8..f108a282 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -17,23 +17,14 @@
#include <trace/events/bcachefs.h>
-static bool journal_entry_is_open(struct journal *j)
+static bool __journal_entry_is_open(union journal_res_state state)
{
- return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+static bool journal_entry_is_open(struct journal *j)
{
- struct journal_buf *w = journal_prev_buf(j);
-
- atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- bch2_time_stats_update(j->delay_time,
- j->need_write_time);
-
- closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+ return __journal_entry_is_open(j->reservations);
}
static void journal_pin_new_entry(struct journal *j, int count)
@@ -77,39 +68,71 @@ static inline bool journal_entry_empty(struct jset *j)
return true;
}
-static enum {
- JOURNAL_ENTRY_ERROR,
- JOURNAL_ENTRY_INUSE,
- JOURNAL_ENTRY_CLOSED,
- JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
+void bch2_journal_halt(struct journal *j)
+{
+ union journal_res_state old, new;
+ u64 v = atomic64_read(&j->reservations.counter);
+
+ do {
+ old.v = new.v = v;
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+ return;
+
+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ journal_wake(j);
+ closure_wake_up(&journal_cur_buf(j)->wait);
+}
+
+/* journal entry close/open: */
+
+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+{
+ if (!need_write_just_set &&
+ test_bit(JOURNAL_NEED_WRITE, &j->flags))
+ bch2_time_stats_update(j->delay_time,
+ j->need_write_time);
+
+ clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ */
+static bool __journal_entry_close(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
+ bool set_need_write = false;
+ unsigned sectors;
lockdep_assert_held(&j->lock);
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return JOURNAL_ENTRY_CLOSED;
+ return true;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
/* this entry will never be written: */
closure_wake_up(&buf->wait);
- return JOURNAL_ENTRY_ERROR;
+ return true;
}
- if (new.prev_buf_unwritten)
- return JOURNAL_ENTRY_INUSE;
+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
+ j->need_write_time = local_clock();
+ set_need_write = true;
+ }
- /*
- * avoid race between setting buf->data->u64s and
- * journal_res_put starting write:
- */
- journal_state_inc(&new);
+ if (new.prev_buf_unwritten)
+ return false;
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
@@ -119,15 +142,12 @@ static enum {
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- j->prev_buf_sectors =
- vstruct_blocks_plus(buf->data, c->block_bits,
- buf->u64s_reserved) *
- c->opts.block_size;
- BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+ sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+ buf->u64s_reserved) << c->block_bits;
+ BUG_ON(sectors > buf->sectors);
+ buf->sectors = sectors;
bkey_extent_init(&buf->key);
@@ -150,7 +170,6 @@ static enum {
* Hence, we want update/set last_seq on the current journal entry right
* before we open a new one:
*/
- bch2_journal_reclaim_fast(j);
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
if (journal_entry_empty(buf->data))
@@ -163,32 +182,22 @@ static enum {
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
- spin_unlock(&j->lock);
- /* ugh - might be called from __journal_res_get() under wait_event() */
- __set_current_state(TASK_RUNNING);
- bch2_journal_buf_put(j, old.idx, need_write_just_set);
+ bch2_journal_space_available(j);
- return JOURNAL_UNLOCKED;
+ bch2_journal_buf_put(j, old.idx, set_need_write);
+ return true;
}
-void bch2_journal_halt(struct journal *j)
+static bool journal_entry_close(struct journal *j)
{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
+ bool ret;
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ spin_lock(&j->lock);
+ ret = __journal_entry_close(j);
+ spin_unlock(&j->lock);
- journal_wake(j);
- closure_wake_up(&journal_cur_buf(j)->wait);
- closure_wake_up(&journal_prev_buf(j)->wait);
+ return ret;
}
/*
@@ -196,46 +205,39 @@ void bch2_journal_halt(struct journal *j)
* journal reservation - journal entry is open means journal is dirty:
*
* returns:
- * 1: success
- * 0: journal currently full (must wait)
- * -EROFS: insufficient rw devices
- * -EIO: journal error
+ * 0: success
+ * -ENOSPC: journal currently full, must invoke reclaim
+ * -EAGAIN: journal blocked, must wait
+ * -EROFS: insufficient rw devices or journal error
*/
static int journal_entry_open(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
- ssize_t u64s;
- int sectors;
+ int u64s;
u64 v;
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
- if (!fifo_free(&j->pin))
- return 0;
+ if (j->blocked)
+ return -EAGAIN;
- sectors = bch2_journal_entry_sectors(j);
- if (sectors <= 0)
- return sectors;
+ if (j->cur_entry_error)
+ return j->cur_entry_error;
- buf->disk_sectors = sectors;
- buf->u64s_reserved = j->entry_u64s_reserved;
+ BUG_ON(!j->cur_entry_sectors);
- sectors = min_t(unsigned, sectors, buf->size >> 9);
- j->cur_buf_sectors = sectors;
-
- u64s = (sectors << 9) / sizeof(u64);
-
- /* Subtract the journal header */
- u64s -= sizeof(struct jset) / sizeof(u64);
- u64s -= buf->u64s_reserved;
- u64s = max_t(ssize_t, 0L, u64s);
+ buf->u64s_reserved = j->entry_u64s_reserved;
+ buf->disk_sectors = j->cur_entry_sectors;
+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
- BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+ u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+ journal_entry_overhead(j);
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
- return 0;
+ return -ENOSPC;
/*
* Must be set before marking the journal entry as open:
@@ -246,11 +248,14 @@ static int journal_entry_open(struct journal *j)
do {
old.v = new.v = v;
+ EBUG_ON(journal_state_count(new, new.idx));
+
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return -EIO;
+ return -EROFS;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ journal_state_inc(&new);
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
@@ -263,37 +268,22 @@ static int journal_entry_open(struct journal *j)
&j->write_work,
msecs_to_jiffies(j->write_delay_ms));
journal_wake(j);
- return 1;
+ return 0;
}
-static bool __journal_entry_close(struct journal *j)
+static bool journal_quiesced(struct journal *j)
{
- bool set_need_write;
-
- if (!journal_entry_is_open(j)) {
- spin_unlock(&j->lock);
- return true;
- }
-
- set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
- if (set_need_write)
- j->need_write_time = local_clock();
+ union journal_res_state state = READ_ONCE(j->reservations);
+ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
- switch (journal_buf_switch(j, set_need_write)) {
- case JOURNAL_ENTRY_INUSE:
- spin_unlock(&j->lock);
- return false;
- default:
- spin_unlock(&j->lock);
- case JOURNAL_UNLOCKED:
- return true;
- }
+ if (!ret)
+ journal_entry_close(j);
+ return ret;
}
-static bool journal_entry_close(struct journal *j)
+static void journal_quiesce(struct journal *j)
{
- spin_lock(&j->lock);
- return __journal_entry_close(j);
+ wait_event(j->wait, journal_quiesced(j));
}
static void journal_write_work(struct work_struct *work)
@@ -337,7 +327,11 @@ retry:
if (journal_res_get_fast(j, res, flags))
return 0;
+ if (bch2_journal_error(j))
+ return -EROFS;
+
spin_lock(&j->lock);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call journal_entry_close()
@@ -355,56 +349,43 @@ retry:
*/
buf = journal_cur_buf(j);
if (journal_entry_is_open(j) &&
- buf->size >> 9 < buf->disk_sectors &&
- buf->size < JOURNAL_ENTRY_SIZE_MAX)
- j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+ buf->buf_size >> 9 < buf->disk_sectors &&
+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- /*
- * Close the current journal entry if necessary, then try to start a new
- * one:
- */
- switch (journal_buf_switch(j, false)) {
- case JOURNAL_ENTRY_ERROR:
- spin_unlock(&j->lock);
- return -EROFS;
- case JOURNAL_ENTRY_INUSE:
+ if (journal_entry_is_open(j) &&
+ !__journal_entry_close(j)) {
/*
- * The current journal entry is still open, but we failed to get
- * a journal reservation because there's not enough space in it,
- * and we can't close it and start another because we haven't
- * finished writing out the previous entry:
+ * We failed to get a reservation on the current open journal
+ * entry because it's full, and we can't close it because
+ * there's still a previous one in flight:
*/
- spin_unlock(&j->lock);
trace_journal_entry_full(c);
- goto blocked;
- case JOURNAL_ENTRY_CLOSED:
- break;
- case JOURNAL_UNLOCKED:
- goto retry;
+ ret = -EAGAIN;
+ } else {
+ ret = journal_entry_open(j);
}
- /* We now have a new, closed journal buf - see if we can open it: */
- ret = journal_entry_open(j);
+ if ((ret == -EAGAIN || ret == -ENOSPC) &&
+ !j->res_get_blocked_start)
+ j->res_get_blocked_start = local_clock() ?: 1;
+
spin_unlock(&j->lock);
- if (ret < 0)
- return ret;
- if (ret)
+ if (!ret)
goto retry;
+ if (ret == -ENOSPC) {
+ /*
+ * Journal is full - can't rely on reclaim from work item due to
+ * freezing:
+ */
+ trace_journal_full(c);
+ if (!(flags & JOURNAL_RES_GET_NONBLOCK))
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
+ ret = -EAGAIN;
+ }
- /* Journal's full, we have to wait */
-
- /*
- * Direct reclaim - can't rely on reclaim from work item
- * due to freezing..
- */
- bch2_journal_reclaim_work(&j->reclaim_work.work);
-
- trace_journal_full(c);
-blocked:
- if (!j->res_get_blocked_start)
- j->res_get_blocked_start = local_clock() ?: 1;
- return -EAGAIN;
+ return ret;
}
/*
@@ -422,7 +403,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
{
int ret;
- wait_event(j->wait,
+ closure_wait_event(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
@@ -441,9 +422,9 @@ void bch2_journal_entry_res_resize(struct journal *j,
j->entry_u64s_reserved += d;
if (d <= 0)
- goto out_unlock;
+ goto out;
- j->cur_entry_u64s -= d;
+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
smp_mb();
state = READ_ONCE(j->reservations);
@@ -454,15 +435,12 @@ void bch2_journal_entry_res_resize(struct journal *j,
* Not enough room in current journal entry, have to flush it:
*/
__journal_entry_close(j);
- goto out;
+ } else {
+ journal_cur_buf(j)->u64s_reserved += d;
}
-
- journal_cur_buf(j)->u64s_reserved += d;
-out_unlock:
- spin_unlock(&j->lock);
out:
+ spin_unlock(&j->lock);
res->u64s += d;
- return;
}
/* journal flushing: */
@@ -492,47 +470,47 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
int ret;
-retry:
+
spin_lock(&j->lock);
- if (seq < journal_cur_seq(j) ||
+ /*
+ * Can't try to open more than one sequence number ahead:
+ */
+ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+
+ if (journal_cur_seq(j) > seq ||
journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return 0;
}
- if (journal_cur_seq(j) < seq) {
- switch (journal_buf_switch(j, false)) {
- case JOURNAL_ENTRY_ERROR:
- spin_unlock(&j->lock);
- return -EROFS;
- case JOURNAL_ENTRY_INUSE:
- /* haven't finished writing out the previous one: */
- trace_journal_entry_full(c);
- goto blocked;
- case JOURNAL_ENTRY_CLOSED:
- break;
- case JOURNAL_UNLOCKED:
- goto retry;
- }
- }
-
- BUG_ON(journal_cur_seq(j) < seq);
+ if (journal_cur_seq(j) < seq &&
+ !__journal_entry_close(j)) {
+ /* haven't finished writing out the previous one: */
+ trace_journal_entry_full(c);
+ ret = -EAGAIN;
+ } else {
+ BUG_ON(journal_cur_seq(j) != seq);
- ret = journal_entry_open(j);
- if (ret) {
- spin_unlock(&j->lock);
- return ret < 0 ? ret : 0;
+ ret = journal_entry_open(j);
}
-blocked:
- if (!j->res_get_blocked_start)
+
+ if ((ret == -EAGAIN || ret == -ENOSPC) &&
+ !j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
- closure_wait(&j->async_wait, cl);
+ if (ret == -EAGAIN || ret == -ENOSPC)
+ closure_wait(&j->async_wait, cl);
+
spin_unlock(&j->lock);
- bch2_journal_reclaim_work(&j->reclaim_work.work);
- return -EAGAIN;
+ if (ret == -ENOSPC) {
+ trace_journal_full(c);
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
+ ret = -EAGAIN;
+ }
+
+ return ret;
}
static int journal_seq_error(struct journal *j, u64 seq)
@@ -615,8 +593,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
- else
- spin_unlock(&j->lock);
+ spin_unlock(&j->lock);
}
static int journal_seq_flushed(struct journal *j, u64 seq)
@@ -628,8 +605,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
- else
- spin_unlock(&j->lock);
+ spin_unlock(&j->lock);
return ret;
}
@@ -721,6 +697,26 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked--;
+ spin_unlock(&j->lock);
+
+ journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked++;
+ spin_unlock(&j->lock);
+
+ journal_quiesce(j);
+}
+
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -743,7 +739,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err;
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
+ nr + sizeof(*journal_buckets) / sizeof(u64));
if (!journal_buckets)
goto err;
@@ -806,9 +802,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
ja->nr++;
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ 0);
if (c) {
spin_unlock(&c->journal.lock);
@@ -859,7 +855,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
*/
if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
mutex_unlock(&c->sb_lock);
return -ENOSPC;
}
@@ -930,8 +926,7 @@ void bch2_fs_journal_stop(struct journal *j)
c->btree_roots_dirty)
bch2_journal_meta(j);
- BUG_ON(journal_entry_is_open(j) ||
- j->reservations.prev_buf_unwritten);
+ journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_NOT_EMPTY, &j->flags));
@@ -957,7 +952,7 @@ void bch2_fs_journal_start(struct journal *j)
journal_pin_new_entry(j, 0);
/*
- * journal_buf_switch() only inits the next journal entry when it
+ * __journal_entry_close() only inits the next journal entry when it
* closes an open journal entry - the very first journal entry gets
* initialized here:
*/
@@ -966,6 +961,7 @@ void bch2_fs_journal_start(struct journal *j)
c->last_bucket_seq_cleanup = journal_cur_seq(j);
+ bch2_journal_space_available(j);
spin_unlock(&j->lock);
/*
@@ -975,7 +971,7 @@ void bch2_fs_journal_start(struct journal *j)
*/
bch2_journal_seq_blacklist_write(j);
- queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
}
/* init/exit: */
@@ -1021,8 +1017,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- kvpfree(j->buf[1].data, j->buf[1].size);
- kvpfree(j->buf[0].data, j->buf[0].size);
+ kvpfree(j->buf[1].data, j->buf[1].buf_size);
+ kvpfree(j->buf[0].data, j->buf[0].buf_size);
free_fifo(&j->pin);
}
@@ -1046,8 +1042,8 @@ int bch2_fs_journal_init(struct journal *j)
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
@@ -1060,8 +1056,8 @@ int bch2_fs_journal_init(struct journal *j)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
- !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
+ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
@@ -1078,35 +1074,54 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{
struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union journal_res_state *s = &j->reservations;
+ union journal_res_state s;
struct bch_dev *ca;
unsigned iter;
rcu_read_lock();
spin_lock(&j->lock);
+ s = READ_ONCE(j->reservations);
pr_buf(&out,
"active journal entries:\t%llu\n"
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
- "reservation count:\t%u\n"
- "reservation offset:\t%u\n"
- "current entry u64s:\t%u\n"
- "io in flight:\t\t%i\n"
- "need write:\t\t%i\n"
- "dirty:\t\t\t%i\n"
- "replay done:\t\t%i\n",
+ "current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
journal_last_seq(j),
- j->last_seq_ondisk,
- journal_state_count(*s, s->idx),
- s->cur_entry_offset,
- j->cur_entry_u64s,
- s->prev_buf_unwritten,
+ j->last_seq_ondisk);
+
+ switch (s.cur_entry_offset) {
+ case JOURNAL_ENTRY_ERROR_VAL:
+ pr_buf(&out, "error\n");
+ break;
+ case JOURNAL_ENTRY_CLOSED_VAL:
+ pr_buf(&out, "closed\n");
+ break;
+ default:
+ pr_buf(&out, "%u/%u\n",
+ s.cur_entry_offset,
+ j->cur_entry_u64s);
+ break;
+ }
+
+ pr_buf(&out,
+ "current entry refs:\t%u\n"
+ "prev entry unwritten:\t",
+ journal_state_count(s, s.idx));
+
+ if (s.prev_buf_unwritten)
+ pr_buf(&out, "yes, ref %u\n",
+ journal_state_count(s, !s.idx));
+ else
+ pr_buf(&out, "no\n");
+
+ pr_buf(&out,
+ "need write:\t\t%i\n"
+ "replay done:\t\t%i\n",
test_bit(JOURNAL_NEED_WRITE, &j->flags),
- journal_entry_is_open(j),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
for_each_member_device_rcu(ca, c, iter,
@@ -1119,9 +1134,12 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
pr_buf(&out,
"dev %u:\n"
"\tnr\t\t%u\n"
+ "\tavailable\t%u:%u\n"
"\tcur_idx\t\t%u (seq %llu)\n"
"\tlast_idx\t%u (seq %llu)\n",
iter, ja->nr,
+ bch2_journal_dev_buckets_available(j, ja),
+ ja->sectors_free,
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 50d864a3..71929bd6 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -178,6 +178,11 @@ static inline unsigned jset_u64s(unsigned u64s)
return u64s + sizeof(struct jset_entry) / sizeof(u64);
}
+static inline int journal_entry_overhead(struct journal *j)
+{
+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
static inline struct jset_entry *
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
{
@@ -222,7 +227,7 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *
id, 0, k, k->k.u64s);
}
-void bch2_journal_buf_put_slowpath(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *, bool);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
bool need_write_just_set)
@@ -233,17 +238,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
.buf0_count = idx == 0,
.buf1_count = idx == 1,
}).v, &j->reservations.counter);
-
- EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
-
- /*
- * Do not initiate a journal write if the journal is in an error state
- * (previous journal entry write may have failed)
- */
- if (s.idx != idx &&
- !journal_state_count(s, idx) &&
- s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
- bch2_journal_buf_put_slowpath(j, need_write_just_set);
+ if (!journal_state_count(s, idx)) {
+ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
+ __bch2_journal_buf_put(j, need_write_just_set);
+ }
}
/*
@@ -291,6 +289,8 @@ static inline int journal_res_get_fast(struct journal *j,
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
+ EBUG_ON(!journal_state_count(new, new.idx));
+
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
@@ -330,6 +330,8 @@ out:
return 0;
}
+/* journal_entry_res: */
+
void bch2_journal_entry_res_resize(struct journal *,
struct journal_entry_res *,
unsigned);
@@ -367,6 +369,9 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
set_bit(JOURNAL_REPLAY_DONE, &j->flags);
}
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 0f1f8e15..16cb6be8 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -825,7 +825,6 @@ fsck_err:
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
- struct journal_entry_pin_list *pin_list;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
@@ -854,7 +853,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
ret = bch2_btree_insert(c, entry->btree_id, k,
&disk_res, NULL,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK);
}
if (ret) {
@@ -866,10 +866,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
cond_resched();
}
- pin_list = journal_seq_pin(j, j->replay_journal_seq);
-
- if (atomic_dec_and_test(&pin_list->count))
- journal_wake(j);
+ bch2_journal_pin_put(j, j->replay_journal_seq);
}
j->replay_journal_seq = 0;
@@ -884,82 +881,6 @@ err:
/* journal write: */
-static unsigned journal_dev_buckets_available(struct journal *j,
- struct journal_device *ja)
-{
- unsigned next = (ja->cur_idx + 1) % ja->nr;
- unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (available &&
- journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
- --available;
-
- return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-int bch2_journal_entry_sectors(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned sectors_available = UINT_MAX;
- unsigned i, nr_online = 0, nr_devs = 0;
-
- lockdep_assert_held(&j->lock);
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_this_device, sectors_this_device;
-
- if (!ja->nr)
- continue;
-
- buckets_this_device = journal_dev_buckets_available(j, ja);
- sectors_this_device = ja->sectors_free;
-
- nr_online++;
-
- /*
- * We that we don't allocate the space for a journal entry
- * until we write it out - thus, account for it here:
- */
- if (j->prev_buf_sectors >= sectors_this_device) {
- if (!buckets_this_device)
- continue;
-
- buckets_this_device--;
- sectors_this_device = ca->mi.bucket_size;
- }
-
- sectors_this_device -= j->prev_buf_sectors;
-
- if (buckets_this_device)
- sectors_this_device = ca->mi.bucket_size;
-
- if (!sectors_this_device)
- continue;
-
- sectors_available = min(sectors_available,
- sectors_this_device);
- nr_devs++;
- }
- rcu_read_unlock();
-
- if (nr_online < c->opts.metadata_replicas_required)
- return -EROFS;
-
- if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
- return 0;
-
- return sectors_available;
-}
-
static void __journal_write_alloc(struct journal *j,
struct journal_buf *w,
struct dev_alloc_list *devs_sorted,
@@ -1033,7 +954,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
&c->rw_devs[BCH_DATA_JOURNAL]);
- spin_lock(&j->lock);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
@@ -1049,7 +969,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
if (sectors > ja->sectors_free &&
sectors <= ca->mi.bucket_size &&
- journal_dev_buckets_available(j, ja)) {
+ bch2_journal_dev_buckets_available(j, ja)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->sectors_free = ca->mi.bucket_size;
}
@@ -1058,10 +978,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
done:
- if (replicas >= replicas_want)
- j->prev_buf_sectors = 0;
-
- spin_unlock(&j->lock);
rcu_read_unlock();
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
@@ -1116,17 +1032,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
- if (buf->size >= new_size)
+ if (buf->buf_size >= new_size)
return;
new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
if (!new_buf)
return;
- memcpy(new_buf, buf->data, buf->size);
- kvpfree(buf->data, buf->size);
+ memcpy(new_buf, buf->data, buf->buf_size);
+ kvpfree(buf->data, buf->buf_size);
buf->data = new_buf;
- buf->size = new_size;
+ buf->buf_size = new_size;
}
static void journal_write_done(struct closure *cl)
@@ -1166,7 +1082,7 @@ static void journal_write_done(struct closure *cl)
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
out:
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
@@ -1220,20 +1136,22 @@ void bch2_journal_write(struct closure *cl)
struct bch_extent_ptr *ptr;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s;
+ int ret;
+
+ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
- start = vstruct_last(w->data);
+ start = vstruct_last(jset);
end = bch2_journal_super_entries_add_common(c, start);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
- le32_add_cpu(&w->data->u64s, u64s);
- BUG_ON(vstruct_sectors(jset, c->block_bits) >
- w->disk_sectors);
+ le32_add_cpu(&jset->u64s, u64s);
+ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
journal_write_compact(jset);
@@ -1271,12 +1189,28 @@ void bch2_journal_write(struct closure *cl)
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
- BUG_ON(sectors > j->prev_buf_sectors);
+ BUG_ON(sectors > w->sectors);
+
+ bytes = vstruct_bytes(jset);
+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+
+ spin_lock(&j->lock);
+ ret = journal_write_alloc(j, w, sectors);
- bytes = vstruct_bytes(w->data);
- memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+ /*
+ * write is allocated, no longer need to account for it in
+ * bch2_journal_space_available():
+ */
+ w->sectors = 0;
+
+ /*
+ * journal entry has been compacted and allocated, recalculate space
+ * available:
+ */
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
- if (journal_write_alloc(j, w, sectors)) {
+ if (ret) {
bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
@@ -1316,7 +1250,7 @@ void bch2_journal_write(struct closure *cl)
trace_journal_write(bio);
closure_bio_submit(bio, cl);
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
for_each_rw_member(ca, c, i)
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
index d0a652cf..ec7b49b8 100644
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -39,7 +39,6 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
-int bch2_journal_entry_sectors(struct journal *);
void bch2_journal_write(struct closure *);
#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index a795e888..b928b8c8 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -1,15 +1,213 @@
#include "bcachefs.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
+/* Free space calculations: */
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+ struct journal_device *ja)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ unsigned next = (ja->cur_idx + 1) % ja->nr;
+ unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+ /*
+ * Allocator startup needs some journal space before we can do journal
+ * replay:
+ */
+ if (available &&
+ test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+ available--;
+
+ /*
+ * Don't use the last bucket unless writing the new last_seq
+ * will make another bucket available:
+ */
+ if (available &&
+ journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+ --available;
+
+ return available;
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned sectors_next_entry = UINT_MAX;
+ unsigned sectors_total = UINT_MAX;
+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
+ j->buf[1].buf_size >> 9);
+ unsigned i, nr_online = 0, nr_devs = 0;
+ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+ ? journal_prev_buf(j)->sectors
+ : 0;
+ int ret = 0;
+
+ lockdep_assert_held(&j->lock);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_JOURNAL]) {
+ struct journal_device *ja = &ca->journal;
+ unsigned buckets_this_device, sectors_this_device;
+
+ if (!ja->nr)
+ continue;
+
+ nr_online++;
+
+ buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
+ sectors_this_device = ja->sectors_free;
+
+ /*
+ * We that we don't allocate the space for a journal entry
+ * until we write it out - thus, account for it here:
+ */
+ if (unwritten_sectors >= sectors_this_device) {
+ if (!buckets_this_device)
+ continue;
+
+ buckets_this_device--;
+ sectors_this_device = ca->mi.bucket_size;
+ }
+
+ sectors_this_device -= unwritten_sectors;
+
+ if (sectors_this_device < ca->mi.bucket_size &&
+ buckets_this_device) {
+ buckets_this_device--;
+ sectors_this_device = ca->mi.bucket_size;
+ }
+
+ if (!sectors_this_device)
+ continue;
+
+ sectors_next_entry = min(sectors_next_entry,
+ sectors_this_device);
+
+ sectors_total = min(sectors_total,
+ buckets_this_device * ca->mi.bucket_size +
+ sectors_this_device);
+
+ max_entry_size = min_t(unsigned, max_entry_size,
+ ca->mi.bucket_size);
+
+ nr_devs++;
+ }
+ rcu_read_unlock();
+
+ if (nr_online < c->opts.metadata_replicas_required) {
+ ret = -EROFS;
+ sectors_next_entry = 0;
+ } else if (!sectors_next_entry ||
+ nr_devs < min_t(unsigned, nr_online,
+ c->opts.metadata_replicas)) {
+ ret = -ENOSPC;
+ sectors_next_entry = 0;
+ } else if (!fifo_free(&j->pin)) {
+ ret = -ENOSPC;
+ sectors_next_entry = 0;
+ }
+
+ j->cur_entry_sectors = sectors_next_entry;
+ j->cur_entry_error = ret;
+
+ if (!ret)
+ journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ ret = ja->nr &&
+ ja->last_idx != ja->cur_idx &&
+ ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+/*
+ * Advance ja->last_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+static void journal_do_discards(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned iter;
+
+ mutex_lock(&j->reclaim_lock);
+
+ for_each_rw_member(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+
+ while (should_discard_bucket(j, ja)) {
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca,
+ ja->buckets[ja->last_idx]),
+ ca->mi.bucket_size, GFP_NOIO, 0);
+
+ spin_lock(&j->lock);
+ ja->last_idx = (ja->last_idx + 1) % ja->nr;
+
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
+ }
+ }
+
+ mutex_unlock(&j->reclaim_lock);
+}
+
/*
* Journal entry pinning - machinery for holding a reference on a given journal
* entry, holding it open to ensure it gets replayed during recovery:
*/
+static void bch2_journal_reclaim_fast(struct journal *j)
+{
+ struct journal_entry_pin_list temp;
+ bool popped = false;
+
+ lockdep_assert_held(&j->lock);
+
+ /*
+ * Unpin journal entries whose reference counts reached zero, meaning
+ * all btree nodes got written out
+ */
+ while (!fifo_empty(&j->pin) &&
+ !atomic_read(&fifo_peek_front(&j->pin).count)) {
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!fifo_pop(&j->pin, temp));
+ popped = true;
+ }
+
+ if (popped)
+ bch2_journal_space_available(j);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ if (atomic_dec_and_test(&pin_list->count)) {
+ spin_lock(&j->lock);
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
+ }
+}
+
static inline void __journal_pin_add(struct journal *j,
u64 seq,
struct journal_entry_pin *pin,
@@ -24,10 +222,7 @@ static inline void __journal_pin_add(struct journal *j,
pin->seq = seq;
pin->flush = flush_fn;
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
+ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
/*
* If the journal is currently full, we might want to call flush_fn
@@ -129,86 +324,53 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
* data off of a specific device:
*/
-/**
- * bch2_journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-void bch2_journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!fifo_empty(&j->pin) &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- journal_wake(j);
-}
-
-static void journal_pin_mark_flushing(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
-{
- lockdep_assert_held(&j->reclaim_lock);
-
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
- BUG_ON(j->flush_in_progress);
- j->flush_in_progress = pin;
-}
-
-static void journal_pin_flush(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
-{
- pin->flush(j, pin, seq);
-
- BUG_ON(j->flush_in_progress != pin);
- j->flush_in_progress = NULL;
- wake_up(&j->pin_flush_wait);
-}
-
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
- /* no need to iterate over empty fifo entries: */
- bch2_journal_reclaim_fast(j);
+ spin_lock(&j->lock);
+
+ BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
- if (*seq > seq_to_flush ||
+ if (*seq > max_seq ||
(ret = list_first_entry_or_null(&pin_list->list,
struct journal_entry_pin, list)))
break;
+ if (ret) {
+ list_move(&ret->list, &pin_list->flushed);
+ BUG_ON(j->flush_in_progress);
+ j->flush_in_progress = ret;
+ j->last_flushed = jiffies;
+ }
+
+ spin_unlock(&j->lock);
+
return ret;
}
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+ unsigned min_nr)
{
- bool ret;
+ struct journal_entry_pin *pin;
+ u64 seq;
- spin_lock(&j->lock);
- ret = ja->nr &&
- (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
- spin_unlock(&j->lock);
+ lockdep_assert_held(&j->reclaim_lock);
- return ret;
+ while ((pin = journal_get_next_pin(j, min_nr
+ ? U64_MAX : seq_to_flush, &seq))) {
+ if (min_nr)
+ min_nr--;
+
+ pin->flush(j, pin, seq);
+
+ BUG_ON(j->flush_in_progress != pin);
+ j->flush_in_progress = NULL;
+ wake_up(&j->pin_flush_wait);
+ }
}
/**
@@ -235,104 +397,44 @@ void bch2_journal_reclaim_work(struct work_struct *work)
struct bch_fs, journal.reclaim_work);
struct journal *j = &c->journal;
struct bch_dev *ca;
- struct journal_entry_pin *pin;
- u64 seq, seq_to_flush = 0;
- unsigned iter, bucket_to_flush;
- unsigned long next_flush;
- bool reclaim_lock_held = false, need_flush;
+ unsigned iter, bucket_to_flush, min_nr = 0;
+ u64 seq_to_flush = 0;
+
+ journal_do_discards(j);
+
+ mutex_lock(&j->reclaim_lock);
+ spin_lock(&j->lock);
- /*
- * Advance last_idx to point to the oldest journal entry containing
- * btree node updates that have not yet been written out
- */
for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
continue;
- while (should_discard_bucket(j, ja)) {
- if (!reclaim_lock_held) {
- /*
- * ugh:
- * might be called from __journal_res_get()
- * under wait_event() - have to go back to
- * TASK_RUNNING before doing something that
- * would block, but only if we're doing work:
- */
- __set_current_state(TASK_RUNNING);
-
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- /* recheck under reclaim_lock: */
- continue;
- }
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->last_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) % ja->nr;
- spin_unlock(&j->lock);
-
- journal_wake(j);
- }
-
- /*
- * Write out enough btree nodes to free up 50% journal
- * buckets
- */
- spin_lock(&j->lock);
+ /* Try to keep the journal at most half full: */
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
- spin_unlock(&j->lock);
}
/* Also flush if the pin fifo is more than half full */
- spin_lock(&j->lock);
seq_to_flush = max_t(s64, seq_to_flush,
(s64) journal_cur_seq(j) -
(j->pin.size >> 1));
+ spin_unlock(&j->lock);
/*
* If it's been longer than j->reclaim_delay_ms since we last flushed,
* make sure to flush at least one journal pin:
*/
- next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
- need_flush = time_after(jiffies, next_flush);
-
- while ((pin = journal_get_next_pin(j, need_flush
- ? U64_MAX
- : seq_to_flush, &seq))) {
- if (!reclaim_lock_held) {
- spin_unlock(&j->lock);
- __set_current_state(TASK_RUNNING);
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- spin_lock(&j->lock);
- continue;
- }
+ if (time_after(jiffies, j->last_flushed +
+ msecs_to_jiffies(j->reclaim_delay_ms)))
+ min_nr = 1;
- journal_pin_mark_flushing(j, pin, seq);
- spin_unlock(&j->lock);
-
- journal_pin_flush(j, pin, seq);
-
- need_flush = false;
- j->last_flushed = jiffies;
+ journal_flush_pins(j, seq_to_flush, min_nr);
- spin_lock(&j->lock);
- }
-
- spin_unlock(&j->lock);
-
- if (reclaim_lock_held)
- mutex_unlock(&j->reclaim_lock);
+ mutex_unlock(&j->reclaim_lock);
if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
@@ -341,8 +443,6 @@ void bch2_journal_reclaim_work(struct work_struct *work)
static int journal_flush_done(struct journal *j, u64 seq_to_flush)
{
- struct journal_entry_pin *pin;
- u64 pin_seq;
int ret;
ret = bch2_journal_error(j);
@@ -350,16 +450,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
return ret;
mutex_lock(&j->reclaim_lock);
- spin_lock(&j->lock);
-
- while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
- journal_pin_mark_flushing(j, pin, pin_seq);
- spin_unlock(&j->lock);
- journal_pin_flush(j, pin, pin_seq);
+ journal_flush_pins(j, seq_to_flush, 0);
- spin_lock(&j->lock);
- }
+ spin_lock(&j->lock);
/*
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
index 287590cd..1d688d6f 100644
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -3,6 +3,10 @@
#define JOURNAL_PIN (32 * 1024)
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+ struct journal_device *);
+void bch2_journal_space_available(struct journal *);
+
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->seq != 0;
@@ -16,6 +20,8 @@ journal_seq_pin(struct journal *j, u64 seq)
return &j->pin.data[seq & j->pin.mask];
}
+void bch2_journal_pin_put(struct journal *, u64);
+
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
@@ -27,7 +33,6 @@ void bch2_journal_pin_add_if_older(struct journal *,
journal_pin_flush_fn);
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
-void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
void bch2_journal_flush_pins(struct journal *, u64);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index a91662f6..8772e53f 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -21,8 +21,10 @@ struct journal_buf {
struct closure_waitlist wait;
- unsigned size;
- unsigned disk_sectors;
+ unsigned buf_size; /* size in bytes of @data */
+ unsigned sectors; /* maximum size for current entry */
+ unsigned disk_sectors; /* maximum size entry could have been, if
+ buf_size was bigger */
unsigned u64s_reserved;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
@@ -128,9 +130,20 @@ struct journal {
unsigned long flags;
union journal_res_state reservations;
+
+ /* Max size of current journal entry */
unsigned cur_entry_u64s;
- unsigned prev_buf_sectors;
- unsigned cur_buf_sectors;
+ unsigned cur_entry_sectors;
+
+ /*
+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+ * insufficient devices:
+ */
+ int cur_entry_error;
+
+ /* Reserved space in journal entry to be used just prior to write */
+ unsigned entry_u64s_reserved;
+
unsigned buf_size_want;
/*
@@ -141,6 +154,9 @@ struct journal {
spinlock_t lock;
+ /* if nonzero, we may not open a new journal entry: */
+ unsigned blocked;
+
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
@@ -155,9 +171,6 @@ struct journal {
u64 seq_ondisk;
u64 last_seq_ondisk;
- /* Reserved space in journal entry to be used just prior to write */
- unsigned entry_u64s_reserved;
-
/*
* FIFO of journal entries whose btree updates have not yet been
* written out.
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 7e50547c..77ab464a 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -82,7 +82,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v));
break;
case FS_USAGE_INODES:
- percpu_u64_set(&c->usage[0]->s.nr_inodes,
+ percpu_u64_set(&c->usage[0]->nr_inodes,
le64_to_cpu(u->v));
break;
case FS_USAGE_KEY_VERSION:
@@ -406,22 +406,19 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- ret = bch2_gc(c, &journal, true);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
err = "unable to allocate journal buckets";
- for_each_online_member(ca, c, i)
- if (bch2_dev_journal_alloc(ca)) {
+ for_each_online_member(ca, c, i) {
+ ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
+ }
/*
* journal_res_get() will crash if called before this has
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 4d0c9718..99283b10 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -244,14 +244,14 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
*dst = *src;
for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
- if (!src->data[src_idx])
+ if (!src->replicas[src_idx])
continue;
dst_idx = __replicas_entry_idx(dst_r,
cpu_replicas_entry(src_r, src_idx));
BUG_ON(dst_idx < 0);
- dst->data[dst_idx] = src->data[src_idx];
+ dst->replicas[dst_idx] = src->replicas[src_idx];
}
}
@@ -261,39 +261,37 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
- struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+ struct bch_fs_usage __percpu *new_scratch = NULL;
unsigned bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
- unsigned i;
int ret = -ENOMEM;
- for (i = 0; i < 3; i++) {
- if (i < 2 && !c->usage[i])
- continue;
-
- new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO);
- if (!new_usage[i])
- goto err;
- }
-
- for (i = 0; i < 2; i++) {
- if (!c->usage[i])
- continue;
-
- __replicas_table_update(new_usage[i], new_r,
- c->usage[i], &c->replicas);
-
- swap(c->usage[i], new_usage[i]);
- }
-
- swap(c->usage_scratch, new_usage[2]);
+ if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO)) ||
+ (c->usage[1] &&
+ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO))) ||
+ !(new_scratch = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO)))
+ goto err;
- swap(c->replicas, *new_r);
+ if (c->usage[0])
+ __replicas_table_update(new_usage[0], new_r,
+ c->usage[0], &c->replicas);
+ if (c->usage[1])
+ __replicas_table_update(new_usage[1], new_r,
+ c->usage[1], &c->replicas);
+
+ swap(c->usage[0], new_usage[0]);
+ swap(c->usage[1], new_usage[1]);
+ swap(c->usage_scratch, new_scratch);
+ swap(c->replicas, *new_r);
ret = 0;
err:
- for (i = 0; i < 3; i++)
- free_percpu(new_usage[i]);
+ free_percpu(new_scratch);
+ free_percpu(new_usage[1]);
+ free_percpu(new_usage[0]);
return ret;
}
@@ -456,7 +454,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
if (__replicas_has_entry(&c->replicas_gc, e))
continue;
- v = percpu_u64_get(&c->usage[0]->data[i]);
+ v = percpu_u64_get(&c->usage[0]->replicas[i]);
if (!v)
continue;
@@ -557,7 +555,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
BUG_ON(ret < 0);
}
- percpu_u64_set(&c->usage[0]->data[idx], sectors);
+ percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
return 0;
}
@@ -974,5 +972,6 @@ int bch2_fs_replicas_init(struct bch_fs *c)
{
c->journal.entry_u64s_reserved +=
reserve_journal_replicas(c, &c->replicas);
- return 0;
+
+ return replicas_table_update(c, &c->replicas);
}
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 1f343e64..a1ca837b 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -125,7 +125,7 @@ struct bch_hash_desc {
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
};
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
bch2_hash_lookup(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@@ -159,7 +159,7 @@ bch2_hash_lookup(struct btree_trans *trans,
return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
}
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
bch2_hash_hole(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@@ -185,10 +185,11 @@ bch2_hash_hole(struct btree_trans *trans,
return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
}
-static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *start)
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *start)
{
struct btree_iter *iter;
struct bkey_s_c k;
@@ -211,10 +212,11 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
return btree_iter_err(k);
}
-static inline int __bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- u64 inode, struct bkey_i *insert, int flags)
+static __always_inline
+int __bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, struct bkey_i *insert, int flags)
{
struct btree_iter *iter, *slot = NULL;
struct bkey_s_c k;
@@ -276,10 +278,11 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc,
inode, insert, flags));
}
-static inline int bch2_hash_delete_at(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter)
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *iter)
{
struct bkey_i *delete;
int ret;
@@ -300,10 +303,11 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
return 0;
}
-static inline int bch2_hash_delete(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- u64 inode, const void *key)
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, const void *key)
{
struct btree_iter *iter;
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index b88750ff..71d97c57 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -136,7 +136,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
sb->bio = bio;
}
- new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
if (!new_sb)
return -ENOMEM;
@@ -923,7 +923,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_down_read_preempt_disable(&c->mark_lock);
{
- u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
+ u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
@@ -970,7 +970,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
+ u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index a8eb1615..1528f77e 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -567,7 +567,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
- unsigned i, iter_size, fs_usage_size;
+ unsigned i, iter_size;
const char *err;
pr_verbose_init(opts, "");
@@ -661,9 +661,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct btree_node_iter_set);
- fs_usage_size = sizeof(struct bch_fs_usage) +
- sizeof(u64) * c->replicas.nr;
-
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc",
@@ -680,8 +677,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
- !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
- !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 7e3aebed..b56db15d 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -243,17 +243,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
pr_buf(&out, "hidden:\t\t\t\t%llu\n",
- fs_usage->s.hidden);
+ fs_usage->hidden);
pr_buf(&out, "data:\t\t\t\t%llu\n",
- fs_usage->s.data);
+ fs_usage->data);
pr_buf(&out, "cached:\t\t\t\t%llu\n",
- fs_usage->s.cached);
+ fs_usage->cached);
pr_buf(&out, "reserved:\t\t\t%llu\n",
- fs_usage->s.reserved);
+ fs_usage->reserved);
pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->s.nr_inodes);
+ fs_usage->nr_inodes);
pr_buf(&out, "online reserved:\t\t%llu\n",
- fs_usage->s.online_reserved);
+ fs_usage->online_reserved);
for (i = 0;
i < ARRAY_SIZE(fs_usage->persistent_reserved);
@@ -269,7 +269,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
pr_buf(&out, "\t");
bch2_replicas_entry_to_text(&out, e);
- pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+ pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
}
percpu_up_read_preempt_enable(&c->mark_lock);