summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/trace/events/bcachefs.h28
-rw-r--r--libbcachefs.c6
-rw-r--r--libbcachefs/alloc_background.c202
-rw-r--r--libbcachefs/alloc_background.h2
-rw-r--r--libbcachefs/alloc_foreground.c158
-rw-r--r--libbcachefs/alloc_foreground.h31
-rw-r--r--libbcachefs/alloc_types.h13
-rw-r--r--libbcachefs/bcachefs.h5
-rw-r--r--libbcachefs/bcachefs_format.h15
-rw-r--r--libbcachefs/btree_cache.c21
-rw-r--r--libbcachefs/btree_gc.c62
-rw-r--r--libbcachefs/btree_gc.h1
-rw-r--r--libbcachefs/btree_iter.c1
-rw-r--r--libbcachefs/btree_update_interior.c2
-rw-r--r--libbcachefs/buckets.c243
-rw-r--r--libbcachefs/buckets.h54
-rw-r--r--libbcachefs/buckets_types.h7
-rw-r--r--libbcachefs/ec.c48
-rw-r--r--libbcachefs/ec.h2
-rw-r--r--libbcachefs/inode.c53
-rw-r--r--libbcachefs/io.c22
-rw-r--r--libbcachefs/journal.c57
-rw-r--r--libbcachefs/journal.h1
-rw-r--r--libbcachefs/journal_io.c7
-rw-r--r--libbcachefs/journal_seq_blacklist.c78
-rw-r--r--libbcachefs/journal_seq_blacklist.h2
-rw-r--r--libbcachefs/opts.c5
-rw-r--r--libbcachefs/opts.h1
-rw-r--r--libbcachefs/recovery.c106
-rw-r--r--libbcachefs/super-io.h1
-rw-r--r--libbcachefs/super.c33
-rw-r--r--libbcachefs/super.h21
-rw-r--r--libbcachefs/super_types.h1
-rw-r--r--libbcachefs/sysfs.c119
-rw-r--r--libbcachefs/tests.c8
36 files changed, 633 insertions, 785 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 6b238443..e2012548 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-f9d2e809a8c3b3a3c6bb0f8fe8e646425f7fce8d
+078a1a596a74ade60db6eee0f0be927defb7abed
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index fce31463..5a409ee1 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -318,6 +318,34 @@ DEFINE_EVENT(btree_node, btree_set_root,
TP_ARGS(c, b)
);
+TRACE_EVENT(btree_cache_scan,
+ TP_PROTO(unsigned long nr_to_scan_pages,
+ unsigned long nr_to_scan_nodes,
+ unsigned long can_free_nodes,
+ long ret),
+ TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr_to_scan_pages )
+ __field(unsigned long, nr_to_scan_nodes )
+ __field(unsigned long, can_free_nodes )
+ __field(long, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_to_scan_pages = nr_to_scan_pages;
+ __entry->nr_to_scan_nodes = nr_to_scan_nodes;
+ __entry->can_free_nodes = can_free_nodes;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
+ __entry->nr_to_scan_pages,
+ __entry->nr_to_scan_nodes,
+ __entry->can_free_nodes,
+ __entry->ret)
+);
+
/* Garbage collection */
DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
diff --git a/libbcachefs.c b/libbcachefs.c
index 16f15a8d..076fe667 100644
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -231,7 +231,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs,
m->first_bucket = 0;
m->bucket_size = cpu_to_le16(i->bucket_size);
- SET_BCH_MEMBER_REPLACEMENT(m, BCH_CACHE_REPLACEMENT_lru);
SET_BCH_MEMBER_DISCARD(m, i->discard);
SET_BCH_MEMBER_DATA_ALLOWED(m, i->data_allowed);
SET_BCH_MEMBER_DURABILITY(m, i->durability + 1);
@@ -521,7 +520,6 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
" Has data: %s\n"
- " Replacement policy: %s\n"
" Discard: %llu\n",
i, member_uuid_str,
pr_units(le16_to_cpu(m->bucket_size) *
@@ -539,10 +537,6 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
data_allowed_str,
data_has_str,
- BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR
- ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
- : "unknown",
-
BCH_MEMBER_DISCARD(m));
}
}
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index ed919b42..2a36af5e 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -354,6 +354,7 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
g = bucket(ca, k.k->p.offset);
u = bch2_alloc_unpack(k);
+ *bucket_gen(ca, k.k->p.offset) = u.gen;
g->_mark.gen = u.gen;
g->_mark.data_type = u.data_type;
g->_mark.dirty_sectors = u.dirty_sectors;
@@ -513,6 +514,18 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
test_bit(b, ca->buckets_nouse))
return false;
+ if (ca->new_fs_bucket_idx) {
+ /*
+ * Device or filesystem is still being initialized, and we
+ * haven't fully marked superblocks & journal:
+ */
+ if (is_superblock_bucket(ca, b))
+ return false;
+
+ if (b < ca->new_fs_bucket_idx)
+ return false;
+ }
+
gc_gen = bucket_gc_gen(bucket(ca, b));
ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
@@ -581,7 +594,7 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
buckets = bucket_array(ca);
ca->alloc_heap.used = 0;
now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.last_seq_ondisk;
+ last_seq_ondisk = c->journal.flushed_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@@ -628,76 +641,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
up_read(&ca->bucket_lock);
}
-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket_mark m;
- size_t b, start;
-
- if (ca->fifo_last_bucket < ca->mi.first_bucket ||
- ca->fifo_last_bucket >= ca->mi.nbuckets)
- ca->fifo_last_bucket = ca->mi.first_bucket;
-
- start = ca->fifo_last_bucket;
-
- do {
- ca->fifo_last_bucket++;
- if (ca->fifo_last_bucket == ca->mi.nbuckets)
- ca->fifo_last_bucket = ca->mi.first_bucket;
-
- b = ca->fifo_last_bucket;
- m = READ_ONCE(buckets->b[b].mark);
-
- if (bch2_can_invalidate_bucket(ca, b, m)) {
- struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
- heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
- if (heap_full(&ca->alloc_heap))
- break;
- }
-
- cond_resched();
- } while (ca->fifo_last_bucket != start);
-}
-
-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket_mark m;
- size_t checked, i;
-
- for (checked = 0;
- checked < ca->mi.nbuckets / 2;
- checked++) {
- size_t b = bch2_rand_range(ca->mi.nbuckets -
- ca->mi.first_bucket) +
- ca->mi.first_bucket;
-
- m = READ_ONCE(buckets->b[b].mark);
-
- if (bch2_can_invalidate_bucket(ca, b, m)) {
- struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
- heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
- if (heap_full(&ca->alloc_heap))
- break;
- }
-
- cond_resched();
- }
-
- sort(ca->alloc_heap.data,
- ca->alloc_heap.used,
- sizeof(ca->alloc_heap.data[0]),
- bucket_idx_cmp, NULL);
-
- /* remove duplicates: */
- for (i = 0; i + 1 < ca->alloc_heap.used; i++)
- if (ca->alloc_heap.data[i].bucket ==
- ca->alloc_heap.data[i + 1].bucket)
- ca->alloc_heap.data[i].nr = 0;
-}
-
static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
{
size_t i, nr = 0;
@@ -705,17 +648,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
- switch (ca->mi.replacement) {
- case BCH_CACHE_REPLACEMENT_lru:
- find_reclaimable_buckets_lru(c, ca);
- break;
- case BCH_CACHE_REPLACEMENT_fifo:
- find_reclaimable_buckets_fifo(c, ca);
- break;
- case BCH_CACHE_REPLACEMENT_random:
- find_reclaimable_buckets_random(c, ca);
- break;
- }
+ find_reclaimable_buckets_lru(c, ca);
heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
@@ -725,33 +658,11 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
return nr;
}
-/*
- * returns sequence number of most recent journal entry that updated this
- * bucket:
- */
-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-{
- if (m.journal_seq_valid) {
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u64 bucket_seq = journal_seq;
-
- bucket_seq &= ~((u64) U16_MAX);
- bucket_seq |= m.journal_seq;
-
- if (bucket_seq > journal_seq)
- bucket_seq -= 1 << 16;
-
- return bucket_seq;
- } else {
- return 0;
- }
-}
-
static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b)
+ struct bch_dev *ca, u64 b,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
- struct bkey_alloc_unpacked u;
struct btree_iter iter;
int ret;
@@ -765,16 +676,16 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
if (ret)
goto err;
- u = alloc_mem_to_key(c, &iter);
+ *u = alloc_mem_to_key(c, &iter);
- u.gen++;
- u.data_type = 0;
- u.dirty_sectors = 0;
- u.cached_sectors = 0;
- u.read_time = atomic64_read(&c->io_clock[READ].now);
- u.write_time = atomic64_read(&c->io_clock[WRITE].now);
+ u->gen++;
+ u->data_type = 0;
+ u->dirty_sectors = 0;
+ u->cached_sectors = 0;
+ u->read_time = atomic64_read(&c->io_clock[READ].now);
+ u->write_time = atomic64_read(&c->io_clock[WRITE].now);
- ret = bch2_alloc_write(trans, &iter, &u,
+ ret = bch2_alloc_write(trans, &iter, u,
BTREE_TRIGGER_BUCKET_INVALIDATE);
err:
bch2_trans_iter_exit(trans, &iter);
@@ -784,21 +695,23 @@ err:
static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 *journal_seq, unsigned flags)
{
- struct bucket *g;
- struct bucket_mark m;
+ struct bkey_alloc_unpacked u;
size_t b;
int ret = 0;
+ /*
+ * If the read-only path is trying to shut down, we can't be generating
+ * new btree updates:
+ */
+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
+ return 1;
+
BUG_ON(!ca->alloc_heap.used ||
!ca->alloc_heap.data[0].nr);
b = ca->alloc_heap.data[0].bucket;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
- g = bucket(ca, b);
- m = READ_ONCE(g->mark);
-
- BUG_ON(m.dirty_sectors);
bch2_mark_alloc_bucket(c, ca, b, true);
@@ -807,37 +720,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!fifo_push(&ca->free_inc, b));
spin_unlock(&c->freelist_lock);
- /*
- * If we're not invalidating cached data, we only increment the bucket
- * gen in memory here, the incremented gen will be updated in the btree
- * by bch2_trans_mark_pointer():
- */
- if (!m.cached_sectors &&
- !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
- BUG_ON(m.data_type);
- bucket_cmpxchg(g, m, m.gen++);
- percpu_up_read(&c->mark_lock);
- goto out;
- }
-
percpu_up_read(&c->mark_lock);
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
- ret = 1;
- goto out;
- }
-
ret = bch2_trans_do(c, NULL, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED|
flags,
- bucket_invalidate_btree(&trans, ca, b));
-out:
+ bucket_invalidate_btree(&trans, ca, b, &u));
+
if (!ret) {
/* remove from alloc_heap: */
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
@@ -853,7 +744,7 @@ out:
* bucket (i.e. deleting the last reference) before writing to
* this bucket again:
*/
- *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+ *journal_seq = max(*journal_seq, u.journal_seq);
} else {
size_t b2;
@@ -1133,7 +1024,7 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
ob++) {
spin_lock(&ob->lock);
if (ob->valid && !ob->on_partial_list &&
- ob->ptr.dev == ca->dev_idx)
+ ob->dev == ca->dev_idx)
ret = true;
spin_unlock(&ob->lock);
}
@@ -1280,22 +1171,3 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct open_bucket *ob;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list) {
- pr_buf(out, "%zu ref %u type %s\n",
- ob - c->open_buckets,
- atomic_read(&ob->pin),
- bch2_data_types[ob->type]);
- }
- spin_unlock(&ob->lock);
- }
-
-}
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index e3cdb8bc..86b64177 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -142,6 +142,4 @@ int bch2_dev_allocator_start(struct bch_dev *);
int bch2_alloc_write_all(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
-
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index dce77cc2..0a634125 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -43,9 +43,32 @@
* reference _after_ doing the index update that makes its allocation reachable.
*/
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+ open_bucket_idx_t idx = ob - c->open_buckets;
+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+ ob->hash = *slot;
+ *slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+ open_bucket_idx_t idx = ob - c->open_buckets;
+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+ while (*slot != idx) {
+ BUG_ON(!*slot);
+ slot = &c->open_buckets[*slot].hash;
+ }
+
+ *slot = ob->hash;
+ ob->hash = 0;
+}
+
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
if (ob->ec) {
bch2_ec_bucket_written(c, ob);
@@ -55,14 +78,16 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
+ bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
- ob->type = 0;
+ ob->data_type = 0;
spin_unlock(&ob->lock);
percpu_up_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
+ bch2_open_bucket_hash_remove(c, ob);
+
ob->freelist = c->open_buckets_freelist;
c->open_buckets_freelist = ob - c->open_buckets;
@@ -81,8 +106,7 @@ void bch2_open_bucket_write_error(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, obs, ob, i)
- if (ob->ptr.dev == dev &&
- ob->ec)
+ if (ob->dev == dev && ob->ec)
bch2_ec_bucket_cancel(c, ob);
}
@@ -95,7 +119,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
ob = c->open_buckets + c->open_buckets_freelist;
c->open_buckets_freelist = ob->freelist;
atomic_set(&ob->pin, 1);
- ob->type = 0;
+ ob->data_type = 0;
c->open_buckets_nr_free--;
return ob;
@@ -105,8 +129,8 @@ static void open_bucket_free_unused(struct bch_fs *c,
struct write_point *wp,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- bool may_realloc = wp->type == BCH_DATA_user;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ bool may_realloc = wp->data_type == BCH_DATA_user;
BUG_ON(ca->open_buckets_partial_nr >
ARRAY_SIZE(ca->open_buckets_partial));
@@ -133,32 +157,28 @@ static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
struct open_bucket *ob;
unsigned i;
+ rcu_read_lock();
open_bucket_for_each(c, obs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
- BUG_ON(ptr_stale(ca, &ob->ptr));
+ BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen);
}
+ rcu_read_unlock();
#endif
}
/* _only_ for allocating the journal on a new device: */
long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
{
- struct bucket_array *buckets;
- ssize_t b;
+ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+ u64 b = ca->new_fs_bucket_idx++;
- rcu_read_lock();
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
- if (is_available_bucket(buckets->b[b].mark) &&
- (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)) &&
- !buckets->b[b].mark.owned_by_allocator)
- goto success;
- b = -1;
-success:
- rcu_read_unlock();
- return b;
+ if (!is_superblock_bucket(ca, b) &&
+ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+ return b;
+ }
+
+ return -1;
}
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
@@ -252,15 +272,14 @@ out:
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
- ob->ptr = (struct bch_extent_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = bucket(ca, b)->mark.gen,
- .offset = bucket_to_sector(ca, b),
- .dev = ca->dev_idx,
- };
-
+ ob->dev = ca->dev_idx;
+ ob->gen = *bucket_gen(ca, b);
+ ob->bucket = b;
spin_unlock(&ob->lock);
+ ca->nr_open_buckets++;
+ bch2_open_bucket_hash_add(c, ob);
+
if (c->blocked_allocate_open_bucket) {
bch2_time_stats_update(
&c->times[BCH_TIME_blocked_allocate_open_bucket],
@@ -275,7 +294,6 @@ out:
c->blocked_allocate = 0;
}
- ca->nr_open_buckets++;
spin_unlock(&c->freelist_lock);
bch2_wake_allocator(ca);
@@ -339,9 +357,9 @@ static void add_new_bucket(struct bch_fs *c,
struct open_bucket *ob)
{
unsigned durability =
- bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
+ bch_dev_bkey_exists(c, ob->dev)->mi.durability;
- __clear_bit(ob->ptr.dev, devs_may_alloc->d);
+ __clear_bit(ob->dev, devs_may_alloc->d);
*nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY)
? durability : 1;
*have_cache |= !durability;
@@ -451,13 +469,13 @@ static int bucket_alloc_from_stripe(struct bch_fs *c,
continue;
ob = c->open_buckets + h->s->blocks[ec_idx];
- if (ob->ptr.dev == devs_sorted.devs[i] &&
+ if (ob->dev == devs_sorted.devs[i] &&
!test_and_set_bit(ec_idx, h->s->blocks_allocated))
goto got_bucket;
}
goto out_put_head;
got_bucket:
- ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ ca = bch_dev_bkey_exists(c, ob->dev);
ob->ec_idx = ec_idx;
ob->ec = h->s;
@@ -487,12 +505,12 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
if (*nr_effective < nr_replicas &&
- test_bit(ob->ptr.dev, devs_may_alloc->d) &&
+ test_bit(ob->dev, devs_may_alloc->d) &&
(ca->mi.durability ||
- (wp->type == BCH_DATA_user && !*have_cache)) &&
+ (wp->data_type == BCH_DATA_user && !*have_cache)) &&
(ob->ec || !need_ec)) {
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache,
@@ -524,7 +542,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
unsigned i;
rcu_read_lock();
- devs = target_rw_devs(c, wp->type, target);
+ devs = target_rw_devs(c, wp->data_type, target);
rcu_read_unlock();
/* Don't allocate from devices we already have pointers to: */
@@ -532,7 +550,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
__clear_bit(devs_have->devs[i], devs.d);
open_bucket_for_each(c, ptrs, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
+ __clear_bit(ob->dev, devs.d);
if (erasure_code) {
if (!ec_open_bucket(c, ptrs)) {
@@ -592,7 +610,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
unsigned i, j;
open_bucket_for_each(c, obs, ob, i) {
- bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+ bool drop = !ca || ob->dev == ca->dev_idx;
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
@@ -601,7 +619,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
continue;
ob2 = c->open_buckets + ob->ec->blocks[j];
- drop |= ob2->ptr.dev == ca->dev_idx;
+ drop |= ob2->dev == ca->dev_idx;
}
mutex_unlock(&ob->ec->lock);
}
@@ -785,11 +803,11 @@ retry:
wp = writepoint_find(c, write_point.v);
- if (wp->type == BCH_DATA_user)
+ if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
/* metadata may not allocate on cache devices: */
- if (wp->type != BCH_DATA_user)
+ if (wp->data_type != BCH_DATA_user)
have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -867,12 +885,27 @@ err:
}
}
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+ return (struct bch_extent_ptr) {
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .gen = ob->gen,
+ .dev = ob->dev,
+ .offset = bucket_to_sector(ca, ob->bucket) +
+ ca->mi.bucket_size -
+ ob->sectors_free,
+ };
+}
+
/*
* Append pointers to the space we just allocated to @k, and mark @sectors space
* as allocated out of @ob
*/
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
- struct bkey_i *k, unsigned sectors)
+ struct bkey_i *k, unsigned sectors,
+ bool cached)
{
struct open_bucket *ob;
@@ -882,14 +915,14 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
wp->sectors_free -= sectors;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- struct bch_extent_ptr tmp = ob->ptr;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
- tmp.cached = !ca->mi.durability &&
- wp->type == BCH_DATA_user;
+ ptr.cached = cached ||
+ (!ca->mi.durability &&
+ wp->data_type == BCH_DATA_user);
- tmp.offset += ca->mi.bucket_size - ob->sectors_free;
- bch2_bkey_append_ptr(k, tmp);
+ bch2_bkey_append_ptr(k, ptr);
BUG_ON(sectors > ob->sectors_free);
ob->sectors_free -= sectors;
@@ -919,7 +952,7 @@ static inline void writepoint_init(struct write_point *wp,
enum bch_data_type type)
{
mutex_init(&wp->lock);
- wp->type = type;
+ wp->data_type = type;
}
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -956,3 +989,22 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
writepoint_hash(c, wp->write_point));
}
}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct open_bucket *ob;
+
+ for (ob = c->open_buckets;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid && !ob->on_partial_list) {
+ pr_buf(out, "%zu ref %u type %s\n",
+ ob - c->open_buckets,
+ atomic_read(&ob->pin),
+ bch2_data_types[ob->data_type]);
+ }
+ spin_unlock(&ob->lock);
+ }
+
+}
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index 2e81712b..d466bda9 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -85,12 +85,36 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- ob->type = wp->type;
+ ob->data_type = wp->data_type;
atomic_inc(&ob->pin);
ob_push(c, ptrs, ob);
}
}
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+ unsigned dev, u64 bucket)
+{
+ return c->open_buckets_hash +
+ (jhash_3words(dev, bucket, bucket >> 32, 0) &
+ (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+ while (slot) {
+ struct open_bucket *ob = &c->open_buckets[slot];
+
+ if (ob->dev == dev && ob->bucket == bucket)
+ return true;
+
+ slot = ob->hash;
+ }
+
+ return false;
+}
+
int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, enum alloc_reserve,
@@ -105,8 +129,9 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned,
struct closure *);
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
- struct bkey_i *, unsigned);
+ struct bkey_i *, unsigned, bool);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
@@ -127,4 +152,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index 4a1cd8b7..409232e3 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -37,24 +37,31 @@ typedef FIFO(long) alloc_fifo;
#define WRITE_POINT_HASH_NR 32
#define WRITE_POINT_MAX 32
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
typedef u16 open_bucket_idx_t;
struct open_bucket {
spinlock_t lock;
atomic_t pin;
open_bucket_idx_t freelist;
+ open_bucket_idx_t hash;
/*
* When an open bucket has an ec_stripe attached, this is the index of
* the block in the stripe this open_bucket corresponds to:
*/
u8 ec_idx;
- u8 type;
+ enum bch_data_type data_type:3;
unsigned valid:1;
unsigned on_partial_list:1;
int alloc_reserve:3;
+
unsigned sectors_free;
- struct bch_extent_ptr ptr;
+ u8 dev;
+ u8 gen;
+ u64 bucket;
struct ec_stripe_new *ec;
};
@@ -74,7 +81,7 @@ struct write_point {
struct mutex lock;
u64 last_used;
unsigned long write_point;
- enum bch_data_type type;
+ enum bch_data_type data_type;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 943487f2..3ada85ac 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -445,6 +445,7 @@ struct bch_dev {
* Or rcu_read_lock(), but only for ptr_stale():
*/
struct bucket_array __rcu *buckets[2];
+ struct bucket_gens *bucket_gens;
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
@@ -453,6 +454,7 @@ struct bch_dev {
struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
+ u64 new_fs_bucket_idx;
struct task_struct __rcu *alloc_thread;
/*
@@ -748,17 +750,18 @@ struct bch_fs {
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table;
- struct work_struct journal_seq_blacklist_gc_work;
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
u64 blocked_allocate;
u64 blocked_allocate_open_bucket;
+
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
+ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
struct write_point rebalance_write_point;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 495f4d19..a053fca7 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1063,8 +1063,7 @@ struct bch_member {
};
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
-/* 4-10 unused, was TIER, HAS_(META)DATA */
-LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
@@ -1088,18 +1087,6 @@ enum bch_member_state {
BCH_MEMBER_STATE_NR
};
-#define BCH_CACHE_REPLACEMENT_POLICIES() \
- x(lru, 0) \
- x(fifo, 1) \
- x(random, 2)
-
-enum bch_cache_replacement_policies {
-#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
- BCH_CACHE_REPLACEMENT_POLICIES()
-#undef x
- BCH_CACHE_REPLACEMENT_NR
-};
-
struct bch_sb_field_members {
struct bch_sb_field field;
struct bch_member members[0];
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 3411d5a0..b13563d7 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -274,6 +274,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned long freed = 0;
unsigned i, flags;
+ unsigned long ret = SHRINK_STOP;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -282,7 +283,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (sc->gfp_mask & __GFP_FS)
mutex_lock(&bc->lock);
else if (!mutex_trylock(&bc->lock))
- return -1;
+ goto out_norestore;
flags = memalloc_nofs_save();
@@ -299,13 +300,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) {
+ /*
+ * Leave a few nodes on the freeable list, so that a btree split
+ * won't have to hit the system allocator:
+ */
+ if (++i <= 3)
+ continue;
+
touched++;
if (touched >= nr)
break;
- if (++i > 3 &&
- !btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -351,8 +358,14 @@ restart:
mutex_unlock(&bc->lock);
out:
+ ret = (unsigned long) freed * btree_pages(c);
memalloc_nofs_restore(flags);
- return (unsigned long) freed * btree_pages(c);
+out_norestore:
+ trace_btree_cache_scan(sc->nr_to_scan,
+ sc->nr_to_scan / btree_pages(c),
+ btree_cache_can_free(bc),
+ ret);
+ return ret;
}
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 91c69a9f..d1883701 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -170,10 +170,10 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- if (ret) {
- kfree(new);
+ kfree(new);
+
+ if (ret)
return ret;
- }
bch2_btree_node_drop_keys_outside_node(b);
@@ -199,10 +199,10 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- if (ret) {
- kfree(new);
+ kfree(new);
+
+ if (ret)
return ret;
- }
bch2_btree_node_drop_keys_outside_node(b);
@@ -504,8 +504,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
*/
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
if (fsck_err_on(!g->gen_valid, c,
@@ -643,14 +643,14 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
ptr->gen = g->mark.gen;
}
} else {
bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
(ptr->cached &&
@@ -691,9 +691,9 @@ found:
}
ret = bch2_journal_key_insert(c, btree_id, level, new);
- if (ret)
- kfree(new);
- else
+ kfree(new);
+
+ if (!ret)
*k = bkey_i_to_s_c(new);
}
fsck_err:
@@ -737,7 +737,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
ptrs = bch2_bkey_ptrs_c(*k);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
if (gen_after(g->oldest_gen, ptr->gen))
g->oldest_gen = ptr->gen;
@@ -1056,23 +1056,13 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
} while (start < end);
}
-void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
- unsigned flags)
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+ unsigned flags)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
unsigned i;
u64 b;
- /*
- * This conditional is kind of gross, but we may be called from the
- * device add path, before the new device has actually been added to the
- * running filesystem:
- */
- if (c) {
- lockdep_assert_held(&c->sb_lock);
- percpu_down_read(&c->mark_lock);
- }
-
for (i = 0; i < layout->nr_superblocks; i++) {
u64 offset = le64_to_cpu(layout->sb_offset[i]);
@@ -1091,9 +1081,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB), flags);
}
-
- if (c)
- percpu_up_read(&c->mark_lock);
}
static void bch2_mark_superblocks(struct bch_fs *c)
@@ -1283,7 +1270,6 @@ static int bch2_gc_start(struct bch_fs *c,
{
struct bch_dev *ca = NULL;
unsigned i;
- int ret;
BUG_ON(c->usage_gc);
@@ -1315,12 +1301,6 @@ static int bch2_gc_start(struct bch_fs *c,
}
}
- ret = bch2_ec_mem_alloc(c, true);
- if (ret) {
- bch_err(c, "error allocating ec gc mem");
- return ret;
- }
-
percpu_down_write(&c->mark_lock);
/*
@@ -1403,8 +1383,7 @@ static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
}
ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
- if (ret)
- kfree(new);
+ kfree(new);
}
fsck_err:
return ret;
@@ -1529,8 +1508,7 @@ inconsistent:
stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
- if (ret)
- kfree(new);
+ kfree(new);
}
fsck_err:
return ret;
@@ -1768,7 +1746,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
percpu_down_read(&c->mark_lock);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, false);
+ struct bucket *g = PTR_BUCKET(ca, ptr);
if (gen_after(g->mark.gen, ptr->gen) > 16) {
percpu_up_read(&c->mark_lock);
@@ -1778,7 +1756,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, false);
+ struct bucket *g = PTR_BUCKET(ca, ptr);
if (gen_after(g->gc_gen, ptr->gen))
g->gc_gen = ptr->gen;
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 59dfb069..0665f594 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -8,7 +8,6 @@ int bch2_gc(struct bch_fs *, bool, bool);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
-void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
/*
* For concurrent mark and sweep (with other index updates), we define a total
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index e9091f8a..65ab2cd6 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -746,6 +746,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
k.k->p.snapshot));
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+ BTREE_ITER_NOPRESERVE|
BTREE_ITER_ALL_SNAPSHOTS);
prev = bch2_btree_iter_prev(&copy);
if (!prev.k)
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 9dca694b..6872e56b 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -236,7 +236,7 @@ retry:
}
bkey_btree_ptr_v2_init(&tmp.k);
- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c));
+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index c73abe69..738ce67d 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -50,7 +50,7 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
void bch2_bucket_seq_cleanup(struct bch_fs *c)
{
u64 journal_seq = atomic64_read(&c->journal.seq);
- u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+ u16 last_seq_ondisk = c->journal.flushed_seq_ondisk;
struct bch_dev *ca;
struct bucket_array *buckets;
struct bucket *g;
@@ -340,13 +340,6 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
: m.data_type;
}
-static bool bucket_became_unavailable(struct bucket_mark old,
- struct bucket_mark new)
-{
- return is_available_bucket(old) &&
- !is_available_bucket(new);
-}
-
static inline void account_bucket(struct bch_fs_usage *fs_usage,
struct bch_dev_usage *dev_usage,
enum bch_data_type type,
@@ -532,19 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-#define do_mark_fn(fn, c, pos, flags, ...) \
-({ \
- int gc, ret = 0; \
- \
- percpu_rwsem_assert_held(&c->mark_lock); \
- \
- for (gc = 0; gc < 2 && !ret; gc++) \
- if (!gc == !(flags & BTREE_TRIGGER_GC) || \
- (gc && gc_visited(c, pos))) \
- ret = fn(c, __VA_ARGS__, gc); \
- ret; \
-})
-
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator)
{
@@ -558,6 +538,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(owned_by_allocator == old.owned_by_allocator);
}
+static inline u8 bkey_alloc_gen(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_alloc:
+ return bkey_s_c_to_alloc(k).v->gen;
+ case KEY_TYPE_alloc_v2:
+ return bkey_s_c_to_alloc_v2(k).v->gen;
+ case KEY_TYPE_alloc_v3:
+ return bkey_s_c_to_alloc_v3(k).v->gen;
+ default:
+ return 0;
+ }
+}
+
static int bch2_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
@@ -565,16 +559,13 @@ static int bch2_mark_alloc(struct btree_trans *trans,
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_alloc_unpacked u;
+ struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
+ struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
struct bch_dev *ca;
struct bucket *g;
struct bucket_mark old_m, m;
int ret = 0;
- /* We don't do anything for deletions - do we?: */
- if (!bkey_is_alloc(new.k))
- return 0;
-
/*
* alloc btree is read in by bch2_alloc_read, not gc:
*/
@@ -582,13 +573,21 @@ static int bch2_mark_alloc(struct btree_trans *trans,
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
- if (flags & BTREE_TRIGGER_INSERT) {
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ !old_u.data_type != !new_u.data_type &&
+ new.k->type == KEY_TYPE_alloc_v3) {
struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
- v->journal_seq = cpu_to_le64(journal_seq);
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ v->journal_seq = !new_u.data_type &&
+ bch2_journal_noflush_seq(&c->journal, journal_seq)
+ ? 0 : cpu_to_le64(journal_seq);
}
ca = bch_dev_bkey_exists(c, new.k->p.inode);
@@ -597,15 +596,17 @@ static int bch2_mark_alloc(struct btree_trans *trans,
return 0;
percpu_down_read(&c->mark_lock);
+ if (!gc && new_u.gen != bkey_alloc_gen(old))
+ *bucket_gen(ca, new.k->p.offset) = new_u.gen;
+
g = __bucket(ca, new.k->p.offset, gc);
- u = bch2_alloc_unpack(new);
old_m = bucket_cmpxchg(g, m, ({
- m.gen = u.gen;
- m.data_type = u.data_type;
- m.dirty_sectors = u.dirty_sectors;
- m.cached_sectors = u.cached_sectors;
- m.stripe = u.stripe != 0;
+ m.gen = new_u.gen;
+ m.data_type = new_u.data_type;
+ m.dirty_sectors = new_u.dirty_sectors;
+ m.cached_sectors = new_u.cached_sectors;
+ m.stripe = new_u.stripe != 0;
if (journal_seq) {
m.journal_seq_valid = 1;
@@ -615,12 +616,12 @@ static int bch2_mark_alloc(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
+ g->io_time[READ] = new_u.read_time;
+ g->io_time[WRITE] = new_u.write_time;
+ g->oldest_gen = new_u.oldest_gen;
g->gen_valid = 1;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
+ g->stripe = new_u.stripe;
+ g->stripe_redundancy = new_u.stripe_redundancy;
percpu_up_read(&c->mark_lock);
/*
@@ -655,17 +656,27 @@ static int bch2_mark_alloc(struct btree_trans *trans,
overflow; \
})
-static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, bool gc)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type data_type,
+ unsigned sectors, struct gc_pos pos,
+ unsigned flags)
{
- struct bucket *g = __bucket(ca, b, gc);
+ struct bucket *g;
struct bucket_mark old, new;
bool overflow;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
BUG_ON(data_type != BCH_DATA_sb &&
data_type != BCH_DATA_journal);
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return;
+
+ percpu_down_read(&c->mark_lock);
+ g = gc_bucket(ca, b);
old = bucket_cmpxchg(g, new, ({
new.data_type = data_type;
overflow = checked_add(new.dirty_sectors, sectors);
@@ -683,32 +694,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
bch2_data_types[old.data_type ?: data_type],
old.dirty_sectors, sectors);
- if (c)
- bch2_dev_usage_update(c, ca, old, new, 0, gc);
-
- return 0;
-}
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
-{
- BUG_ON(type != BCH_DATA_sb &&
- type != BCH_DATA_journal);
-
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return;
-
- if (likely(c)) {
- do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
- ca, b, type, sectors);
- } else {
- __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
- }
+ bch2_dev_usage_update(c, ca, old, new, 0, true);
+ percpu_up_read(&c->mark_lock);
}
static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
@@ -809,17 +796,18 @@ static int mark_stripe_bucket(struct btree_trans *trans,
enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- bool gc = flags & BTREE_TRIGGER_GC;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g;
struct bucket_mark new, old;
char buf[200];
int ret = 0;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
/* * XXX doesn't handle deletion */
percpu_down_read(&c->mark_lock);
- g = PTR_BUCKET(ca, ptr, gc);
+ g = PTR_GC_BUCKET(ca, ptr);
if (g->mark.dirty_sectors ||
(g->stripe && g->stripe != k.k->p.offset)) {
@@ -853,7 +841,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
+ bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
err:
percpu_up_read(&c->mark_lock);
@@ -889,18 +877,19 @@ static int bch2_mark_pointer(struct btree_trans *trans,
s64 sectors, enum bch_data_type data_type,
unsigned flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
+ struct bucket *g;
u8 bucket_data_type;
u64 v;
int ret = 0;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
percpu_down_read(&c->mark_lock);
- g = PTR_BUCKET(ca, &p.ptr, gc);
+ g = PTR_GC_BUCKET(ca, &p.ptr);
v = atomic64_read(&g->_mark.v);
do {
@@ -930,9 +919,7 @@ static int bch2_mark_pointer(struct btree_trans *trans,
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
-
- BUG_ON(!gc && bucket_became_unavailable(old, new));
+ bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
err:
percpu_up_read(&c->mark_lock);
@@ -946,37 +933,35 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
s64 sectors,
unsigned flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
struct bch_fs *c = trans->c;
struct bch_replicas_padded r;
+ struct gc_stripe *m;
- if (!gc) {
- BUG();
- } else {
- struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
- if (!m)
- return -ENOMEM;
+ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
- spin_lock(&c->ec_stripes_heap_lock);
-
- if (!m || !m->alive) {
- spin_unlock(&c->ec_stripes_heap_lock);
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
- (u64) p.idx);
- bch2_inconsistent_error(c);
- return -EIO;
- }
+ if (!m)
+ return -ENOMEM;
- m->block_sectors[p.block] += sectors;
+ spin_lock(&c->ec_stripes_heap_lock);
- r = m->r;
+ if (!m || !m->alive) {
spin_unlock(&c->ec_stripes_heap_lock);
-
- r.e.data_type = data_type;
- update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, gc);
+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+ (u64) p.idx);
+ bch2_inconsistent_error(c);
+ return -EIO;
}
+ m->block_sectors[p.block] += sectors;
+
+ r = m->r;
+ spin_unlock(&c->ec_stripes_heap_lock);
+
+ r.e.data_type = data_type;
+ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+
return 0;
}
@@ -984,7 +969,6 @@ static int bch2_mark_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -1002,6 +986,8 @@ static int bch2_mark_extent(struct btree_trans *trans,
bool stale;
int ret;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
r.e.data_type = data_type;
r.e.nr_devs = 0;
r.e.nr_required = 1;
@@ -1022,7 +1008,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
if (p.ptr.cached) {
if (!stale) {
ret = update_cached_sectors(c, k, p.ptr.dev,
- disk_sectors, journal_seq, gc);
+ disk_sectors, journal_seq, true);
if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
return ret;
@@ -1047,7 +1033,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
}
if (r.e.nr_devs) {
- ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, gc);
+ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
if (ret) {
char buf[200];
@@ -1114,7 +1100,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
spin_unlock(&c->ec_stripes_heap_lock);
}
} else {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, idx);
+ struct gc_stripe *m =
+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+ if (!m)
+ return -ENOMEM;
/*
* This will be wrong when we bring back runtime gc: we should
@@ -1198,6 +1188,8 @@ static int bch2_mark_reservation(struct btree_trans *trans,
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
if (flags & BTREE_TRIGGER_OVERWRITE)
sectors = -sectors;
sectors *= replicas;
@@ -1247,19 +1239,13 @@ not_found:
*/
if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
- struct bkey_i_error *new;
-
- new = kmalloc(sizeof(*new), GFP_KERNEL);
- if (!new) {
- bch_err(c, "%s: error allocating new key", __func__);
- return -ENOMEM;
- }
+ struct bkey_i_error new;
- bkey_init(&new->k);
- new->k.type = KEY_TYPE_error;
- new->k.p = p.k->p;
- new->k.size = p.k->size;
- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
+ bkey_init(&new.k);
+ new.k.type = KEY_TYPE_error;
+ new.k.p = p.k->p;
+ new.k.size = p.k->size;
+ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
}
fsck_err:
return ret;
@@ -1278,6 +1264,8 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
u64 end = le64_to_cpu(p.v->idx) + p.k->size;
int ret = 0;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
idx -= le32_to_cpu(p.v->front_pad);
end += le32_to_cpu(p.v->back_pad);
@@ -2170,9 +2158,18 @@ static void buckets_free_rcu(struct rcu_head *rcu)
buckets->nbuckets * sizeof(struct bucket));
}
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+ struct bucket_gens *buckets =
+ container_of(rcu, struct bucket_gens, rcu);
+
+ kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+}
+
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_array *buckets = NULL, *old_buckets = NULL;
+ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
@@ -2196,6 +2193,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
+ !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+ GFP_KERNEL|__GFP_ZERO)) ||
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
@@ -2208,6 +2207,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = nbuckets;
+ bucket_gens->first_bucket = ca->mi.first_bucket;
+ bucket_gens->nbuckets = nbuckets;
bch2_copygc_stop(c);
@@ -2218,6 +2219,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
}
old_buckets = bucket_array(ca);
+ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
@@ -2225,13 +2227,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
memcpy(buckets->b,
old_buckets->b,
n * sizeof(struct bucket));
+ memcpy(bucket_gens->b,
+ old_bucket_gens->b,
+ n);
memcpy(buckets_nouse,
ca->buckets_nouse,
BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->buckets[0], buckets);
- buckets = old_buckets;
+ rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+ buckets = old_buckets;
+ bucket_gens = old_bucket_gens;
swap(ca->buckets_nouse, buckets_nouse);
@@ -2265,6 +2272,8 @@ err:
free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+ if (bucket_gens)
+ call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
if (buckets)
call_rcu(&old_buckets->rcu, buckets_free_rcu);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index ac9b554a..45c6d230 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -53,11 +53,34 @@ static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
return buckets->b + b;
}
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+ return __bucket(ca, b, true);
+}
+
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
{
return __bucket(ca, b, false);
}
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+ return rcu_dereference_check(ca->bucket_gens,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
+
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+ struct bucket_gens *gens = bucket_gens(ca);
+
+ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+ return gens->b + b;
+}
+
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
@@ -75,10 +98,15 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
}
static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- bool gc)
+ const struct bch_extent_ptr *ptr)
{
- return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
+ return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
+{
+ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
}
static inline enum bch_data_type ptr_data_type(const struct bkey *k,
@@ -91,18 +119,6 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
}
-static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- struct bucket_mark m;
-
- rcu_read_lock();
- m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
- rcu_read_unlock();
-
- return m;
-}
-
static inline int gen_cmp(u8 a, u8 b)
{
return (s8) (a - b);
@@ -122,7 +138,13 @@ static inline int gen_after(u8 a, u8 b)
static inline u8 ptr_stale(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
- return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+ u8 ret;
+
+ rcu_read_lock();
+ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ rcu_read_unlock();
+
+ return ret;
}
/* bucket gc marks */
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index b2de2995..18bca269 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -52,6 +52,13 @@ struct bucket_array {
struct bucket b[];
};
+struct bucket_gens {
+ struct rcu_head rcu;
+ u16 first_bucket;
+ size_t nbuckets;
+ u8 b[];
+};
+
struct bch_dev_usage {
u64 buckets_ec;
u64 buckets_unavailable;
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 7d78672d..3cccd1fa 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -1063,7 +1063,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
if (!ob)
return NULL;
- ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ ca = bch_dev_bkey_exists(c, ob->dev);
offset = ca->mi.bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
@@ -1318,7 +1318,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
h->s->blocks[j] = buckets.v[i];
- h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, h->s->blocks_gotten);
}
@@ -1346,7 +1346,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
BUG_ON(j >= h->s->nr_data);
h->s->blocks[j] = buckets.v[i];
- h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, h->s->blocks_gotten);
}
@@ -1535,7 +1535,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
continue;
ob = c->open_buckets + h->s->blocks[i];
- if (ob->ptr.dev == ca->dev_idx)
+ if (ob->dev == ca->dev_idx)
goto found;
}
goto unlock;
@@ -1608,46 +1608,6 @@ int bch2_stripes_read(struct bch_fs *c)
return ret;
}
-int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- size_t i, idx = 0;
- int ret = 0;
-
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
-
- k = bch2_btree_iter_prev(&iter);
- ret = bkey_err(k);
- if (!ret && k.k)
- idx = k.k->p.offset + 1;
-
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
- if (ret)
- return ret;
-
- if (!idx)
- return 0;
-
- if (!gc &&
- !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
- GFP_KERNEL))
- return -ENOMEM;
-#if 0
- ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
-#else
- for (i = 0; i < idx; i++)
- if (!gc
- ? !genradix_ptr_alloc(&c->stripes, i, GFP_KERNEL)
- : !genradix_ptr_alloc(&c->gc_stripes, i, GFP_KERNEL))
- return -ENOMEM;
-#endif
- return 0;
-}
-
void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index 46814107..78d468c7 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -217,8 +217,6 @@ void bch2_stripes_heap_start(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
-int bch2_ec_mem_alloc(struct bch_fs *, bool);
-
void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 99b2a77e..3a7c1468 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -4,6 +4,7 @@
#include "btree_key_cache.h"
#include "bkey_methods.h"
#include "btree_update.h"
+#include "buckets.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@@ -584,59 +585,49 @@ found_slot:
static int bch2_inode_delete_keys(struct btree_trans *trans,
subvol_inum inum, enum btree_id id)
{
- u64 offset = 0;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ u32 snapshot;
int ret = 0;
- while (!ret || ret == -EINTR) {
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i delete;
- u32 snapshot;
+ /*
+ * We're never going to be deleting extents, no need to use an extent
+ * iterator:
+ */
+ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ while (1) {
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- continue;
+ goto err;
- bch2_trans_iter_init(trans, &iter, id,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
-
- if (!k.k || iter.pos.inode != inum.inum) {
- bch2_trans_iter_exit(trans, &iter);
- break;
- }
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
+ if (!k.k || iter.pos.inode != inum.inum)
+ break;
+
bkey_init(&delete.k);
delete.k.p = iter.pos;
- if (btree_node_type_is_extents(iter.btree_id)) {
- unsigned max_sectors =
- min_t(u64, U64_MAX - iter.pos.offset,
- KEY_SIZE_MAX & (~0 << trans->c->block_bits));
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
-
- ret = bch2_extent_trim_atomic(trans, &iter, &delete);
- if (ret)
- goto err;
- }
-
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
+ if (ret && ret != -EINTR)
+ break;
}
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 1b954dff..50b90b72 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -665,11 +665,7 @@ static void init_append_extent(struct bch_write_op *op,
{
struct bch_fs *c = op->c;
struct bkey_i_extent *e;
- struct open_bucket *ob;
- unsigned i;
- BUG_ON(crc.compressed_size > wp->sectors_free);
- wp->sectors_free -= crc.compressed_size;
op->pos.offset += crc.uncompressed_size;
e = bkey_extent_init(op->insert_keys.top);
@@ -682,22 +678,8 @@ static void init_append_extent(struct bch_write_op *op,
crc.nonce)
bch2_extent_crc_append(&e->k_i, crc);
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- union bch_extent_entry *end =
- bkey_val_end(bkey_i_to_s(&e->k_i));
-
- end->ptr = ob->ptr;
- end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- end->ptr.cached = !ca->mi.durability ||
- (op->flags & BCH_WRITE_CACHED) != 0;
- end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
-
- e->k.u64s++;
-
- BUG_ON(crc.compressed_size > ob->sectors_free);
- ob->sectors_free -= crc.compressed_size;
- }
+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
+ op->flags & BCH_WRITE_CACHED);
bch2_keylist_push(&op->insert_keys);
}
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 40e7cb62..158df42e 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -705,6 +705,44 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ u64 unwritten_seq;
+ bool ret = false;
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+ return false;
+
+ if (seq <= c->journal.flushed_seq_ondisk)
+ return false;
+
+ spin_lock(&j->lock);
+ if (seq <= c->journal.flushed_seq_ondisk)
+ goto out;
+
+ for (unwritten_seq = last_unwritten_seq(j);
+ unwritten_seq < seq;
+ unwritten_seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+ /* journal write is already in flight, and was a flush write: */
+ if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+ goto out;
+
+ buf->noflush = true;
+ }
+
+ ret = true;
+out:
+ spin_unlock(&j->lock);
+ return ret;
+}
+
/* block/unlock the journal: */
void bch2_journal_unblock(struct journal *j)
@@ -775,11 +813,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
long b;
if (new_fs) {
- if (c)
- percpu_down_read(&c->mark_lock);
b = bch2_bucket_alloc_new_fs(ca);
if (b < 0) {
- percpu_up_read(&c->mark_lock);
ret = -ENOSPC;
goto err;
}
@@ -793,7 +828,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err;
}
- b = sector_to_bucket(ca, ob->ptr.offset);
+ b = ob->bucket;
}
if (c)
@@ -827,14 +862,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (c)
spin_unlock(&c->journal.lock);
- if (new_fs) {
- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
- if (c)
- percpu_up_read(&c->mark_lock);
- } else {
+ if (!new_fs) {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_trans_mark_metadata_bucket(&trans, ca,
b, BCH_DATA_journal,
@@ -1000,11 +1028,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->replay_journal_seq = last_seq;
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
- j->flushed_seq_ondisk = last_seq;
+ j->flushed_seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
+ if (list_empty(journal_entries))
+ j->last_empty_seq = cur_seq - 1;
+
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index c39cbbf1..b2988732 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -477,6 +477,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
void bch2_journal_halt(struct journal *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index e161e86e..77201a0e 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1399,9 +1399,10 @@ void bch2_journal_write(struct closure *cl)
spin_lock(&j->lock);
if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
- !w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+ (w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index 79bc0e49..10bd23e9 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -235,81 +235,3 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
.validate = bch2_sb_journal_seq_blacklist_validate,
.to_text = bch2_sb_journal_seq_blacklist_to_text
};
-
-void bch2_blacklist_entries_gc(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- journal_seq_blacklist_gc_work);
- struct journal_seq_blacklist_table *t;
- struct bch_sb_field_journal_seq_blacklist *bl;
- struct journal_seq_blacklist_entry *src, *dst;
- struct btree_trans trans;
- unsigned i, nr, new_nr;
- int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
- 0, 0, BTREE_ITER_PREFETCH);
-retry:
- bch2_trans_begin(&trans);
-
- b = bch2_btree_iter_peek_node(&iter);
-
- while (!(ret = PTR_ERR_OR_ZERO(b)) &&
- b &&
- !test_bit(BCH_FS_STOPPING, &c->flags))
- b = bch2_btree_iter_next_node(&iter);
-
- if (ret == -EINTR)
- goto retry;
-
- bch2_trans_iter_exit(&trans, &iter);
- }
-
- bch2_trans_exit(&trans);
- if (ret)
- return;
-
- mutex_lock(&c->sb_lock);
- bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
- if (!bl)
- goto out;
-
- nr = blacklist_nr_entries(bl);
- dst = bl->start;
-
- t = c->journal_seq_blacklist_table;
- BUG_ON(nr != t->nr);
-
- for (src = bl->start, i = eytzinger0_first(t->nr);
- src < bl->start + nr;
- src++, i = eytzinger0_next(i, nr)) {
- BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
- BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
-
- if (t->entries[i].dirty)
- *dst++ = *src;
- }
-
- new_nr = dst - bl->start;
-
- bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
- if (new_nr != nr) {
- bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
- new_nr ? sb_blacklist_u64s(new_nr) : 0);
- BUG_ON(new_nr && !bl);
-
- if (!new_nr)
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
-
- bch2_write_super(c);
- }
-out:
- mutex_unlock(&c->sb_lock);
-}
diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h
index afb886ec..b4f876a0 100644
--- a/libbcachefs/journal_seq_blacklist.h
+++ b/libbcachefs/journal_seq_blacklist.h
@@ -17,6 +17,4 @@ int bch2_blacklist_table_initialize(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-void bch2_blacklist_entries_gc(struct work_struct *);
-
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 9b75c852..d9ca69f2 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -66,11 +66,6 @@ const char * const bch2_data_types[] = {
NULL
};
-const char * const bch2_cache_replacement_policies[] = {
- BCH_CACHE_REPLACEMENT_POLICIES()
- NULL
-};
-
const char * const bch2_member_states[] = {
BCH_MEMBER_STATES()
NULL
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index aadd3958..661eb576 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -19,7 +19,6 @@ extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const bch2_data_types[];
-extern const char * const bch2_cache_replacement_policies[];
extern const char * const bch2_member_states[];
extern const char * const bch2_d_types[];
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 29fe6260..8b0e468f 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -115,21 +115,12 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
struct journal_key n = {
.btree_id = id,
.level = level,
- .k = k,
.allocated = true
};
struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter;
unsigned idx = journal_key_search(keys, id, level, k->k.p);
- if (idx < keys->nr &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
- return 0;
- }
-
if (keys->nr == keys->size) {
struct journal_keys new_keys = {
.nr = keys->nr,
@@ -149,10 +140,23 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
*keys = new_keys;
}
- array_insert_item(keys->d, keys->nr, idx, n);
+ n.k = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+ if (!n.k)
+ return -ENOMEM;
+
+ bkey_copy(n.k, k);
+
+ if (idx < keys->nr &&
+ journal_key_cmp(&n, &keys->d[idx]) == 0) {
+ if (keys->d[idx].allocated)
+ kfree(keys->d[idx].k);
+ keys->d[idx] = n;
+ } else {
+ array_insert_item(keys->d, keys->nr, idx, n);
- list_for_each_entry(iter, &c->journal_iters, list)
- journal_iter_fix(c, iter, idx);
+ list_for_each_entry(iter, &c->journal_iters, list)
+ journal_iter_fix(c, iter, idx);
+ }
return 0;
}
@@ -160,22 +164,12 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
unsigned level, struct bpos pos)
{
- struct bkey_i *whiteout =
- kmalloc(sizeof(struct bkey), GFP_KERNEL);
- int ret;
-
- if (!whiteout) {
- bch_err(c, "%s: error allocating new key", __func__);
- return -ENOMEM;
- }
+ struct bkey_i whiteout;
- bkey_init(&whiteout->k);
- whiteout->k.p = pos;
+ bkey_init(&whiteout.k);
+ whiteout.k.p = pos;
- ret = bch2_journal_key_insert(c, id, level, whiteout);
- if (ret)
- kfree(whiteout);
- return ret;
+ return bch2_journal_key_insert(c, id, level, &whiteout);
}
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
@@ -1149,16 +1143,6 @@ use_clean:
if (ret)
goto err;
- /*
- * After an unclean shutdown, skip then next few journal sequence
- * numbers as they may have been referenced by btree writes that
- * happened before their corresponding journal writes - those btree
- * writes need to be ignored, by skipping and blacklisting the next few
- * journal sequence numbers:
- */
- if (!c->sb.clean)
- journal_seq += 8;
-
if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
@@ -1295,24 +1279,15 @@ use_clean:
bch_verbose(c, "quotas done");
}
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
- struct bch_move_stats stats;
-
- bch_move_stats_init(&stats, "recovery");
-
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c);
- if (ret)
- goto err;
-
- ret = bch2_scan_old_btree_nodes(c, &stats);
- if (ret)
- goto err;
- bch_info(c, "scanning for old btree nodes done");
- }
-
mutex_lock(&c->sb_lock);
+ /*
+ * With journal replay done, we can clear the journal seq blacklist
+ * table:
+ */
+ BUG_ON(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+ if (le16_to_cpu(c->sb.version_min) >= bcachefs_metadata_version_btree_ptr_sectors_written)
+ bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 0);
+
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1336,9 +1311,23 @@ use_clean:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (c->journal_seq_blacklist_table &&
- c->journal_seq_blacklist_table->nr > 128)
- queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
+ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
+ struct bch_move_stats stats;
+
+ bch_move_stats_init(&stats, "recovery");
+
+ bch_info(c, "scanning for old btree nodes");
+ ret = bch2_fs_read_write(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_scan_old_btree_nodes(c, &stats);
+ if (ret)
+ goto err;
+ bch_info(c, "scanning for old btree nodes done");
+ }
ret = 0;
out:
@@ -1383,9 +1372,6 @@ int bch2_fs_initialize(struct bch_fs *c)
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
bch2_write_super(c);
}
-
- for_each_online_member(ca, c, i)
- bch2_mark_dev_superblock(c, ca, 0);
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
@@ -1429,6 +1415,8 @@ int bch2_fs_initialize(struct bch_fs *c)
percpu_ref_put(&ca->ref);
goto err;
}
+
+ ca->new_fs_bucket_idx = 0;
}
err = "error creating root snapshot node";
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index b64ac2fb..5c264875 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -110,7 +110,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
.state = BCH_MEMBER_STATE(mi),
- .replacement = BCH_MEMBER_REPLACEMENT(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
.durability = BCH_MEMBER_DURABILITY(mi)
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 58bc2903..df6bffef 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -528,8 +528,6 @@ void __bch2_fs_stop(struct bch_fs *c)
set_bit(BCH_FS_STOPPING, &c->flags);
- cancel_work_sync(&c->journal_seq_blacklist_gc_work);
-
down_write(&c->state_lock);
bch2_fs_read_only(c);
up_write(&c->state_lock);
@@ -692,9 +690,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
spin_lock_init(&c->btree_write_error_lock);
- INIT_WORK(&c->journal_seq_blacklist_gc_work,
- bch2_blacklist_entries_gc);
-
INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
@@ -1600,8 +1595,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_dev *ca = NULL;
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
- struct bucket_array *buckets;
- struct bucket *g;
unsigned dev_idx, nr_devices, u64s;
int ret;
@@ -1631,20 +1624,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
return ret;
}
- /*
- * We want to allocate journal on the new device before adding the new
- * device to the filesystem because allocating after we attach requires
- * spinning up the allocator thread, and the allocator thread requires
- * doing btree writes, which if the existing devices are RO isn't going
- * to work
- *
- * So we have to mark where the superblocks are, but marking allocated
- * data normally updates the filesystem usage too, so we have to mark,
- * allocate the journal, reset all the marks, then remark after we
- * attach...
- */
- bch2_mark_dev_superblock(NULL, ca, 0);
-
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
if (ret)
@@ -1705,21 +1684,13 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
- /*
- * Clear marks before marking transactionally in the btree, so that
- * per-device accounting gets done correctly:
- */
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
- for_each_bucket(g, buckets)
- atomic64_set(&g->_mark.v, 0);
- up_read(&ca->bucket_lock);
-
err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret)
goto err_late;
+ ca->new_fs_bucket_idx = 0;
+
if (ca->mi.state == BCH_MEMBER_STATE_rw) {
ret = __bch2_dev_read_write(c, ca);
if (ret)
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 739e8fd1..c3273e9c 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -194,6 +194,27 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
return devs;
}
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 b_offset = bucket_to_sector(ca, b);
+ u64 b_end = bucket_to_sector(ca, b + 1);
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ u64 end = offset + (1 << layout->sb_max_size_bits);
+
+ if (!(offset >= b_end || end <= b_offset))
+ return true;
+ }
+
+ return false;
+}
+
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 96023f37..d8b159a5 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -29,7 +29,6 @@ struct bch_member_cpu {
u16 bucket_size; /* sectors */
u16 group;
u8 state;
- u8 replacement;
u8 discard;
u8 data_allowed;
u8 durability;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 0a0798ba..6d159632 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -10,6 +10,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
+#include "alloc_foreground.h"
#include "sysfs.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -131,7 +132,6 @@ do { \
return strtoi_h(buf, &var) ?: (ssize_t) size; \
} while (0)
-write_attribute(trigger_journal_flush);
write_attribute(trigger_gc);
write_attribute(prune_cache);
rw_attribute(btree_gc_periodic);
@@ -177,7 +177,6 @@ read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced);
rw_attribute(discard);
-rw_attribute(cache_replacement_policy);
rw_attribute(label);
rw_attribute(copy_gc_enabled);
@@ -267,8 +266,12 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+ enum btree_id id;
+ u64 nr_uncompressed_extents = 0,
nr_compressed_extents = 0,
+ nr_incompressible_extents = 0,
+ uncompressed_sectors = 0,
+ incompressible_sectors = 0,
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
int ret;
@@ -278,47 +281,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
- if (k.k->type == KEY_TYPE_extent) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!((1U << id) & BTREE_ID_HAS_PTRS))
+ continue;
+
+ for_each_btree_key(&trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
-
- extent_for_each_ptr_decode(e, p, entry) {
- if (!crc_is_compressed(p.crc)) {
- nr_uncompressed_extents++;
- uncompressed_sectors += e.k->size;
- } else {
- nr_compressed_extents++;
+ bool compressed = false, uncompressed = false, incompressible = false;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ switch (p.crc.compression_type) {
+ case BCH_COMPRESSION_TYPE_none:
+ uncompressed = true;
+ uncompressed_sectors += k.k->size;
+ break;
+ case BCH_COMPRESSION_TYPE_incompressible:
+ incompressible = true;
+ incompressible_sectors += k.k->size;
+ break;
+ default:
compressed_sectors_compressed +=
p.crc.compressed_size;
compressed_sectors_uncompressed +=
p.crc.uncompressed_size;
+ compressed = true;
+ break;
}
-
- /* only looking at the first ptr */
- break;
}
+
+ if (incompressible)
+ nr_incompressible_extents++;
+ else if (uncompressed)
+ nr_uncompressed_extents++;
+ else if (compressed)
+ nr_compressed_extents++;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(&trans, &iter);
+ }
bch2_trans_exit(&trans);
+
if (ret)
return ret;
- pr_buf(out,
- "uncompressed data:\n"
- " nr extents: %llu\n"
- " size (bytes): %llu\n"
- "compressed data:\n"
- " nr extents: %llu\n"
- " compressed size (bytes): %llu\n"
- " uncompressed size (bytes): %llu\n",
- nr_uncompressed_extents,
- uncompressed_sectors << 9,
- nr_compressed_extents,
- compressed_sectors_compressed << 9,
- compressed_sectors_uncompressed << 9);
+ pr_buf(out, "uncompressed:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents);
+ pr_buf(out, " size: ");
+ bch2_hprint(out, uncompressed_sectors << 9);
+ pr_buf(out, "\n");
+
+ pr_buf(out, "compressed:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_compressed_extents);
+ pr_buf(out, " compressed size: ");
+ bch2_hprint(out, compressed_sectors_compressed << 9);
+ pr_buf(out, "\n");
+ pr_buf(out, " uncompressed size: ");
+ bch2_hprint(out, compressed_sectors_uncompressed << 9);
+ pr_buf(out, "\n");
+
+ pr_buf(out, "incompressible:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents);
+ pr_buf(out, " size: ");
+ bch2_hprint(out, incompressible_sectors << 9);
+ pr_buf(out, "\n");
return 0;
}
@@ -483,9 +511,6 @@ STORE(bch2_fs)
/* Debugging: */
- if (attr == &sysfs_trigger_journal_flush)
- bch2_journal_meta(&c->journal);
-
if (attr == &sysfs_trigger_gc) {
/*
* Full gc is currently incompatible with btree key cache:
@@ -575,7 +600,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_io_timers_read,
&sysfs_io_timers_write,
- &sysfs_trigger_journal_flush,
&sysfs_trigger_gc,
&sysfs_prune_cache,
@@ -729,7 +753,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
memset(nr, 0, sizeof(nr));
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].type]++;
+ nr[c->open_buckets[i].data_type]++;
pr_buf(out,
"\t\t buckets\t sectors fragmented\n"
@@ -826,14 +850,6 @@ SHOW(bch2_dev)
return out.pos - buf;
}
- if (attr == &sysfs_cache_replacement_policy) {
- bch2_string_opt_to_text(&out,
- bch2_cache_replacement_policies,
- ca->mi.replacement);
- pr_buf(&out, "\n");
- return out.pos - buf;
- }
-
if (attr == &sysfs_state_rw) {
bch2_string_opt_to_text(&out, bch2_member_states,
ca->mi.state);
@@ -893,22 +909,6 @@ STORE(bch2_dev)
mutex_unlock(&c->sb_lock);
}
- if (attr == &sysfs_cache_replacement_policy) {
- ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
-
- if (v < 0)
- return v;
-
- mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-
- if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
- SET_BCH_MEMBER_REPLACEMENT(mi, v);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
- }
-
if (attr == &sysfs_label) {
char *tmp;
int ret;
@@ -939,7 +939,6 @@ struct attribute *bch2_dev_files[] = {
/* settings: */
&sysfs_discard,
- &sysfs_cache_replacement_policy,
&sysfs_state_rw,
&sysfs_label,
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index 478c00a5..60ccb94e 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -618,7 +618,6 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
static int __do_delete(struct btree_trans *trans, struct bpos pos)
{
struct btree_iter iter;
- struct bkey_i delete;
struct bkey_s_c k;
int ret = 0;
@@ -632,10 +631,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
if (!k.k)
goto err;
- bkey_init(&delete.k);
- delete.k.p = k.k->p;
-
- ret = bch2_trans_update(trans, &iter, &delete, 0);
+ ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -650,7 +646,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < nr; i++) {
- struct bpos pos = POS(0, test_rand());
+ struct bpos pos = SPOS(0, test_rand(), U32_MAX);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__do_delete(&trans, pos));