summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--libbcachefs/alloc_background.c115
-rw-r--r--libbcachefs/alloc_background.h1
-rw-r--r--libbcachefs/alloc_foreground.c40
-rw-r--r--libbcachefs/bkey.h5
-rw-r--r--libbcachefs/btree_gc.c104
-rw-r--r--libbcachefs/btree_gc.h2
-rw-r--r--libbcachefs/btree_io.c17
-rw-r--r--libbcachefs/btree_io.h1
-rw-r--r--libbcachefs/btree_update_leaf.c26
-rw-r--r--libbcachefs/buckets.c255
-rw-r--r--libbcachefs/buckets.h26
-rw-r--r--libbcachefs/buckets_types.h13
-rw-r--r--libbcachefs/chardev.c6
-rw-r--r--libbcachefs/ec.c221
-rw-r--r--libbcachefs/ec.h6
-rw-r--r--libbcachefs/ec_types.h3
-rw-r--r--libbcachefs/extent_update.c15
-rw-r--r--libbcachefs/fs-io.c30
-rw-r--r--libbcachefs/io.c3
-rw-r--r--libbcachefs/journal.c21
-rw-r--r--libbcachefs/journal.h5
-rw-r--r--libbcachefs/journal_io.c107
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/move.c2
-rw-r--r--libbcachefs/movinggc.c2
-rw-r--r--libbcachefs/recovery.c18
-rw-r--r--libbcachefs/replicas.c16
-rw-r--r--libbcachefs/super.c28
-rw-r--r--libbcachefs/sysfs.c91
30 files changed, 663 insertions, 519 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 79c81a65..ee5b7e5c 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-7d57e9b703cf8bda52c3894b5a18e74329914823
+313b24b652d521c6ba4a965f7033c73575923a91
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 60c2c38b..896ec023 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -54,10 +54,10 @@ static void pd_controllers_update(struct work_struct *work)
* reclaimed by copy GC
*/
fragmented += max_t(s64, 0, (bucket_to_sector(ca,
- stats.buckets[BCH_DATA_user] +
- stats.buckets[BCH_DATA_cached]) -
- (stats.sectors[BCH_DATA_user] +
- stats.sectors[BCH_DATA_cached])) << 9);
+ stats.d[BCH_DATA_user].buckets +
+ stats.d[BCH_DATA_cached].buckets) -
+ (stats.d[BCH_DATA_user].sectors +
+ stats.d[BCH_DATA_cached].sectors)) << 9);
}
bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
@@ -217,7 +217,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = __bucket(ca, k.k->p.offset, 0);
+ g = bucket(ca, k.k->p.offset);
u = bch2_alloc_unpack(k);
g->_mark.gen = u.gen;
@@ -278,7 +278,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *ba;
struct bucket *g;
struct bucket_mark m;
struct bkey_alloc_unpacked old_u, new_u;
@@ -302,9 +301,7 @@ retry:
percpu_down_read(&c->mark_lock);
ca = bch_dev_bkey_exists(c, iter->pos.inode);
- ba = bucket_array(ca);
-
- g = &ba->b[iter->pos.offset];
+ g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
new_u = alloc_mem_to_key(g, m);
percpu_up_read(&c->mark_lock);
@@ -326,54 +323,36 @@ err:
return ret;
}
-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
{
struct btree_trans trans;
struct btree_iter *iter;
- u64 first_bucket, nbuckets;
+ struct bch_dev *ca;
+ unsigned i;
int ret = 0;
- percpu_down_read(&c->mark_lock);
- first_bucket = bucket_array(ca)->first_bucket;
- nbuckets = bucket_array(ca)->nbuckets;
- percpu_up_read(&c->mark_lock);
-
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
- POS(ca->dev_idx, first_bucket),
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- while (iter->pos.offset < nbuckets) {
- bch2_trans_cond_resched(&trans);
-
- ret = bch2_alloc_write_key(&trans, iter, flags);
- if (ret)
- break;
- bch2_btree_iter_next_slot(iter);
- }
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
+ for_each_member_device(ca, c, i) {
+ bch2_btree_iter_set_pos(iter,
+ POS(ca->dev_idx, ca->mi.first_bucket));
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ while (iter->pos.offset < ca->mi.nbuckets) {
+ bch2_trans_cond_resched(&trans);
- for_each_member_device(ca, c, i) {
- bch2_dev_alloc_write(c, ca, flags);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- break;
+ ret = bch2_alloc_write_key(&trans, iter, flags);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ goto err;
+ }
+ bch2_btree_iter_next_slot(iter);
}
}
-
+err:
+ bch2_trans_exit(&trans);
return ret;
}
@@ -552,7 +531,8 @@ out:
static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
unsigned long gc_count = c->gc_count;
- u64 available;
+ s64 available;
+ unsigned i;
int ret = 0;
ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -568,8 +548,15 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
if (gc_count != c->gc_count)
ca->inc_gen_really_needs_gc = 0;
- available = max_t(s64, 0, dev_buckets_available(ca) -
- ca->inc_gen_really_needs_gc);
+ available = dev_buckets_available(ca);
+ available -= ca->inc_gen_really_needs_gc;
+
+ spin_lock(&c->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++)
+ available -= fifo_used(&ca->free[i]);
+ spin_unlock(&c->freelist_lock);
+
+ available = max(available, 0LL);
if (available > fifo_free(&ca->free_inc) ||
(available &&
@@ -598,6 +585,9 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
if (!is_available_bucket(mark))
return false;
+ if (mark.owned_by_allocator)
+ return false;
+
if (ca->buckets_nouse &&
test_bit(bucket, ca->buckets_nouse))
return false;
@@ -894,34 +884,33 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
-
- verify_not_on_freelist(c, ca, b);
-
- BUG_ON(!fifo_push(&ca->free_inc, b));
-
g = bucket(ca, b);
m = READ_ONCE(g->mark);
- invalidating_cached_data = m.cached_sectors != 0;
+ BUG_ON(m.dirty_sectors);
+
+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+ spin_lock(&c->freelist_lock);
+ verify_not_on_freelist(c, ca, b);
+ BUG_ON(!fifo_push(&ca->free_inc, b));
+ spin_unlock(&c->freelist_lock);
/*
* If we're not invalidating cached data, we only increment the bucket
* gen in memory here, the incremented gen will be updated in the btree
* by bch2_trans_mark_pointer():
*/
+ if (!m.cached_sectors &&
+ !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+ BUG_ON(m.data_type);
+ bucket_cmpxchg(g, m, m.gen++);
+ percpu_up_read(&c->mark_lock);
+ goto out;
+ }
- if (!invalidating_cached_data)
- bch2_invalidate_bucket(c, ca, b, &m);
- else
- bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
- spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
- if (!invalidating_cached_data)
- goto out;
-
/*
* If the read-only path is trying to shut down, we can't be generating
* new btree updates:
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index d10ff56e..f60fcebf 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -98,7 +98,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
int bch2_alloc_write(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index dcbe0404..8f0b94f5 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
rcu_read_lock();
buckets = bucket_array(ca);
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
- if (is_available_bucket(buckets->b[b].mark))
+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
+ if (is_available_bucket(buckets->b[b].mark) &&
+ !buckets->b[b].mark.owned_by_allocator)
goto success;
b = -1;
success:
@@ -224,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bool may_alloc_partial,
struct closure *cl)
{
- struct bucket_array *buckets;
struct open_bucket *ob;
- long bucket = 0;
+ long b = 0;
spin_lock(&c->freelist_lock);
@@ -260,13 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
return ERR_PTR(-OPEN_BUCKETS_EMPTY);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
goto out;
switch (reserve) {
case RESERVE_BTREE_MOVINGGC:
case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
goto out;
break;
default:
@@ -284,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
trace_bucket_alloc_fail(ca, reserve);
return ERR_PTR(-FREELIST_EMPTY);
out:
- verify_not_on_freelist(c, ca, bucket);
+ verify_not_on_freelist(c, ca, b);
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
- buckets = bucket_array(ca);
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->ptr = (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = buckets->b[bucket].mark.gen,
- .offset = bucket_to_sector(ca, bucket),
+ .gen = bucket(ca, b)->mark.gen,
+ .offset = bucket_to_sector(ca, b),
.dev = ca->dev_idx,
};
@@ -489,16 +488,20 @@ bucket_alloc_from_stripe(struct bch_fs *c,
devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
for (i = 0; i < devs_sorted.nr; i++)
- open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+ if (!h->s->blocks[ec_idx])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[ec_idx];
if (ob->ptr.dev == devs_sorted.devs[i] &&
- !test_and_set_bit(h->s->data_block_idx[ec_idx],
- h->s->blocks_allocated))
+ !test_and_set_bit(ec_idx, h->s->blocks_allocated))
goto got_bucket;
+ }
goto out_put_head;
got_bucket:
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- ob->ec_idx = h->s->data_block_idx[ec_idx];
+ ob->ec_idx = ec_idx;
ob->ec = h->s;
add_new_bucket(c, ptrs, devs_may_alloc,
@@ -636,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
- open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
- drop |= ob2->ptr.dev == ca->dev_idx;
- open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+ for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
+ if (!ob->ec->blocks[j])
+ continue;
+
+ ob2 = c->open_buckets + ob->ec->blocks[j];
drop |= ob2->ptr.dev == ca->dev_idx;
+ }
mutex_unlock(&ob->ec->lock);
}
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 2d2c6403..2c3b73a6 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -170,6 +170,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r)
return bkey_cmp(l, r) < 0 ? l : r;
}
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+ return bkey_cmp(l, r) > 0 ? l : r;
+}
+
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index d0635a08..efeaec3d 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -205,13 +205,12 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
- bool initial, bool metadata_only)
+ bool initial)
{
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
- unsigned depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
+ unsigned depth = bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
@@ -326,13 +325,11 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
static int bch2_gc_btree_init(struct bch_fs *c,
struct journal_keys *journal_keys,
- enum btree_id btree_id,
- bool metadata_only)
+ enum btree_id btree_id)
{
struct btree *b;
- unsigned target_depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
+ unsigned target_depth = bch2_expensive_debug_checks ? 0
+ : !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
int ret = 0;
@@ -377,7 +374,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
}
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
- bool initial, bool metadata_only)
+ bool initial)
{
enum btree_id ids[BTREE_ID_NR];
unsigned i;
@@ -390,8 +387,8 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
enum btree_id id = ids[i];
int ret = initial
? bch2_gc_btree_init(c, journal_keys,
- id, metadata_only)
- : bch2_gc_btree(c, id, initial, metadata_only);
+ id)
+ : bch2_gc_btree(c, id, initial);
if (ret)
return ret;
}
@@ -558,12 +555,11 @@ static void bch2_gc_free(struct bch_fs *c)
}
static int bch2_gc_done(struct bch_fs *c,
- bool initial, bool metadata_only)
+ bool initial)
{
struct bch_dev *ca;
- bool verify = !metadata_only &&
- (!initial ||
- (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
+ bool verify = (!initial ||
+ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
unsigned i;
int ret = 0;
@@ -580,10 +576,9 @@ static int bch2_gc_done(struct bch_fs *c,
if (verify) \
fsck_err(c, "stripe %zu has wrong "_msg \
": got %u, should be %u", \
- dst_iter.pos, ##__VA_ARGS__, \
+ iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- dst->dirty = true; \
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_bucket_field(_f) \
@@ -602,29 +597,32 @@ static int bch2_gc_done(struct bch_fs *c,
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
- if (!metadata_only) {
- struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
- struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+ {
+ struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src;
- while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
- (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
- BUG_ON(src_iter.pos != dst_iter.pos);
-
- copy_stripe_field(alive, "alive");
- copy_stripe_field(sectors, "sectors");
- copy_stripe_field(algorithm, "algorithm");
- copy_stripe_field(nr_blocks, "nr_blocks");
- copy_stripe_field(nr_redundant, "nr_redundant");
- copy_stripe_field(blocks_nonempty,
- "blocks_nonempty");
+ while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
+ dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
+
+ if (dst->alive != src->alive ||
+ dst->sectors != src->sectors ||
+ dst->algorithm != src->algorithm ||
+ dst->nr_blocks != src->nr_blocks ||
+ dst->nr_redundant != src->nr_redundant) {
+ bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
+ ret = -EINVAL;
+ goto fsck_err;
+ }
for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i);
- genradix_iter_advance(&dst_iter, &c->stripes[0]);
- genradix_iter_advance(&src_iter, &c->stripes[1]);
+ dst->blocks_nonempty = 0;
+ for (i = 0; i < dst->nr_blocks; i++)
+ dst->blocks_nonempty += dst->block_sectors[i] != 0;
+
+ genradix_iter_advance(&iter, &c->stripes[1]);
}
}
@@ -658,28 +656,20 @@ static int bch2_gc_done(struct bch_fs *c,
copy_fs_field(hidden, "hidden");
copy_fs_field(btree, "btree");
+ copy_fs_field(data, "data");
+ copy_fs_field(cached, "cached");
+ copy_fs_field(reserved, "reserved");
+ copy_fs_field(nr_inodes,"nr_inodes");
- if (!metadata_only) {
- copy_fs_field(data, "data");
- copy_fs_field(cached, "cached");
- copy_fs_field(reserved, "reserved");
- copy_fs_field(nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(persistent_reserved[i],
- "persistent_reserved[%i]", i);
- }
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ copy_fs_field(persistent_reserved[i],
+ "persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
char buf[80];
- if (metadata_only &&
- (e->data_type == BCH_DATA_user ||
- e->data_type == BCH_DATA_cached))
- continue;
-
bch2_replicas_entry_to_text(&PBUF(buf), e);
copy_fs_field(replicas[i], "%s", buf);
@@ -695,8 +685,7 @@ fsck_err:
return ret;
}
-static int bch2_gc_start(struct bch_fs *c,
- bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
@@ -760,13 +749,6 @@ static int bch2_gc_start(struct bch_fs *c,
d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
d->gen_valid = s->gen_valid;
-
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached)) {
- d->_mark = s->mark;
- d->_mark.owned_by_allocator = 0;
- }
}
};
@@ -794,7 +776,7 @@ static int bch2_gc_start(struct bch_fs *c,
* uses, GC could skip past them
*/
int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
- bool initial, bool metadata_only)
+ bool initial)
{
struct bch_dev *ca;
u64 start_time = local_clock();
@@ -810,13 +792,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
again:
- ret = bch2_gc_start(c, metadata_only);
+ ret = bch2_gc_start(c);
if (ret)
goto out;
bch2_mark_superblocks(c);
- ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+ ret = bch2_gc_btrees(c, journal_keys, initial);
if (ret)
goto out;
@@ -855,7 +837,7 @@ out:
bch2_journal_block(&c->journal);
percpu_down_write(&c->mark_lock);
- ret = bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_done(c, initial);
bch2_journal_unblock(&c->journal);
} else {
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 3694a3df..f0435a58 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -7,7 +7,7 @@
void bch2_coalesce(struct bch_fs *);
struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc(struct bch_fs *, struct journal_keys *, bool);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index b94f0807..65f7e366 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1828,23 +1828,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
-void bch2_btree_verify_flushed(struct bch_fs *c)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos) {
- unsigned long flags = READ_ONCE(b->flags);
-
- BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
- (flags & (1 << BTREE_NODE_write_in_flight)));
- }
- rcu_read_unlock();
-}
-
void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bucket_table *tbl;
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 1a4b11e9..3b61555e 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -185,7 +185,6 @@ do { \
void bch2_btree_flush_all_reads(struct bch_fs *);
void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index c490df47..967e1e4d 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -836,7 +836,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
int ret = 0;
if (!trans->nr_updates)
- goto out_noupdates;
+ goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&trans->c->gc_lock);
@@ -850,7 +850,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
unlikely(!percpu_ref_tryget(&trans->c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
- return ret;
+ goto out_reset;
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -962,7 +962,7 @@ out:
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
-out_noupdates:
+out_reset:
bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
return ret;
@@ -981,10 +981,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
- EBUG_ON(bkey_cmp(iter->pos,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_start_pos(&k->k)
- : k->k.p));
+#ifdef CONFIG_BCACHEFS_DEBUG
+ BUG_ON(bkey_cmp(iter->pos,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&k->k)
+ : k->k.p));
+
+ trans_for_each_update(trans, i) {
+ BUG_ON(bkey_cmp(i->iter->pos,
+ (i->iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&i->k->k)
+ : i->k->k.p));
+
+ BUG_ON(i != trans->updates &&
+ btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0);
+ }
+#endif
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index ed07dfee..cb0f0e09 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -376,15 +376,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
return !is_available_bucket(m);
}
-static inline int is_fragmented_bucket(struct bucket_mark m,
- struct bch_dev *ca)
-{
- if (!m.owned_by_allocator &&
- m.data_type == BCH_DATA_user &&
- bucket_sectors_used(m))
- return max_t(int, 0, (int) ca->mi.bucket_size -
- bucket_sectors_used(m));
- return 0;
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bucket_mark m)
+{
+ return bucket_sectors_used(m)
+ ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ : 0;
}
static inline int is_stripe_data_bucket(struct bucket_mark m)
@@ -392,11 +389,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m)
return m.stripe && m.data_type != BCH_DATA_parity;
}
-static inline int bucket_stripe_sectors(struct bucket_mark m)
-{
- return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
-}
-
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
@@ -456,7 +448,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
if (type == BCH_DATA_sb || type == BCH_DATA_journal)
fs_usage->hidden += size;
- dev_usage->buckets[type] += nr;
+ dev_usage->d[type].buckets += nr;
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
@@ -481,19 +473,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+ u->buckets_ec += (int) new.stripe - (int) old.stripe;
u->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
- u->buckets_ec += (int) new.stripe - (int) old.stripe;
- u->sectors_ec += bucket_stripe_sectors(new) -
- bucket_stripe_sectors(old);
-
- u->sectors[old.data_type] -= old.dirty_sectors;
- u->sectors[new.data_type] += new.dirty_sectors;
- u->sectors[BCH_DATA_cached] +=
+ u->d[old.data_type].sectors -= old.dirty_sectors;
+ u->d[new.data_type].sectors += new.dirty_sectors;
+ u->d[BCH_DATA_cached].sectors +=
(int) new.cached_sectors - (int) old.cached_sectors;
- u->sectors_fragmented +=
- is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+
+ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
@@ -650,46 +641,6 @@ unwind:
ret; \
})
-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *ret,
- bool gc)
-{
- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
- struct bucket *g = __bucket(ca, b, gc);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- BUG_ON(!is_available_bucket(new));
-
- new.owned_by_allocator = true;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- new.gen++;
- }));
-
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
- if (old.cached_sectors)
- update_cached_sectors(c, fs_usage, ca->dev_idx,
- -((s64) old.cached_sectors));
-
- if (!gc)
- *ret = old;
- return 0;
-}
-
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *old)
-{
- do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
- ca, b, old);
-
- if (!old->owned_by_allocator && old->cached_sectors)
- trace_invalidate(ca, bucket_to_sector(ca, b),
- old->cached_sectors);
-}
-
static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
@@ -1269,9 +1220,15 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
- m->block_sectors[i] =
- stripe_blockcount_get(new_s, i);
- m->blocks_nonempty += !!m->block_sectors[i];
+ unsigned s = stripe_blockcount_get(new_s, i);
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ if (!gc)
+ m->block_sectors[i] = s;
+ m->blocks_nonempty += !!s;
}
if (gc && old_s)
@@ -2100,6 +2057,168 @@ int bch2_trans_mark_update(struct btree_trans *trans,
return ret;
}
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc *a;
+ struct bch_extent_ptr ptr = {
+ .dev = ca->dev_idx,
+ .offset = bucket_to_sector(ca, b),
+ };
+ int ret = 0;
+
+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+ if (ret)
+ return ret;
+
+ if (u.data_type && u.data_type != type) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ bch2_data_types[type],
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto out;
+ }
+
+ if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
+ "while marking %s",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ bch2_data_types[u.data_type ?: type],
+ u.dirty_sectors, sectors, ca->mi.bucket_size,
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (u.data_type == type &&
+ u.dirty_sectors == sectors)
+ goto out;
+
+ u.data_type = type;
+ u.dirty_sectors = sectors;
+
+ bkey_alloc_init(&a->k_i);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, u);
+ bch2_trans_update(trans, iter, &a->k_i, 0);
+out:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ return __bch2_trans_do(trans, res, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
+ ca->mi.bucket_size));
+
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca,
+ u64 start, u64 end,
+ enum bch_data_type type,
+ u64 *bucket, unsigned *bucket_sectors)
+{
+ int ret;
+
+ do {
+ u64 b = sector_to_bucket(ca, start);
+ unsigned sectors =
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+ if (b != *bucket) {
+ if (*bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ *bucket, type, *bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ *bucket = b;
+ *bucket_sectors = 0;
+ }
+
+ *bucket_sectors += sectors;
+ start += sectors;
+ } while (!ret && start < end);
+
+ return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 bucket = 0;
+ unsigned i, bucket_sectors = 0;
+ int ret;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR) {
+ ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+ 0, BCH_SB_SECTOR,
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ if (bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ bucket, BCH_DATA_sb, bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < ca->journal.nr; i++) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ ca->journal.buckets[i],
+ BCH_DATA_journal, ca->mi.bucket_size);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c,
+ struct disk_reservation *res,
+ struct bch_dev *ca)
+{
+ return bch2_trans_do(c, res, NULL, 0,
+ __bch2_trans_mark_dev_sb(&trans, res, ca));
+}
+
/* Disk reservations: */
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
@@ -2115,7 +2234,7 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- unsigned sectors, int flags)
+ u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;
u64 old, v, get;
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 3a5ed1fc..37346240 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -153,18 +153,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark)
return mark.dirty_sectors + mark.cached_sectors;
}
-static inline bool bucket_unused(struct bucket_mark mark)
-{
- return !mark.owned_by_allocator &&
- !mark.data_type &&
- !bucket_sectors_used(mark);
-}
-
static inline bool is_available_bucket(struct bucket_mark mark)
{
- return (!mark.owned_by_allocator &&
- !mark.dirty_sectors &&
- !mark.stripe);
+ return !mark.dirty_sectors && !mark.stripe;
}
static inline bool bucket_needs_journal_commit(struct bucket_mark m,
@@ -245,8 +236,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
- size_t, struct bucket_mark *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
size_t, bool, struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
@@ -270,6 +259,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
struct bkey_i *insert, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *,
+ struct disk_reservation *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
+ struct bch_dev *);
+
/* disk reservations: */
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
@@ -284,8 +279,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
int bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- unsigned, int);
+ struct disk_reservation *,
+ u64, int);
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@@ -302,8 +297,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
static inline int bch2_disk_reservation_get(struct bch_fs *c,
struct disk_reservation *res,
- unsigned sectors,
- unsigned nr_replicas,
+ u64 sectors, unsigned nr_replicas,
int flags)
{
*res = bch2_disk_reservation_init(c, nr_replicas);
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index d6057d22..5fbe940a 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -52,16 +52,15 @@ struct bucket_array {
};
struct bch_dev_usage {
- u64 buckets[BCH_DATA_NR];
u64 buckets_alloc;
+ u64 buckets_ec;
u64 buckets_unavailable;
- /* _compressed_ sectors: */
- u64 sectors[BCH_DATA_NR];
- u64 sectors_fragmented;
-
- u64 buckets_ec;
- u64 sectors_ec;
+ struct {
+ u64 buckets;
+ u64 sectors; /* _compressed_ sectors: */
+ u64 fragmented;
+ } d[BCH_DATA_NR];
};
struct bch_fs_usage {
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index e7c8969a..49842ec8 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
arg.available_buckets = arg.nr_buckets - src.buckets_unavailable;
arg.ec_buckets = src.buckets_ec;
- arg.ec_sectors = src.sectors_ec;
+ arg.ec_sectors = 0;
for (i = 0; i < BCH_DATA_NR; i++) {
- arg.buckets[i] = src.buckets[i];
- arg.sectors[i] = src.sectors[i];
+ arg.buckets[i] = src.d[i].buckets;
+ arg.sectors[i] = src.d[i].sectors;
}
percpu_ref_put(&ca->ref);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 75f39e99..9c7cc788 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -684,13 +684,14 @@ static void ec_stripe_delete_work(struct work_struct *work)
/* stripe creation: */
static int ec_stripe_bkey_insert(struct bch_fs *c,
- struct ec_stripe_new *s,
- struct bkey_i_stripe *stripe)
+ struct bkey_i_stripe *stripe,
+ struct disk_reservation *res)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bpos start_pos = POS(0, c->ec_stripe_hint);
+ struct bpos min_pos = POS(0, 1);
+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
bch2_trans_init(&trans, c, 0, 0);
@@ -701,7 +702,7 @@ retry:
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
if (start_pos.offset) {
- start_pos = POS_MIN;
+ start_pos = min_pos;
bch2_btree_iter_set_pos(iter, start_pos);
continue;
}
@@ -726,7 +727,7 @@ found_slot:
bch2_trans_update(&trans, iter, &stripe->k_i, 0);
- ret = bch2_trans_commit(&trans, &s->res, NULL,
+ ret = bch2_trans_commit(&trans, res, NULL,
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_put(&trans, iter);
@@ -740,6 +741,47 @@ err:
return ret;
}
+static int ec_stripe_bkey_update(struct btree_trans *trans,
+ struct bkey_i_stripe *new)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ const struct bch_stripe *existing;
+ unsigned i;
+ int ret;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_EC,
+ new->k.p, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || k.k->type != KEY_TYPE_stripe) {
+ bch_err(c, "error updating stripe: not found");
+ ret = -ENOENT;
+ goto err;
+ }
+
+ existing = bkey_s_c_to_stripe(k).v;
+
+ if (existing->nr_blocks != new->v.nr_blocks) {
+ bch_err(c, "error updating stripe: nr_blocks does not match");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i,
+ stripe_blockcount_get(existing, i));
+
+ bch2_trans_update(trans, iter, &new->k_i, 0);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static void extent_stripe_ptr_add(struct bkey_s_extent e,
struct ec_stripe_buf *s,
struct bch_extent_ptr *ptr,
@@ -866,9 +908,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (!percpu_ref_tryget(&c->writes))
goto err;
- BUG_ON(bitmap_weight(s->blocks_allocated,
- s->blocks.nr) != s->blocks.nr);
-
ec_generate_ec(&s->new_stripe);
ec_generate_checksums(&s->new_stripe);
@@ -884,9 +923,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
ret = s->have_existing_stripe
- ? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i,
- &s->res, NULL, BTREE_INSERT_NOFAIL)
- : ec_stripe_bkey_insert(c, s, &s->new_stripe.key);
+ ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+ ec_stripe_bkey_update(&trans, &s->new_stripe.key))
+ : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
goto err_put_writes;
@@ -902,11 +941,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
spin_lock(&c->ec_stripes_heap_lock);
m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
-#if 0
- pr_info("created a %s stripe %llu",
- s->have_existing_stripe ? "existing" : "new",
- s->stripe.key.k.p.offset);
-#endif
+
BUG_ON(m->on_heap);
bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
spin_unlock(&c->ec_stripes_heap_lock);
@@ -915,12 +950,17 @@ err_put_writes:
err:
bch2_disk_reservation_put(c, &s->res);
- open_bucket_for_each(c, &s->blocks, ob, i) {
- ob->ec = NULL;
- __bch2_open_bucket_put(c, ob);
- }
-
- bch2_open_buckets_put(c, &s->parity);
+ for (i = 0; i < v->nr_blocks; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
+
+ if (i < nr_data) {
+ ob->ec = NULL;
+ __bch2_open_bucket_put(c, ob);
+ } else {
+ bch2_open_bucket_put(c, ob);
+ }
+ }
bch2_keylist_free(&s->keys, s->inline_keys);
@@ -1179,7 +1219,7 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
if (h->s &&
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr) == h->s->blocks.nr)
+ h->s->nr_data) == h->s->nr_data)
ec_stripe_set_pending(c, h);
mutex_unlock(&h->lock);
@@ -1216,64 +1256,82 @@ static enum bucket_alloc_ret
new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
struct closure *cl)
{
- struct bch_devs_mask devs;
+ struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
- unsigned i, nr_have, nr_data =
- min_t(unsigned, h->nr_active_devs,
- BCH_BKEY_PTRS_MAX) - h->redundancy;
+ struct open_buckets buckets;
+ unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
enum bucket_alloc_ret ret = ALLOC_SUCCESS;
- devs = h->devs;
-
- for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
- __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
- --nr_data;
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+ if (test_bit(i, h->s->blocks_gotten)) {
+ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+ if (i < h->s->nr_data)
+ nr_have_data++;
+ else
+ nr_have_parity++;
+ }
}
- BUG_ON(h->s->blocks.nr > nr_data);
- BUG_ON(h->s->parity.nr > h->redundancy);
-
- open_bucket_for_each(c, &h->s->parity, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
- open_bucket_for_each(c, &h->s->blocks, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
+ BUG_ON(nr_have_data > h->s->nr_data);
+ BUG_ON(nr_have_parity > h->s->nr_parity);
percpu_down_read(&c->mark_lock);
rcu_read_lock();
- if (h->s->parity.nr < h->redundancy) {
- nr_have = h->s->parity.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->s->parity,
+ buckets.nr = 0;
+ if (nr_have_parity < h->s->nr_parity) {
+ ret = bch2_bucket_alloc_set(c, &buckets,
&h->parity_stripe,
&devs,
- h->redundancy,
- &nr_have,
+ h->s->nr_parity,
+ &nr_have_parity,
&have_cache,
h->copygc
? RESERVE_MOVINGGC
: RESERVE_NONE,
0,
cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data + h->s->nr_parity,
+ h->s->nr_data);
+ BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+ h->s->blocks[j] = buckets.v[i];
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
if (ret)
goto err;
}
- if (h->s->blocks.nr < nr_data) {
- nr_have = h->s->blocks.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+ buckets.nr = 0;
+ if (nr_have_data < h->s->nr_data) {
+ ret = bch2_bucket_alloc_set(c, &buckets,
&h->block_stripe,
&devs,
- nr_data,
- &nr_have,
+ h->s->nr_data,
+ &nr_have_data,
&have_cache,
h->copygc
? RESERVE_MOVINGGC
: RESERVE_NONE,
0,
cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data, 0);
+ BUG_ON(j >= h->s->nr_data);
+
+ h->s->blocks[j] = buckets.v[i];
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
if (ret)
goto err;
}
@@ -1325,8 +1383,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
struct closure *cl)
{
struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i, data_idx = 0;
+ unsigned i;
s64 idx;
int ret;
@@ -1361,9 +1418,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
BUG();
}
+ BUG_ON(h->s->existing_stripe.size != h->blocksize);
+ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
- if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i))
+ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+ __set_bit(i, h->s->blocks_gotten);
__set_bit(i, h->s->blocks_allocated);
+ }
ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
}
@@ -1401,20 +1463,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
goto out;
}
- open_bucket_for_each(c, &h->s->blocks, ob, i) {
- data_idx = find_next_zero_bit(h->s->blocks_allocated,
- h->s->nr_data, data_idx);
- BUG_ON(data_idx >= h->s->nr_data);
-
- h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr;
- h->s->data_block_idx[i] = data_idx;
- data_idx++;
- }
-
- open_bucket_for_each(c, &h->s->parity, ob, i)
- h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
-
- //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
h->s->allocated = true;
}
out:
@@ -1434,12 +1482,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
if (!h->s)
goto unlock;
- open_bucket_for_each(c, &h->s->blocks, ob, i)
- if (ob->ptr.dev == ca->dev_idx)
- goto found;
- open_bucket_for_each(c, &h->s->parity, ob, i)
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+ if (!h->s->blocks[i])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[i];
if (ob->ptr.dev == ca->dev_idx)
goto found;
+ }
goto unlock;
found:
h->s->err = -EROFS;
@@ -1466,7 +1516,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
size_t idx,
struct bkey_i_stripe *new_key)
{
- struct bch_fs *c = trans->c;
+ const struct bch_stripe *v;
struct bkey_s_c k;
unsigned i;
int ret;
@@ -1481,16 +1531,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_stripe)
return -EIO;
+ v = bkey_s_c_to_stripe(k).v;
+ for (i = 0; i < v->nr_blocks; i++)
+ if (m->block_sectors[i] != stripe_blockcount_get(v, i))
+ goto write;
+ return 0;
+write:
bkey_reassemble(&new_key->k_i, k);
- spin_lock(&c->ec_stripes_heap_lock);
-
for (i = 0; i < new_key->v.nr_blocks; i++)
stripe_blockcount_set(&new_key->v, i,
m->block_sectors[i]);
- m->dirty = false;
-
- spin_unlock(&c->ec_stripes_heap_lock);
bch2_trans_update(trans, iter, &new_key->k_i, 0);
return 0;
@@ -1514,7 +1565,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
genradix_for_each(&c->stripes[0], giter, m) {
- if (!m->dirty)
+ if (!m->alive)
continue;
ret = __bch2_trans_do(&trans, NULL, NULL,
@@ -1624,19 +1675,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
h->target, h->algo, h->redundancy);
if (h->s)
- pr_buf(out, "\tpending: blocks %u allocated %u\n",
- h->s->blocks.nr,
+ pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+ h->s->nr_data, h->s->nr_parity,
bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr));
+ h->s->nr_data));
}
mutex_unlock(&c->ec_stripe_head_lock);
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
- s->blocks.nr,
- bitmap_weight(s->blocks_allocated,
- s->blocks.nr),
+ pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+ s->nr_data, s->nr_parity,
atomic_read(&s->pin));
}
mutex_unlock(&c->ec_stripe_new_lock);
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index f124582f..765baa9d 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -143,11 +143,9 @@ struct ec_stripe_new {
bool pending;
bool have_existing_stripe;
+ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
- struct open_buckets blocks;
- u8 data_block_idx[BCH_BKEY_PTRS_MAX];
- struct open_buckets parity;
+ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
struct keylist keys;
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index 5b688b43..84777016 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -18,8 +18,7 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
- unsigned alive:1;
- unsigned dirty:1;
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
unsigned on_heap:1;
u8 blocks_nonempty;
u16 block_sectors[BCH_BKEY_PTRS_MAX];
diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c
index 1faca4bc..5c43678e 100644
--- a/libbcachefs/extent_update.c
+++ b/libbcachefs/extent_update.c
@@ -192,18 +192,13 @@ bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- struct bkey_packed *_k;
struct bkey_s_c k;
- struct bkey unpacked;
- int sectors;
+ int ret, sectors;
- _k = bch2_btree_node_iter_peek(&node_iter, l->b);
- if (!_k)
- return BTREE_INSERT_OK;
-
- k = bkey_disassemble(l->b, _k, &unpacked);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
/* Check if we're splitting a compressed extent: */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 959eff4c..af7f8791 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -84,6 +84,7 @@ struct dio_read {
struct closure cl;
struct kiocb *req;
long ret;
+ bool should_dirty;
struct bch_read_bio rbio;
};
@@ -1619,12 +1620,22 @@ again:
/* O_DIRECT reads */
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+ if (check_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
static void bch2_dio_read_complete(struct closure *cl)
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
dio->req->ki_complete(dio->req, dio->ret, 0);
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
}
static void bch2_direct_IO_read_endio(struct bio *bio)
@@ -1639,8 +1650,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
static void bch2_direct_IO_read_split_endio(struct bio *bio)
{
+ struct dio_read *dio = bio->bi_private;
+ bool should_dirty = dio->should_dirty;
+
bch2_direct_IO_read_endio(bio);
- bio_check_pages_dirty(bio); /* transfers ownership */
+ bio_check_or_release(bio, should_dirty);
}
static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
@@ -1694,6 +1708,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
dio->req = req;
dio->ret = ret;
+ /*
+ * This is one of the sketchier things I've encountered: we have to skip
+ * the dirtying of requests that are internal from the kernel (i.e. from
+ * loopback), because we'll deadlock on page_lock.
+ */
+ dio->should_dirty = iter_is_iovec(iter);
goto start;
while (iter->count) {
@@ -1715,7 +1735,9 @@ start:
}
offset += bio->bi_iter.bi_size;
- bio_set_pages_dirty(bio);
+
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
if (iter->count)
closure_get(&dio->cl);
@@ -1729,7 +1751,7 @@ start:
closure_sync(&dio->cl);
closure_debug_destroy(&dio->cl);
ret = dio->ret;
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
return ret;
} else {
return -EIOCBQUEUED;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 4c4ba07c..5f74583f 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -499,9 +499,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
- if (!journal_flushes_device(ca))
- n->bio.bi_opf |= REQ_FUA;
-
if (likely(n->have_ioref)) {
this_cpu_add(ca->io_done->sectors[WRITE][type],
bio_sectors(&n->bio));
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 04c94e57..d6273c8d 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -9,6 +9,7 @@
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "btree_gc.h"
+#include "btree_update.h"
#include "buckets.h"
#include "journal.h"
#include "journal_io.h"
@@ -82,6 +83,7 @@ static void bch2_journal_buf_init(struct journal *j)
bkey_extent_init(&buf->key);
buf->noflush = false;
buf->must_flush = false;
+ buf->separate_flush = false;
memset(buf->has_inode, 0, sizeof(buf->has_inode));
@@ -823,18 +825,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
+ if (!c || new_fs)
+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ 0);
if (c) {
spin_unlock(&c->journal.lock);
percpu_up_read(&c->mark_lock);
}
+ if (c && !new_fs)
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+ bucket, BCH_DATA_journal,
+ ca->mi.bucket_size));
+
if (!new_fs)
bch2_open_bucket_put(c, ob);
+
+ if (ret)
+ goto err;
}
err:
bch2_sb_resize_journal(&ca->disk_sb,
@@ -953,6 +965,7 @@ void bch2_fs_journal_stop(struct journal *j)
journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
(journal_entry_is_open(j) ||
j->last_empty_seq + 1 != journal_cur_seq(j)));
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 1db1f190..bda8cb97 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -494,11 +494,6 @@ static inline int bch2_journal_error(struct journal *j)
struct bch_dev;
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
- return true;
-}
-
static inline void bch2_journal_set_replay_done(struct journal *j)
{
BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 385cb4d5..750f6fab 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1189,6 +1189,53 @@ static void journal_write_endio(struct bio *bio)
percpu_ref_put(&ca->io_ref);
}
+static void do_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_extent_ptr *ptr;
+ struct bio *bio;
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ /* XXX: fix this */
+ bch_err(c, "missing device for journal write\n");
+ continue;
+ }
+
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+ sectors);
+
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_iter.bi_sector = ptr->offset;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+
+ if (!JSET_NO_FLUSH(w->data))
+ bio->bi_opf |= REQ_FUA;
+ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+ bio->bi_opf |= REQ_PREFLUSH;
+
+ bch2_bio_map(bio, w->data, sectors << 9);
+
+ trace_journal_write(bio);
+ closure_bio_submit(bio, cl);
+
+ ca->journal.bucket_seq[ca->journal.cur_idx] =
+ le64_to_cpu(w->data->seq);
+ }
+
+ continue_at(cl, journal_write_done, system_highpri_wq);
+ return;
+}
+
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
@@ -1198,9 +1245,8 @@ void bch2_journal_write(struct closure *cl)
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
- struct bch_extent_ptr *ptr;
bool validate_before_checksum = false;
- unsigned i, sectors, bytes, u64s;
+ unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1330,49 +1376,30 @@ retry_alloc:
if (c->opts.nochanges)
goto no_io;
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!percpu_ref_tryget(&ca->io_ref)) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
- sectors);
+ for_each_rw_member(ca, c, i)
+ nr_rw_members++;
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
- if (!JSET_NO_FLUSH(jset))
- bio->bi_opf |= REQ_PREFLUSH|REQ_FUA;
- bch2_bio_map(bio, jset, sectors << 9);
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
- trace_journal_write(bio);
- closure_bio_submit(bio, cl);
+ if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+ for_each_rw_member(ca, c, i) {
+ percpu_ref_get(&ca->io_ref);
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_opf = REQ_OP_FLUSH;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
}
- if (!JSET_NO_FLUSH(jset)) {
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
- }
+ bch2_bucket_seq_cleanup(c);
+
+ continue_at(cl, do_journal_write, system_highpri_wq);
+ return;
no_io:
bch2_bucket_seq_cleanup(c);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 9953663e..d17a1ff8 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -31,6 +31,7 @@ struct journal_buf {
unsigned u64s_reserved;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
+ bool separate_flush;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 9505eab9..b4c315cf 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -154,7 +154,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
if (ret)
goto err;
- if (disk_sectors_delta > (s64) &op->res.sectors) {
+ if (disk_sectors_delta > (s64) op->res.sectors) {
ret = bch2_disk_reservation_add(c, &op->res,
disk_sectors_delta - op->res.sectors,
!should_check_enospc
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index efa7f38e..d0acc1ee 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -291,7 +291,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
fragmented_allowed += ((__dev_buckets_available(ca, usage) *
ca->mi.bucket_size) >> 1);
- fragmented += usage.sectors_fragmented;
+ fragmented += usage.d[BCH_DATA_user].fragmented;
}
return max_t(s64, 0, fragmented_allowed - fragmented);
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 8c67f146..422f2fbe 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -1099,27 +1099,13 @@ use_clean:
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
- if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
- /*
- * interior btree node updates aren't consistent with the
- * journal; after an unclean shutdown we have to walk all
- * pointers to metadata:
- */
- bch_info(c, "starting metadata mark and sweep");
- err = "error in mark and sweep";
- ret = bch2_gc(c, &c->journal_keys, true, true);
- if (ret)
- goto err;
- bch_verbose(c, "mark and sweep done");
- }
-
if (c->opts.fsck ||
!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bch_info(c, "starting mark and sweep");
err = "error in mark and sweep";
- ret = bch2_gc(c, &c->journal_keys, true, false);
+ ret = bch2_gc(c, &c->journal_keys, true);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index b1d8db67..ce8b7355 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -159,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
BUG_ON(!new_entry->data_type);
verify_replicas_entry(new_entry);
- new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
if (!new.entries)
return new;
@@ -282,13 +282,13 @@ static int replicas_table_update(struct bch_fs *c,
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
- sizeof(u64), GFP_NOIO)))
+ sizeof(u64), GFP_KERNEL)))
goto err;
- if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
- !(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
+ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+ !(new_scratch = kmalloc(bytes, GFP_KERNEL)) ||
(c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
goto err;
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
@@ -548,7 +548,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
c->replicas_gc.entry_size,
- GFP_NOIO);
+ GFP_KERNEL);
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
@@ -671,7 +671,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
nr++;
}
- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
return -ENOMEM;
@@ -703,7 +703,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
entry_size += sizeof(struct bch_replicas_entry) -
sizeof(struct bch_replicas_entry_v0);
- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
return -ENOMEM;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 651fbc5d..00681533 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -235,10 +235,7 @@ nowrote_alloc:
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal:
*/
- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- bch2_btree_flush_all_writes(c);
- else
- bch2_btree_verify_flushed(c);
+ bch2_btree_flush_all_writes(c);
/*
* After stopping journal:
@@ -1222,13 +1219,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
- if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
- !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
- mutex_lock(&c->sb_lock);
- bch2_mark_dev_superblock(ca->fs, ca, 0);
- mutex_unlock(&c->sb_lock);
- }
-
bch2_dev_sysfs_online(c, ca);
if (c->sb.nr_devices == 1)
@@ -1602,7 +1592,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
- bch2_mark_dev_superblock(ca->fs, ca, 0);
+ bch2_mark_dev_superblock(NULL, ca, 0);
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
@@ -1661,15 +1651,13 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
- bch2_mark_dev_superblock(c, ca, 0);
-
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- err = "alloc write failed";
- ret = bch2_dev_alloc_write(c, ca, 0);
+ err = "error marking superblock";
+ ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)
- goto err;
+ goto err_late;
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
@@ -1690,6 +1678,7 @@ err:
bch_err(c, "Unable to add device: %s", err);
return ret;
err_late:
+ up_write(&c->state_lock);
bch_err(c, "Error going rw after adding device: %s", err);
return -EINVAL;
}
@@ -1724,6 +1713,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err;
}
+ if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+ err = "bch2_trans_mark_dev_sb() error";
+ goto err;
+ }
+
ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index bfae0d71..4fc5777e 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -797,61 +797,42 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
nr[c->open_buckets[i].type]++;
pr_buf(out,
- "free_inc: %zu/%zu\n"
- "free[RESERVE_MOVINGGC]: %zu/%zu\n"
- "free[RESERVE_NONE]: %zu/%zu\n"
- "buckets:\n"
- " capacity: %llu\n"
- " alloc: %llu\n"
- " sb: %llu\n"
- " journal: %llu\n"
- " meta: %llu\n"
- " user: %llu\n"
- " cached: %llu\n"
- " erasure coded: %llu\n"
- " available: %lli\n"
- "sectors:\n"
- " sb: %llu\n"
- " journal: %llu\n"
- " meta: %llu\n"
- " user: %llu\n"
- " cached: %llu\n"
- " erasure coded: %llu\n"
- " fragmented: %llu\n"
- " copygc threshold: %llu\n"
- "freelist_wait: %s\n"
- "open buckets: %u/%u (reserved %u)\n"
- "open_buckets_wait: %s\n"
- "open_buckets_btree: %u\n"
- "open_buckets_user: %u\n"
- "btree reserve cache: %u\n",
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
- ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets_alloc,
- stats.buckets[BCH_DATA_sb],
- stats.buckets[BCH_DATA_journal],
- stats.buckets[BCH_DATA_btree],
- stats.buckets[BCH_DATA_user],
- stats.buckets[BCH_DATA_cached],
- stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- stats.sectors[BCH_DATA_sb],
- stats.sectors[BCH_DATA_journal],
- stats.sectors[BCH_DATA_btree],
- stats.sectors[BCH_DATA_user],
- stats.sectors[BCH_DATA_cached],
- stats.sectors_ec,
- stats.sectors_fragmented,
- c->copygc_threshold,
- c->freelist_wait.list.first ? "waiting" : "empty",
- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
- BTREE_NODE_OPEN_BUCKET_RESERVE,
- c->open_buckets_wait.list.first ? "waiting" : "empty",
- nr[BCH_DATA_btree],
- nr[BCH_DATA_user],
- c->btree_reserve_cache_nr);
+ "\t\t buckets\t sectors fragmented\n"
+ "capacity%16llu\n",
+ ca->mi.nbuckets - ca->mi.first_bucket);
+
+ for (i = 1; i < BCH_DATA_NR; i++)
+ pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+ bch2_data_types[i], stats.d[i].buckets,
+ stats.d[i].sectors, stats.d[i].fragmented);
+
+ pr_buf(out,
+ "ec\t%16llu\n"
+ "available%15llu\n"
+ "alloc\t%16llu\n"
+ "\n"
+ "free_inc\t\t%zu/%zu\n"
+ "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
+ "free[RESERVE_NONE]\t%zu/%zu\n"
+ "freelist_wait\t\t%s\n"
+ "open buckets\t\t%u/%u (reserved %u)\n"
+ "open_buckets_wait\t%s\n"
+ "open_buckets_btree\t%u\n"
+ "open_buckets_user\t%u\n"
+ "btree reserve cache\t%u\n",
+ stats.buckets_ec,
+ __dev_buckets_available(ca, stats),
+ stats.buckets_alloc,
+ fifo_used(&ca->free_inc), ca->free_inc.size,
+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
+ c->freelist_wait.list.first ? "waiting" : "empty",
+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+ BTREE_NODE_OPEN_BUCKET_RESERVE,
+ c->open_buckets_wait.list.first ? "waiting" : "empty",
+ nr[BCH_DATA_btree],
+ nr[BCH_DATA_user],
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {