summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/trace/events/bcachefs.h106
-rw-r--r--libbcachefs/bcachefs.h21
-rw-r--r--libbcachefs/bcachefs_format.h1
-rw-r--r--libbcachefs/bset.c12
-rw-r--r--libbcachefs/btree_gc.c202
-rw-r--r--libbcachefs/btree_io.c306
-rw-r--r--libbcachefs/btree_io.h4
-rw-r--r--libbcachefs/btree_iter.c2
-rw-r--r--libbcachefs/btree_update_interior.c36
-rw-r--r--libbcachefs/btree_update_interior.h4
-rw-r--r--libbcachefs/buckets.c176
-rw-r--r--libbcachefs/fs-common.c3
-rw-r--r--libbcachefs/fs-ioctl.c60
-rw-r--r--libbcachefs/fs.c2
-rw-r--r--libbcachefs/inode.c21
-rw-r--r--libbcachefs/inode.h2
-rw-r--r--libbcachefs/io.c9
-rw-r--r--libbcachefs/io.h2
-rw-r--r--libbcachefs/journal.c15
-rw-r--r--libbcachefs/journal_io.c44
-rw-r--r--libbcachefs/journal_reclaim.c4
-rw-r--r--libbcachefs/journal_seq_blacklist.c6
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/move.c9
-rw-r--r--libbcachefs/movinggc.c3
-rw-r--r--libbcachefs/opts.h7
-rw-r--r--libbcachefs/recovery.c18
-rw-r--r--libbcachefs/reflink.c54
-rw-r--r--libbcachefs/reflink.h24
-rw-r--r--libbcachefs/super-io.c14
-rw-r--r--libbcachefs/super.c16
-rw-r--r--libbcachefs/sysfs.c2
33 files changed, 954 insertions, 234 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 8da505a4..93876ae2 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-ac3ab6a511717db1644ded49a6f417304abba048
+3913e0cac34e0993ab6dde67a2dec1ea485a2e28
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index c79338c8..7c90ba01 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -49,14 +49,14 @@ DECLARE_EVENT_CLASS(bch_fs,
TP_ARGS(c),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
+ __field(dev_t, dev )
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->dev = c->dev;
),
- TP_printk("%pU", __entry->uuid)
+ TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
);
DECLARE_EVENT_CLASS(bio,
@@ -131,7 +131,7 @@ TRACE_EVENT(journal_reclaim_start,
btree_key_cache_dirty, btree_key_cache_total),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
+ __field(dev_t, dev )
__field(u64, min_nr )
__field(u64, prereserved )
__field(u64, prereserved_total )
@@ -142,7 +142,7 @@ TRACE_EVENT(journal_reclaim_start,
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->dev = c->dev;
__entry->min_nr = min_nr;
__entry->prereserved = prereserved;
__entry->prereserved_total = prereserved_total;
@@ -152,8 +152,8 @@ TRACE_EVENT(journal_reclaim_start,
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
- __entry->uuid,
+ TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->min_nr,
__entry->prereserved,
__entry->prereserved_total,
@@ -168,16 +168,18 @@ TRACE_EVENT(journal_reclaim_finish,
TP_ARGS(c, nr_flushed),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(u64, nr_flushed )
+ __field(dev_t, dev )
+ __field(u64, nr_flushed )
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- __entry->nr_flushed = nr_flushed;
+ __entry->dev = c->dev;
+ __entry->nr_flushed = nr_flushed;
),
- TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+ TP_printk("%d%d flushed %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_flushed)
);
/* bset.c: */
@@ -194,7 +196,7 @@ DECLARE_EVENT_CLASS(btree_node,
TP_ARGS(c, b),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
+ __field(dev_t, dev )
__field(u8, level )
__field(u8, id )
__field(u64, inode )
@@ -202,15 +204,16 @@ DECLARE_EVENT_CLASS(btree_node,
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->dev = c->dev;
__entry->level = b->c.level;
__entry->id = b->c.btree_id;
__entry->inode = b->key.k.p.inode;
__entry->offset = b->key.k.p.offset;
),
- TP_printk("%pU %u id %u %llu:%llu",
- __entry->uuid, __entry->level, __entry->id,
+ TP_printk("%d,%d %u id %u %llu:%llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->level, __entry->id,
__entry->inode, __entry->offset)
);
@@ -254,32 +257,17 @@ DEFINE_EVENT(btree_node, btree_node_reap,
TP_ARGS(c, b)
);
-DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c),
-
- TP_STRUCT__entry(
- __array(char, uuid, 16 )
- ),
-
- TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- ),
-
- TP_printk("%pU", __entry->uuid)
-);
-
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
@@ -294,18 +282,19 @@ TRACE_EVENT(btree_reserve_get_fail,
TP_ARGS(c, required, cl),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
+ __field(dev_t, dev )
__field(size_t, required )
__field(struct closure *, cl )
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->dev = c->dev;
__entry->required = required;
__entry->cl = cl;
),
- TP_printk("%pU required %zu by %p", __entry->uuid,
+ TP_printk("%d,%d required %zu by %p",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->required, __entry->cl)
);
@@ -483,19 +472,20 @@ TRACE_EVENT(move_data,
TP_ARGS(c, sectors_moved, keys_moved),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
+ __field(dev_t, dev )
__field(u64, sectors_moved )
__field(u64, keys_moved )
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->dev = c->dev;
__entry->sectors_moved = sectors_moved;
__entry->keys_moved = keys_moved;
),
- TP_printk("%pU sectors_moved %llu keys_moved %llu",
- __entry->uuid, __entry->sectors_moved, __entry->keys_moved)
+ TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->sectors_moved, __entry->keys_moved)
);
TRACE_EVENT(copygc,
@@ -507,7 +497,7 @@ TRACE_EVENT(copygc,
buckets_moved, buckets_not_moved),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
+ __field(dev_t, dev )
__field(u64, sectors_moved )
__field(u64, sectors_not_moved )
__field(u64, buckets_moved )
@@ -515,17 +505,39 @@ TRACE_EVENT(copygc,
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->dev = c->dev;
__entry->sectors_moved = sectors_moved;
__entry->sectors_not_moved = sectors_not_moved;
__entry->buckets_moved = buckets_moved;
__entry->buckets_not_moved = buckets_moved;
),
- TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
- __entry->uuid,
- __entry->sectors_moved, __entry->sectors_not_moved,
- __entry->buckets_moved, __entry->buckets_not_moved)
+ TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->sectors_moved, __entry->sectors_not_moved,
+ __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+TRACE_EVENT(copygc_wait,
+ TP_PROTO(struct bch_fs *c,
+ u64 wait_amount, u64 until),
+ TP_ARGS(c, wait_amount, until),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, wait_amount )
+ __field(u64, until )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->wait_amount = wait_amount;
+ __entry->until = until;
+ ),
+
+ TP_printk("%d,%u waiting for %llu sectors until %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->wait_amount, __entry->until)
);
TRACE_EVENT(trans_get_iter,
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 24aa2cc7..8be95d81 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -263,7 +263,10 @@ do { \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
- "done in memory")
+ "done in memory") \
+ BCH_DEBUG_PARAM(verify_all_btree_replicas, \
+ "When reading btree nodes, read all replicas and " \
+ "compare them")
/* Parameters that should only be compiled in in debug mode: */
#define BCH_DEBUG_PARAMS_DEBUG() \
@@ -387,6 +390,14 @@ struct gc_pos {
unsigned level;
};
+struct reflink_gc {
+ u64 offset;
+ u32 size;
+ u32 refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
@@ -564,6 +575,7 @@ struct bch_fs {
int minor;
struct device *chardev;
struct super_block *vfs_sb;
+ dev_t dev;
char name[40];
/* ro/rw, add/remove/resize devices: */
@@ -623,6 +635,7 @@ struct bch_fs {
/* BTREE CACHE */
struct bio_set btree_bio;
+ struct workqueue_struct *io_complete_wq;
struct btree_root btree_roots[BTREE_ID_NR];
struct mutex btree_root_lock;
@@ -660,7 +673,8 @@ struct bch_fs {
struct btree_key_cache btree_key_cache;
- struct workqueue_struct *wq;
+ struct workqueue_struct *btree_update_wq;
+ struct workqueue_struct *btree_error_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
@@ -799,6 +813,9 @@ struct bch_fs {
/* REFLINK */
u64 reflink_hint;
+ reflink_gc_table reflink_gc_table;
+ size_t reflink_gc_nr;
+ size_t reflink_gc_idx;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index d640a311..79c0876a 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1344,6 +1344,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
/*
* Features:
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 26203a5d..8a149e21 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -1193,13 +1193,11 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
static inline void prefetch_four_cachelines(void *p)
{
-#if (CONFIG_X86_64 && !defined(__clang__))
- asm(".intel_syntax noprefix;"
- "prefetcht0 [%0 - 127 + 64 * 0];"
- "prefetcht0 [%0 - 127 + 64 * 1];"
- "prefetcht0 [%0 - 127 + 64 * 2];"
- "prefetcht0 [%0 - 127 + 64 * 3];"
- ".att_syntax prefix;"
+#if CONFIG_X86_64
+ asm("prefetcht0 (-127 + 64 * 0)(%0);"
+ "prefetcht0 (-127 + 64 * 1)(%0);"
+ "prefetcht0 (-127 + 64 * 2)(%0);"
+ "prefetcht0 (-127 + 64 * 3)(%0);"
:
: "r" (p + 127));
#else
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index e28292e0..b03432c1 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -23,6 +23,7 @@
#include "keylist.h"
#include "move.h"
#include "recovery.h"
+#include "reflink.h"
#include "replicas.h"
#include "super-io.h"
@@ -1282,6 +1283,201 @@ static int bch2_gc_start(struct bch_fs *c,
return 0;
}
+static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct reflink_gc *r;
+ const __le64 *refcount = bkey_refcount_c(k);
+ char buf[200];
+ int ret = 0;
+
+ if (!refcount)
+ return 0;
+
+ r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
+ if (!r)
+ return -ENOMEM;
+
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ return -EINVAL;
+ }
+
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ r->refcount)) {
+ struct bkey_i *new;
+
+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto fsck_err;
+ }
+
+ bkey_reassemble(new, k);
+
+ if (!r->refcount) {
+ new->k.type = KEY_TYPE_deleted;
+ new->k.size = 0;
+ } else {
+ *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ }
+
+ ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
+ if (ret)
+ kfree(new);
+ }
+fsck_err:
+ return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ struct reflink_gc *r;
+ size_t idx = 0;
+ char buf[200];
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ if (initial) {
+ c->reflink_gc_idx = 0;
+
+ ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+ bch2_gc_reflink_done_initial_fn);
+ goto out;
+ }
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ continue;
+
+ r = genradix_ptr(&c->reflink_gc_table, idx);
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ ret = -EINVAL;
+ break;
+ }
+
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ r->refcount)) {
+ struct bkey_i *new;
+
+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ bkey_reassemble(new, k);
+
+ if (!r->refcount)
+ new->k.type = KEY_TYPE_deleted;
+ else
+ *bkey_refcount(new) = cpu_to_le64(r->refcount);
+
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
+ kfree(new);
+
+ if (ret)
+ break;
+ }
+ }
+fsck_err:
+ bch2_trans_iter_put(&trans, iter);
+ bch2_trans_exit(&trans);
+out:
+ genradix_free(&c->reflink_gc_table);
+ c->reflink_gc_nr = 0;
+ return ret;
+}
+
+static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+
+ struct reflink_gc *r;
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ return 0;
+
+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+ GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ return 0;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ struct reflink_gc *r;
+ int ret;
+
+ if (metadata_only)
+ return 0;
+
+ genradix_free(&c->reflink_gc_table);
+ c->reflink_gc_nr = 0;
+
+ if (initial)
+ return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+ bch2_gc_reflink_start_initial_fn);
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ const __le64 *refcount = bkey_refcount_c(k);
+
+ if (!refcount)
+ continue;
+
+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+ GFP_KERNEL);
+ if (!r) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ }
+ bch2_trans_iter_put(&trans, iter);
+
+ bch2_trans_exit(&trans);
+ return 0;
+}
+
/**
* bch2_gc - walk _all_ references to buckets, and recompute them:
*
@@ -1316,7 +1512,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
again:
- ret = bch2_gc_start(c, metadata_only);
+ ret = bch2_gc_start(c, metadata_only) ?:
+ bch2_gc_reflink_start(c, initial, metadata_only);
if (ret)
goto out;
@@ -1378,7 +1575,8 @@ out:
bch2_journal_block(&c->journal);
percpu_down_write(&c->mark_lock);
- ret = bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+ bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
} else {
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 094285bd..47cfd8a0 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -521,7 +521,7 @@ enum btree_validate_ret {
\
switch (write) { \
case READ: \
- bch_err(c, "%s", _buf2); \
+ bch_err(c, "%s", _buf2); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
@@ -815,6 +815,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
+ unsigned nonblacklisted_written = 0;
int ret, retry_read = 0, write = READ;
b->version_ondisk = U16_MAX;
@@ -934,15 +935,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sort_iter_add(iter,
vstruct_idx(i, whiteout_u64s),
vstruct_last(i));
+
+ nonblacklisted_written = b->written;
}
for (bne = write_block(b);
bset_byte_offset(b, bne) < btree_bytes(c);
bne = (void *) bne + block_bytes(c))
- btree_err_on(bne->keys.seq == b->data->keys.seq,
+ btree_err_on(bne->keys.seq == b->data->keys.seq &&
+ !bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq),
+ true),
BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
"found bset signature after last bset");
+ /*
+ * Blacklisted bsets are those that were written after the most recent
+ * (flush) journal write. Since there wasn't a flush, they may not have
+ * made it to all devices - which means we shouldn't write new bsets
+ * after them, as that could leave a gap and then reads from that device
+ * wouldn't find all the bsets in that btree node - which means it's
+ * important that we start writing new bsets after the most recent _non_
+ * blacklisted bset:
+ */
+ b->written = nonblacklisted_written;
+
sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
sorted->keys.u64s = 0;
@@ -1027,8 +1044,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct btree *b = rb->b;
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
- struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
char buf[200];
@@ -1101,7 +1118,263 @@ static void btree_node_read_endio(struct bio *bio)
bch2_latency_acct(ca, rb->start_time, READ);
}
- queue_work(system_unbound_wq, &rb->work);
+ queue_work(c->io_complete_wq, &rb->work);
+}
+
+struct btree_node_read_all {
+ struct closure cl;
+ struct bch_fs *c;
+ struct btree *b;
+ unsigned nr;
+ void *buf[BCH_REPLICAS_MAX];
+ struct bio *bio[BCH_REPLICAS_MAX];
+ int err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+ struct btree_node *bn = data;
+ struct btree_node_entry *bne;
+ unsigned offset = 0;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return 0;
+
+ while (offset < c->opts.btree_node_size) {
+ if (!offset) {
+ offset += vstruct_sectors(bn, c->block_bits);
+ } else {
+ bne = data + (offset << 9);
+ if (bne->keys.seq != bn->keys.seq)
+ break;
+ offset += vstruct_sectors(bne, c->block_bits);
+ }
+ }
+
+ return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+ struct btree_node *bn = data;
+ struct btree_node_entry *bne;
+
+ if (!offset)
+ return false;
+
+ while (offset < c->opts.btree_node_size) {
+ bne = data + (offset << 9);
+ if (bne->keys.seq == bn->keys.seq)
+ return true;
+ offset++;
+ }
+
+ return false;
+ return offset;
+}
+
+static void btree_node_read_all_replicas_done(struct closure *cl)
+{
+ struct btree_node_read_all *ra =
+ container_of(cl, struct btree_node_read_all, cl);
+ struct bch_fs *c = ra->c;
+ struct btree *b = ra->b;
+ bool have_good_copy = false;
+ bool dump_bset_maps = false;
+ bool have_retry = false;
+ int ret = 0, write = READ;
+ unsigned i, written, written2;
+ __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+
+ for (i = 0; i < ra->nr; i++) {
+ if (ra->err[i])
+ continue;
+
+ if (!have_good_copy) {
+ memcpy(b->data, ra->buf[i], btree_bytes(c));
+ have_good_copy = true;
+ written = btree_node_sectors_written(c, b->data);
+ }
+
+ /* Try to get the right btree node: */
+ if (have_good_copy &&
+ seq &&
+ b->data->keys.seq != seq &&
+ ((struct btree_node *) ra->buf[i])->keys.seq == seq) {
+ memcpy(b->data, ra->buf[i], btree_bytes(c));
+ written = btree_node_sectors_written(c, b->data);
+ }
+
+ written2 = btree_node_sectors_written(c, ra->buf[i]);
+ if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+ "btree node sectors written mismatch: %u != %u",
+ written, written2) ||
+ btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+ BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+ "found bset signature after last bset") ||
+ btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
+ BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+ "btree node replicas content mismatch"))
+ dump_bset_maps = true;
+
+ if (written2 > written) {
+ written = written2;
+ memcpy(b->data, ra->buf[i], btree_bytes(c));
+ }
+ }
+fsck_err:
+ if (dump_bset_maps) {
+ for (i = 0; i < ra->nr; i++) {
+ char buf[200];
+ struct printbuf out = PBUF(buf);
+ struct btree_node *bn = ra->buf[i];
+ struct btree_node_entry *bne = NULL;
+ unsigned offset = 0, sectors;
+ bool gap = false;
+
+ if (ra->err[i])
+ continue;
+
+ while (offset < c->opts.btree_node_size) {
+ if (!offset) {
+ sectors = vstruct_sectors(bn, c->block_bits);
+ } else {
+ bne = ra->buf[i] + (offset << 9);
+ if (bne->keys.seq != bn->keys.seq)
+ break;
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ pr_buf(&out, " %u-%u", offset, offset + sectors);
+ if (bne && bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq), false))
+ pr_buf(&out, "*");
+ offset += sectors;
+ }
+
+ while (offset < c->opts.btree_node_size) {
+ bne = ra->buf[i] + (offset << 9);
+ if (bne->keys.seq == bn->keys.seq) {
+ if (!gap)
+ pr_buf(&out, " GAP");
+ gap = true;
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ pr_buf(&out, " %u-%u", offset, offset + sectors);
+ if (bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq), false))
+ pr_buf(&out, "*");
+ }
+ offset++;
+ }
+
+ bch_err(c, "replica %u:%s", i, buf);
+ }
+ }
+
+ if (have_good_copy)
+ bch2_btree_node_read_done(c, NULL, b, false);
+ else
+ set_btree_node_read_error(b);
+
+ for (i = 0; i < ra->nr; i++) {
+ mempool_free(ra->buf[i], &c->btree_bounce_pool);
+ bio_put(ra->bio[i]);
+ }
+
+ closure_debug_destroy(&ra->cl);
+ kfree(ra);
+
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+ struct btree_read_bio *rb =
+ container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
+ struct btree_node_read_all *ra = rb->ra;
+
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
+
+ ra->err[rb->idx] = bio->bi_status;
+ closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded pick;
+ struct btree_node_read_all *ra;
+ unsigned i;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
+ closure_init(&ra->cl, NULL);
+ ra->c = c;
+ ra->b = b;
+ ra->nr = bch2_bkey_nr_ptrs(k);
+
+ for (i = 0; i < ra->nr; i++) {
+ ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+ ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i],
+ btree_bytes(c)),
+ &c->btree_bio);
+ }
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ struct btree_read_bio *rb =
+ container_of(ra->bio[i], struct btree_read_bio, bio);
+ rb->c = c;
+ rb->b = b;
+ rb->ra = ra;
+ rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ rb->idx = i;
+ rb->pick = pick;
+ rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
+ rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+ rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+ bio_sectors(&rb->bio));
+ bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+ closure_get(&ra->cl);
+ submit_bio(&rb->bio);
+ } else {
+ ra->err[i] = BLK_STS_REMOVED;
+ }
+
+ i++;
+ }
+
+ if (sync) {
+ closure_sync(&ra->cl);
+ btree_node_read_all_replicas_done(&ra->cl);
+ } else {
+ continue_at(&ra->cl, btree_node_read_all_replicas_done,
+ c->io_complete_wq);
+ }
+
+ return 0;
}
void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
@@ -1117,6 +1390,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b);
+ set_btree_node_read_in_flight(b);
+
+ if (bch2_verify_all_btree_replicas &&
+ !btree_node_read_all_replicas(c, b, sync))
+ return;
+
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick);
if (bch2_fs_fatal_err_on(ret <= 0, c,
@@ -1133,6 +1412,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
&c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
+ rb->b = b;
+ rb->ra = NULL;
rb->start_time = local_clock();
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
@@ -1140,11 +1421,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
- bio->bi_private = b;
bch2_bio_map(bio, b->data, btree_bytes(c));
- set_btree_node_read_in_flight(b);
-
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
@@ -1153,7 +1431,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync) {
submit_bio_wait(bio);
- bio->bi_private = b;
btree_node_read_work(&rb->work);
} else {
submit_bio(bio);
@@ -1164,8 +1441,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync)
btree_node_read_work(&rb->work);
else
- queue_work(system_unbound_wq, &rb->work);
-
+ queue_work(c->io_complete_wq, &rb->work);
}
}
@@ -1332,7 +1608,7 @@ static void btree_node_write_work(struct work_struct *work)
bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
- queue_work(c->wq, &c->btree_write_error_work);
+ queue_work(c->btree_error_wq, &c->btree_write_error_work);
return;
}
@@ -1371,7 +1647,7 @@ static void btree_node_write_endio(struct bio *bio)
container_of(orig, struct btree_write_bio, wbio);
INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(system_unbound_wq, &wb->work);
+ queue_work(c->io_complete_wq, &wb->work);
}
}
@@ -1441,6 +1717,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
return;
if (old & (1 << BTREE_NODE_write_in_flight)) {
+ /*
+ * XXX waiting on btree writes with btree locks held -
+ * this can deadlock, and we hit the write error path
+ */
btree_node_wait_on_io(b);
continue;
}
@@ -1631,7 +1911,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
atomic64_add(sectors_to_write, &c->btree_writes_sectors);
INIT_WORK(&wbio->work, btree_write_submit);
- schedule_work(&wbio->work);
+ queue_work(c->io_complete_wq, &wbio->work);
return;
err:
set_btree_node_noevict(b);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index cadcf7f8..abbc4675 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -13,6 +13,7 @@ struct bch_fs;
struct btree_write;
struct btree;
struct btree_iter;
+struct btree_node_read_all;
static inline bool btree_node_dirty(struct btree *b)
{
@@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
struct btree_read_bio {
struct bch_fs *c;
+ struct btree *b;
+ struct btree_node_read_all *ra;
u64 start_time;
unsigned have_ioref:1;
+ unsigned idx:7;
struct extent_ptr_decoded pick;
struct work_struct work;
struct bio bio;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 7f86a39b..bdb068e9 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -2260,6 +2260,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
unsigned expected_nr_iters,
size_t expected_mem_bytes)
+ __acquires(&c->btree_trans_barrier)
{
memset(trans, 0, sizeof(*trans));
trans->c = c;
@@ -2292,6 +2293,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
}
int bch2_trans_exit(struct btree_trans *trans)
+ __releases(&c->btree_trans_barrier)
{
struct bch_fs *c = trans->c;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index bee7ee69..b0484c7a 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -551,6 +551,22 @@ static void btree_update_nodes_written(struct btree_update *as)
BUG_ON(!journal_pin_active(&as->journal));
/*
+ * Wait for any in flight writes to finish before we free the old nodes
+ * on disk:
+ */
+ for (i = 0; i < as->nr_old_nodes; i++) {
+ struct btree_node *bn = READ_ONCE(as->old_nodes[i]->data);
+
+ /*
+ * This is technically a use after free, but it's just a read -
+ * but it might cause problems in userspace where freeing the
+ * buffer may unmap it:
+ */
+ if (bn && bn->keys.seq == as->old_nodes_seq[i])
+ btree_node_wait_on_io(as->old_nodes[i]);
+ }
+
+ /*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
@@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
btree_update_will_delete_key(as, &b->key);
- /*
- * XXX: Waiting on io with btree node locks held, we don't want to be
- * doing this. We can't have btree writes happening after the space has
- * been freed, but we really only need to block before
- * btree_update_nodes_written_trans() happens.
- */
- btree_node_wait_on_io(b);
+ as->old_nodes[as->nr_old_nodes] = b;
+ as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+ as->nr_old_nodes++;
}
void bch2_btree_update_done(struct btree_update *as)
@@ -908,7 +920,8 @@ void bch2_btree_update_done(struct btree_update *as)
bch2_btree_reserve_put(as);
- continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
+ continue_at(&as->cl, btree_update_set_nodes_written,
+ as->c->btree_interior_update_worker);
}
struct btree_update *
@@ -1826,7 +1839,10 @@ void async_btree_node_rewrite_work(struct work_struct *work)
void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
- struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+ struct async_btree_rewrite *a;
+
+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+ return;
if (!percpu_ref_tryget(&c->writes))
return;
@@ -1844,7 +1860,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
a->seq = b->data->keys.seq;
INIT_WORK(&a->work, async_btree_node_rewrite_work);
- queue_work(system_long_wq, &a->work);
+ queue_work(c->btree_interior_update_worker, &a->work);
}
static void __bch2_btree_node_update_key(struct bch_fs *c,
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 7eef3dbb..7ed67b47 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -92,6 +92,10 @@ struct btree_update {
struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_new_nodes;
+ struct btree *old_nodes[BTREE_UPDATE_NODES_MAX];
+ __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+ unsigned nr_old_nodes;
+
open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
BCH_REPLICAS_MAX];
open_bucket_idx_t nr_open_buckets;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index cbd295e4..d07085a2 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -14,6 +14,7 @@
#include "ec.h"
#include "error.h"
#include "movinggc.h"
+#include "reflink.h"
#include "replicas.h"
#include <linux/preempt.h>
@@ -1072,6 +1073,124 @@ static int bch2_mark_stripe(struct bch_fs *c,
return 0;
}
+static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+ u64 p_start, u64 p_end,
+ u64 v_start, u64 v_end)
+{
+ if (p_start == p_end)
+ return false;
+
+ p_start += le64_to_cpu(p.v->idx);
+ p_end += le64_to_cpu(p.v->idx);
+
+ if (p_end <= v_start)
+ return false;
+ if (p_start >= v_end)
+ return false;
+ return true;
+}
+
+static int reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+ u64 start, u64 end,
+ struct bkey_s_c k)
+{
+ return __reflink_p_frag_references(p, start, end,
+ bkey_start_offset(k.k),
+ k.k->p.offset);
+}
+
+static int __bch2_mark_reflink_p(struct bch_fs *c,
+ struct bkey_s_c_reflink_p p,
+ u64 idx, unsigned sectors,
+ unsigned front_frag,
+ unsigned back_frag,
+ unsigned flags,
+ size_t *r_idx)
+{
+ struct reflink_gc *r;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ int frags_referenced;
+
+ while (1) {
+ if (*r_idx >= c->reflink_gc_nr)
+ goto not_found;
+ r = genradix_ptr(&c->reflink_gc_table, *r_idx);
+ BUG_ON(!r);
+
+ if (r->offset > idx)
+ break;
+ (*r_idx)++;
+ }
+
+ frags_referenced =
+ __reflink_p_frag_references(p, 0, front_frag,
+ r->offset - r->size, r->offset) +
+ __reflink_p_frag_references(p, back_frag, p.k->size,
+ r->offset - r->size, r->offset);
+
+ if (frags_referenced == 2) {
+ BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
+ add = -add;
+ } else if (frags_referenced == 1) {
+ BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
+ add = 0;
+ }
+
+ BUG_ON((s64) r->refcount + add < 0);
+
+ r->refcount += add;
+ return min_t(u64, sectors, r->offset - idx);
+not_found:
+ bch2_fs_inconsistent(c,
+ "%llu:%llu len %u points to nonexistent indirect extent %llu",
+ p.k->p.inode, p.k->p.offset, p.k->size, idx);
+ bch2_inconsistent_error(c);
+ return -EIO;
+}
+
+static int bch2_mark_reflink_p(struct bch_fs *c,
+ struct bkey_s_c_reflink_p p, unsigned offset,
+ s64 sectors, unsigned flags)
+{
+ u64 idx = le64_to_cpu(p.v->idx) + offset;
+ struct reflink_gc *ref;
+ size_t l, r, m;
+ unsigned front_frag, back_frag;
+ s64 ret = 0;
+
+ if (sectors < 0)
+ sectors = -sectors;
+
+ BUG_ON(offset + sectors > p.k->size);
+
+ front_frag = offset;
+ back_frag = offset + sectors;
+
+ l = 0;
+ r = c->reflink_gc_nr;
+ while (l < r) {
+ m = l + (r - l) / 2;
+
+ ref = genradix_ptr(&c->reflink_gc_table, m);
+ if (ref->offset <= idx)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ while (sectors) {
+ ret = __bch2_mark_reflink_p(c, p, idx, sectors,
+ front_frag, back_frag, flags, &l);
+ if (ret < 0)
+ return ret;
+
+ idx += ret;
+ sectors -= ret;
+ }
+
+ return 0;
+}
+
static int bch2_mark_key_locked(struct bch_fs *c,
struct bkey_s_c old,
struct bkey_s_c new,
@@ -1127,6 +1246,10 @@ static int bch2_mark_key_locked(struct bch_fs *c,
fs_usage->persistent_reserved[replicas - 1] += sectors;
break;
}
+ case KEY_TYPE_reflink_p:
+ ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k),
+ offset, sectors, flags);
+ break;
}
preempt_enable();
@@ -1689,35 +1812,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
-static __le64 *bkey_refcount(struct bkey_i *k)
-{
- switch (k->k.type) {
- case KEY_TYPE_reflink_v:
- return &bkey_i_to_reflink_v(k)->v.refcount;
- case KEY_TYPE_indirect_inline_data:
- return &bkey_i_to_indirect_inline_data(k)->v.refcount;
- default:
- return NULL;
- }
-}
-
-static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p,
- u64 start, u64 end,
- struct bkey_s_c k)
-{
- if (start == end)
- return false;
-
- start += le64_to_cpu(p.v->idx);
- end += le64_to_cpu(p.v->idx);
-
- if (end <= bkey_start_offset(k.k))
- return false;
- if (start >= k.k->p.offset)
- return false;
- return true;
-}
-
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 idx, unsigned sectors,
@@ -1731,6 +1825,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_i *n;
__le64 *refcount;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ int frags_referenced;
s64 ret;
ret = trans_get_key(trans, BTREE_ID_reflink,
@@ -1738,18 +1833,20 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
if (ret < 0)
return ret;
- if (reflink_p_frag_references(p, 0, front_frag, k) &&
- reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+ sectors = min_t(u64, sectors, k.k->p.offset - idx);
+
+ frags_referenced =
+ reflink_p_frag_references(p, 0, front_frag, k) +
+ reflink_p_frag_references(p, back_frag, p.k->size, k);
+
+ if (frags_referenced == 2) {
BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
add = -add;
- } else if (reflink_p_frag_references(p, 0, front_frag, k) ||
- reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+ } else if (frags_referenced == 1) {
BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
goto out;
}
- sectors = min_t(u64, sectors, k.k->p.offset - idx);
-
n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
@@ -1804,14 +1901,13 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors,
front_frag, back_frag, flags);
if (ret < 0)
- break;
+ return ret;
- idx += ret;
- sectors = max_t(s64, 0LL, sectors - ret);
- ret = 0;
+ idx += ret;
+ sectors -= ret;
}
- return ret;
+ return 0;
}
int bch2_trans_mark_key(struct btree_trans *trans,
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c
index 08c6af88..00a63fec 100644
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -23,6 +23,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
struct btree_iter *inode_iter = NULL;
struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
u64 now = bch2_current_time(c);
+ u64 cpu = raw_smp_processor_id();
u64 dir_offset = 0;
int ret;
@@ -36,7 +37,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
if (!name)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
- inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
+ inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
ret = PTR_ERR_OR_ZERO(inode_iter);
if (ret)
goto err;
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index eb871634..d8cc32e0 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -13,6 +13,9 @@
#include <linux/mount.h>
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
+#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
struct flags_set {
unsigned mask;
@@ -247,11 +250,54 @@ err1:
return ret;
}
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+ u32 flags;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, arg))
+ return -EFAULT;
+
+ bch_notice(c, "shutdown by ioctl type %u", flags);
+
+ down_write(&c->vfs_sb->s_umount);
+
+ switch (flags) {
+ case FSOP_GOING_FLAGS_DEFAULT:
+ ret = freeze_bdev(c->vfs_sb->s_bdev);
+ if (ret)
+ goto err;
+
+ bch2_journal_flush(&c->journal);
+ c->vfs_sb->s_flags |= SB_RDONLY;
+ bch2_fs_emergency_read_only(c);
+ thaw_bdev(c->vfs_sb->s_bdev);
+ break;
+
+ case FSOP_GOING_FLAGS_LOGFLUSH:
+ bch2_journal_flush(&c->journal);
+ fallthrough;
+
+ case FSOP_GOING_FLAGS_NOLOGFLUSH:
+ c->vfs_sb->s_flags |= SB_RDONLY;
+ bch2_fs_emergency_read_only(c);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+err:
+ up_write(&c->vfs_sb->s_umount);
+ return ret;
+}
+
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
- struct super_block *sb = inode->v.i_sb;
- struct bch_fs *c = sb->s_fs_info;
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
switch (cmd) {
case FS_IOC_GETFLAGS:
@@ -276,15 +322,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -ENOTTY;
case FS_IOC_GOINGDOWN:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- down_write(&sb->s_umount);
- sb->s_flags |= SB_RDONLY;
- if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only due to ioctl");
- up_write(&sb->s_umount);
- return 0;
+ return bch2_ioc_goingdown(c, (u32 __user *) arg);
default:
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 25a9fc14..e8a329c9 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1578,6 +1578,8 @@ got_sb:
break;
}
+ c->dev = sb->s_dev;
+
#ifdef CONFIG_BCACHEFS_POSIX_ACL
if (c->opts.acl)
sb->s_flags |= SB_POSIXACL;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index c5892e42..6b43a971 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -472,23 +472,28 @@ static inline u32 bkey_generation(struct bkey_s_c k)
struct btree_iter *bch2_inode_create(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
- u32 snapshot)
+ u32 snapshot, u64 cpu)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 min, max, start, pos, *hint;
int ret = 0;
+ unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
- u64 cpu = raw_smp_processor_id();
- unsigned bits = (c->opts.inodes_32bit
- ? 31 : 63) - c->inode_shard_bits;
+ if (c->opts.shard_inode_numbers) {
+ bits -= c->inode_shard_bits;
- min = (cpu << bits);
- max = (cpu << bits) | ~(ULLONG_MAX << bits);
+ min = (cpu << bits);
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
- min = max_t(u64, min, BLOCKDEV_INODE_MAX);
- hint = c->unused_inode_hints + cpu;
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+ hint = c->unused_inode_hints + cpu;
+ } else {
+ min = BLOCKDEV_INODE_MAX;
+ max = ~(ULLONG_MAX << bits);
+ hint = c->unused_inode_hints;
+ }
start = READ_ONCE(*hint);
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 558d5464..2cb081ae 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
struct btree_iter *bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *, u32);
+ struct bch_inode_unpacked *, u32, u64);
int bch2_inode_rm(struct bch_fs *, u64, bool);
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 9b6aece7..157b2a0f 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -1439,7 +1439,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
bch2_migrate_read_done(&op->write, rbio);
closure_init(cl, NULL);
- closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+ closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
closure_return_with_destructor(cl, promote_done);
}
@@ -1822,6 +1822,13 @@ static void __bch2_read_endio(struct work_struct *work)
if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
goto csum_err;
+ /*
+ * XXX
+ * We need to rework the narrow_crcs path to deliver the read completion
+ * first, and then punt to a different workqueue, otherwise we're
+ * holding up reads while doing btree updates which is bad for memory
+ * reclaim.
+ */
if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 144dc934..bc0a0bd6 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -58,7 +58,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
? op->c->copygc_wq
- : op->c->wq;
+ : op->c->btree_update_wq;
}
int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 52efa463..af5386d9 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -118,7 +118,9 @@ void bch2_journal_halt(struct journal *j)
void __bch2_journal_buf_put(struct journal *j)
{
- closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
}
/*
@@ -304,7 +306,7 @@ static int journal_entry_open(struct journal *j)
j->res_get_blocked_start);
j->res_get_blocked_start = 0;
- mod_delayed_work(system_freezable_wq,
+ mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(j->write_delay_ms));
journal_wake(j);
@@ -805,10 +807,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
long b;
if (new_fs) {
- percpu_down_read(&c->mark_lock);
b = bch2_bucket_alloc_new_fs(ca);
if (b < 0) {
- percpu_up_read(&c->mark_lock);
ret = -ENOSPC;
goto err;
}
@@ -825,7 +825,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
b = sector_to_bucket(ca, ob->ptr.offset);
}
- spin_lock(&c->journal.lock);
+ if (c)
+ spin_lock(&c->journal.lock);
/*
* XXX
@@ -852,14 +853,14 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- spin_unlock(&c->journal.lock);
+ if (c)
+ spin_unlock(&c->journal.lock);
if (new_fs) {
bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
0);
- percpu_up_read(&c->mark_lock);
} else {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_trans_mark_metadata_bucket(&trans, ca,
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 635cceb4..2da6839f 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -834,7 +834,7 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
unsigned i;
for (i = 0; i < j->nr_ptrs; i++) {
- struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
u64 offset;
div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
@@ -1233,8 +1233,6 @@ static void journal_write_done(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_devs_list devs =
- bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 v, seq;
@@ -1242,11 +1240,12 @@ static void journal_write_done(struct closure *cl)
bch2_time_stats_update(j->write_time, j->write_start_time);
- if (!devs.nr) {
+ if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
err = -EIO;
} else {
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+ w->devs_written);
if (bch2_mark_replicas(c, &replicas.e))
err = -EIO;
}
@@ -1258,7 +1257,7 @@ static void journal_write_done(struct closure *cl)
seq = le64_to_cpu(w->data->seq);
if (seq >= j->pin.front)
- journal_seq_pin(j, seq)->devs = devs;
+ journal_seq_pin(j, seq)->devs = w->devs_written;
j->seq_ondisk = seq;
if (err && (!j->err_seq || seq < j->err_seq))
@@ -1296,27 +1295,27 @@ static void journal_write_done(struct closure *cl)
journal_wake(j);
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
- mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+ mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
spin_unlock(&j->lock);
if (new.unwritten_idx != new.idx &&
!journal_state_count(new, new.unwritten_idx))
- closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
}
static void journal_write_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ unsigned long flags;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+ le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
- struct journal_buf *w = journal_last_unwritten_buf(j);
- unsigned long flags;
-
spin_lock_irqsave(&j->err_lock, flags);
- bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
+ bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
@@ -1370,7 +1369,7 @@ static void do_journal_write(struct closure *cl)
le64_to_cpu(w->data->seq);
}
- continue_at(cl, journal_write_done, system_highpri_wq);
+ continue_at(cl, journal_write_done, c->io_complete_wq);
return;
}
@@ -1402,7 +1401,8 @@ void bch2_journal_write(struct closure *cl)
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
- jset->last_seq = w->last_seq = 0;
+ jset->last_seq = 0;
+ w->last_seq = 0;
j->nr_noflush_writes++;
} else {
@@ -1509,14 +1509,12 @@ retry_alloc:
journal_debug_buf);
kfree(journal_debug_buf);
bch2_fatal_error(c);
- continue_at(cl, journal_write_done, system_highpri_wq);
+ continue_at(cl, journal_write_done, c->io_complete_wq);
return;
}
- /*
- * XXX: we really should just disable the entire journal in nochanges
- * mode
- */
+ w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
if (c->opts.nochanges)
goto no_io;
@@ -1542,14 +1540,14 @@ retry_alloc:
bch2_bucket_seq_cleanup(c);
- continue_at(cl, do_journal_write, system_highpri_wq);
+ continue_at(cl, do_journal_write, c->io_complete_wq);
return;
no_io:
bch2_bucket_seq_cleanup(c);
- continue_at(cl, journal_write_done, system_highpri_wq);
+ continue_at(cl, journal_write_done, c->io_complete_wq);
return;
err:
bch2_inconsistent_error(c);
- continue_at(cl, journal_write_done, system_highpri_wq);
+ continue_at(cl, journal_write_done, c->io_complete_wq);
}
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 427be2da..7a0ae5d3 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -93,6 +93,10 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
* until we write it out - thus, account for it here:
*/
while ((unwritten = get_unwritten_sectors(j, &idx))) {
+ /* entry won't fit on this device, skip: */
+ if (unwritten > ca->mi.bucket_size)
+ continue;
+
if (unwritten >= sectors) {
if (!buckets) {
sectors = 0;
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index e1b63f38..f2060f90 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -111,8 +111,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl->start[nr].start = cpu_to_le64(start);
bl->start[nr].end = cpu_to_le64(end);
out_write_sb:
- c->disk_sb.sb->features[0] |=
- 1ULL << BCH_FEATURE_journal_seq_blacklist_v3;
+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
out:
@@ -298,8 +297,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
BUG_ON(new_nr && !bl);
if (!new_nr)
- c->disk_sb.sb->features[0] &=
- ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
bch2_write_super(c);
}
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index cacab22a..61674ae1 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -21,6 +21,7 @@ struct journal_buf {
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
+ struct bch_devs_list devs_written;
struct closure_waitlist wait;
u64 last_seq; /* copy of data->last_seq */
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 778ff72c..2fa763e3 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -523,6 +523,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
if (ret)
goto err;
+ if (!k.k || bkey_cmp(k.k->p, pos)) {
+ ret = -ENOENT;
+ goto err;
+ }
+
ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
if (ret)
goto err;
@@ -921,8 +926,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
c->disk_sb.sb->version_min = c->disk_sb.sb->version;
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 61c5901f..2acca0dd 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -317,6 +317,8 @@ static int bch2_copygc_thread(void *arg)
set_freezable();
while (!kthread_should_stop()) {
+ cond_resched();
+
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
@@ -324,6 +326,7 @@ static int bch2_copygc_thread(void *arg)
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
+ trace_copygc_wait(c, wait, last + wait);
c->copygc_wait = last + wait;
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 001e865c..1e2fc5de 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -165,8 +165,13 @@ enum opt_type {
x(inodes_32bit, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH_SB_INODE_32BIT, false, \
+ BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
+ x(shard_inode_numbers, u8, \
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_SHARD_INUMS, false, \
+ NULL, "Shard new inode numbers by CPU id") \
x(gc_reserve_percent, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index cd538ecc..9bd63488 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -716,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_dev_usage: {
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
- struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
sizeof(struct jset_entry_dev_usage_type);
@@ -755,7 +755,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
- atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+ atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
}
}
@@ -1217,13 +1217,13 @@ use_clean:
mutex_lock(&c->sb_lock);
if (c->opts.version_upgrade) {
- c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
write_sb = true;
}
if (!test_bit(BCH_FS_ERROR, &c->flags)) {
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
write_sb = true;
}
@@ -1278,12 +1278,12 @@ int bch2_fs_initialize(struct bch_fs *c)
bch_notice(c, "initializing new filesystem");
mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
if (c->opts.version_upgrade) {
- c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
bch2_write_super(c);
}
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index c624fabe..a4207292 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
- refcount = (void *) &r_v->v;
+ refcount = bkey_refcount(r_v);
*refcount = 0;
memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
@@ -181,18 +181,19 @@ err:
static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
{
- struct bkey_s_c k = bch2_btree_iter_peek(iter);
+ struct bkey_s_c k;
int ret;
for_each_btree_key_continue(iter, 0, k, ret) {
if (bkey_cmp(iter->pos, end) >= 0)
- return bkey_s_c_null;
+ break;
if (bkey_extent_is_data(k.k))
- break;
+ return k;
}
- return k;
+ bch2_btree_iter_set_pos(iter, end);
+ return bkey_s_c_null;
}
s64 bch2_remap_range(struct bch_fs *c,
@@ -205,8 +206,8 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bkey_s_c src_k;
struct bkey_buf new_dst, new_src;
struct bpos dst_end = dst_start, src_end = src_start;
- struct bpos dst_want, src_want;
- u64 src_done, dst_done;
+ struct bpos src_want;
+ u64 dst_done;
int ret = 0, ret2 = 0;
if (!percpu_ref_tryget(&c->writes))
@@ -226,7 +227,8 @@ s64 bch2_remap_range(struct bch_fs *c,
dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
BTREE_ITER_INTENT);
- while (ret == 0 || ret == -EINTR) {
+ while ((ret == 0 || ret == -EINTR) &&
+ bkey_cmp(dst_iter->pos, dst_end) < 0) {
struct disk_reservation disk_res = { 0 };
bch2_trans_begin(&trans);
@@ -236,32 +238,29 @@ s64 bch2_remap_range(struct bch_fs *c,
break;
}
+ dst_done = dst_iter->pos.offset - dst_start.offset;
+ src_want = POS(src_start.inode, src_start.offset + dst_done);
+ bch2_btree_iter_set_pos(src_iter, src_want);
+
src_k = get_next_src(src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
continue;
- src_done = bpos_min(src_iter->pos, src_end).offset -
- src_start.offset;
- dst_want = POS(dst_start.inode, dst_start.offset + src_done);
-
- if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
- ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
- journal_seq, i_sectors_delta);
+ if (bkey_cmp(src_want, src_iter->pos) < 0) {
+ ret = bch2_fpunch_at(&trans, dst_iter,
+ bpos_min(dst_end,
+ POS(dst_iter->pos.inode, dst_iter->pos.offset +
+ src_iter->pos.offset - src_want.offset)),
+ journal_seq, i_sectors_delta);
continue;
}
- BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
-
- if (!bkey_cmp(dst_iter->pos, dst_end))
- break;
-
if (src_k.k->type != KEY_TYPE_reflink_p) {
bch2_bkey_buf_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
- bch2_cut_front(src_iter->pos, new_src.k);
- bch2_cut_back(src_end, new_src.k);
+ bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k));
ret = bch2_make_extent_indirect(&trans, src_iter,
new_src.k);
@@ -278,7 +277,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bkey_reflink_p_init(new_dst.k);
u64 offset = le64_to_cpu(src_p.v->idx) +
- (src_iter->pos.offset -
+ (src_want.offset -
bkey_start_offset(src_k.k));
dst_p->v.idx = cpu_to_le64(offset);
@@ -288,20 +287,13 @@ s64 bch2_remap_range(struct bch_fs *c,
new_dst.k->k.p = dst_iter->pos;
bch2_key_resize(&new_dst.k->k,
- min(src_k.k->p.offset - src_iter->pos.offset,
+ min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter->pos.offset));
-
ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
&disk_res, journal_seq,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
- if (ret)
- continue;
-
- dst_done = dst_iter->pos.offset - dst_start.offset;
- src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(src_iter, src_want);
}
bch2_trans_iter_put(&trans, dst_iter);
bch2_trans_iter_put(&trans, src_iter);
diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h
index 9d5e7dc5..bfc78561 100644
--- a/libbcachefs/reflink.h
+++ b/libbcachefs/reflink.h
@@ -34,6 +34,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *,
.val_to_text = bch2_indirect_inline_data_to_text, \
}
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_reflink_v:
+ return &bkey_s_c_to_reflink_v(k).v->refcount;
+ case KEY_TYPE_indirect_inline_data:
+ return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+ default:
+ return NULL;
+ }
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+ switch (k->k.type) {
+ case KEY_TYPE_reflink_v:
+ return &bkey_i_to_reflink_v(k)->v.refcount;
+ case KEY_TYPE_indirect_inline_data:
+ return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+ default:
+ return NULL;
+ }
+}
+
s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
u64, u64 *, u64, s64 *);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 74a75ced..97788516 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -982,7 +982,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -999,7 +999,7 @@ static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
- entry->u64s = u64s - 1;
+ entry->u64s = cpu_to_le16(u64s - 1);
*end = vstruct_next(*end);
return entry;
@@ -1092,7 +1092,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
clock->entry.type = BCH_JSET_ENTRY_clock;
clock->rw = i;
- clock->time = atomic64_read(&c->io_clock[i].now);
+ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
}
}
@@ -1109,10 +1109,10 @@ void bch2_fs_mark_clean(struct bch_fs *c)
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
- c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
- c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 3b1e9203..4c679363 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -509,10 +509,14 @@ static void __bch2_fs_free(struct bch_fs *c)
kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
+ if (c->io_complete_wq )
+ destroy_workqueue(c->io_complete_wq );
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
- if (c->wq)
- destroy_workqueue(c->wq);
+ if (c->btree_error_wq)
+ destroy_workqueue(c->btree_error_wq);
+ if (c->btree_update_wq)
+ destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
kvpfree(c, sizeof(*c));
@@ -760,10 +764,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
- if (!(c->wq = alloc_workqueue("bcachefs",
+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ !(c->btree_error_wq = alloc_workqueue("bcachefs_error",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
@@ -1437,7 +1445,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
/* Device add/removal: */
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
struct btree_trans trans;
size_t i;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 21ef7719..84a7acb0 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -312,7 +312,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
return 0;
}
-void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
bch2_bpos_to_text(out, c->gc_gens_pos);