summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-05-04 14:04:31 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2018-05-04 14:05:32 -0400
commit018de5aa899937a9dc3bc8cb9819cb218a59abf3 (patch)
tree554b99e5dafe04f5bf9201a3c54bd1b0f39f77f3
parentc598d91dcb0c7e95abdacb2711898ae14ab52ca1 (diff)
Update bcachefs sources to ed4aea2ad4 bcachefs: fix gcc warning
-rw-r--r--.bcachefs_revision2
-rw-r--r--libbcachefs/alloc.c1
-rw-r--r--libbcachefs/bcachefs.h35
-rw-r--r--libbcachefs/bcachefs_format.h53
-rw-r--r--libbcachefs/btree_io.c118
-rw-r--r--libbcachefs/btree_io.h2
-rw-r--r--libbcachefs/btree_iter.c4
-rw-r--r--libbcachefs/btree_update_interior.c1
-rw-r--r--libbcachefs/btree_update_leaf.c13
-rw-r--r--libbcachefs/buckets.h3
-rw-r--r--libbcachefs/debug.c12
-rw-r--r--libbcachefs/error.c10
-rw-r--r--libbcachefs/error.h6
-rw-r--r--libbcachefs/extents.c89
-rw-r--r--libbcachefs/extents.h12
-rw-r--r--libbcachefs/extents_types.h1
-rw-r--r--libbcachefs/fs-io.c271
-rw-r--r--libbcachefs/fs.c1
-rw-r--r--libbcachefs/fs.h2
-rw-r--r--libbcachefs/io.c832
-rw-r--r--libbcachefs/io.h33
-rw-r--r--libbcachefs/io_types.h21
-rw-r--r--libbcachefs/journal.c2486
-rw-r--r--libbcachefs/journal.h110
-rw-r--r--libbcachefs/journal_io.c1423
-rw-r--r--libbcachefs/journal_io.h45
-rw-r--r--libbcachefs/journal_reclaim.c411
-rw-r--r--libbcachefs/journal_reclaim.h36
-rw-r--r--libbcachefs/journal_seq_blacklist.c358
-rw-r--r--libbcachefs/journal_seq_blacklist.h13
-rw-r--r--libbcachefs/journal_types.h8
-rw-r--r--libbcachefs/move.c31
-rw-r--r--libbcachefs/super.c49
-rw-r--r--libbcachefs/super.h21
-rw-r--r--libbcachefs/sysfs.c100
-rw-r--r--libbcachefs/util.c201
-rw-r--r--libbcachefs/util.h101
37 files changed, 3916 insertions, 2999 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index a7c36b9..37d51b2 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-edf5f38218f699e53913a549465f35d36c4418f7
+ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index 16bdc48..256adb5 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -69,6 +69,7 @@
#include "extents.h"
#include "io.h"
#include "journal.h"
+#include "journal_io.h"
#include "super-io.h"
#include <linux/blkdev.h>
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index bc10324..206c30f 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -271,17 +271,19 @@ do { \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
-/* name, frequency_units, duration_units */
-#define BCH_TIME_STATS() \
- BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \
- BCH_TIME_STAT(btree_gc, sec, ms) \
- BCH_TIME_STAT(btree_split, sec, us) \
- BCH_TIME_STAT(btree_sort, ms, us) \
- BCH_TIME_STAT(btree_read, ms, us) \
- BCH_TIME_STAT(journal_write, us, us) \
- BCH_TIME_STAT(journal_delay, ms, us) \
- BCH_TIME_STAT(journal_blocked, sec, ms) \
- BCH_TIME_STAT(journal_flush_seq, us, us)
+#define BCH_TIME_STATS() \
+ BCH_TIME_STAT(btree_node_mem_alloc) \
+ BCH_TIME_STAT(btree_gc) \
+ BCH_TIME_STAT(btree_split) \
+ BCH_TIME_STAT(btree_sort) \
+ BCH_TIME_STAT(btree_read) \
+ BCH_TIME_STAT(data_write) \
+ BCH_TIME_STAT(data_read) \
+ BCH_TIME_STAT(data_promote) \
+ BCH_TIME_STAT(journal_write) \
+ BCH_TIME_STAT(journal_delay) \
+ BCH_TIME_STAT(journal_blocked) \
+ BCH_TIME_STAT(journal_flush_seq)
#include "alloc_types.h"
#include "buckets_types.h"
@@ -416,7 +418,12 @@ struct bch_dev {
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
- atomic_t latency[2];
+ atomic64_t cur_latency[2];
+ struct time_stats io_latency[2];
+
+#define CONGESTED_MAX 1024
+ atomic_t congested;
+ u64 congested_last;
struct io_count __percpu *io_done;
};
@@ -644,6 +651,7 @@ struct bch_fs {
struct bio_set bio_write;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
+ struct rhashtable promote_table;
mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_NR];
@@ -708,12 +716,13 @@ struct bch_fs {
unsigned copy_gc_enabled:1;
unsigned rebalance_enabled:1;
unsigned rebalance_percent;
+ bool promote_whole_extents;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
+#define BCH_TIME_STAT(name) \
struct time_stats name##_time;
BCH_TIME_STATS()
#undef BCH_TIME_STAT
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index eed6fb8..48d14a3 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1088,13 +1088,14 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
- struct bch_sb, flags[1], 28, 32);
LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+ struct bch_sb, flags[2], 0, 4);
+
/* Features: */
enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
@@ -1193,29 +1194,41 @@ struct jset_entry {
};
};
+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES() \
+ x(btree_keys, 0) \
+ x(btree_root, 1) \
+ x(prio_ptrs, 2) \
+ x(blacklist, 3) \
+ x(blacklist_v2, 4)
+
+enum {
+#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
+ BCH_JSET_ENTRY_TYPES()
+#undef x
+ BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
struct jset_entry_blacklist {
struct jset_entry entry;
__le64 seq;
};
-#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
-
-enum {
- JOURNAL_ENTRY_BTREE_KEYS = 0,
- JOURNAL_ENTRY_BTREE_ROOT = 1,
- JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */
-
- /*
- * Journal sequence numbers can be blacklisted: bsets record the max
- * sequence number of all the journal entries they contain updates for,
- * so that on recovery we can ignore those bsets that contain index
- * updates newer that what made it into the journal.
- *
- * This means that we can't reuse that journal_seq - we have to skip it,
- * and then record that we skipped it so that the next time we crash and
- * recover we don't think there was a missing journal entry.
- */
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
+struct jset_entry_blacklist_v2 {
+ struct jset_entry entry;
+ __le64 start;
+ __le64 end;
};
/*
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 0525c3b..1aa9422 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -13,7 +13,8 @@
#include "error.h"
#include "extents.h"
#include "io.h"
-#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
@@ -947,6 +948,7 @@ enum btree_validate_ret {
#define btree_err(type, c, b, i, msg, ...) \
({ \
+ __label__ out; \
char _buf[300], *out = _buf, *end = out + sizeof(_buf); \
\
out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
@@ -956,7 +958,11 @@ enum btree_validate_ret {
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
mustfix_fsck_err(c, "%s", _buf); \
- } else { \
+ goto out; \
+ } \
+ \
+ switch (write) { \
+ case READ: \
bch_err(c, "%s", _buf); \
\
switch (type) { \
@@ -976,7 +982,17 @@ enum btree_validate_ret {
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
+ break; \
+ case WRITE: \
+ bch_err(c, "corrupt metadata before write: %s", _buf); \
+ \
+ if (bch2_fs_inconsistent(c)) { \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+ break; \
} \
+out: \
true; \
})
@@ -1323,37 +1339,48 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_devs_mask avoid;
+ bool can_retry;
memset(&avoid, 0, sizeof(avoid));
goto start;
- do {
+ while (1) {
bch_info(c, "retrying read");
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio);
- bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- submit_bio_wait(bio);
+
+ if (rb->have_ioref) {
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ submit_bio_wait(bio);
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+ }
start:
- bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
- percpu_ref_put(&rb->pick.ca->io_ref);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+ if (rb->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+ rb->have_ioref = false;
- __set_bit(rb->pick.ca->dev_idx, avoid.d);
- rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+ __set_bit(rb->pick.ptr.dev, avoid.d);
+ can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
- goto out;
- } while (!IS_ERR_OR_NULL(rb->pick.ca));
+ !bch2_btree_node_read_done(c, b, can_retry))
+ break;
- set_btree_node_read_error(b);
-out:
- if (!IS_ERR_OR_NULL(rb->pick.ca))
- percpu_ref_put(&rb->pick.ca->io_ref);
+ if (!can_retry) {
+ set_btree_node_read_error(b);
+ break;
+ }
+ }
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
bio_put(&rb->bio);
@@ -1365,10 +1392,13 @@ static void btree_node_read_endio(struct bio *bio)
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
- bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
- INIT_WORK(&rb->work, btree_node_read_work);
queue_work(system_unbound_wq, &rb->work);
}
@@ -1377,41 +1407,58 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
{
struct extent_pick_ptr pick;
struct btree_read_bio *rb;
+ struct bch_dev *ca;
struct bio *bio;
+ int ret;
trace_btree_read(c, b);
- pick = bch2_btree_pick_ptr(c, b, NULL);
- if (bch2_fs_fatal_err_on(!pick.ca, c,
+ ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+ if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from")) {
set_btree_node_read_error(b);
return;
}
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
- bio_set_dev(bio, pick.ca->disk_sb.bdev);
+ INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = b;
bch2_bio_map(bio, b->data);
- this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
- bio_sectors(bio));
-
set_btree_node_read_in_flight(b);
- if (sync) {
- submit_bio_wait(bio);
- bio->bi_private = b;
- btree_node_read_work(&rb->work);
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+ bio_sectors(bio));
+ bio_set_dev(bio, ca->disk_sb.bdev);
+
+ if (sync) {
+ submit_bio_wait(bio);
+
+ bio->bi_private = b;
+ btree_node_read_work(&rb->work);
+ } else {
+ submit_bio(bio);
+ }
} else {
- bio->bi_end_io = btree_node_read_endio;
- bio->bi_private = b;
- submit_bio(bio);
+ bio->bi_status = BLK_STS_REMOVED;
+
+ if (sync)
+ btree_node_read_work(&rb->work);
+ else
+ queue_work(system_unbound_wq, &rb->work);
+
}
}
@@ -1593,20 +1640,21 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ if (wbio->have_ioref)
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (bio->bi_status == BLK_STS_REMOVED ||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
- if (wbio->have_io_ref)
+ if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
if (parent) {
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 01df817..947685f 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -12,8 +12,8 @@ struct btree_iter;
struct btree_read_bio {
struct bch_fs *c;
- unsigned submit_time_us;
u64 start_time;
+ unsigned have_ioref:1;
struct extent_pick_ptr pick;
struct work_struct work;
struct bio bio;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 465aadb..69cad3b 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -748,7 +748,9 @@ static void btree_iter_prefetch(struct btree_iter *iter)
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
- unsigned nr = iter->level > 1 ? 1 : 8;
+ unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
+ ? (iter->level > 1 ? 0 : 2)
+ : (iter->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(iter, iter->level);
while (nr) {
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 6369692..adba309 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -12,6 +12,7 @@
#include "buckets.h"
#include "extents.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "replicas.h"
#include "super-io.h"
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 53b39de..92fb5f6 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
#include "debug.h"
#include "extents.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include <linux/sort.h>
@@ -137,7 +138,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
- if (likely(trans->journal_res.ref)) {
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
@@ -155,12 +156,16 @@ void bch2_btree_journal_key(struct btree_insert *trans,
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
}
- if (unlikely(!journal_pin_active(&w->journal)))
- bch2_journal_pin_add(j, &trans->journal_res,
- &w->journal,
+ if (unlikely(!journal_pin_active(&w->journal))) {
+ u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ ? trans->journal_res.seq
+ : j->replay_journal_seq;
+
+ bch2_journal_pin_add(j, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
+ }
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 399a853..01f0b31 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -142,7 +142,8 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
if (WARN_ONCE(stats.buckets_unavailable > total,
- "buckets_unavailable overflow\n"))
+ "buckets_unavailable overflow (%llu > %llu)\n",
+ stats.buckets_unavailable, total))
return 0;
return total - stats.buckets_unavailable;
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 7190990..71f649b 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
struct bset *sorted, *inmemory;
struct extent_pick_ptr pick;
+ struct bch_dev *ca;
struct bio *bio;
if (c->opts.nochanges)
@@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
v->btree_id = b->btree_id;
bch2_btree_keys_init(v, &c->expensive_debug_checks);
- pick = bch2_btree_pick_ptr(c, b, NULL);
- if (IS_ERR_OR_NULL(pick.ca))
+ if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+ return;
+
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ if (!bch2_dev_get_ioref(ca, READ))
return;
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
- bio_set_dev(bio, pick.ca->disk_sb.bdev);
+ bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
@@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
submit_bio_wait(bio);
bio_put(bio);
- percpu_ref_put(&pick.ca->io_ref);
+ percpu_ref_put(&ca->io_ref);
memcpy(n_ondisk, n_sorted, btree_bytes(c));
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index ca2a06e..2a357fc 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -3,20 +3,22 @@
#include "io.h"
#include "super.h"
-void bch2_inconsistent_error(struct bch_fs *c)
+bool bch2_inconsistent_error(struct bch_fs *c)
{
set_bit(BCH_FS_ERROR, &c->flags);
switch (c->opts.errors) {
case BCH_ON_ERROR_CONTINUE:
- break;
+ return false;
case BCH_ON_ERROR_RO:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "emergency read only");
- break;
+ return true;
case BCH_ON_ERROR_PANIC:
panic(bch2_fmt(c, "panic after error"));
- break;
+ return true;
+ default:
+ BUG();
}
}
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index ac3e96d..ababaee 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -45,13 +45,13 @@ do { \
* BCH_ON_ERROR_CONTINUE mode
*/
-void bch2_inconsistent_error(struct bch_fs *);
+bool bch2_inconsistent_error(struct bch_fs *);
#define bch2_fs_inconsistent(c, ...) \
-do { \
+({ \
bch_err(c, __VA_ARGS__); \
bch2_inconsistent_error(c); \
-} while (0)
+})
#define bch2_fs_inconsistent_on(cond, c, ...) \
({ \
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index c5d1e7c..9efaa1f 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -588,58 +588,51 @@ out:
return out - buf;
}
-static inline bool dev_latency_better(struct bch_dev *dev1,
- struct bch_dev *dev2)
+static inline bool dev_latency_better(struct bch_fs *c,
+ const struct bch_extent_ptr *ptr1,
+ const struct bch_extent_ptr *ptr2)
{
- unsigned l1 = atomic_read(&dev1->latency[READ]);
- unsigned l2 = atomic_read(&dev2->latency[READ]);
+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
}
-static void extent_pick_read_device(struct bch_fs *c,
- struct bkey_s_c_extent e,
- struct bch_devs_mask *avoid,
- struct extent_pick_ptr *pick)
+static int extent_pick_read_device(struct bch_fs *c,
+ struct bkey_s_c_extent e,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
+ struct bch_dev *ca;
+ int ret = 0;
extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr))
continue;
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+ if (avoid && test_bit(ptr->dev, avoid->d))
continue;
- if (avoid) {
- if (test_bit(ca->dev_idx, avoid->d))
- continue;
-
- if (pick->ca &&
- test_bit(pick->ca->dev_idx, avoid->d))
- goto use;
- }
-
- if (pick->ca && !dev_latency_better(ca, pick->ca))
- continue;
-use:
- if (!percpu_ref_tryget(&ca->io_ref))
+ if (ret && !dev_latency_better(c, ptr, &pick->ptr))
continue;
- if (pick->ca)
- percpu_ref_put(&pick->ca->io_ref);
-
*pick = (struct extent_pick_ptr) {
.ptr = *ptr,
.crc = crc,
- .ca = ca,
};
+
+ ret = 1;
}
+
+ return ret;
}
/* Btree ptrs */
@@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
#undef p
}
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
- struct bch_devs_mask *avoid)
+int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
{
- struct extent_pick_ptr pick = { .ca = NULL };
-
- extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
- avoid, &pick);
-
- return pick;
+ return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+ avoid, pick);
}
/* Extents */
@@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
* Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
* other devices, it will still pick a pointer from avoid.
*/
-void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
- struct bch_devs_mask *avoid,
- struct extent_pick_ptr *ret)
+int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *pick)
{
- struct bkey_s_c_extent e;
+ int ret;
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
case KEY_TYPE_COOKIE:
- ret->ca = NULL;
- return;
+ return 0;
case KEY_TYPE_ERROR:
- ret->ca = ERR_PTR(-EIO);
- return;
+ return -EIO;
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
- ret->ca = NULL;
+ ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
+ avoid, pick);
- extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
+ if (!ret && !bkey_extent_is_cached(k.k))
+ ret = -EIO;
- if (!ret->ca && !bkey_extent_is_cached(e.k))
- ret->ca = ERR_PTR(-EIO);
- return;
+ return ret;
case BCH_RESERVATION:
- ret->ca = NULL;
- return;
+ return 0;
default:
BUG();
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 8dc1548..338e9e0 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -53,13 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct btree *,
struct btree_node_iter_large *);
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
- struct bch_devs_mask *avoid);
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+ struct bch_devs_mask *avoid,
+ struct extent_pick_ptr *);
-void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
- struct bch_devs_mask *,
- struct extent_pick_ptr *);
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+ struct bch_devs_mask *,
+ struct extent_pick_ptr *);
enum btree_insert_ret
bch2_insert_fixup_extent(struct btree_insert *,
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index 15805cd..76139f9 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked {
struct extent_pick_ptr {
struct bch_extent_ptr ptr;
struct bch_extent_crc_unpacked crc;
- struct bch_dev *ca;
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index d1473f2..a2455b4 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -20,6 +20,7 @@
#include <linux/migrate.h>
#include <linux/mmu_context.h>
#include <linux/pagevec.h>
+#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uio.h>
#include <linux/writeback.h>
@@ -124,13 +125,13 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
if (!res->sectors)
return;
- mutex_lock(&inode->ei_update_lock);
+ mutex_lock(&inode->ei_quota_lock);
BUG_ON(res->sectors > inode->ei_quota_reserved);
bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-((s64) res->sectors), BCH_QUOTA_PREALLOC);
inode->ei_quota_reserved -= res->sectors;
- mutex_unlock(&inode->ei_update_lock);
+ mutex_unlock(&inode->ei_quota_lock);
res->sectors = 0;
}
@@ -143,14 +144,14 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
{
int ret;
- mutex_lock(&inode->ei_update_lock);
+ mutex_lock(&inode->ei_quota_lock);
ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
if (likely(!ret)) {
inode->ei_quota_reserved += sectors;
res->sectors += sectors;
}
- mutex_unlock(&inode->ei_update_lock);
+ mutex_unlock(&inode->ei_quota_lock);
return ret;
}
@@ -195,9 +196,10 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
return __bch2_write_inode(c, inode, inode_set_size, &new_size);
}
-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, int sectors)
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+ struct quota_res *quota_res, int sectors)
{
+ mutex_lock(&inode->ei_quota_lock);
#ifdef CONFIG_BCACHEFS_QUOTA
if (quota_res && sectors > 0) {
BUG_ON(sectors > quota_res->sectors);
@@ -210,14 +212,7 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
}
#endif
inode->v.i_blocks += sectors;
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, int sectors)
-{
- mutex_lock(&inode->ei_update_lock);
- __i_sectors_acct(c, inode, quota_res, sectors);
- mutex_unlock(&inode->ei_update_lock);
+ mutex_unlock(&inode->ei_quota_lock);
}
/* i_sectors accounting: */
@@ -265,7 +260,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
if (h->new_i_size != U64_MAX)
i_size_write(&h->inode->v, h->new_i_size);
- __i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
+ i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
mutex_unlock(&h->inode->ei_update_lock);
@@ -773,6 +768,7 @@ void bch2_invalidatepage(struct page *page, unsigned int offset,
int bch2_releasepage(struct page *page, gfp_t gfp_mask)
{
+ /* XXX: this can't take locks that are held while we allocate memory */
EBUG_ON(!PageLocked(page));
EBUG_ON(PageWriteback(page));
@@ -881,10 +877,12 @@ static int readpage_add_page(struct readpages_iter *iter, struct page *page)
int ret;
prefetchw(&page->flags);
- page_state_init_for_read(page);
ret = add_to_page_cache_lru(page, iter->mapping,
page->index, GFP_NOFS);
+ if (!ret)
+ page_state_init_for_read(page);
+
put_page(page);
return ret;
}
@@ -992,12 +990,13 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
+ rbio->c = c;
+ rbio->start_time = local_clock();
+
while (1) {
- struct extent_pick_ptr pick;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
unsigned bytes;
- bool is_last;
bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
@@ -1016,45 +1015,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_unlock(iter);
k = bkey_i_to_s_c(&tmp.k);
- bch2_extent_pick_ptr(c, k, NULL, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, bio, "no device to read from");
- bio_endio(bio);
- return;
- }
+ if (readpages_iter) {
+ bool want_full_extent = false;
+
+ if (bkey_extent_is_data(k.k)) {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ want_full_extent |= !!crc.csum_type |
+ !!crc.compression_type;
+ }
- if (readpages_iter)
readpage_bio_extend(readpages_iter,
bio, k.k->p.offset,
- pick.ca &&
- (pick.crc.csum_type ||
- pick.crc.compression_type));
+ want_full_extent);
+ }
bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
bio->bi_iter.bi_sector) << 9;
- is_last = bytes == bio->bi_iter.bi_size;
swap(bio->bi_iter.bi_size, bytes);
+ if (bytes == bio->bi_iter.bi_size)
+ flags |= BCH_READ_LAST_FRAGMENT;
+
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(bio, k);
- if (pick.ca) {
- if (!is_last) {
- bio_inc_remaining(&rbio->bio);
- flags |= BCH_READ_MUST_CLONE;
- trace_read_split(&rbio->bio);
- }
+ bch2_read_extent(c, rbio, k, flags);
- bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
- &pick, flags);
- } else {
- zero_fill_bio(bio);
-
- if (is_last)
- bio_endio(bio);
- }
-
- if (is_last)
+ if (flags & BCH_READ_LAST_FRAGMENT)
return;
swap(bio->bi_iter.bi_size, bytes);
@@ -1487,6 +1478,194 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
return copied;
}
+#define WRITE_BATCH_PAGES 32
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+ struct address_space *mapping,
+ struct iov_iter *iter,
+ loff_t pos, unsigned len)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct page *pages[WRITE_BATCH_PAGES];
+ unsigned long index = pos >> PAGE_SHIFT;
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ unsigned i, copied = 0, nr_pages_copied = 0;
+ int ret = 0;
+
+ BUG_ON(!len);
+ BUG_ON(nr_pages > ARRAY_SIZE(pages));
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+ if (!pages[i]) {
+ nr_pages = i;
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (offset && !PageUptodate(pages[0])) {
+ ret = bch2_read_single_page(pages[0], mapping);
+ if (ret)
+ goto out;
+ }
+
+ if ((pos + len) & (PAGE_SIZE - 1) &&
+ !PageUptodate(pages[nr_pages - 1])) {
+ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
+ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+ } else {
+ ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+ if (ret)
+ goto out;
+ }
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ ret = bch2_get_page_reservation(c, inode, pages[i], true);
+
+ if (ret && !PageUptodate(pages[i])) {
+ ret = bch2_read_single_page(pages[i], mapping);
+ if (ret)
+ goto out;
+
+ ret = bch2_get_page_reservation(c, inode, pages[i], true);
+ }
+
+ if (ret)
+ goto out;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ for (i = 0; i < nr_pages; i++)
+ flush_dcache_page(pages[i]);
+
+ while (copied < len) {
+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
+ unsigned pg_bytes = min_t(unsigned, len - copied,
+ PAGE_SIZE - pg_offset);
+ unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
+ iter, pg_offset, pg_bytes);
+
+ if (!pg_copied)
+ break;
+
+ flush_dcache_page(page);
+ iov_iter_advance(iter, pg_copied);
+ copied += pg_copied;
+ }
+
+ if (!copied)
+ goto out;
+
+ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+ inode->ei_last_dirtied = (unsigned long) current;
+
+ if (pos + copied > inode->v.i_size)
+ i_size_write(&inode->v, pos + copied);
+
+ if (copied < len &&
+ ((offset + copied) & (PAGE_SIZE - 1))) {
+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+
+ if (!PageUptodate(page)) {
+ zero_user(page, 0, PAGE_SIZE);
+ copied -= (offset + copied) & (PAGE_SIZE - 1);
+ }
+ }
+out:
+ for (i = 0; i < nr_pages_copied; i++) {
+ if (!PageUptodate(pages[i]))
+ SetPageUptodate(pages[i]);
+ if (!PageDirty(pages[i]))
+ set_page_dirty(pages[i]);
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+
+ for (i = nr_pages_copied; i < nr_pages; i++) {
+ if (!PageDirty(pages[i]))
+ bch2_put_page_reservation(c, inode, pages[i]);
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+
+ return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ int ret = 0;
+
+ pagecache_add_get(&mapping->add_lock);
+
+ do {
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
+ PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+again:
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ bytes = min_t(unsigned long, iov_iter_count(iter),
+ PAGE_SIZE - offset);
+
+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ if (unlikely(fatal_signal_pending(current))) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ if (unlikely(ret < 0))
+ break;
+
+ cond_resched();
+
+ if (unlikely(ret == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_single_seg_count(iter));
+ goto again;
+ }
+ pos += ret;
+ written += ret;
+
+ balance_dirty_pages_ratelimited(mapping);
+ } while (iov_iter_count(iter));
+
+ pagecache_add_put(&mapping->add_lock);
+
+ return written ? written : ret;
+}
+
/* O_DIRECT reads */
static void bch2_dio_read_complete(struct closure *cl)
@@ -1822,7 +2001,7 @@ static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = iocb->ki_flags & IOCB_DIRECT
? bch2_direct_write(iocb, from)
- : generic_perform_write(file, from, iocb->ki_pos);
+ : bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index c7e842e..fb30f0d 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1028,6 +1028,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
+ mutex_init(&inode->ei_quota_lock);
inode->ei_journal_seq = 0;
return &inode->v;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index fddfb2d..fbbc7a3 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -15,6 +15,8 @@ struct bch_inode_info {
u64 ei_journal_seq;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
+
+ struct mutex ei_quota_lock;
struct bch_qid ei_qid;
struct bch_hash_info ei_str_hash;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 27e4508..bb65652 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -14,6 +14,7 @@
#include "compress.h"
#include "clock.h"
#include "debug.h"
+#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "io.h"
@@ -30,14 +31,71 @@
#include <trace/events/bcachefs.h>
-/* Allocate, free from mempool: */
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+ const struct bch_devs_mask *devs;
+ unsigned d, nr = 0, total = 0;
+ u64 now = local_clock(), last;
+ s64 congested;
+ struct bch_dev *ca;
+
+ if (!target)
+ return false;
+
+ rcu_read_lock();
+ devs = bch2_target_to_mask(c, target);
+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+ ca = rcu_dereference(c->devs[d]);
+ if (!ca)
+ continue;
+
+ congested = atomic_read(&ca->congested);
+ last = READ_ONCE(ca->congested_last);
+ if (time_after64(now, last))
+ congested -= (now - last) >> 12;
+
+ total += max(congested, 0LL);
+ nr++;
+ }
+ rcu_read_unlock();
-void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
+ return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+ u64 now, int rw)
+{
+ u64 latency_capable =
+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+ /* ideally we'd be taking into account the device's variance here: */
+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+ s64 latency_over = io_latency - latency_threshold;
+
+ if (latency_threshold && latency_over > 0) {
+ /*
+ * bump up congested by approximately latency_over * 4 /
+ * latency_threshold - we don't need much accuracy here so don't
+ * bother with the divide:
+ */
+ if (atomic_read(&ca->congested) < CONGESTED_MAX)
+ atomic_add(latency_over >>
+ max_t(int, ilog2(latency_threshold) - 2, 0),
+ &ca->congested);
+
+ ca->congested_last = now;
+ } else if (atomic_read(&ca->congested) > 0) {
+ atomic_dec(&ca->congested);
+ }
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
{
+ atomic64_t *latency = &ca->cur_latency[rw];
u64 now = local_clock();
- unsigned io_latency = (now >> 10) - submit_time_us;
- atomic_t *latency = &ca->latency[rw];
- unsigned old, new, v = atomic_read(latency);
+ u64 io_latency = time_after64(now, submit_time)
+ ? now - submit_time
+ : 0;
+ u64 old, new, v = atomic64_read(latency);
do {
old = v;
@@ -51,10 +109,16 @@ void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
now & ~(~0 << 5))
break;
- new = ewma_add((u64) old, io_latency, 6);
- } while ((v = atomic_cmpxchg(latency, old, new)) != old);
+ new = ewma_add(old, io_latency, 5);
+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+ bch2_congested_acct(ca, io_latency, now, rw);
+
+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
}
+/* Allocate, free from mempool: */
+
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
struct bio_vec *bv;
@@ -169,22 +233,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
n->c = c;
- n->ca = ca;
- n->submit_time_us = local_clock_us();
+ n->dev = ptr->dev;
+ n->have_ioref = bch2_dev_get_ioref(ca, WRITE);
+ n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
if (!journal_flushes_device(ca))
n->bio.bi_opf |= REQ_FUA;
- if (likely(percpu_ref_tryget(&ca->io_ref))) {
+ if (likely(n->have_ioref)) {
this_cpu_add(ca->io_done->sectors[WRITE][type],
bio_sectors(&n->bio));
- n->have_io_ref = true;
bio_set_dev(&n->bio, ca->disk_sb.bdev);
submit_bio(&n->bio);
} else {
- n->have_io_ref = false;
n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
@@ -196,15 +259,18 @@ static void __bch2_write(struct closure *);
static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_fs *c = op->c;
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
- op->error = bch2_journal_error(&op->c->journal);
+ op->error = bch2_journal_error(&c->journal);
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
- bch2_disk_reservation_put(op->c, &op->res);
- percpu_ref_put(&op->c->writes);
+ bch2_disk_reservation_put(c, &op->res);
+ percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
+ bch2_time_stats_update(&c->data_write_time, op->start_time);
+
closure_return(cl);
}
@@ -318,15 +384,15 @@ static void bch2_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
-
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
- set_bit(ca->dev_idx, op->failed.d);
+ set_bit(wbio->dev, op->failed.d);
- if (wbio->have_io_ref)
+ if (wbio->have_ioref) {
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
percpu_ref_put(&ca->io_ref);
+ }
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -821,6 +887,8 @@ void bch2_write(struct closure *cl)
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+ op->start_time = local_clock();
+
memset(&op->failed, 0, sizeof(op->failed));
bch2_keylist_init(&op->insert_keys, op->inline_keys);
@@ -844,19 +912,72 @@ void bch2_write(struct closure *cl)
struct promote_op {
struct closure cl;
+ u64 start_time;
+
+ struct rhash_head hash;
+ struct bpos pos;
+
struct migrate_write write;
struct bio_vec bi_inline_vecs[0]; /* must be last */
};
+static const struct rhashtable_params bch_promote_params = {
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+ struct bpos pos,
+ struct bch_io_opts opts,
+ unsigned flags)
+{
+ if (!opts.promote_target)
+ return false;
+
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return false;
+
+ if (percpu_ref_is_dying(&c->writes))
+ return false;
+
+ if (!bkey_extent_is_data(k.k))
+ return false;
+
+ if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
+ return false;
+
+ if (bch2_target_congested(c, opts.promote_target))
+ return false;
+
+ if (rhashtable_lookup_fast(&c->promote_table, &pos,
+ bch_promote_params))
+ return false;
+
+ return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+ int ret;
+
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+ percpu_ref_put(&c->writes);
+ kfree(op);
+}
+
static void promote_done(struct closure *cl)
{
struct promote_op *op =
container_of(cl, struct promote_op, cl);
struct bch_fs *c = op->write.op.c;
- percpu_ref_put(&c->writes);
+ bch2_time_stats_update(&c->data_promote_time, op->start_time);
+
bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
- kfree(op);
+ promote_free(c, op);
}
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
@@ -865,17 +986,15 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
- BUG_ON(!rbio->split || !rbio->bounce);
-
- if (!percpu_ref_tryget(&c->writes))
- return;
-
trace_promote(&rbio->bio);
/* we now own pages: */
+ BUG_ON(!rbio->bounce);
BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- rbio->promote = NULL;
bch2_migrate_read_done(&op->write, rbio);
@@ -884,69 +1003,120 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
closure_return_with_destructor(cl, promote_done);
}
-/*
- * XXX: multiple promotes can race with each other, wastefully. Keep a list of
- * outstanding promotes?
- */
-static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
- struct bkey_s_c k)
+noinline
+static struct promote_op *__promote_alloc(struct bch_fs *c,
+ struct bpos pos,
+ struct extent_pick_ptr *pick,
+ struct bch_io_opts opts,
+ unsigned rbio_sectors,
+ struct bch_read_bio **rbio)
{
- struct bch_fs *c = rbio->c;
- struct promote_op *op;
+ struct promote_op *op = NULL;
struct bio *bio;
+ unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
/* data might have to be decompressed in the write path: */
- unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
- PAGE_SECTORS);
+ unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
+ PAGE_SECTORS);
int ret;
- BUG_ON(!rbio->bounce);
- BUG_ON(pages < rbio->bio.bi_vcnt);
+ if (!percpu_ref_tryget(&c->writes))
+ return NULL;
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
GFP_NOIO);
if (!op)
- return NULL;
+ goto err;
- bio = &op->write.op.wbio.bio;
- bio_init(bio, bio->bi_inline_vecs, pages);
+ op->start_time = local_clock();
+ op->pos = pos;
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+ /*
+ * promotes require bouncing, but if the extent isn't
+ * checksummed/compressed it might be too big for the mempool:
+ */
+ if (rbio_sectors > c->sb.encoded_extent_max) {
+ *rbio = kzalloc(sizeof(struct bch_read_bio) +
+ sizeof(struct bio_vec) * rbio_pages,
+ GFP_NOIO);
+ if (!*rbio)
+ goto err;
+
+ rbio_init(&(*rbio)->bio, opts);
+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
+ rbio_pages);
+
+ (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
+ bch2_bio_map(&(*rbio)->bio, NULL);
+
+ if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
+ goto err;
+
+ (*rbio)->bounce = true;
+ (*rbio)->split = true;
+ (*rbio)->kmalloc = true;
+ }
+
+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+ bch_promote_params))
+ goto err;
+
+ bio = &op->write.op.wbio.bio;
+ bio_init(bio, bio->bi_inline_vecs, wbio_pages);
ret = bch2_migrate_write_init(c, &op->write,
writepoint_hashed((unsigned long) current),
- rbio->opts,
+ opts,
DATA_PROMOTE,
(struct data_opts) {
- .target = rbio->opts.promote_target
+ .target = opts.promote_target
},
- k);
+ bkey_s_c_null);
BUG_ON(ret);
return op;
+err:
+ if (*rbio)
+ bio_free_pages(&(*rbio)->bio);
+ kfree(*rbio);
+ *rbio = NULL;
+ kfree(op);
+ percpu_ref_put(&c->writes);
+ return NULL;
}
-static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
- unsigned flags, u16 target)
+static inline struct promote_op *promote_alloc(struct bch_fs *c,
+ struct bvec_iter iter,
+ struct bkey_s_c k,
+ struct extent_pick_ptr *pick,
+ struct bch_io_opts opts,
+ unsigned flags,
+ struct bch_read_bio **rbio,
+ bool *bounce,
+ bool *read_full)
{
- if (!target)
- return false;
-
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return false;
+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ unsigned sectors = promote_full
+ ? pick->crc.compressed_size
+ : bvec_iter_sectors(iter);
+ struct bpos pos = promote_full
+ ? bkey_start_pos(k.k)
+ : POS(k.k->p.inode, iter.bi_sector);
+ struct promote_op *promote;
+
+ if (!should_promote(c, k, pos, opts, flags))
+ return NULL;
- if (percpu_ref_is_dying(&c->writes))
- return false;
+ promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+ if (!promote)
+ return NULL;
- return bch2_extent_has_target(c, e, target) == NULL;
+ *bounce = true;
+ *read_full = promote_full;
+ return promote;
}
/* Read */
-static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
- struct bvec_iter, u64,
- struct bch_devs_mask *, unsigned);
-
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
@@ -979,38 +1149,144 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
- struct bch_read_bio *parent = rbio->parent;
-
- BUG_ON(!rbio->split);
+ BUG_ON(rbio->bounce && !rbio->split);
if (rbio->promote)
- kfree(rbio->promote);
+ promote_free(rbio->c, rbio->promote);
+ rbio->promote = NULL;
+
if (rbio->bounce)
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
- bio_put(&rbio->bio);
- return parent;
+ if (rbio->split) {
+ struct bch_read_bio *parent = rbio->parent;
+
+ if (rbio->kmalloc)
+ kfree(rbio);
+ else
+ bio_put(&rbio->bio);
+
+ rbio = parent;
+ }
+
+ return rbio;
}
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
- if (rbio->promote)
- kfree(rbio->promote);
- rbio->promote = NULL;
-
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
+ bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
bio_endio(&rbio->bio);
}
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct btree_iter iter;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_c k;
+ int ret;
+
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+ rbio->pos, BTREE_ITER_SLOTS);
+retry:
+ rbio->bio.bi_status = 0;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ if (btree_iter_err(k)) {
+ bch2_btree_iter_unlock(&iter);
+ goto err;
+ }
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ if (!bkey_extent_is_data(k.k) ||
+ !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset)) {
+ /* extent we wanted to read no longer exists: */
+ rbio->hole = true;
+ goto out;
+ }
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+ if (ret == READ_RETRY)
+ goto retry;
+ if (ret)
+ goto err;
+ goto out;
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+ bch2_rbio_done(rbio);
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+retry:
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_SLOTS, k) {
+ BKEY_PADDED(k) tmp;
+ unsigned bytes;
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ bytes = min_t(unsigned, bvec_iter.bi_size,
+ (k.k->p.offset - bvec_iter.bi_sector) << 9);
+ swap(bvec_iter.bi_size, bytes);
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+ switch (ret) {
+ case READ_RETRY:
+ goto retry;
+ case READ_ERR:
+ goto err;
+ };
+
+ if (bytes == bvec_iter.bi_size)
+ goto out;
+
+ swap(bvec_iter.bi_size, bytes);
+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+ }
+
+ /*
+ * If we get here, it better have been because there was an error
+ * reading a btree node
+ */
+ ret = bch2_btree_iter_unlock(&iter);
+ BUG_ON(!ret);
+ __bcache_io_error(c, "btree IO error %i", ret);
+err:
+ rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+ bch2_rbio_done(rbio);
+}
+
static void bch2_rbio_retry(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bvec_iter iter = rbio->bvec_iter;
- unsigned flags = rbio->flags;
- u64 inode = rbio->pos.inode;
+ struct bch_fs *c = rbio->c;
+ struct bvec_iter iter = rbio->bvec_iter;
+ unsigned flags = rbio->flags;
+ u64 inode = rbio->pos.inode;
struct bch_devs_mask avoid;
trace_read_retry(&rbio->bio);
@@ -1018,26 +1294,19 @@ static void bch2_rbio_retry(struct work_struct *work)
memset(&avoid, 0, sizeof(avoid));
if (rbio->retry == READ_RETRY_AVOID)
- __set_bit(rbio->pick.ca->dev_idx, avoid.d);
+ __set_bit(rbio->pick.ptr.dev, avoid.d);
- if (rbio->promote)
- kfree(rbio->promote);
- rbio->promote = NULL;
+ rbio->bio.bi_status = 0;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
- else
- rbio->bio.bi_status = 0;
+ rbio = bch2_rbio_free(rbio);
- if (!(flags & BCH_READ_NODECODE))
- flags |= BCH_READ_MUST_CLONE;
flags |= BCH_READ_IN_RETRY;
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE)
- bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+ bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
else
- __bch2_read(c, rbio, iter, inode, &avoid, flags);
+ bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -1049,7 +1318,9 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
return;
if (retry == READ_ERR) {
- bch2_rbio_parent(rbio)->bio.bi_status = error;
+ rbio = bch2_rbio_free(rbio);
+
+ rbio->bio.bi_status = error;
bch2_rbio_done(rbio);
} else {
bch2_rbio_punt(rbio, bch2_rbio_retry,
@@ -1121,12 +1392,13 @@ out:
bch2_btree_iter_unlock(&iter);
}
-static bool should_narrow_crcs(struct bkey_s_c_extent e,
+static bool should_narrow_crcs(struct bkey_s_c k,
struct extent_pick_ptr *pick,
unsigned flags)
{
return !(flags & BCH_READ_IN_RETRY) &&
- bch2_can_narrow_extent_crcs(e, pick->crc);
+ bkey_extent_is_data(k.k) &&
+ bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
}
/* Inner part that may run in process context */
@@ -1134,8 +1406,10 @@ static void __bch2_read_endio(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
@@ -1191,10 +1465,13 @@ static void __bch2_read_endio(struct work_struct *work)
*/
bch2_encrypt_bio(c, crc.csum_type, nonce, src);
promote_start(rbio->promote, rbio);
+ rbio->promote = NULL;
}
nodecode:
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+ rbio = bch2_rbio_free(rbio);
bch2_rbio_done(rbio);
+ }
return;
csum_err:
/*
@@ -1208,7 +1485,7 @@ csum_err:
return;
}
- bch2_dev_io_error(rbio->pick.ca,
+ bch2_dev_io_error(ca,
"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
@@ -1227,25 +1504,27 @@ static void bch2_read_endio(struct bio *bio)
{
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
+ struct bch_fs *c = rbio->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
- bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
-
- percpu_ref_put(&rbio->pick.ca->io_ref);
+ if (rbio->have_ioref) {
+ bch2_latency_acct(ca, rbio->submit_time, READ);
+ percpu_ref_put(&ca->io_ref);
+ }
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
if (rbio->pick.ptr.cached &&
(((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+ ptr_stale(ca, &rbio->pick.ptr))) {
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -1266,76 +1545,97 @@ static void bch2_read_endio(struct bio *bio)
}
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c_extent e,
- struct extent_pick_ptr *pick, unsigned flags)
+ struct bvec_iter iter, struct bkey_s_c k,
+ struct bch_devs_mask *avoid, unsigned flags)
{
- struct bch_read_bio *rbio;
- bool split = false, bounce = false, read_full = false;
- bool promote = false, narrow_crcs = false;
- struct bpos pos = bkey_start_pos(e.k);
- int ret = 0;
+ struct extent_pick_ptr pick;
+ struct bch_read_bio *rbio = NULL;
+ struct bch_dev *ca;
+ struct promote_op *promote = NULL;
+ bool bounce = false, read_full = false, narrow_crcs = false;
+ struct bpos pos = bkey_start_pos(k.k);
+ int pick_ret;
- lg_local_lock(&c->usage_lock);
- bucket_io_clock_reset(c, pick->ca,
- PTR_BUCKET_NR(pick->ca, &pick->ptr), READ);
- lg_local_unlock(&c->usage_lock);
+ pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
+
+ /* hole or reservation - just zero fill: */
+ if (!pick_ret)
+ goto hole;
- narrow_crcs = should_narrow_crcs(e, pick, flags);
+ if (pick_ret < 0)
+ goto no_device;
+
+ if (pick_ret > 0)
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
if (flags & BCH_READ_NODECODE) {
- BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
- iter.bi_size = pick->crc.compressed_size << 9;
+ /*
+ * can happen if we retry, and the extent we were going to read
+ * has been merged in the meantime:
+ */
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+ goto hole;
+
+ iter.bi_sector = pos.offset;
+ iter.bi_size = pick.crc.compressed_size << 9;
goto noclone;
}
+ if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+ bio_flagged(&orig->bio, BIO_CHAIN))
+ flags |= BCH_READ_MUST_CLONE;
+
+ narrow_crcs = should_narrow_crcs(k, &pick, flags);
+
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
- e.k->p.offset < bvec_iter_end_sector(iter));
+ EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+ k.k->p.offset < bvec_iter_end_sector(iter));
- if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
- (pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
+ if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+ (pick.crc.csum_type != BCH_CSUM_NONE &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
(flags & BCH_READ_MUST_BOUNCE)))) {
read_full = true;
bounce = true;
}
- promote = should_promote(c, e, flags, orig->opts.promote_target);
- /* could also set read_full */
- if (promote)
- bounce = true;
+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+ &rbio, &bounce, &read_full);
if (!read_full) {
- EBUG_ON(pick->crc.compression_type);
- EBUG_ON(pick->crc.csum_type &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
- bvec_iter_sectors(iter) != pick->crc.live_size ||
- pick->crc.offset ||
+ EBUG_ON(pick.crc.compression_type);
+ EBUG_ON(pick.crc.csum_type &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ bvec_iter_sectors(iter) != pick.crc.live_size ||
+ pick.crc.offset ||
iter.bi_sector != pos.offset));
- pick->ptr.offset += pick->crc.offset +
+ pick.ptr.offset += pick.crc.offset +
(iter.bi_sector - pos.offset);
- pick->crc.compressed_size = bvec_iter_sectors(iter);
- pick->crc.uncompressed_size = bvec_iter_sectors(iter);
- pick->crc.offset = 0;
- pick->crc.live_size = bvec_iter_sectors(iter);
+ pick.crc.compressed_size = bvec_iter_sectors(iter);
+ pick.crc.uncompressed_size = bvec_iter_sectors(iter);
+ pick.crc.offset = 0;
+ pick.crc.live_size = bvec_iter_sectors(iter);
pos.offset = iter.bi_sector;
}
- if (bounce) {
- unsigned sectors = pick->crc.compressed_size;
+ if (rbio) {
+ /* promote already allocated bounce rbio */
+ } else if (bounce) {
+ unsigned sectors = pick.crc.compressed_size;
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split),
+ DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ &c->bio_read_split),
orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
- split = true;
+ rbio->bounce = true;
+ rbio->split = true;
} else if (flags & BCH_READ_MUST_CLONE) {
/*
* Have to clone if there were any splits, due to error
@@ -1349,156 +1649,138 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
&c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter;
- split = true;
+ rbio->split = true;
} else {
noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
- split = false;
BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
+ BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
rbio->c = c;
- if (split)
+ rbio->submit_time = local_clock();
+ if (rbio->split)
rbio->parent = orig;
else
rbio->end_io = orig->bio.bi_end_io;
rbio->bvec_iter = iter;
- rbio->submit_time_us = local_clock_us();
rbio->flags = flags;
- rbio->bounce = bounce;
- rbio->split = split;
+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
rbio->narrow_crcs = narrow_crcs;
+ rbio->hole = 0;
rbio->retry = 0;
rbio->context = 0;
- rbio->devs_have = bch2_extent_devs(e);
- rbio->pick = *pick;
+ rbio->devs_have = bch2_bkey_devs(k);
+ rbio->pick = pick;
rbio->pos = pos;
- rbio->version = e.k->version;
- rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL;
+ rbio->version = k.k->version;
+ rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
- bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
- if (bounce)
+ if (rbio->bounce)
trace_read_bounce(&rbio->bio);
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+
+ if (!rbio->have_ioref)
+ goto no_device_postclone;
+
+ lg_local_lock(&c->usage_lock);
+ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+ lg_local_unlock(&c->usage_lock);
+
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ if (!(flags & BCH_READ_LAST_FRAGMENT)) {
+ bio_inc_remaining(&orig->bio);
+ trace_read_split(&orig->bio);
+ }
+
submit_bio(&rbio->bio);
+ return 0;
} else {
+ int ret;
+
submit_bio_wait(&rbio->bio);
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
- if (rbio->split)
- rbio = bch2_rbio_free(rbio);
- if (!ret)
- bch2_rbio_done(rbio);
- }
-
- return ret;
-}
-
-static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_devs_mask *avoid, unsigned flags)
-{
- struct extent_pick_ptr pick;
- struct btree_iter iter;
- BKEY_PADDED(k) tmp;
- struct bkey_s_c k;
- int ret;
-
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS);
-retry:
- k = bch2_btree_iter_peek_slot(&iter);
- if (btree_iter_err(k)) {
- bch2_btree_iter_unlock(&iter);
- goto err;
- }
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
- bch2_btree_iter_unlock(&iter);
+ rbio = bch2_rbio_free(rbio);
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
- rbio->pick.ptr,
- rbio->pos.offset -
- rbio->pick.crc.offset) ||
- bkey_start_offset(k.k) != bvec_iter.bi_sector)
- goto err;
+ if (ret == READ_RETRY_AVOID) {
+ __set_bit(pick.ptr.dev, avoid->d);
+ ret = READ_RETRY;
+ }
- bch2_extent_pick_ptr(c, k, avoid, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, &rbio->bio, "no device to read from");
- bio_endio(&rbio->bio);
- return;
+ return ret;
}
- if (!pick.ca)
- goto err;
+no_device_postclone:
+ if (!rbio->split)
+ rbio->bio.bi_end_io = rbio->end_io;
+ bch2_rbio_free(rbio);
+no_device:
+ __bcache_io_error(c, "no device to read from");
- if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
- percpu_ref_put(&pick.ca->io_ref);
- goto err;
+ if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ orig->bio.bi_status = BLK_STS_IOERR;
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bch2_rbio_done(orig);
+ return 0;
+ } else {
+ return READ_ERR;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
- &pick, flags);
- switch (ret) {
- case READ_RETRY_AVOID:
- __set_bit(pick.ca->dev_idx, avoid->d);
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- bio_endio(&rbio->bio);
- return;
- };
-
- return;
-err:
+hole:
/*
- * extent we wanted to read no longer exists, or
- * was merged or partially overwritten (and thus
- * possibly bigger than the memory that was
- * originally allocated)
+ * won't normally happen in the BCH_READ_NODECODE
+ * (bch2_move_extent()) path, but if we retry and the extent we wanted
+ * to read no longer exists we have to signal that:
*/
- rbio->bio.bi_status = BLK_STS_AGAIN;
- bio_endio(&rbio->bio);
- return;
+ if (flags & BCH_READ_NODECODE)
+ orig->hole = true;
+
+ zero_fill_bio_iter(&orig->bio, iter);
+
+ if (flags & BCH_READ_LAST_FRAGMENT)
+ bch2_rbio_done(orig);
+ return 0;
}
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_devs_mask *avoid, unsigned flags)
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
{
struct btree_iter iter;
struct bkey_s_c k;
+ unsigned flags = BCH_READ_RETRY_IF_STALE|
+ BCH_READ_MAY_PROMOTE|
+ BCH_READ_USER_MAPPED;
int ret;
- EBUG_ON(flags & BCH_READ_NODECODE);
-retry:
+ BUG_ON(rbio->_state);
+ BUG_ON(flags & BCH_READ_NODECODE);
+ BUG_ON(flags & BCH_READ_IN_RETRY);
+
+ rbio->c = c;
+ rbio->start_time = local_clock();
+
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
+ POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS, k) {
BKEY_PADDED(k) tmp;
- struct extent_pick_ptr pick;
- struct bvec_iter fragment;
+ unsigned bytes;
/*
* Unlock the iterator while the btree node's lock is still in
@@ -1508,49 +1790,20 @@ retry:
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&iter);
- bch2_extent_pick_ptr(c, k, avoid, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, &rbio->bio, "no device to read from");
- bio_endio(&rbio->bio);
- return;
- }
-
- fragment = bvec_iter;
- fragment.bi_size = (min_t(u64, k.k->p.offset,
- bvec_iter_end_sector(bvec_iter)) -
- bvec_iter.bi_sector) << 9;
+ bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
+ (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
- if (pick.ca) {
- if (fragment.bi_size != bvec_iter.bi_size) {
- bio_inc_remaining(&rbio->bio);
- flags |= BCH_READ_MUST_CLONE;
- trace_read_split(&rbio->bio);
- }
+ if (rbio->bio.bi_iter.bi_size == bytes)
+ flags |= BCH_READ_LAST_FRAGMENT;
- ret = __bch2_read_extent(c, rbio, fragment,
- bkey_s_c_to_extent(k),
- &pick, flags);
- switch (ret) {
- case READ_RETRY_AVOID:
- __set_bit(pick.ca->dev_idx, avoid->d);
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- rbio->bio.bi_status = BLK_STS_IOERR;
- bio_endio(&rbio->bio);
- return;
- };
- } else {
- zero_fill_bio_iter(&rbio->bio, fragment);
-
- if (fragment.bi_size == bvec_iter.bi_size)
- bio_endio(&rbio->bio);
- }
+ bch2_read_extent(c, rbio, k, flags);
- if (fragment.bi_size == bvec_iter.bi_size)
+ if (flags & BCH_READ_LAST_FRAGMENT)
return;
- bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+ bio_advance(&rbio->bio, bytes);
}
/*
@@ -1560,5 +1813,34 @@ retry:
ret = bch2_btree_iter_unlock(&iter);
BUG_ON(!ret);
bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
- bio_endio(&rbio->bio);
+ bch2_rbio_done(rbio);
+}
+
+void bch2_fs_io_exit(struct bch_fs *c)
+{
+ if (c->promote_table.tbl)
+ rhashtable_destroy(&c->promote_table);
+ mempool_exit(&c->bio_bounce_pages);
+ bioset_exit(&c->bio_write);
+ bioset_exit(&c->bio_read_split);
+ bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ mempool_init_page_pool(&c->bio_bounce_pages,
+ max_t(unsigned,
+ c->opts.btree_node_size,
+ c->sb.encoded_extent_max) /
+ PAGE_SECTORS, 0) ||
+ rhashtable_init(&c->promote_table, &bch_promote_params))
+ return -ENOMEM;
+
+ return 0;
}
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index a0c795a..8b1411c 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -16,7 +16,7 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
-void bch2_latency_acct(struct bch_dev *, unsigned, int);
+void bch2_latency_acct(struct bch_dev *, u64, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
@@ -99,40 +99,28 @@ struct cache_promote_op;
struct extent_pick_ptr;
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- struct bkey_s_c_extent e, struct extent_pick_ptr *,
- unsigned);
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- u64, struct bch_devs_mask *, unsigned);
+ struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
BCH_READ_NODECODE = 1 << 3,
+ BCH_READ_LAST_FRAGMENT = 1 << 4,
/* internal: */
- BCH_READ_MUST_BOUNCE = 1 << 4,
- BCH_READ_MUST_CLONE = 1 << 5,
- BCH_READ_IN_RETRY = 1 << 6,
+ BCH_READ_MUST_BOUNCE = 1 << 5,
+ BCH_READ_MUST_CLONE = 1 << 6,
+ BCH_READ_IN_RETRY = 1 << 7,
};
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
- struct bkey_s_c_extent e,
- struct extent_pick_ptr *pick,
+ struct bkey_s_c k,
unsigned flags)
{
- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
-}
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inode)
-{
- BUG_ON(rbio->_state);
- __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE|
- BCH_READ_USER_MAPPED);
+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
@@ -146,4 +134,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
return rbio;
}
+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
#endif /* _BCACHEFS_IO_H */
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index a022ab3..28281ea 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -14,6 +14,8 @@
struct bch_read_bio {
struct bch_fs *c;
+ u64 start_time;
+ u64 submit_time;
/*
* Reads will often have to be split, and if the extent being read from
@@ -35,17 +37,19 @@ struct bch_read_bio {
*/
struct bvec_iter bvec_iter;
- unsigned submit_time_us;
- u8 flags;
+ u16 flags;
union {
struct {
- u8 bounce:1,
+ u16 bounce:1,
split:1,
+ kmalloc:1,
+ have_ioref:1,
narrow_crcs:1,
+ hole:1,
retry:2,
context:2;
};
- u8 _state;
+ u16 _state;
};
struct bch_devs_list devs_have;
@@ -66,20 +70,20 @@ struct bch_read_bio {
struct bch_write_bio {
struct bch_fs *c;
- struct bch_dev *ca;
struct bch_write_bio *parent;
+ u64 submit_time;
+
struct bch_devs_list failed;
u8 order;
+ u8 dev;
unsigned split:1,
bounce:1,
put_bio:1,
- have_io_ref:1,
+ have_ioref:1,
used_mempool:1;
- unsigned submit_time_us;
-
struct bio bio;
};
@@ -87,6 +91,7 @@ struct bch_write_op {
struct closure cl;
struct bch_fs *c;
struct workqueue_struct *io_wq;
+ u64 start_time;
unsigned written; /* sectors */
u16 flags;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index b525a85..ea67af3 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -7,1160 +7,16 @@
#include "bcachefs.h"
#include "alloc.h"
#include "bkey_methods.h"
-#include "buckets.h"
#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "checksum.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
+#include "buckets.h"
#include "journal.h"
-#include "replicas.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "super-io.h"
-#include "vstructs.h"
#include <trace/events/bcachefs.h>
-static void journal_write(struct closure *);
-static void journal_reclaim_fast(struct journal *);
-static void journal_pin_add_entry(struct journal *,
- struct journal_entry_pin_list *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-
-static inline void journal_wake(struct journal *j)
-{
- wake_up(&j->wait);
- closure_wake_up(&j->async_wait);
-}
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- return j->buf + j->reservations.idx;
-}
-
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
- return j->buf + !j->reservations.idx;
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
- return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
- return j->pin.back - 1;
-}
-
-static inline u64 journal_pin_seq(struct journal *j,
- struct journal_entry_pin_list *pin_list)
-{
- return fifo_entry_idx_abs(&j->pin, pin_list);
-}
-
-u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
-{
- u64 ret = 0;
-
- spin_lock(&j->lock);
- if (journal_pin_active(pin))
- ret = journal_pin_seq(j, pin->pin_list);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
- unsigned type, enum btree_id id,
- unsigned level,
- const void *data, size_t u64s)
-{
- struct jset *jset = buf->data;
-
- bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s),
- type, id, level, data, u64s);
- le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-}
-
-static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
- enum btree_id id)
-{
- struct jset_entry *entry;
-
- for_each_jset_entry_type(entry, j, type)
- if (entry->btree_id == id)
- return entry;
-
- return NULL;
-}
-
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
- enum btree_id id, unsigned *level)
-{
- struct bkey_i *k;
- struct jset_entry *entry =
- bch2_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id);
-
- if (!entry)
- return NULL;
-
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
-
- k = entry->start;
- *level = entry->level;
- *level = entry->level;
- return k;
-}
-
-static void bch2_journal_add_btree_root(struct journal_buf *buf,
- enum btree_id id, struct bkey_i *k,
- unsigned level)
-{
- bch2_journal_add_entry_noreservation(buf,
- JOURNAL_ENTRY_BTREE_ROOT, id, level,
- k, k->k.u64s);
-}
-
-static void journal_seq_blacklist_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct bch_fs *c =
- container_of(j, struct bch_fs, journal);
- struct journal_seq_blacklist *bl =
- container_of(pin, struct journal_seq_blacklist, pin);
- struct blacklisted_node n;
- struct closure cl;
- unsigned i;
- int ret;
-
- closure_init_stack(&cl);
-
- for (i = 0;; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- mutex_lock(&j->blacklist_lock);
- if (i >= bl->nr_entries) {
- mutex_unlock(&j->blacklist_lock);
- break;
- }
- n = bl->entries[i];
- mutex_unlock(&j->blacklist_lock);
-
- __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
-
- b = bch2_btree_iter_peek_node(&iter);
-
- /* The node might have already been rewritten: */
-
- if (b->data->keys.seq == n.seq) {
- ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
- bch2_fs_fatal_error(c,
- "error %i rewriting btree node with blacklisted journal seq",
- ret);
- bch2_journal_halt(j);
- return;
- }
- }
-
- bch2_btree_iter_unlock(&iter);
- }
-
- for (i = 0;; i++) {
- struct btree_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&j->blacklist_lock);
- if (i >= bl->nr_entries) {
- mutex_unlock(&j->blacklist_lock);
- break;
- }
- n = bl->entries[i];
- mutex_unlock(&j->blacklist_lock);
-redo_wait:
- mutex_lock(&c->btree_interior_update_lock);
-
- /*
- * Is the node on the list of pending interior node updates -
- * being freed? If so, wait for that to finish:
- */
- for_each_pending_btree_node_free(c, as, d)
- if (n.seq == d->seq &&
- n.btree_id == d->btree_id &&
- !d->level &&
- !bkey_cmp(n.pos, d->key.k.p)) {
- closure_wait(&as->wait, &cl);
- mutex_unlock(&c->btree_interior_update_lock);
- closure_sync(&cl);
- goto redo_wait;
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
- }
-
- mutex_lock(&j->blacklist_lock);
-
- bch2_journal_pin_drop(j, &bl->pin);
- list_del(&bl->list);
- kfree(bl->entries);
- kfree(bl);
-
- mutex_unlock(&j->blacklist_lock);
-}
-
-static struct journal_seq_blacklist *
-journal_seq_blacklist_find(struct journal *j, u64 seq)
-{
- struct journal_seq_blacklist *bl;
-
- lockdep_assert_held(&j->blacklist_lock);
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (seq == bl->seq)
- return bl;
-
- return NULL;
-}
-
-static struct journal_seq_blacklist *
-bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq)
-{
- struct journal_seq_blacklist *bl;
-
- lockdep_assert_held(&j->blacklist_lock);
-
- /*
- * When we start the journal, bch2_journal_start() will skip over @seq:
- */
-
- bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return NULL;
-
- bl->seq = seq;
- list_add_tail(&bl->list, &j->seq_blacklist);
- return bl;
-}
-
-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl = NULL;
- struct blacklisted_node *n;
- u64 journal_seq, i;
- int ret = 0;
-
- if (!seq)
- return 0;
-
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
- spin_unlock(&j->lock);
-
- /* Interier updates aren't journalled: */
- BUG_ON(b->level);
- BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
-
- /*
- * Decrease this back to j->seq + 2 when we next rev the on disk format:
- * increasing it temporarily to work around bug in old kernels
- */
- bch2_fs_inconsistent_on(seq > journal_seq + 4, c,
- "bset journal seq too far in the future: %llu > %llu",
- seq, journal_seq);
-
- if (seq <= journal_seq &&
- list_empty_careful(&j->seq_blacklist))
- return 0;
-
- mutex_lock(&j->blacklist_lock);
-
- if (seq <= journal_seq) {
- bl = journal_seq_blacklist_find(j, seq);
- if (!bl)
- goto out;
- } else {
- bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
- b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
- for (i = journal_seq + 1; i <= seq; i++) {
- bl = journal_seq_blacklist_find(j, i) ?:
- bch2_journal_seq_blacklisted_new(j, i);
- if (!bl) {
- ret = -ENOMEM;
- goto out;
- }
- }
- }
-
- for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
- if (b->data->keys.seq == n->seq &&
- b->btree_id == n->btree_id &&
- !bkey_cmp(b->key.k.p, n->pos))
- goto found_entry;
-
- if (!bl->nr_entries ||
- is_power_of_2(bl->nr_entries)) {
- n = krealloc(bl->entries,
- max(bl->nr_entries * 2, 8UL) * sizeof(*n),
- GFP_KERNEL);
- if (!n) {
- ret = -ENOMEM;
- goto out;
- }
- bl->entries = n;
- }
-
- bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
- .seq = b->data->keys.seq,
- .btree_id = b->btree_id,
- .pos = b->key.k.p,
- };
-found_entry:
- ret = 1;
-out:
- mutex_unlock(&j->blacklist_lock);
- return ret;
-}
-
-/*
- * Journal replay/recovery:
- *
- * This code is all driven from bch2_fs_start(); we first read the journal
- * entries, do some other stuff, then we mark all the keys in the journal
- * entries (same as garbage collection would), then we replay them - reinserting
- * them into the cache in precisely the same order as they appear in the
- * journal.
- *
- * We only journal keys that go in leaf nodes, which simplifies things quite a
- * bit.
- */
-
-struct journal_list {
- struct closure cl;
- struct mutex lock;
- struct list_head *head;
- int ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK 0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct journal_list *jlist, struct jset *j)
-{
- struct journal_replay *i, *pos;
- struct list_head *where;
- size_t bytes = vstruct_bytes(j);
- __le64 last_seq;
- int ret;
-
- last_seq = !list_empty(jlist->head)
- ? list_last_entry(jlist->head, struct journal_replay,
- list)->j.last_seq
- : 0;
-
- /* Is this entry older than the range we need? */
- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
- }
-
- /* Drop entries we don't need anymore */
- list_for_each_entry_safe(i, pos, jlist->head, list) {
- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
- break;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-
- list_for_each_entry_reverse(i, jlist->head, list) {
- /* Duplicate? */
- if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
- fsck_err_on(bytes != vstruct_bytes(&i->j) ||
- memcmp(j, &i->j, bytes), c,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
- goto found;
- }
-
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
- }
-
- where = jlist->head;
-add:
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i) {
- ret = -ENOMEM;
- goto out;
- }
-
- list_add(&i->list, where);
- i->devs.nr = 0;
- memcpy(&i->j, j, bytes);
-found:
- if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
- bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
- else
- fsck_err_on(1, c, "duplicate journal entries on same device");
- ret = JOURNAL_ENTRY_ADD_OK;
-out:
-fsck_err:
- return ret;
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = ((__le32 *) &jset->seq)[0],
- [2] = ((__le32 *) &jset->seq)[1],
- [3] = BCH_NONCE_JOURNAL,
- }};
-}
-
-/* this fills in a range with empty jset_entries: */
-static void journal_entry_null_range(void *start, void *end)
-{
- struct jset_entry *entry;
-
- for (entry = start; entry != end; entry = vstruct_next(entry))
- memset(entry, 0, sizeof(*entry));
-}
-
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
- struct jset_entry *entry,
- struct bkey_i *k, enum bkey_type key_type,
- const char *type)
-{
- void *next = vstruct_next(entry);
- const char *invalid;
- char buf[160];
- int ret = 0;
-
- if (mustfix_fsck_err_on(!k->k.u64s, c,
- "invalid %s in journal: k->u64s 0", type)) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- if (mustfix_fsck_err_on((void *) bkey_next(k) >
- (void *) vstruct_next(entry), c,
- "invalid %s in journal: extends past end of journal entry",
- type)) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- if (mustfix_fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in journal: bad format %u",
- type, k->k.format)) {
- le16_add_cpu(&entry->u64s, -k->k.u64s);
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
-
- invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
- if (invalid) {
- bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
- bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
- type, invalid, buf);
-
- le16_add_cpu(&entry->u64s, -k->k.u64s);
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-fsck_err:
- return ret;
-}
-
-#define JOURNAL_ENTRY_REREAD 5
-#define JOURNAL_ENTRY_NONE 6
-#define JOURNAL_ENTRY_BAD 7
-
-#define journal_entry_err(c, msg, ...) \
-({ \
- if (write == READ) { \
- mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
- } else { \
- bch_err(c, "detected corrupt metadata before write:\n" \
- msg, ##__VA_ARGS__); \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
- goto fsck_err; \
- } \
- true; \
-})
-
-#define journal_entry_err_on(cond, c, msg, ...) \
- ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
-
-static int journal_entry_validate_entries(struct bch_fs *c, struct jset *jset,
- int write)
-{
- struct jset_entry *entry;
- int ret = 0;
-
- vstruct_for_each(jset, entry) {
- void *next = vstruct_next(entry);
- struct bkey_i *k;
-
- if (journal_entry_err_on(vstruct_next(entry) >
- vstruct_last(jset), c,
- "journal entry extends past end of jset")) {
- jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
- break;
- }
-
- switch (entry->type) {
- case JOURNAL_ENTRY_BTREE_KEYS:
- vstruct_for_each(entry, k) {
- ret = journal_validate_key(c, jset, entry, k,
- bkey_type(entry->level,
- entry->btree_id),
- "key");
- if (ret)
- goto fsck_err;
- }
- break;
-
- case JOURNAL_ENTRY_BTREE_ROOT:
- k = entry->start;
-
- if (journal_entry_err_on(!entry->u64s ||
- le16_to_cpu(entry->u64s) != k->k.u64s, c,
- "invalid btree root journal entry: wrong number of keys")) {
- /*
- * we don't want to null out this jset_entry,
- * just the contents, so that later we can tell
- * we were _supposed_ to have a btree root
- */
- entry->u64s = 0;
- journal_entry_null_range(vstruct_next(entry), next);
- continue;
- }
-
- ret = journal_validate_key(c, jset, entry, k,
- BKEY_TYPE_BTREE, "btree root");
- if (ret)
- goto fsck_err;
- break;
-
- case JOURNAL_ENTRY_PRIO_PTRS:
- break;
-
- case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
- "invalid journal seq blacklist entry: bad size")) {
- journal_entry_null_range(entry,
- vstruct_next(entry));
- }
-
- break;
- default:
- journal_entry_err(c, "invalid journal entry type %u",
- entry->type);
- journal_entry_null_range(entry, vstruct_next(entry));
- break;
- }
- }
-
-fsck_err:
- return ret;
-}
-
-static int journal_entry_validate(struct bch_fs *c,
- struct jset *jset, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read,
- int write)
-{
- size_t bytes = vstruct_bytes(jset);
- struct bch_csum csum;
- int ret = 0;
-
- if (le64_to_cpu(jset->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
- bch_err(c, "unknown journal entry version %u",
- le32_to_cpu(jset->version));
- return BCH_FSCK_UNKNOWN_VERSION;
- }
-
- if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
- "journal entry too big (%zu bytes), sector %lluu",
- bytes, sector)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
-
- if (bytes > sectors_read << 9)
- return JOURNAL_ENTRY_REREAD;
-
- if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
- "journal entry with unknown csum type %llu sector %lluu",
- JSET_CSUM_TYPE(jset), sector))
- return JOURNAL_ENTRY_BAD;
-
- csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
- if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
- "journal checksum bad, sector %llu", sector)) {
- /* XXX: retry IO, when we start retrying checksum errors */
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
-
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
-
- if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
- "invalid journal entry: last_seq > seq"))
- jset->last_seq = jset->seq;
-
- return 0;
-fsck_err:
- return ret;
-}
-
-struct journal_read_buf {
- void *data;
- size_t size;
-};
-
-static int journal_read_buf_realloc(struct journal_read_buf *b,
- size_t new_size)
-{
- void *n;
-
- /* the bios are sized for this many pages, max: */
- if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -ENOMEM;
-
- new_size = roundup_pow_of_two(new_size);
- n = kvpmalloc(new_size, GFP_KERNEL);
- if (!n)
- return -ENOMEM;
-
- kvpfree(b->data, b->size);
- b->data = n;
- b->size = new_size;
- return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
- struct journal_read_buf *buf,
- struct journal_list *jlist,
- unsigned bucket, u64 *seq, bool *entries_found)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct bio *bio = ja->bio;
- struct jset *j = NULL;
- unsigned sectors, sectors_read = 0;
- u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
- end = offset + ca->mi.bucket_size;
- bool saw_bad = false;
- int ret = 0;
-
- pr_debug("reading %u", bucket);
-
- while (offset < end) {
- if (!sectors_read) {
-reread: sectors_read = min_t(unsigned,
- end - offset, buf->size >> 9);
-
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = offset;
- bio->bi_iter.bi_size = sectors_read << 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch2_bio_map(bio, buf->data);
-
- ret = submit_bio_wait(bio);
-
- if (bch2_dev_io_err_on(ret, ca,
- "journal read from sector %llu",
- offset) ||
- bch2_meta_read_fault("journal"))
- return -EIO;
-
- j = buf->data;
- }
-
- ret = journal_entry_validate(c, j, offset,
- end - offset, sectors_read,
- READ);
- switch (ret) {
- case BCH_FSCK_OK:
- break;
- case JOURNAL_ENTRY_REREAD:
- if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
- vstruct_bytes(j));
- if (ret)
- return ret;
- }
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- return 0;
- sectors = c->opts.block_size;
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
- sectors = c->opts.block_size;
- goto next_block;
- default:
- return ret;
- }
-
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, jlist, j);
- mutex_unlock(&jlist->lock);
-
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- *entries_found = true;
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- return ret;
- }
-
- if (le64_to_cpu(j->seq) > *seq)
- *seq = le64_to_cpu(j->seq);
-
- sectors = vstruct_sectors(j, c->block_bits);
-next_block:
- pr_debug("next");
- offset += sectors;
- sectors_read -= sectors;
- j = ((void *) j) + (sectors << 9);
- }
-
- return 0;
-}
-
-static void bch2_journal_read_device(struct closure *cl)
-{
-#define read_bucket(b) \
- ({ \
- bool entries_found = false; \
- ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
- &entries_found); \
- if (ret) \
- goto err; \
- __set_bit(b, bitmap); \
- entries_found; \
- })
-
- struct journal_device *ja =
- container_of(cl, struct journal_device, read);
- struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
- struct journal_list *jlist =
- container_of(cl->parent, struct journal_list, cl);
- struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
- struct journal_read_buf buf = { NULL, 0 };
-
- DECLARE_BITMAP(bitmap, ja->nr);
- unsigned i, l, r;
- u64 seq = 0;
- int ret;
-
- if (!ja->nr)
- goto out;
-
- bitmap_zero(bitmap, ja->nr);
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
- if (ret)
- goto err;
-
- pr_debug("%u journal buckets", ja->nr);
-
- /*
- * If the device supports discard but not secure discard, we can't do
- * the fancy fibonacci hash/binary search because the live journal
- * entries might not form a contiguous range:
- */
- for (i = 0; i < ja->nr; i++)
- read_bucket(i);
- goto search_done;
-
- if (!blk_queue_nonrot(q))
- goto linear_scan;
-
- /*
- * Read journal buckets ordered by golden ratio hash to quickly
- * find a sequence of buckets with valid journal entries
- */
- for (i = 0; i < ja->nr; i++) {
- l = (i * 2654435769U) % ja->nr;
-
- if (test_bit(l, bitmap))
- break;
-
- if (read_bucket(l))
- goto bsearch;
- }
-
- /*
- * If that fails, check all the buckets we haven't checked
- * already
- */
- pr_debug("falling back to linear search");
-linear_scan:
- for (l = find_first_zero_bit(bitmap, ja->nr);
- l < ja->nr;
- l = find_next_zero_bit(bitmap, ja->nr, l + 1))
- if (read_bucket(l))
- goto bsearch;
-
- /* no journal entries on this device? */
- if (l == ja->nr)
- goto out;
-bsearch:
- /* Binary search */
- r = find_next_bit(bitmap, ja->nr, l + 1);
- pr_debug("starting binary search, l %u r %u", l, r);
-
- while (l + 1 < r) {
- unsigned m = (l + r) >> 1;
- u64 cur_seq = seq;
-
- read_bucket(m);
-
- if (cur_seq != seq)
- l = m;
- else
- r = m;
- }
-
-search_done:
- /*
- * Find the journal bucket with the highest sequence number:
- *
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- seq = 0;
-
- for (i = 0; i < ja->nr; i++)
- if (ja->bucket_seq[i] >= seq &&
- ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
- /*
- * When journal_next_bucket() goes to allocate for
- * the first time, it'll use the bucket after
- * ja->cur_idx
- */
- ja->cur_idx = i;
- seq = ja->bucket_seq[i];
- }
-
- /*
- * Set last_idx to indicate the entire journal is full and needs to be
- * reclaimed - journal reclaim will immediately reclaim whatever isn't
- * pinned when it first runs:
- */
- ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
- /*
- * Read buckets in reverse order until we stop finding more journal
- * entries:
- */
- for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
- i != ja->cur_idx;
- i = (i + ja->nr - 1) % ja->nr)
- if (!test_bit(i, bitmap) &&
- !read_bucket(i))
- break;
-out:
- kvpfree(buf.data, buf.size);
- percpu_ref_put(&ca->io_ref);
- closure_return(cl);
-err:
- mutex_lock(&jlist->lock);
- jlist->ret = ret;
- mutex_unlock(&jlist->lock);
- goto out;
-#undef read_bucket
-}
-
-void bch2_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-}
-
-static int journal_seq_blacklist_read(struct journal *j,
- struct journal_replay *i,
- struct journal_entry_pin_list *p)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *entry;
- struct journal_seq_blacklist *bl;
- u64 seq;
-
- for_each_jset_entry_type(entry, &i->j,
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- struct jset_entry_blacklist *bl_entry =
- container_of(entry, struct jset_entry_blacklist, entry);
- seq = le64_to_cpu(bl_entry->seq);
-
- bch_verbose(c, "blacklisting existing journal seq %llu", seq);
-
- bl = bch2_journal_seq_blacklisted_new(j, seq);
- if (!bl)
- return -ENOMEM;
-
- journal_pin_add_entry(j, p, &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- return 0;
-}
-
-static inline bool journal_has_keys(struct list_head *list)
-{
- struct journal_replay *i;
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- list_for_each_entry(i, list, list)
- for_each_jset_key(k, _n, entry, &i->j)
- return true;
-
- return false;
-}
-
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct journal_list jlist;
- struct journal_replay *i;
- struct journal_entry_pin_list *p;
- struct bch_dev *ca;
- u64 cur_seq, end_seq, seq;
- unsigned iter, keys = 0, entries = 0;
- size_t nr;
- bool degraded = false;
- int ret = 0;
-
- closure_init_stack(&jlist.cl);
- mutex_init(&jlist.lock);
- jlist.head = list;
- jlist.ret = 0;
-
- for_each_member_device(ca, c, iter) {
- if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
- continue;
-
- if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
- ca->mi.state == BCH_MEMBER_STATE_RO) &&
- percpu_ref_tryget(&ca->io_ref))
- closure_call(&ca->journal.read,
- bch2_journal_read_device,
- system_unbound_wq,
- &jlist.cl);
- else
- degraded = true;
- }
-
- closure_sync(&jlist.cl);
-
- if (jlist.ret)
- return jlist.ret;
-
- if (list_empty(list)){
- bch_err(c, "no journal entries found");
- return BCH_FSCK_REPAIR_IMPOSSIBLE;
- }
-
- fsck_err_on(c->sb.clean && journal_has_keys(list), c,
- "filesystem marked clean but journal has keys to replay");
-
- list_for_each_entry(i, list, list) {
- ret = journal_entry_validate_entries(c, &i->j, READ);
- if (ret)
- goto fsck_err;
-
- /*
- * If we're mounting in degraded mode - if we didn't read all
- * the devices - this is wrong:
- */
-
- if (!degraded &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
- i->devs), c,
- "superblock not marked as containing replicas (type %u)",
- BCH_DATA_JOURNAL))) {
- ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
- if (ret)
- return ret;
- }
- }
-
- i = list_last_entry(list, struct journal_replay, list);
-
- nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
-
- if (nr > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
- return -ENOMEM;
- }
- }
-
- atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
- j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
-
- j->pin.front = le64_to_cpu(i->j.last_seq);
- j->pin.back = le64_to_cpu(i->j.seq) + 1;
-
- fifo_for_each_entry_ptr(p, &j->pin, seq) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
- p->devs.nr = 0;
- }
-
- mutex_lock(&j->blacklist_lock);
-
- list_for_each_entry(i, list, list) {
- p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- atomic_set(&p->count, 1);
- p->devs = i->devs;
-
- if (journal_seq_blacklist_read(j, i, p)) {
- mutex_unlock(&j->blacklist_lock);
- return -ENOMEM;
- }
- }
-
- mutex_unlock(&j->blacklist_lock);
-
- cur_seq = journal_last_seq(j);
- end_seq = le64_to_cpu(list_last_entry(list,
- struct journal_replay, list)->j.seq);
-
- list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
- bool blacklisted;
-
- mutex_lock(&j->blacklist_lock);
- while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(j, cur_seq))
- cur_seq++;
-
- blacklisted = journal_seq_blacklist_find(j,
- le64_to_cpu(i->j.seq));
- mutex_unlock(&j->blacklist_lock);
-
- fsck_err_on(blacklisted, c,
- "found blacklisted journal entry %llu",
- le64_to_cpu(i->j.seq));
-
- fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- cur_seq, le64_to_cpu(i->j.seq) - 1,
- journal_last_seq(j), end_seq);
-
- cur_seq = le64_to_cpu(i->j.seq) + 1;
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
- entries++;
- }
-
- bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
- keys, entries, journal_cur_seq(j));
-fsck_err:
- return ret;
-}
-
-int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
-{
- struct bkey_i *k, *n;
- struct jset_entry *j;
- struct journal_replay *r;
- int ret;
-
- list_for_each_entry(r, list, list)
- for_each_jset_key(k, n, j, &r->j) {
- enum bkey_type type = bkey_type(j->level, j->btree_id);
- struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
-
- if (btree_type_has_ptrs(type)) {
- ret = bch2_btree_mark_key_initial(c, type, k_s_c);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
static bool journal_entry_is_open(struct journal *j)
{
return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
@@ -1174,15 +30,15 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
- __bch2_time_stats_update(j->delay_time,
- j->need_write_time);
+ bch2_time_stats_update(j->delay_time,
+ j->need_write_time);
#if 0
- closure_call(&j->io, journal_write, NULL, NULL);
+ closure_call(&j->io, bch2_journal_write, NULL, NULL);
#else
/* Shut sparse up: */
closure_init(&j->io, NULL);
- set_closure_fn(&j->io, journal_write, NULL);
- journal_write(&j->io);
+ set_closure_fn(&j->io, bch2_journal_write, NULL);
+ bch2_journal_write(&j->io);
#endif
}
@@ -1269,8 +125,8 @@ static enum {
c->opts.block_size;
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
- journal_reclaim_fast(j);
- /* XXX: why set this here, and not in journal_write()? */
+ bch2_journal_reclaim_fast(j);
+ /* XXX: why set this here, and not in bch2_journal_write()? */
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
journal_pin_new_entry(j, 1);
@@ -1285,6 +141,8 @@ static enum {
bch2_bucket_seq_cleanup(c);
}
+ c->bucket_journal_seq++;
+
/* ugh - might be called from __journal_res_get() under wait_event() */
__set_current_state(TASK_RUNNING);
bch2_journal_buf_put(j, old.idx, need_write_just_set);
@@ -1311,96 +169,6 @@ void bch2_journal_halt(struct journal *j)
closure_wake_up(&journal_prev_buf(j)->wait);
}
-static unsigned journal_dev_buckets_available(struct journal *j,
- struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
- unsigned next = (ja->cur_idx + 1) % ja->nr;
- unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
- /*
- * Hack to avoid a deadlock during journal replay:
- * journal replay might require setting a new btree
- * root, which requires writing another journal entry -
- * thus, if the journal is full (and this happens when
- * replaying the first journal bucket's entries) we're
- * screwed.
- *
- * So don't let the journal fill up unless we're in
- * replay:
- */
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
- available = max((int) available - 2, 0);
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
- available = max((int) available - 1, 0);
-
- return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-static int journal_entry_sectors(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- unsigned sectors_available = UINT_MAX;
- unsigned i, nr_online = 0, nr_devs = 0;
-
- lockdep_assert_held(&j->lock);
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_required = 0;
-
- if (!ja->nr)
- continue;
-
- sectors_available = min_t(unsigned, sectors_available,
- ca->mi.bucket_size);
-
- /*
- * Note that we don't allocate the space for a journal entry
- * until we write it out - thus, if we haven't started the write
- * for the previous entry we have to make sure we have space for
- * it too:
- */
- if (bch2_extent_has_device(e.c, ca->dev_idx)) {
- if (j->prev_buf_sectors > ja->sectors_free)
- buckets_required++;
-
- if (j->prev_buf_sectors + sectors_available >
- ja->sectors_free)
- buckets_required++;
- } else {
- if (j->prev_buf_sectors + sectors_available >
- ca->mi.bucket_size)
- buckets_required++;
-
- buckets_required++;
- }
-
- if (journal_dev_buckets_available(j, ca) >= buckets_required)
- nr_devs++;
- nr_online++;
- }
- rcu_read_unlock();
-
- if (nr_online < c->opts.metadata_replicas_required)
- return -EROFS;
-
- if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
- return 0;
-
- return sectors_available;
-}
-
/*
* should _only_ called from journal_res_get() - when we actually want a
* journal reservation - journal entry is open means journal is dirty:
@@ -1425,7 +193,7 @@ static int journal_entry_open(struct journal *j)
if (!fifo_free(&j->pin))
return 0;
- sectors = journal_entry_sectors(j);
+ sectors = bch2_journal_entry_sectors(j);
if (sectors <= 0)
return sectors;
@@ -1468,8 +236,8 @@ static int journal_entry_open(struct journal *j)
old.v, new.v)) != old.v);
if (j->res_get_blocked_start)
- __bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
+ bch2_time_stats_update(j->blocked_time,
+ j->res_get_blocked_start);
j->res_get_blocked_start = 0;
mod_delayed_work(system_freezable_wq,
@@ -1479,977 +247,6 @@ static int journal_entry_open(struct journal *j)
return 1;
}
-void bch2_journal_start(struct bch_fs *c)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl;
- u64 new_seq = 0;
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- new_seq = max(new_seq, bl->seq);
-
- spin_lock(&j->lock);
-
- set_bit(JOURNAL_STARTED, &j->flags);
-
- while (journal_cur_seq(j) < new_seq)
- journal_pin_new_entry(j, 0);
-
- /*
- * journal_buf_switch() only inits the next journal entry when it
- * closes an open journal entry - the very first journal entry gets
- * initialized here:
- */
- journal_pin_new_entry(j, 1);
- bch2_journal_buf_init(j);
-
- spin_unlock(&j->lock);
-
- /*
- * Adding entries to the next journal entry before allocating space on
- * disk for the next journal entry - this is ok, because these entries
- * only have to go down with the next journal entry we write:
- */
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (!bl->written) {
- bch2_journal_add_entry_noreservation(journal_cur_buf(j),
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
- 0, 0, &bl->seq, 1);
-
- journal_pin_add_entry(j,
- &fifo_peek_back(&j->pin),
- &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
- int ret = 0;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->replay_pin_list =
- journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- for_each_jset_key(k, _n, entry, &i->j) {
-
- if (entry->btree_id == BTREE_ID_ALLOC) {
- /*
- * allocation code handles replay for
- * BTREE_ID_ALLOC keys:
- */
- ret = bch2_alloc_replay_key(c, k->k.p);
- } else {
- /*
- * We might cause compressed extents to be
- * split, so we need to pass in a
- * disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
-
- ret = bch2_btree_insert(c, entry->btree_id, k,
- &disk_res, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
- }
-
- if (ret) {
- bch_err(c, "journal replay: error %d while replaying key",
- ret);
- goto err;
- }
-
- cond_resched();
- }
-
- if (atomic_dec_and_test(&j->replay_pin_list->count))
- journal_wake(j);
- }
-
- j->replay_pin_list = NULL;
-
- bch2_journal_set_replay_done(j);
- ret = bch2_journal_flush_all_pins(j);
-err:
- bch2_journal_entries_free(list);
- return ret;
-}
-
-static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
- bool new_fs, struct closure *cl)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- u64 *new_bucket_seq = NULL, *new_buckets = NULL;
- int ret = 0;
-
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
-
- ret = -ENOMEM;
- new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
- goto err;
-
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
- goto err;
-
- if (c)
- spin_lock(&c->journal.lock);
-
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
-
- if (c)
- spin_unlock(&c->journal.lock);
-
- while (ja->nr < nr) {
- struct open_bucket *ob = NULL;
- long bucket;
-
- if (new_fs) {
- bucket = bch2_bucket_alloc_new_fs(ca);
- if (bucket < 0) {
- ret = -ENOSPC;
- goto err;
- }
- } else {
- int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
- if (ob_idx < 0) {
- ret = cl ? -EAGAIN : -ENOSPC;
- goto err;
- }
-
- ob = c->open_buckets + ob_idx;
- bucket = sector_to_bucket(ca, ob->ptr.offset);
- }
-
- if (c)
- spin_lock(&c->journal.lock);
-
- __array_insert_item(ja->buckets, ja->nr, ja->last_idx);
- __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
- __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
-
- ja->buckets[ja->last_idx] = bucket;
- ja->bucket_seq[ja->last_idx] = 0;
- journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
-
- if (ja->last_idx < ja->nr) {
- if (ja->cur_idx >= ja->last_idx)
- ja->cur_idx++;
- ja->last_idx++;
- }
- ja->nr++;
-
- if (c)
- spin_unlock(&c->journal.lock);
-
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- new_fs
- ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
- : 0);
-
- if (!new_fs)
- bch2_open_bucket_put(c, ob);
- }
-
- ret = 0;
-err:
- kfree(new_bucket_seq);
- kfree(new_buckets);
-
- return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
-{
- struct journal_device *ja = &ca->journal;
- struct closure cl;
- unsigned current_nr;
- int ret;
-
- closure_init_stack(&cl);
-
- do {
- struct disk_reservation disk_res = { 0, 0 };
-
- closure_sync(&cl);
-
- mutex_lock(&c->sb_lock);
- current_nr = ja->nr;
-
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- */
-
- if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
- mutex_unlock(&c->sb_lock);
- return -ENOSPC;
- }
-
- ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
-
- bch2_disk_reservation_put(c, &disk_res);
-
- if (ja->nr != current_nr)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- } while (ret == -EAGAIN);
-
- return ret;
-}
-
-int bch2_dev_journal_alloc(struct bch_dev *ca)
-{
- unsigned nr;
-
- if (dynamic_fault("bcachefs:add:journal_alloc"))
- return -ENOMEM;
-
- /*
- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
- * is smaller:
- */
- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
- BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 10,
- (1 << 20) / ca->mi.bucket_size));
-
- return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
-}
-
-/* Journalling */
-
-/**
- * journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-static void journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- journal_wake(j);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, marking it as dirty:
- */
-
-static inline void __journal_pin_add(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- BUG_ON(journal_pin_active(pin));
- BUG_ON(!atomic_read(&pin_list->count));
-
- atomic_inc(&pin_list->count);
- pin->pin_list = pin_list;
- pin->flush = flush_fn;
-
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- journal_wake(j);
-}
-
-static void journal_pin_add_entry(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add(struct journal *j,
- struct journal_res *res,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- struct journal_entry_pin_list *pin_list = res->ref
- ? journal_seq_pin(j, res->seq)
- : j->replay_pin_list;
-
- spin_lock(&j->lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
-static inline void __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list = pin->pin_list;
-
- if (!journal_pin_active(pin))
- return;
-
- pin->pin_list = NULL;
- list_del_init(&pin->list);
-
- /*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- */
- if (atomic_dec_and_test(&pin_list->count) &&
- pin_list == &fifo_peek_front(&j->pin))
- journal_reclaim_fast(j);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- spin_lock(&j->lock);
- __journal_pin_drop(j, pin);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add_if_older(struct journal *j,
- struct journal_entry_pin *src_pin,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- if (journal_pin_active(src_pin) &&
- (!journal_pin_active(pin) ||
- journal_pin_seq(j, src_pin->pin_list) <
- journal_pin_seq(j, pin->pin_list))) {
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
- }
-
- spin_unlock(&j->lock);
-}
-
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret;
- u64 iter;
-
- /* no need to iterate over empty fifo entries: */
- journal_reclaim_fast(j);
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
- if (iter > seq_to_flush)
- break;
-
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret) {
- /* must be list_del_init(), see bch2_journal_pin_drop() */
- list_move(&ret->list, &pin_list->flushed);
- *seq = iter;
- return ret;
- }
- }
-
- return NULL;
-}
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin *ret;
-
- spin_lock(&j->lock);
- ret = __journal_get_next_pin(j, seq_to_flush, seq);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
- struct journal_entry_pin **pin,
- u64 *pin_seq)
-{
- int ret;
-
- *pin = NULL;
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- spin_lock(&j->lock);
- /*
- * If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers
- */
- ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
- !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
- journal_last_seq(j) > seq_to_flush ||
- (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin *pin;
- u64 pin_seq;
- bool flush;
-
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return 0;
-again:
- wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
- if (pin) {
- /* flushing a journal pin might cause a new one to be added: */
- pin->flush(j, pin, pin_seq);
- goto again;
- }
-
- spin_lock(&j->lock);
- flush = journal_last_seq(j) != j->last_seq_ondisk ||
- (seq_to_flush == U64_MAX && c->btree_roots_dirty);
- spin_unlock(&j->lock);
-
- return flush ? bch2_journal_meta(j) : 0;
-}
-
-int bch2_journal_flush_all_pins(struct journal *j)
-{
- return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = ja->nr &&
- (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/**
- * journal_reclaim_work - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static void journal_reclaim_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(to_delayed_work(work),
- struct bch_fs, journal.reclaim_work);
- struct journal *j = &c->journal;
- struct bch_dev *ca;
- struct journal_entry_pin *pin;
- u64 seq, seq_to_flush = 0;
- unsigned iter, bucket_to_flush;
- unsigned long next_flush;
- bool reclaim_lock_held = false, need_flush;
-
- /*
- * Advance last_idx to point to the oldest journal entry containing
- * btree node updates that have not yet been written out
- */
- for_each_rw_member(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- while (should_discard_bucket(j, ja)) {
- if (!reclaim_lock_held) {
- /*
- * ugh:
- * might be called from __journal_res_get()
- * under wait_event() - have to go back to
- * TASK_RUNNING before doing something that
- * would block, but only if we're doing work:
- */
- __set_current_state(TASK_RUNNING);
-
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- /* recheck under reclaim_lock: */
- continue;
- }
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->last_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) % ja->nr;
- spin_unlock(&j->lock);
-
- journal_wake(j);
- }
-
- /*
- * Write out enough btree nodes to free up 50% journal
- * buckets
- */
- spin_lock(&j->lock);
- bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
- seq_to_flush = max_t(u64, seq_to_flush,
- ja->bucket_seq[bucket_to_flush]);
- spin_unlock(&j->lock);
- }
-
- if (reclaim_lock_held)
- mutex_unlock(&j->reclaim_lock);
-
- /* Also flush if the pin fifo is more than half full */
- spin_lock(&j->lock);
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
- need_flush = time_after(jiffies, next_flush);
-
- while ((pin = journal_get_next_pin(j, need_flush
- ? U64_MAX
- : seq_to_flush, &seq))) {
- __set_current_state(TASK_RUNNING);
- pin->flush(j, pin, seq);
- need_flush = false;
-
- j->last_flushed = jiffies;
- }
-
- if (!test_bit(BCH_FS_RO, &c->flags))
- queue_delayed_work(system_freezable_wq, &j->reclaim_work,
- msecs_to_jiffies(j->reclaim_delay_ms));
-}
-
-/**
- * journal_next_bucket - move on to the next journal bucket if possible
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
- unsigned sectors)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bkey_s_extent e;
- struct bch_extent_ptr *ptr;
- struct journal_device *ja;
- struct bch_dev *ca;
- struct dev_alloc_list devs_sorted;
- unsigned i, replicas, replicas_want =
- READ_ONCE(c->opts.metadata_replicas);
-
- spin_lock(&j->lock);
- e = bkey_i_to_s_extent(&j->key);
-
- /*
- * Drop any pointers to devices that have been removed, are no longer
- * empty, or filled up their current journal bucket:
- *
- * Note that a device may have had a small amount of free space (perhaps
- * one sector) that wasn't enough for the smallest possible journal
- * entry - that's why we drop pointers to devices <= current free space,
- * i.e. whichever device was limiting the current journal entry size.
- */
- extent_for_each_ptr_backwards(e, ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
-
- if (ca->mi.state != BCH_MEMBER_STATE_RW ||
- ca->journal.sectors_free <= sectors)
- __bch2_extent_drop_ptr(e, ptr);
- else
- ca->journal.sectors_free -= sectors;
- }
-
- replicas = bch2_extent_nr_ptrs(e.c);
-
- rcu_read_lock();
- devs_sorted = bch2_wp_alloc_list(c, &j->wp,
- &c->rw_devs[BCH_DATA_JOURNAL]);
-
- for (i = 0; i < devs_sorted.nr; i++) {
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
- if (!ca)
- continue;
-
- if (!ca->mi.durability)
- continue;
-
- ja = &ca->journal;
- if (!ja->nr)
- continue;
-
- if (replicas >= replicas_want)
- break;
-
- /*
- * Check that we can use this device, and aren't already using
- * it:
- */
- if (bch2_extent_has_device(e.c, ca->dev_idx) ||
- !journal_dev_buckets_available(j, ca) ||
- sectors > ca->mi.bucket_size)
- continue;
-
- j->wp.next_alloc[ca->dev_idx] += U32_MAX;
- bch2_wp_rescale(c, ca, &j->wp);
-
- ja->sectors_free = ca->mi.bucket_size - sectors;
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
- extent_ptr_append(bkey_i_to_extent(&j->key),
- (struct bch_extent_ptr) {
- .offset = bucket_to_sector(ca,
- ja->buckets[ja->cur_idx]),
- .dev = ca->dev_idx,
- });
-
- replicas += ca->mi.durability;
- }
- rcu_read_unlock();
-
- j->prev_buf_sectors = 0;
-
- bkey_copy(&w->key, &j->key);
- spin_unlock(&j->lock);
-
- if (replicas < c->opts.metadata_replicas_required)
- return -EROFS;
-
- BUG_ON(!replicas);
-
- return 0;
-}
-
-static void journal_write_compact(struct jset *jset)
-{
- struct jset_entry *i, *next, *prev = NULL;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each_safe(jset, i, next) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == JOURNAL_ENTRY_BTREE_KEYS &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
- }
-
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-{
- /* we aren't holding j->lock: */
- unsigned new_size = READ_ONCE(j->buf_size_want);
- void *new_buf;
-
- if (buf->size >= new_size)
- return;
-
- new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
- if (!new_buf)
- return;
-
- memcpy(new_buf, buf->data, buf->size);
- kvpfree(buf->data, buf->size);
- buf->data = new_buf;
- buf->size = new_size;
-}
-
-static void journal_write_done(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_prev_buf(j);
- struct bch_devs_list devs =
- bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
-
- if (!devs.nr) {
- bch_err(c, "unable to write journal to sufficient devices");
- goto err;
- }
-
- if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
- goto err;
-out:
- __bch2_time_stats_update(j->write_time, j->write_start_time);
-
- spin_lock(&j->lock);
- j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
-
- journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs;
-
- /*
- * Updating last_seq_ondisk may let journal_reclaim_work() discard more
- * buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-
- /* also must come before signalling write completion: */
- closure_debug_destroy(cl);
-
- BUG_ON(!j->reservations.prev_buf_unwritten);
- atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
- &j->reservations.counter);
-
- closure_wake_up(&w->wait);
- journal_wake(j);
-
- if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
- mod_delayed_work(system_freezable_wq, &j->write_work, 0);
- spin_unlock(&j->lock);
- return;
-err:
- bch2_fatal_error(c);
- bch2_journal_halt(j);
- goto out;
-}
-
-static void journal_write_endio(struct bio *bio)
-{
- struct bch_dev *ca = bio->bi_private;
- struct journal *j = &ca->fs->journal;
-
- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
- bch2_meta_write_fault("journal")) {
- struct journal_buf *w = journal_prev_buf(j);
- unsigned long flags;
-
- spin_lock_irqsave(&j->err_lock, flags);
- bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
- spin_unlock_irqrestore(&j->err_lock, flags);
- }
-
- closure_put(&j->io);
- percpu_ref_put(&ca->io_ref);
-}
-
-static void journal_write(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_prev_buf(j);
- struct jset *jset;
- struct bio *bio;
- struct bch_extent_ptr *ptr;
- unsigned i, sectors, bytes;
-
- journal_buf_realloc(j, w);
- jset = w->data;
-
- j->write_start_time = local_clock();
- mutex_lock(&c->btree_root_lock);
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_root *r = &c->btree_roots[i];
-
- if (r->alive)
- bch2_journal_add_btree_root(w, i, &r->key, r->level);
- }
- c->btree_roots_dirty = false;
- mutex_unlock(&c->btree_root_lock);
-
- journal_write_compact(jset);
-
- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
- jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = cpu_to_le32(BCACHE_JSET_VERSION);
-
- SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
- SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
-
- if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
- journal_entry_validate_entries(c, jset, WRITE))
- goto err;
-
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
-
- jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
- journal_nonce(jset), jset);
-
- if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
- journal_entry_validate_entries(c, jset, WRITE))
- goto err;
-
- sectors = vstruct_sectors(jset, c->block_bits);
- BUG_ON(sectors > j->prev_buf_sectors);
-
- bytes = vstruct_bytes(w->data);
- memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
-
- if (journal_write_alloc(j, w, sectors)) {
- bch2_journal_halt(j);
- bch_err(c, "Unable to allocate journal write");
- bch2_fatal_error(c);
- continue_at(cl, journal_write_done, system_highpri_wq);
- }
-
- /*
- * XXX: we really should just disable the entire journal in nochanges
- * mode
- */
- if (c->opts.nochanges)
- goto no_io;
-
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!percpu_ref_tryget(&ca->io_ref)) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
- sectors);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_iter.bi_size = sectors << 9;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE,
- REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch2_bio_map(bio, jset);
-
- trace_journal_write(bio);
- closure_bio_submit(bio, cl);
-
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
- }
-
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
-
-no_io:
- extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
- ptr->offset += sectors;
-
- continue_at(cl, journal_write_done, system_highpri_wq);
-err:
- bch2_inconsistent_error(c);
- continue_at(cl, journal_write_done, system_highpri_wq);
-}
-
/*
* returns true if there's nothing to flush and no journal write still in flight
*/
@@ -2572,7 +369,7 @@ retry:
* Direct reclaim - can't rely on reclaim from work item
* due to freezing..
*/
- journal_reclaim_work(&j->reclaim_work.work);
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
trace_journal_full(c);
blocked:
@@ -2615,6 +412,14 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
return seq;
}
+/**
+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
+ * open yet, or wait if we cannot
+ *
+ * used by the btree interior update machinery, when it needs to write a new
+ * btree root - every journal entry contains the roots of all the btrees, so it
+ * doesn't need to bother with getting a journal reservation
+ */
int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
{
int ret;
@@ -2634,11 +439,19 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
spin_unlock(&j->lock);
if (!ret)
- journal_reclaim_work(&j->reclaim_work.work);
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
return ret;
}
+/**
+ * bch2_journal_wait_on_seq - wait for a journal entry to be written
+ *
+ * does _not_ cause @seq to be written immediately - if there is no other
+ * activity to cause the relevant journal entry to be filled up or flushed it
+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
+ * configurable).
+ */
void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
{
spin_lock(&j->lock);
@@ -2669,6 +482,12 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent
spin_unlock(&j->lock);
}
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ *
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
{
struct journal_buf *buf;
@@ -2787,6 +606,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
return ret ?: ret2 < 0 ? ret2 : 0;
}
+/**
+ * bch2_journal_meta_async - force a journal entry to be written
+ */
void bch2_journal_meta_async(struct journal *j, struct closure *parent)
{
struct journal_res res;
@@ -2817,6 +639,10 @@ int bch2_journal_meta(struct journal *j)
return bch2_journal_flush_seq(j, res.seq);
}
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
void bch2_journal_flush_async(struct journal *j, struct closure *parent)
{
u64 seq, journal_seq;
@@ -2857,49 +683,170 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
-int bch2_journal_flush_device(struct journal *j, int dev_idx)
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+ bool new_fs, struct closure *cl)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin_list *p;
- struct bch_devs_list devs;
- u64 iter, seq = 0;
+ struct bch_fs *c = ca->fs;
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets;
+ u64 *new_bucket_seq = NULL, *new_buckets = NULL;
int ret = 0;
- spin_lock(&j->lock);
- fifo_for_each_entry_ptr(p, &j->pin, iter)
- if (dev_idx >= 0
- ? bch2_dev_list_has_dev(p->devs, dev_idx)
- : p->devs.nr < c->opts.metadata_replicas)
- seq = iter;
- spin_unlock(&j->lock);
+ /* don't handle reducing nr of buckets yet: */
+ if (nr <= ja->nr)
+ return 0;
- ret = bch2_journal_flush_pins(j, seq);
- if (ret)
- return ret;
+ ret = -ENOMEM;
+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ if (!new_buckets || !new_bucket_seq)
+ goto err;
+
+ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
+ nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (!journal_buckets)
+ goto err;
+
+ if (c)
+ spin_lock(&c->journal.lock);
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
- seq = 0;
+ if (c)
+ spin_unlock(&c->journal.lock);
- spin_lock(&j->lock);
- while (!ret && seq < j->pin.back) {
- seq = max(seq, journal_last_seq(j));
- devs = journal_seq_pin(j, seq)->devs;
- seq++;
+ while (ja->nr < nr) {
+ struct open_bucket *ob = NULL;
+ long bucket;
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
- spin_lock(&j->lock);
+ if (new_fs) {
+ bucket = bch2_bucket_alloc_new_fs(ca);
+ if (bucket < 0) {
+ ret = -ENOSPC;
+ goto err;
+ }
+ } else {
+ int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
+ if (ob_idx < 0) {
+ ret = cl ? -EAGAIN : -ENOSPC;
+ goto err;
+ }
+
+ ob = c->open_buckets + ob_idx;
+ bucket = sector_to_bucket(ca, ob->ptr.offset);
+ }
+
+ if (c)
+ spin_lock(&c->journal.lock);
+
+ __array_insert_item(ja->buckets, ja->nr, ja->last_idx);
+ __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
+ __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
+
+ ja->buckets[ja->last_idx] = bucket;
+ ja->bucket_seq[ja->last_idx] = 0;
+ journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
+
+ if (ja->last_idx < ja->nr) {
+ if (ja->cur_idx >= ja->last_idx)
+ ja->cur_idx++;
+ ja->last_idx++;
+ }
+ ja->nr++;
+
+ if (c)
+ spin_unlock(&c->journal.lock);
+
+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ new_fs
+ ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
+ : 0);
+
+ if (!new_fs)
+ bch2_open_bucket_put(c, ob);
}
- spin_unlock(&j->lock);
- bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
+ ret = 0;
+err:
+ kfree(new_bucket_seq);
+ kfree(new_buckets);
+
+ return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr)
+{
+ struct journal_device *ja = &ca->journal;
+ struct closure cl;
+ unsigned current_nr;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ do {
+ struct disk_reservation disk_res = { 0, 0 };
+
+ closure_sync(&cl);
+
+ mutex_lock(&c->sb_lock);
+ current_nr = ja->nr;
+
+ /*
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+ * when space used goes up without a reservation - but we do need the
+ * reservation to ensure we'll actually be able to allocate:
+ */
+
+ if (bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+ mutex_unlock(&c->sb_lock);
+ return -ENOSPC;
+ }
+
+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+ bch2_disk_reservation_put(c, &disk_res);
+
+ if (ja->nr != current_nr)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ } while (ret == -EAGAIN);
return ret;
}
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+ unsigned nr;
+
+ if (dynamic_fault("bcachefs:add:journal_alloc"))
+ return -ENOMEM;
+
+ /*
+ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+ * is smaller:
+ */
+ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+ BCH_JOURNAL_BUCKETS_MIN,
+ min(1 << 10,
+ (1 << 20) / ca->mi.bucket_size));
+
+ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+}
+
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
@@ -2936,6 +883,43 @@ void bch2_fs_journal_stop(struct journal *j)
cancel_delayed_work_sync(&j->reclaim_work);
}
+void bch2_fs_journal_start(struct journal *j)
+{
+ struct journal_seq_blacklist *bl;
+ u64 blacklist = 0;
+
+ list_for_each_entry(bl, &j->seq_blacklist, list)
+ blacklist = max(blacklist, bl->end);
+
+ spin_lock(&j->lock);
+
+ set_bit(JOURNAL_STARTED, &j->flags);
+
+ while (journal_cur_seq(j) < blacklist)
+ journal_pin_new_entry(j, 0);
+
+ /*
+ * journal_buf_switch() only inits the next journal entry when it
+ * closes an open journal entry - the very first journal entry gets
+ * initialized here:
+ */
+ journal_pin_new_entry(j, 1);
+ bch2_journal_buf_init(j);
+
+ spin_unlock(&j->lock);
+
+ /*
+ * Adding entries to the next journal entry before allocating space on
+ * disk for the next journal entry - this is ok, because these entries
+ * only have to go down with the next journal entry we write:
+ */
+ bch2_journal_seq_blacklist_write(j);
+
+ queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+/* init/exit: */
+
void bch2_dev_journal_exit(struct bch_dev *ca)
{
kfree(ca->journal.bio);
@@ -2994,7 +978,7 @@ int bch2_fs_journal_init(struct journal *j)
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
mutex_init(&j->blacklist_lock);
INIT_LIST_HEAD(&j->seq_blacklist);
mutex_init(&j->reclaim_lock);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index cf5cc9b..4cec7bb 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -112,72 +112,37 @@
#include "journal_types.h"
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- struct list_head list;
- struct bch_devs_list devs;
- /* must be last: */
- struct jset j;
-};
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
- struct jset_entry *entry, unsigned type)
-{
- while (entry < vstruct_last(jset)) {
- if (entry->type == type)
- return entry;
-
- entry = vstruct_next(entry);
- }
+struct bch_fs;
- return NULL;
+static inline void journal_wake(struct journal *j)
+{
+ wake_up(&j->wait);
+ closure_wake_up(&j->async_wait);
}
-#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
- (entry = __jset_entry_type_next(jset, entry, type)); \
- entry = vstruct_next(entry))
-
-#define for_each_jset_key(k, _n, entry, jset) \
- for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
- vstruct_for_each_safe(entry, k, _n)
-
-#define JOURNAL_PIN (32 * 1024)
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
- return pin->pin_list != NULL;
+ return j->buf + j->reservations.idx;
}
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
{
- return &j->pin.data[seq & j->pin.mask];
+ return j->buf + !j->reservations.idx;
}
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+/* Sequence number of oldest dirty journal entry */
-void bch2_journal_pin_add(struct journal *, struct journal_res *,
- struct journal_entry_pin *, journal_pin_flush_fn);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-int bch2_journal_flush_pins(struct journal *, u64);
-int bch2_journal_flush_all_pins(struct journal *);
-
-struct closure;
-struct bch_fs;
-struct keylist;
+static inline u64 journal_last_seq(struct journal *j)
+{
+ return j->pin.front;
+}
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
- enum btree_id, unsigned *);
+static inline u64 journal_cur_seq(struct journal *j)
+{
+ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+ return j->pin.back - 1;
+}
u64 bch2_inode_journal_seq(struct journal *, u64);
@@ -213,21 +178,18 @@ static inline unsigned jset_u64s(unsigned u64s)
return u64s + sizeof(struct jset_entry) / sizeof(u64);
}
-static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
- unsigned offset,
- unsigned type, enum btree_id id,
- unsigned level,
- const void *data, size_t u64s)
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
{
- struct jset_entry *entry = vstruct_idx(buf->data, offset);
+ struct jset *jset = buf->data;
+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
memset(entry, 0, sizeof(*entry));
- entry->u64s = cpu_to_le16(u64s);
- entry->btree_id = id;
- entry->level = level;
- entry->type = type;
+ entry->u64s = cpu_to_le16(u64s);
- memcpy_u64s(entry->_data, data, u64s);
+ le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+ return entry;
}
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
@@ -236,21 +198,27 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
const void *data, unsigned u64s)
{
struct journal_buf *buf = &j->buf[res->idx];
+ struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
unsigned actual = jset_u64s(u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
- bch2_journal_add_entry_at(buf, res->offset, type,
- id, level, data, u64s);
res->offset += actual;
res->u64s -= actual;
+
+ memset(entry, 0, sizeof(*entry));
+ entry->u64s = cpu_to_le16(u64s);
+ entry->type = type;
+ entry->btree_id = id;
+ entry->level = level;
+ memcpy_u64s(entry->_data, data, u64s);
}
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
enum btree_id id, const struct bkey_i *k)
{
- bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
+ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
id, 0, k, k->k.u64s);
}
@@ -292,7 +260,7 @@ static inline void bch2_journal_res_put(struct journal *j,
while (res->u64s)
bch2_journal_add_entry(j, res,
- JOURNAL_ENTRY_BTREE_KEYS,
+ BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
bch2_journal_buf_put(j, res->idx, false);
@@ -368,7 +336,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
-int bch2_journal_flush_device(struct journal *, int);
void bch2_journal_halt(struct journal *);
@@ -385,10 +352,8 @@ static inline bool journal_flushes_device(struct bch_dev *ca)
return true;
}
-void bch2_journal_start(struct bch_fs *);
int bch2_journal_mark(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
static inline void bch2_journal_set_replay_done(struct journal *j)
@@ -404,6 +369,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
+void bch2_fs_journal_start(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
new file mode 100644
index 0000000..2fd0d64
--- /dev/null
+++ b/libbcachefs/journal_io.c
@@ -0,0 +1,1423 @@
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+
+#include <trace/events/bcachefs.h>
+
+static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
+ enum btree_id id)
+{
+ struct jset_entry *entry;
+
+ for_each_jset_entry_type(entry, j, type)
+ if (entry->btree_id == id)
+ return entry;
+
+ return NULL;
+}
+
+struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
+ enum btree_id id, unsigned *level)
+{
+ struct bkey_i *k;
+ struct jset_entry *entry =
+ bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id);
+
+ if (!entry)
+ return NULL;
+
+ if (!entry->u64s)
+ return ERR_PTR(-EINVAL);
+
+ k = entry->start;
+ *level = entry->level;
+ *level = entry->level;
+ return k;
+}
+
+struct journal_list {
+ struct closure cl;
+ struct mutex lock;
+ struct list_head *head;
+ int ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK 0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct journal_list *jlist, struct jset *j)
+{
+ struct journal_replay *i, *pos;
+ struct list_head *where;
+ size_t bytes = vstruct_bytes(j);
+ __le64 last_seq;
+ int ret;
+
+ last_seq = !list_empty(jlist->head)
+ ? list_last_entry(jlist->head, struct journal_replay,
+ list)->j.last_seq
+ : 0;
+
+ /* Is this entry older than the range we need? */
+ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+ goto out;
+ }
+
+ /* Drop entries we don't need anymore */
+ list_for_each_entry_safe(i, pos, jlist->head, list) {
+ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+ break;
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+ }
+
+ list_for_each_entry_reverse(i, jlist->head, list) {
+ /* Duplicate? */
+ if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+ fsck_err_on(bytes != vstruct_bytes(&i->j) ||
+ memcmp(j, &i->j, bytes), c,
+ "found duplicate but non identical journal entries (seq %llu)",
+ le64_to_cpu(j->seq));
+ goto found;
+ }
+
+ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+ where = &i->list;
+ goto add;
+ }
+ }
+
+ where = jlist->head;
+add:
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ if (!i) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ list_add(&i->list, where);
+ i->devs.nr = 0;
+ memcpy(&i->j, j, bytes);
+found:
+ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
+ else
+ fsck_err_on(1, c, "duplicate journal entries on same device");
+ ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+ return ret;
+}
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = ((__le32 *) &jset->seq)[0],
+ [2] = ((__le32 *) &jset->seq)[1],
+ [3] = BCH_NONCE_JOURNAL,
+ }};
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+ struct jset_entry *entry;
+
+ for (entry = start; entry != end; entry = vstruct_next(entry))
+ memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD 5
+#define JOURNAL_ENTRY_NONE 6
+#define JOURNAL_ENTRY_BAD 7
+
+#define journal_entry_err(c, msg, ...) \
+({ \
+ switch (write) { \
+ case READ: \
+ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
+ break; \
+ case WRITE: \
+ bch_err(c, "corrupt metadata before write:\n" \
+ msg, ##__VA_ARGS__); \
+ if (bch2_fs_inconsistent(c)) { \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+ break; \
+ } \
+ true; \
+})
+
+#define journal_entry_err_on(cond, c, msg, ...) \
+ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+ struct jset_entry *entry,
+ struct bkey_i *k, enum bkey_type key_type,
+ const char *type, int write)
+{
+ void *next = vstruct_next(entry);
+ const char *invalid;
+ char buf[160];
+ int ret = 0;
+
+ if (journal_entry_err_on(!k->k.u64s, c,
+ "invalid %s in journal: k->u64s 0", type)) {
+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+ journal_entry_null_range(vstruct_next(entry), next);
+ return 0;
+ }
+
+ if (journal_entry_err_on((void *) bkey_next(k) >
+ (void *) vstruct_next(entry), c,
+ "invalid %s in journal: extends past end of journal entry",
+ type)) {
+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+ journal_entry_null_range(vstruct_next(entry), next);
+ return 0;
+ }
+
+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+ "invalid %s in journal: bad format %u",
+ type, k->k.format)) {
+ le16_add_cpu(&entry->u64s, -k->k.u64s);
+ memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+ journal_entry_null_range(vstruct_next(entry), next);
+ return 0;
+ }
+
+ if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
+ bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+
+ invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+ if (invalid) {
+ bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
+ bkey_i_to_s_c(k));
+ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+ type, invalid, buf);
+
+ le16_add_cpu(&entry->u64s, -k->k.u64s);
+ memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+ journal_entry_null_range(vstruct_next(entry), next);
+ return 0;
+ }
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_btree_keys(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct bkey_i *k;
+
+ vstruct_for_each(entry, k) {
+ int ret = journal_validate_key(c, jset, entry, k,
+ bkey_type(entry->level,
+ entry->btree_id),
+ "key", write);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int journal_entry_validate_btree_root(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct bkey_i *k = entry->start;
+ int ret = 0;
+
+ if (journal_entry_err_on(!entry->u64s ||
+ le16_to_cpu(entry->u64s) != k->k.u64s, c,
+ "invalid btree root journal entry: wrong number of keys")) {
+ void *next = vstruct_next(entry);
+ /*
+ * we don't want to null out this jset_entry,
+ * just the contents, so that later we can tell
+ * we were _supposed_ to have a btree root
+ */
+ entry->u64s = 0;
+ journal_entry_null_range(vstruct_next(entry), next);
+ return 0;
+ }
+
+ return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+ "btree root", write);
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ /* obsolete, don't care: */
+ return 0;
+}
+
+static int journal_entry_validate_blacklist(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ int ret = 0;
+
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+ "invalid journal seq blacklist entry: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ }
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_blacklist_v2 *bl_entry;
+ int ret = 0;
+
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+ "invalid journal seq blacklist entry: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ }
+
+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+ le64_to_cpu(bl_entry->end), c,
+ "invalid journal seq blacklist entry: start > end")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ }
+
+fsck_err:
+ return ret;
+}
+
+struct jset_entry_ops {
+ int (*validate)(struct bch_fs *, struct jset *,
+ struct jset_entry *, int);
+};
+
+const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr) \
+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
+ .validate = journal_entry_validate_##f, \
+ },
+ BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
+ struct jset_entry *entry, int write)
+{
+ int ret = 0;
+
+ if (entry->type >= BCH_JSET_ENTRY_NR) {
+ journal_entry_err(c, "invalid journal entry type %u",
+ entry->type);
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return 0;
+ }
+
+ ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write);
+fsck_err:
+ return ret;
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+ int write)
+{
+ struct jset_entry *entry;
+ int ret = 0;
+
+ vstruct_for_each(jset, entry) {
+ if (journal_entry_err_on(vstruct_next(entry) >
+ vstruct_last(jset), c,
+ "journal entry extends past end of jset")) {
+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+ break;
+ }
+
+ ret = journal_entry_validate(c, jset, entry, write);
+ if (ret)
+ break;
+ }
+fsck_err:
+ return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+ struct jset *jset, u64 sector,
+ unsigned bucket_sectors_left,
+ unsigned sectors_read,
+ int write)
+{
+ size_t bytes = vstruct_bytes(jset);
+ struct bch_csum csum;
+ int ret = 0;
+
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
+ return JOURNAL_ENTRY_NONE;
+
+ if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
+ bch_err(c, "unknown journal entry version %u",
+ le32_to_cpu(jset->version));
+ return BCH_FSCK_UNKNOWN_VERSION;
+ }
+
+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+ "journal entry too big (%zu bytes), sector %lluu",
+ bytes, sector)) {
+ /* XXX: note we might have missing journal entries */
+ return JOURNAL_ENTRY_BAD;
+ }
+
+ if (bytes > sectors_read << 9)
+ return JOURNAL_ENTRY_REREAD;
+
+ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+ "journal entry with unknown csum type %llu sector %lluu",
+ JSET_CSUM_TYPE(jset), sector))
+ return JOURNAL_ENTRY_BAD;
+
+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
+ "journal checksum bad, sector %llu", sector)) {
+ /* XXX: retry IO, when we start retrying checksum errors */
+ /* XXX: note we might have missing journal entries */
+ return JOURNAL_ENTRY_BAD;
+ }
+
+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ jset->encrypted_start,
+ vstruct_end(jset) - (void *) jset->encrypted_start);
+
+ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+ "invalid journal entry: last_seq > seq"))
+ jset->last_seq = jset->seq;
+
+ return 0;
+fsck_err:
+ return ret;
+}
+
+struct journal_read_buf {
+ void *data;
+ size_t size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+ size_t new_size)
+{
+ void *n;
+
+ /* the bios are sized for this many pages, max: */
+ if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+ return -ENOMEM;
+
+ new_size = roundup_pow_of_two(new_size);
+ n = kvpmalloc(new_size, GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ kvpfree(b->data, b->size);
+ b->data = n;
+ b->size = new_size;
+ return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+ struct journal_read_buf *buf,
+ struct journal_list *jlist,
+ unsigned bucket, u64 *seq, bool *entries_found)
+{
+ struct bch_fs *c = ca->fs;
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = ja->bio;
+ struct jset *j = NULL;
+ unsigned sectors, sectors_read = 0;
+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+ end = offset + ca->mi.bucket_size;
+ bool saw_bad = false;
+ int ret = 0;
+
+ pr_debug("reading %u", bucket);
+
+ while (offset < end) {
+ if (!sectors_read) {
+reread: sectors_read = min_t(unsigned,
+ end - offset, buf->size >> 9);
+
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_iter.bi_sector = offset;
+ bio->bi_iter.bi_size = sectors_read << 9;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bch2_bio_map(bio, buf->data);
+
+ ret = submit_bio_wait(bio);
+
+ if (bch2_dev_io_err_on(ret, ca,
+ "journal read from sector %llu",
+ offset) ||
+ bch2_meta_read_fault("journal"))
+ return -EIO;
+
+ j = buf->data;
+ }
+
+ ret = jset_validate(c, j, offset,
+ end - offset, sectors_read,
+ READ);
+ switch (ret) {
+ case BCH_FSCK_OK:
+ break;
+ case JOURNAL_ENTRY_REREAD:
+ if (vstruct_bytes(j) > buf->size) {
+ ret = journal_read_buf_realloc(buf,
+ vstruct_bytes(j));
+ if (ret)
+ return ret;
+ }
+ goto reread;
+ case JOURNAL_ENTRY_NONE:
+ if (!saw_bad)
+ return 0;
+ sectors = c->opts.block_size;
+ goto next_block;
+ case JOURNAL_ENTRY_BAD:
+ saw_bad = true;
+ sectors = c->opts.block_size;
+ goto next_block;
+ default:
+ return ret;
+ }
+
+ /*
+ * This happens sometimes if we don't have discards on -
+ * when we've partially overwritten a bucket with new
+ * journal entries. We don't need the rest of the
+ * bucket:
+ */
+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+ return 0;
+
+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+ mutex_lock(&jlist->lock);
+ ret = journal_entry_add(c, ca, jlist, j);
+ mutex_unlock(&jlist->lock);
+
+ switch (ret) {
+ case JOURNAL_ENTRY_ADD_OK:
+ *entries_found = true;
+ break;
+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+ break;
+ default:
+ return ret;
+ }
+
+ if (le64_to_cpu(j->seq) > *seq)
+ *seq = le64_to_cpu(j->seq);
+
+ sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+ pr_debug("next");
+ offset += sectors;
+ sectors_read -= sectors;
+ j = ((void *) j) + (sectors << 9);
+ }
+
+ return 0;
+}
+
+static void bch2_journal_read_device(struct closure *cl)
+{
+#define read_bucket(b) \
+ ({ \
+ bool entries_found = false; \
+ ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
+ &entries_found); \
+ if (ret) \
+ goto err; \
+ __set_bit(b, bitmap); \
+ entries_found; \
+ })
+
+ struct journal_device *ja =
+ container_of(cl, struct journal_device, read);
+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+ struct journal_list *jlist =
+ container_of(cl->parent, struct journal_list, cl);
+ struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+ struct journal_read_buf buf = { NULL, 0 };
+
+ DECLARE_BITMAP(bitmap, ja->nr);
+ unsigned i, l, r;
+ u64 seq = 0;
+ int ret;
+
+ if (!ja->nr)
+ goto out;
+
+ bitmap_zero(bitmap, ja->nr);
+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ if (ret)
+ goto err;
+
+ pr_debug("%u journal buckets", ja->nr);
+
+ /*
+ * If the device supports discard but not secure discard, we can't do
+ * the fancy fibonacci hash/binary search because the live journal
+ * entries might not form a contiguous range:
+ */
+ for (i = 0; i < ja->nr; i++)
+ read_bucket(i);
+ goto search_done;
+
+ if (!blk_queue_nonrot(q))
+ goto linear_scan;
+
+ /*
+ * Read journal buckets ordered by golden ratio hash to quickly
+ * find a sequence of buckets with valid journal entries
+ */
+ for (i = 0; i < ja->nr; i++) {
+ l = (i * 2654435769U) % ja->nr;
+
+ if (test_bit(l, bitmap))
+ break;
+
+ if (read_bucket(l))
+ goto bsearch;
+ }
+
+ /*
+ * If that fails, check all the buckets we haven't checked
+ * already
+ */
+ pr_debug("falling back to linear search");
+linear_scan:
+ for (l = find_first_zero_bit(bitmap, ja->nr);
+ l < ja->nr;
+ l = find_next_zero_bit(bitmap, ja->nr, l + 1))
+ if (read_bucket(l))
+ goto bsearch;
+
+ /* no journal entries on this device? */
+ if (l == ja->nr)
+ goto out;
+bsearch:
+ /* Binary search */
+ r = find_next_bit(bitmap, ja->nr, l + 1);
+ pr_debug("starting binary search, l %u r %u", l, r);
+
+ while (l + 1 < r) {
+ unsigned m = (l + r) >> 1;
+ u64 cur_seq = seq;
+
+ read_bucket(m);
+
+ if (cur_seq != seq)
+ l = m;
+ else
+ r = m;
+ }
+
+search_done:
+ /*
+ * Find the journal bucket with the highest sequence number:
+ *
+ * If there's duplicate journal entries in multiple buckets (which
+ * definitely isn't supposed to happen, but...) - make sure to start
+ * cur_idx at the last of those buckets, so we don't deadlock trying to
+ * allocate
+ */
+ seq = 0;
+
+ for (i = 0; i < ja->nr; i++)
+ if (ja->bucket_seq[i] >= seq &&
+ ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
+ /*
+ * When journal_next_bucket() goes to allocate for
+ * the first time, it'll use the bucket after
+ * ja->cur_idx
+ */
+ ja->cur_idx = i;
+ seq = ja->bucket_seq[i];
+ }
+
+ /*
+ * Set last_idx to indicate the entire journal is full and needs to be
+ * reclaimed - journal reclaim will immediately reclaim whatever isn't
+ * pinned when it first runs:
+ */
+ ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+
+ /*
+ * Read buckets in reverse order until we stop finding more journal
+ * entries:
+ */
+ for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
+ i != ja->cur_idx;
+ i = (i + ja->nr - 1) % ja->nr)
+ if (!test_bit(i, bitmap) &&
+ !read_bucket(i))
+ break;
+out:
+ kvpfree(buf.data, buf.size);
+ percpu_ref_put(&ca->io_ref);
+ closure_return(cl);
+err:
+ mutex_lock(&jlist->lock);
+ jlist->ret = ret;
+ mutex_unlock(&jlist->lock);
+ goto out;
+#undef read_bucket
+}
+
+void bch2_journal_entries_free(struct list_head *list)
+{
+
+ while (!list_empty(list)) {
+ struct journal_replay *i =
+ list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+ }
+}
+
+static inline bool journal_has_keys(struct list_head *list)
+{
+ struct journal_replay *i;
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
+
+ list_for_each_entry(i, list, list)
+ for_each_jset_key(k, _n, entry, &i->j)
+ return true;
+
+ return false;
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+{
+ struct journal *j = &c->journal;
+ struct journal_list jlist;
+ struct journal_replay *i;
+ struct journal_entry_pin_list *p;
+ struct bch_dev *ca;
+ u64 cur_seq, end_seq, seq;
+ unsigned iter, keys = 0, entries = 0;
+ size_t nr;
+ bool degraded = false;
+ int ret = 0;
+
+ closure_init_stack(&jlist.cl);
+ mutex_init(&jlist.lock);
+ jlist.head = list;
+ jlist.ret = 0;
+
+ for_each_member_device(ca, c, iter) {
+ if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+ continue;
+
+ if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
+ ca->mi.state == BCH_MEMBER_STATE_RO) &&
+ percpu_ref_tryget(&ca->io_ref))
+ closure_call(&ca->journal.read,
+ bch2_journal_read_device,
+ system_unbound_wq,
+ &jlist.cl);
+ else
+ degraded = true;
+ }
+
+ closure_sync(&jlist.cl);
+
+ if (jlist.ret)
+ return jlist.ret;
+
+ if (list_empty(list)){
+ bch_err(c, "no journal entries found");
+ return BCH_FSCK_REPAIR_IMPOSSIBLE;
+ }
+
+ fsck_err_on(c->sb.clean && journal_has_keys(list), c,
+ "filesystem marked clean but journal has keys to replay");
+
+ list_for_each_entry(i, list, list) {
+ ret = jset_validate_entries(c, &i->j, READ);
+ if (ret)
+ goto fsck_err;
+
+ /*
+ * If we're mounting in degraded mode - if we didn't read all
+ * the devices - this is wrong:
+ */
+
+ if (!degraded &&
+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+ fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
+ i->devs), c,
+ "superblock not marked as containing replicas (type %u)",
+ BCH_DATA_JOURNAL))) {
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+ if (ret)
+ return ret;
+ }
+ }
+
+ i = list_last_entry(list, struct journal_replay, list);
+
+ nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
+
+ if (nr > j->pin.size) {
+ free_fifo(&j->pin);
+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+ if (!j->pin.data) {
+ bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+ return -ENOMEM;
+ }
+ }
+
+ atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
+ j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
+
+ j->pin.front = le64_to_cpu(i->j.last_seq);
+ j->pin.back = le64_to_cpu(i->j.seq) + 1;
+
+ fifo_for_each_entry_ptr(p, &j->pin, seq) {
+ INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->flushed);
+ atomic_set(&p->count, 0);
+ p->devs.nr = 0;
+ }
+
+ mutex_lock(&j->blacklist_lock);
+
+ list_for_each_entry(i, list, list) {
+ p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
+
+ atomic_set(&p->count, 1);
+ p->devs = i->devs;
+
+ if (bch2_journal_seq_blacklist_read(j, i)) {
+ mutex_unlock(&j->blacklist_lock);
+ return -ENOMEM;
+ }
+ }
+
+ mutex_unlock(&j->blacklist_lock);
+
+ cur_seq = journal_last_seq(j);
+ end_seq = le64_to_cpu(list_last_entry(list,
+ struct journal_replay, list)->j.seq);
+
+ list_for_each_entry(i, list, list) {
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
+ bool blacklisted;
+
+ mutex_lock(&j->blacklist_lock);
+ while (cur_seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_blacklist_find(j, cur_seq))
+ cur_seq++;
+
+ blacklisted = bch2_journal_seq_blacklist_find(j,
+ le64_to_cpu(i->j.seq));
+ mutex_unlock(&j->blacklist_lock);
+
+ fsck_err_on(blacklisted, c,
+ "found blacklisted journal entry %llu",
+ le64_to_cpu(i->j.seq));
+
+ fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
+ "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ cur_seq, le64_to_cpu(i->j.seq) - 1,
+ journal_last_seq(j), end_seq);
+
+ cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+ for_each_jset_key(k, _n, entry, &i->j)
+ keys++;
+ entries++;
+ }
+
+ bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+ keys, entries, journal_cur_seq(j));
+fsck_err:
+ return ret;
+}
+
+/* journal replay: */
+
+int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
+{
+ struct bkey_i *k, *n;
+ struct jset_entry *j;
+ struct journal_replay *r;
+ int ret;
+
+ list_for_each_entry(r, list, list)
+ for_each_jset_key(k, n, j, &r->j) {
+ enum bkey_type type = bkey_type(j->level, j->btree_id);
+ struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+
+ if (btree_type_has_ptrs(type)) {
+ ret = bch2_btree_mark_key_initial(c, type, k_s_c);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+ struct journal *j = &c->journal;
+ struct journal_entry_pin_list *pin_list;
+ struct bkey_i *k, *_n;
+ struct jset_entry *entry;
+ struct journal_replay *i, *n;
+ int ret = 0;
+
+ list_for_each_entry_safe(i, n, list, list) {
+
+ j->replay_journal_seq = le64_to_cpu(i->j.seq);
+
+ for_each_jset_key(k, _n, entry, &i->j) {
+
+ if (entry->btree_id == BTREE_ID_ALLOC) {
+ /*
+ * allocation code handles replay for
+ * BTREE_ID_ALLOC keys:
+ */
+ ret = bch2_alloc_replay_key(c, k->k.p);
+ } else {
+ /*
+ * We might cause compressed extents to be
+ * split, so we need to pass in a
+ * disk_reservation:
+ */
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+
+ ret = bch2_btree_insert(c, entry->btree_id, k,
+ &disk_res, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_REPLAY);
+ }
+
+ if (ret) {
+ bch_err(c, "journal replay: error %d while replaying key",
+ ret);
+ goto err;
+ }
+
+ cond_resched();
+ }
+
+ pin_list = journal_seq_pin(j, j->replay_journal_seq);
+
+ if (atomic_dec_and_test(&pin_list->count))
+ journal_wake(j);
+ }
+
+ j->replay_journal_seq = 0;
+
+ bch2_journal_set_replay_done(j);
+ ret = bch2_journal_flush_all_pins(j);
+err:
+ bch2_journal_entries_free(list);
+ return ret;
+}
+
+/* journal write: */
+
+static void bch2_journal_add_btree_root(struct journal_buf *buf,
+ enum btree_id id, struct bkey_i *k,
+ unsigned level)
+{
+ struct jset_entry *entry;
+
+ entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s);
+ entry->type = BCH_JSET_ENTRY_btree_root;
+ entry->btree_id = id;
+ entry->level = level;
+ memcpy_u64s(entry->_data, k, k->k.u64s);
+}
+
+static unsigned journal_dev_buckets_available(struct journal *j,
+ struct bch_dev *ca)
+{
+ struct journal_device *ja = &ca->journal;
+ unsigned next = (ja->cur_idx + 1) % ja->nr;
+ unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+ /*
+ * Hack to avoid a deadlock during journal replay:
+ * journal replay might require setting a new btree
+ * root, which requires writing another journal entry -
+ * thus, if the journal is full (and this happens when
+ * replaying the first journal bucket's entries) we're
+ * screwed.
+ *
+ * So don't let the journal fill up unless we're in
+ * replay:
+ */
+ if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
+ available = max((int) available - 2, 0);
+
+ /*
+ * Don't use the last bucket unless writing the new last_seq
+ * will make another bucket available:
+ */
+ if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
+ available = max((int) available - 1, 0);
+
+ return available;
+}
+
+/* returns number of sectors available for next journal entry: */
+int bch2_journal_entry_sectors(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+ unsigned sectors_available = UINT_MAX;
+ unsigned i, nr_online = 0, nr_devs = 0;
+
+ lockdep_assert_held(&j->lock);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_JOURNAL]) {
+ struct journal_device *ja = &ca->journal;
+ unsigned buckets_required = 0;
+
+ if (!ja->nr)
+ continue;
+
+ sectors_available = min_t(unsigned, sectors_available,
+ ca->mi.bucket_size);
+
+ /*
+ * Note that we don't allocate the space for a journal entry
+ * until we write it out - thus, if we haven't started the write
+ * for the previous entry we have to make sure we have space for
+ * it too:
+ */
+ if (bch2_extent_has_device(e.c, ca->dev_idx)) {
+ if (j->prev_buf_sectors > ja->sectors_free)
+ buckets_required++;
+
+ if (j->prev_buf_sectors + sectors_available >
+ ja->sectors_free)
+ buckets_required++;
+ } else {
+ if (j->prev_buf_sectors + sectors_available >
+ ca->mi.bucket_size)
+ buckets_required++;
+
+ buckets_required++;
+ }
+
+ if (journal_dev_buckets_available(j, ca) >= buckets_required)
+ nr_devs++;
+ nr_online++;
+ }
+ rcu_read_unlock();
+
+ if (nr_online < c->opts.metadata_replicas_required)
+ return -EROFS;
+
+ if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
+ return 0;
+
+ return sectors_available;
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+ unsigned sectors)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct journal_device *ja;
+ struct bch_dev *ca;
+ struct dev_alloc_list devs_sorted;
+ unsigned i, replicas, replicas_want =
+ READ_ONCE(c->opts.metadata_replicas);
+
+ spin_lock(&j->lock);
+ e = bkey_i_to_s_extent(&j->key);
+
+ /*
+ * Drop any pointers to devices that have been removed, are no longer
+ * empty, or filled up their current journal bucket:
+ *
+ * Note that a device may have had a small amount of free space (perhaps
+ * one sector) that wasn't enough for the smallest possible journal
+ * entry - that's why we drop pointers to devices <= current free space,
+ * i.e. whichever device was limiting the current journal entry size.
+ */
+ extent_for_each_ptr_backwards(e, ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ca->mi.state != BCH_MEMBER_STATE_RW ||
+ ca->journal.sectors_free <= sectors)
+ __bch2_extent_drop_ptr(e, ptr);
+ else
+ ca->journal.sectors_free -= sectors;
+ }
+
+ replicas = bch2_extent_nr_ptrs(e.c);
+
+ rcu_read_lock();
+ devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+ &c->rw_devs[BCH_DATA_JOURNAL]);
+
+ for (i = 0; i < devs_sorted.nr; i++) {
+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ if (!ca)
+ continue;
+
+ if (!ca->mi.durability)
+ continue;
+
+ ja = &ca->journal;
+ if (!ja->nr)
+ continue;
+
+ if (replicas >= replicas_want)
+ break;
+
+ /*
+ * Check that we can use this device, and aren't already using
+ * it:
+ */
+ if (bch2_extent_has_device(e.c, ca->dev_idx) ||
+ !journal_dev_buckets_available(j, ca) ||
+ sectors > ca->mi.bucket_size)
+ continue;
+
+ j->wp.next_alloc[ca->dev_idx] += U32_MAX;
+ bch2_wp_rescale(c, ca, &j->wp);
+
+ ja->sectors_free = ca->mi.bucket_size - sectors;
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+ extent_ptr_append(bkey_i_to_extent(&j->key),
+ (struct bch_extent_ptr) {
+ .offset = bucket_to_sector(ca,
+ ja->buckets[ja->cur_idx]),
+ .dev = ca->dev_idx,
+ });
+
+ replicas += ca->mi.durability;
+ }
+ rcu_read_unlock();
+
+ j->prev_buf_sectors = 0;
+
+ bkey_copy(&w->key, &j->key);
+ spin_unlock(&j->lock);
+
+ if (replicas < c->opts.metadata_replicas_required)
+ return -EROFS;
+
+ BUG_ON(!replicas);
+
+ return 0;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+ struct jset_entry *i, *next, *prev = NULL;
+
+ /*
+ * Simple compaction, dropping empty jset_entries (from journal
+ * reservations that weren't fully used) and merging jset_entries that
+ * can be.
+ *
+ * If we wanted to be really fancy here, we could sort all the keys in
+ * the jset and drop keys that were overwritten - probably not worth it:
+ */
+ vstruct_for_each_safe(jset, i, next) {
+ unsigned u64s = le16_to_cpu(i->u64s);
+
+ /* Empty entry: */
+ if (!u64s)
+ continue;
+
+ /* Can we merge with previous entry? */
+ if (prev &&
+ i->btree_id == prev->btree_id &&
+ i->level == prev->level &&
+ i->type == prev->type &&
+ i->type == BCH_JSET_ENTRY_btree_keys &&
+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+ memmove_u64s_down(vstruct_next(prev),
+ i->_data,
+ u64s);
+ le16_add_cpu(&prev->u64s, u64s);
+ continue;
+ }
+
+ /* Couldn't merge, move i into new position (after prev): */
+ prev = prev ? vstruct_next(prev) : jset->start;
+ if (i != prev)
+ memmove_u64s_down(prev, i, jset_u64s(u64s));
+ }
+
+ prev = prev ? vstruct_next(prev) : jset->start;
+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+ /* we aren't holding j->lock: */
+ unsigned new_size = READ_ONCE(j->buf_size_want);
+ void *new_buf;
+
+ if (buf->size >= new_size)
+ return;
+
+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+ if (!new_buf)
+ return;
+
+ memcpy(new_buf, buf->data, buf->size);
+ kvpfree(buf->data, buf->size);
+ buf->data = new_buf;
+ buf->size = new_size;
+}
+
+static void journal_write_done(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_buf *w = journal_prev_buf(j);
+ struct bch_devs_list devs =
+ bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+ u64 seq = le64_to_cpu(w->data->seq);
+
+ if (!devs.nr) {
+ bch_err(c, "unable to write journal to sufficient devices");
+ goto err;
+ }
+
+ if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+ goto err;
+out:
+ bch2_time_stats_update(j->write_time, j->write_start_time);
+
+ spin_lock(&j->lock);
+ j->last_seq_ondisk = seq;
+ if (seq >= j->pin.front)
+ journal_seq_pin(j, seq)->devs = devs;
+
+ /*
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+
+ /* also must come before signalling write completion: */
+ closure_debug_destroy(cl);
+
+ BUG_ON(!j->reservations.prev_buf_unwritten);
+ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+ &j->reservations.counter);
+
+ closure_wake_up(&w->wait);
+ journal_wake(j);
+
+ if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
+ mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+ spin_unlock(&j->lock);
+ return;
+err:
+ bch2_fatal_error(c);
+ bch2_journal_halt(j);
+ goto out;
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+ struct bch_dev *ca = bio->bi_private;
+ struct journal *j = &ca->fs->journal;
+
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+ bch2_meta_write_fault("journal")) {
+ struct journal_buf *w = journal_prev_buf(j);
+ unsigned long flags;
+
+ spin_lock_irqsave(&j->err_lock, flags);
+ bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+ spin_unlock_irqrestore(&j->err_lock, flags);
+ }
+
+ closure_put(&j->io);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_prev_buf(j);
+ struct jset *jset;
+ struct bio *bio;
+ struct bch_extent_ptr *ptr;
+ unsigned i, sectors, bytes;
+
+ journal_buf_realloc(j, w);
+ jset = w->data;
+
+ j->write_start_time = local_clock();
+ mutex_lock(&c->btree_root_lock);
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct btree_root *r = &c->btree_roots[i];
+
+ if (r->alive)
+ bch2_journal_add_btree_root(w, i, &r->key, r->level);
+ }
+ c->btree_roots_dirty = false;
+ mutex_unlock(&c->btree_root_lock);
+
+ journal_write_compact(jset);
+
+ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
+ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
+ jset->magic = cpu_to_le64(jset_magic(c));
+ jset->version = cpu_to_le32(BCACHE_JSET_VERSION);
+
+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+ jset_validate_entries(c, jset, WRITE))
+ goto err;
+
+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ jset->encrypted_start,
+ vstruct_end(jset) - (void *) jset->encrypted_start);
+
+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+ journal_nonce(jset), jset);
+
+ if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+ jset_validate_entries(c, jset, WRITE))
+ goto err;
+
+ sectors = vstruct_sectors(jset, c->block_bits);
+ BUG_ON(sectors > j->prev_buf_sectors);
+
+ bytes = vstruct_bytes(w->data);
+ memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+
+ if (journal_write_alloc(j, w, sectors)) {
+ bch2_journal_halt(j);
+ bch_err(c, "Unable to allocate journal write");
+ bch2_fatal_error(c);
+ continue_at(cl, journal_write_done, system_highpri_wq);
+ }
+
+ /*
+ * XXX: we really should just disable the entire journal in nochanges
+ * mode
+ */
+ if (c->opts.nochanges)
+ goto no_io;
+
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ /* XXX: fix this */
+ bch_err(c, "missing device for journal write\n");
+ continue;
+ }
+
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+ sectors);
+
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_iter.bi_sector = ptr->offset;
+ bio->bi_iter.bi_size = sectors << 9;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ bio_set_op_attrs(bio, REQ_OP_WRITE,
+ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+ bch2_bio_map(bio, jset);
+
+ trace_journal_write(bio);
+ closure_bio_submit(bio, cl);
+
+ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+ }
+
+ for_each_rw_member(ca, c, i)
+ if (journal_flushes_device(ca) &&
+ !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+ percpu_ref_get(&ca->io_ref);
+
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_opf = REQ_OP_FLUSH;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+
+no_io:
+ extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
+ ptr->offset += sectors;
+
+ continue_at(cl, journal_write_done, system_highpri_wq);
+err:
+ bch2_inconsistent_error(c);
+ continue_at(cl, journal_write_done, system_highpri_wq);
+}
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
new file mode 100644
index 0000000..4236b7f
--- /dev/null
+++ b/libbcachefs/journal_io.h
@@ -0,0 +1,45 @@
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
+ enum btree_id, unsigned *);
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ struct list_head list;
+ struct bch_devs_list devs;
+ /* must be last: */
+ struct jset j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+ struct jset_entry *entry, unsigned type)
+{
+ while (entry < vstruct_last(jset)) {
+ if (entry->type == type)
+ return entry;
+
+ entry = vstruct_next(entry);
+ }
+
+ return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type) \
+ for (entry = (jset)->start; \
+ (entry = __jset_entry_type_next(jset, entry, type)); \
+ entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset) \
+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
+ vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+int bch2_journal_entry_sectors(struct journal *);
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
new file mode 100644
index 0000000..0e3e5b6
--- /dev/null
+++ b/libbcachefs/journal_reclaim.c
@@ -0,0 +1,411 @@
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+ struct journal_entry_pin_list *pin_list)
+{
+ return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+ u64 ret = 0;
+
+ spin_lock(&j->lock);
+ if (journal_pin_active(pin))
+ ret = journal_pin_seq(j, pin->pin_list);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+ struct journal_entry_pin_list *pin_list,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ BUG_ON(journal_pin_active(pin));
+ BUG_ON(!atomic_read(&pin_list->count));
+
+ atomic_inc(&pin_list->count);
+ pin->pin_list = pin_list;
+ pin->flush = flush_fn;
+
+ if (flush_fn)
+ list_add(&pin->list, &pin_list->list);
+ else
+ INIT_LIST_HEAD(&pin->list);
+
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ spin_lock(&j->lock);
+ __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+ spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+ struct journal_entry_pin *pin)
+{
+ struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+ if (!journal_pin_active(pin))
+ return;
+
+ pin->pin_list = NULL;
+ list_del_init(&pin->list);
+
+ /*
+ * Unpinning a journal entry make make journal_next_bucket() succeed, if
+ * writing a new last_seq will now make another bucket available:
+ */
+ if (atomic_dec_and_test(&pin_list->count) &&
+ pin_list == &fifo_peek_front(&j->pin))
+ bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+ struct journal_entry_pin *pin)
+{
+ spin_lock(&j->lock);
+ __journal_pin_drop(j, pin);
+ spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+ struct journal_entry_pin *src_pin,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ spin_lock(&j->lock);
+
+ if (journal_pin_active(src_pin) &&
+ (!journal_pin_active(pin) ||
+ journal_pin_seq(j, src_pin->pin_list) <
+ journal_pin_seq(j, pin->pin_list))) {
+ __journal_pin_drop(j, pin);
+ __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+ }
+
+ spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+ struct journal_entry_pin_list temp;
+ bool popped = false;
+
+ lockdep_assert_held(&j->lock);
+
+ /*
+ * Unpin journal entries whose reference counts reached zero, meaning
+ * all btree nodes got written out
+ */
+ while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!fifo_pop(&j->pin, temp));
+ popped = true;
+ }
+
+ if (popped)
+ journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *ret;
+ u64 iter;
+
+ /* no need to iterate over empty fifo entries: */
+ bch2_journal_reclaim_fast(j);
+
+ fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+ if (iter > seq_to_flush)
+ break;
+
+ ret = list_first_entry_or_null(&pin_list->list,
+ struct journal_entry_pin, list);
+ if (ret) {
+ /* must be list_del_init(), see bch2_journal_pin_drop() */
+ list_move(&ret->list, &pin_list->flushed);
+ *seq = iter;
+ return ret;
+ }
+ }
+
+ return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+ struct journal_entry_pin *ret;
+
+ spin_lock(&j->lock);
+ ret = __journal_get_next_pin(j, seq_to_flush, seq);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ ret = ja->nr &&
+ (ja->last_idx != ja->cur_idx &&
+ ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(to_delayed_work(work),
+ struct bch_fs, journal.reclaim_work);
+ struct journal *j = &c->journal;
+ struct bch_dev *ca;
+ struct journal_entry_pin *pin;
+ u64 seq, seq_to_flush = 0;
+ unsigned iter, bucket_to_flush;
+ unsigned long next_flush;
+ bool reclaim_lock_held = false, need_flush;
+
+ /*
+ * Advance last_idx to point to the oldest journal entry containing
+ * btree node updates that have not yet been written out
+ */
+ for_each_rw_member(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+
+ if (!ja->nr)
+ continue;
+
+ while (should_discard_bucket(j, ja)) {
+ if (!reclaim_lock_held) {
+ /*
+ * ugh:
+ * might be called from __journal_res_get()
+ * under wait_event() - have to go back to
+ * TASK_RUNNING before doing something that
+ * would block, but only if we're doing work:
+ */
+ __set_current_state(TASK_RUNNING);
+
+ mutex_lock(&j->reclaim_lock);
+ reclaim_lock_held = true;
+ /* recheck under reclaim_lock: */
+ continue;
+ }
+
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca,
+ ja->buckets[ja->last_idx]),
+ ca->mi.bucket_size, GFP_NOIO, 0);
+
+ spin_lock(&j->lock);
+ ja->last_idx = (ja->last_idx + 1) % ja->nr;
+ spin_unlock(&j->lock);
+
+ journal_wake(j);
+ }
+
+ /*
+ * Write out enough btree nodes to free up 50% journal
+ * buckets
+ */
+ spin_lock(&j->lock);
+ bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+ seq_to_flush = max_t(u64, seq_to_flush,
+ ja->bucket_seq[bucket_to_flush]);
+ spin_unlock(&j->lock);
+ }
+
+ if (reclaim_lock_held)
+ mutex_unlock(&j->reclaim_lock);
+
+ /* Also flush if the pin fifo is more than half full */
+ spin_lock(&j->lock);
+ seq_to_flush = max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
+ spin_unlock(&j->lock);
+
+ /*
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
+ * make sure to flush at least one journal pin:
+ */
+ next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+ need_flush = time_after(jiffies, next_flush);
+
+ while ((pin = journal_get_next_pin(j, need_flush
+ ? U64_MAX
+ : seq_to_flush, &seq))) {
+ __set_current_state(TASK_RUNNING);
+ pin->flush(j, pin, seq);
+ need_flush = false;
+
+ j->last_flushed = jiffies;
+ }
+
+ if (!test_bit(BCH_FS_RO, &c->flags))
+ queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+ msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+ struct journal_entry_pin **pin,
+ u64 *pin_seq)
+{
+ int ret;
+
+ *pin = NULL;
+
+ ret = bch2_journal_error(j);
+ if (ret)
+ return ret;
+
+ spin_lock(&j->lock);
+ /*
+ * If journal replay hasn't completed, the unreplayed journal entries
+ * hold refs on their corresponding sequence numbers
+ */
+ ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+ !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+ journal_last_seq(j) > seq_to_flush ||
+ (fifo_used(&j->pin) == 1 &&
+ atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin *pin;
+ u64 pin_seq;
+ bool flush;
+
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return 0;
+again:
+ wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
+ if (pin) {
+ /* flushing a journal pin might cause a new one to be added: */
+ pin->flush(j, pin, pin_seq);
+ goto again;
+ }
+
+ spin_lock(&j->lock);
+ flush = journal_last_seq(j) != j->last_seq_ondisk ||
+ (seq_to_flush == U64_MAX && c->btree_roots_dirty);
+ spin_unlock(&j->lock);
+
+ return flush ? bch2_journal_meta(j) : 0;
+}
+
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+ return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ struct bch_devs_list devs;
+ u64 iter, seq = 0;
+ int ret = 0;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
+ if (dev_idx >= 0
+ ? bch2_dev_list_has_dev(p->devs, dev_idx)
+ : p->devs.nr < c->opts.metadata_replicas)
+ seq = iter;
+ spin_unlock(&j->lock);
+
+ ret = bch2_journal_flush_pins(j, seq);
+ if (ret)
+ return ret;
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+ seq = 0;
+
+ spin_lock(&j->lock);
+ while (!ret && seq < j->pin.back) {
+ seq = max(seq, journal_last_seq(j));
+ devs = journal_seq_pin(j, seq)->devs;
+ seq++;
+
+ spin_unlock(&j->lock);
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+ spin_lock(&j->lock);
+ }
+ spin_unlock(&j->lock);
+
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+}
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
new file mode 100644
index 0000000..7d460c3
--- /dev/null
+++ b/libbcachefs/journal_reclaim.h
@@ -0,0 +1,36 @@
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN (32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+ return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+ BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+ return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+int bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
new file mode 100644
index 0000000..b5301d9
--- /dev/null
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -0,0 +1,358 @@
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
+{
+ struct bch_fs *c =
+ container_of(j, struct bch_fs, journal);
+ struct journal_seq_blacklist *bl =
+ container_of(pin, struct journal_seq_blacklist, pin);
+ struct blacklisted_node n;
+ struct closure cl;
+ unsigned i;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ for (i = 0;; i++) {
+ struct btree_iter iter;
+ struct btree *b;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+
+ __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ /* The node might have already been rewritten: */
+
+ if (b->data->keys.seq == n.seq) {
+ ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+ if (ret) {
+ bch2_btree_iter_unlock(&iter);
+ bch2_fs_fatal_error(c,
+ "error %i rewriting btree node with blacklisted journal seq",
+ ret);
+ bch2_journal_halt(j);
+ return;
+ }
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ }
+
+ for (i = 0;; i++) {
+ struct btree_update *as;
+ struct pending_btree_node_free *d;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+redo_wait:
+ mutex_lock(&c->btree_interior_update_lock);
+
+ /*
+ * Is the node on the list of pending interior node updates -
+ * being freed? If so, wait for that to finish:
+ */
+ for_each_pending_btree_node_free(c, as, d)
+ if (n.seq == d->seq &&
+ n.btree_id == d->btree_id &&
+ !d->level &&
+ !bkey_cmp(n.pos, d->key.k.p)) {
+ closure_wait(&as->wait, &cl);
+ mutex_unlock(&c->btree_interior_update_lock);
+ closure_sync(&cl);
+ goto redo_wait;
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+ }
+
+ mutex_lock(&j->blacklist_lock);
+
+ bch2_journal_pin_drop(j, &bl->pin);
+ list_del(&bl->list);
+ kfree(bl->entries);
+ kfree(bl);
+
+ mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+ struct journal_seq_blacklist *bl;
+
+ lockdep_assert_held(&j->blacklist_lock);
+
+ list_for_each_entry(bl, &j->seq_blacklist, list)
+ if (seq >= bl->start && seq <= bl->end)
+ return bl;
+
+ return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+ struct journal_seq_blacklist *bl;
+
+ lockdep_assert_held(&j->blacklist_lock);
+
+ /*
+ * When we start the journal, bch2_journal_start() will skip over @seq:
+ */
+
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return NULL;
+
+ bl->start = start;
+ bl->end = end;
+
+ list_add_tail(&bl->list, &j->seq_blacklist);
+ return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+ struct journal *j = &c->journal;
+ struct journal_seq_blacklist *bl = NULL;
+ struct blacklisted_node *n;
+ u64 journal_seq;
+ int ret = 0;
+
+ if (!seq)
+ return 0;
+
+ spin_lock(&j->lock);
+ journal_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
+
+ /* Interier updates aren't journalled: */
+ BUG_ON(b->level);
+ BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+ /*
+ * Decrease this back to j->seq + 2 when we next rev the on disk format:
+ * increasing it temporarily to work around bug in old kernels
+ */
+ fsck_err_on(seq > journal_seq + 4, c,
+ "bset journal seq too far in the future: %llu > %llu",
+ seq, journal_seq);
+
+ if (seq <= journal_seq &&
+ list_empty_careful(&j->seq_blacklist))
+ return 0;
+
+ mutex_lock(&j->blacklist_lock);
+
+ if (seq <= journal_seq) {
+ bl = bch2_journal_seq_blacklist_find(j, seq);
+ if (!bl)
+ goto out;
+ } else {
+ bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+ b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+ if (!j->new_blacklist) {
+ j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+ journal_seq + 1,
+ journal_seq + 1);
+ if (!j->new_blacklist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ bl = j->new_blacklist;
+ bl->end = max(bl->end, seq);
+ }
+
+ for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+ if (b->data->keys.seq == n->seq &&
+ b->btree_id == n->btree_id &&
+ !bkey_cmp(b->key.k.p, n->pos))
+ goto found_entry;
+
+ if (!bl->nr_entries ||
+ is_power_of_2(bl->nr_entries)) {
+ n = krealloc(bl->entries,
+ max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+ GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ bl->entries = n;
+ }
+
+ bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+ .seq = b->data->keys.seq,
+ .btree_id = b->btree_id,
+ .pos = b->key.k.p,
+ };
+found_entry:
+ ret = 1;
+out:
+fsck_err:
+ mutex_unlock(&j->blacklist_lock);
+ return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+ struct journal_replay *i,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_seq_blacklist *bl;
+
+ bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+ start, end);
+
+ bl = bch2_journal_seq_blacklisted_new(j, start, end);
+ if (!bl)
+ return -ENOMEM;
+
+ bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+ journal_seq_blacklist_flush);
+ return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+ struct journal_replay *i)
+{
+ struct jset_entry *entry;
+ int ret = 0;
+
+ vstruct_for_each(&i->j, entry) {
+ switch (entry->type) {
+ case BCH_JSET_ENTRY_blacklist: {
+ struct jset_entry_blacklist *bl_entry =
+ container_of(entry, struct jset_entry_blacklist, entry);
+
+ ret = __bch2_journal_seq_blacklist_read(j, i,
+ le64_to_cpu(bl_entry->seq),
+ le64_to_cpu(bl_entry->seq));
+ break;
+ }
+ case BCH_JSET_ENTRY_blacklist_v2: {
+ struct jset_entry_blacklist_v2 *bl_entry =
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ ret = __bch2_journal_seq_blacklist_read(j, i,
+ le64_to_cpu(bl_entry->start),
+ le64_to_cpu(bl_entry->end));
+ break;
+ }
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+ struct journal_seq_blacklist *bl = j->new_blacklist;
+ struct jset_entry_blacklist_v2 *bl_entry;
+ struct jset_entry *entry;
+
+ if (!bl)
+ return;
+
+ entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+ (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+ bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
+ bl_entry->start = cpu_to_le64(bl->start);
+ bl_entry->end = cpu_to_le64(bl->end);
+
+ bch2_journal_pin_add(j,
+ journal_cur_seq(j),
+ &bl->pin,
+ journal_seq_blacklist_flush);
+
+ j->new_blacklist = NULL;
+}
diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h
new file mode 100644
index 0000000..95ea6e9
--- /dev/null
+++ b/libbcachefs/journal_seq_blacklist.h
@@ -0,0 +1,13 @@
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_replay;
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+ struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index e39b18f..a27e054 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -59,8 +59,9 @@ struct blacklisted_node {
struct journal_seq_blacklist {
struct list_head list;
- u64 seq;
- bool written;
+ u64 start;
+ u64 end;
+
struct journal_entry_pin pin;
struct blacklisted_node *entries;
@@ -171,10 +172,11 @@ struct journal {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
- struct journal_entry_pin_list *replay_pin_list;
+ u64 replay_journal_seq;
struct mutex blacklist_lock;
struct list_head seq_blacklist;
+ struct journal_seq_blacklist *new_blacklist;
BKEY_PADDED(key);
struct write_point wp;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 87e6e80..0431fb8 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -5,6 +5,7 @@
#include "buckets.h"
#include "inode.h"
#include "io.h"
+#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
@@ -22,7 +23,6 @@ struct moving_io {
struct closure cl;
bool read_completed;
- unsigned read_dev;
unsigned read_sectors;
unsigned write_sectors;
@@ -42,7 +42,7 @@ struct moving_context {
struct list_head reads;
/* in flight sectors: */
- atomic_t read_sectors[BCH_SB_MEMBERS_MAX];
+ atomic_t read_sectors;
atomic_t write_sectors;
wait_queue_head_t wait;
@@ -306,7 +306,8 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- if (likely(!io->rbio.bio.bi_status)) {
+ if (likely(!io->rbio.bio.bi_status &&
+ !io->rbio.hole)) {
bch2_migrate_read_done(&io->write, &io->rbio);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
@@ -330,7 +331,7 @@ static void move_read_endio(struct bio *bio)
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
- atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+ atomic_sub(io->read_sectors, &ctxt->read_sectors);
io->read_completed = true;
if (next_pending_write(ctxt))
@@ -376,7 +377,6 @@ static int bch2_move_extent(struct bch_fs *c,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
- struct extent_pick_ptr pick;
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
@@ -387,12 +387,8 @@ static int bch2_move_extent(struct bch_fs *c,
atomic_read(&ctxt->write_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
- bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
- if (IS_ERR_OR_NULL(pick.ca))
- return pick.ca ? PTR_ERR(pick.ca) : 0;
-
move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
+ atomic_read(&ctxt->read_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
/* write path might have to decompress data: */
@@ -406,8 +402,7 @@ static int bch2_move_extent(struct bch_fs *c,
goto err;
io->write.ctxt = ctxt;
- io->read_dev = pick.ca->dev_idx;
- io->read_sectors = pick.crc.uncompressed_size;
+ io->read_sectors = e.k->size;
io->write_sectors = e.k->size;
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
@@ -421,6 +416,7 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+ io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -438,7 +434,7 @@ static int bch2_move_extent(struct bch_fs *c,
trace_move_extent(e.k);
- atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+ atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
/*
@@ -446,14 +442,15 @@ static int bch2_move_extent(struct bch_fs *c,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
+ bch2_read_extent(c, &io->rbio, e.s_c,
+ BCH_READ_NODECODE|
+ BCH_READ_LAST_FRAGMENT);
return 0;
err_free_pages:
bio_free_pages(&io->write.op.wbio.bio);
err_free:
kfree(io);
err:
- percpu_ref_put(&pick.ca->io_ref);
trace_move_alloc_fail(e.k);
return ret;
}
@@ -728,7 +725,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
stats->data_type = BCH_DATA_JOURNAL;
- ret = bch2_journal_flush_device(&c->journal, -1);
+ ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
@@ -745,7 +742,7 @@ int bch2_data_job(struct bch_fs *c,
return -EINVAL;
stats->data_type = BCH_DATA_JOURNAL;
- ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 05910c4..16b8cbf 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -26,6 +26,8 @@
#include "inode.h"
#include "io.h"
#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "migrate.h"
@@ -396,9 +398,15 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
+#define BCH_TIME_STAT(name) \
+ bch2_time_stats_exit(&c->name##_time);
+ BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_encryption_exit(c);
+ bch2_fs_io_exit(c);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
@@ -407,10 +415,6 @@ static void bch2_fs_free(struct bch_fs *c)
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
- mempool_exit(&c->bio_bounce_pages);
- bioset_exit(&c->bio_write);
- bioset_exit(&c->bio_read_split);
- bioset_exit(&c->bio_read);
bioset_exit(&c->btree_bio);
mempool_exit(&c->btree_interior_update_pool);
mempool_exit(&c->btree_reserve_pool);
@@ -561,8 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
init_rwsem(&c->gc_lock);
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- spin_lock_init(&c->name##_time.lock);
+#define BCH_TIME_STAT(name) \
+ bch2_time_stats_init(&c->name##_time);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
@@ -587,9 +591,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
seqcount_init(&c->gc_pos_lock);
- c->copy_gc_enabled = 1;
- c->rebalance_enabled = 1;
- c->rebalance_percent = 10;
+ c->copy_gc_enabled = 1;
+ c->rebalance_enabled = 1;
+ c->rebalance_percent = 10;
+ c->promote_whole_extents = true;
c->journal.write_time = &c->journal_write_time;
c->journal.delay_time = &c->journal_delay_time;
@@ -640,17 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS) ||
- mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->sb.encoded_extent_max) /
- PAGE_SECTORS, 0) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
@@ -658,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
bch2_fs_btree_cache_init(c) ||
+ bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
bch2_fs_fsio_init(c))
@@ -774,11 +769,11 @@ const char *bch2_fs_start(struct bch_fs *c)
goto recovery_done;
/*
- * bch2_journal_start() can't happen sooner, or btree_gc_finish()
+ * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
* this is a hack but oh well.
*/
- bch2_journal_start(c);
+ bch2_fs_journal_start(&c->journal);
err = "error starting allocator";
if (bch2_fs_allocator_start(c))
@@ -834,7 +829,7 @@ const char *bch2_fs_start(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_journal_start(c);
+ bch2_fs_journal_start(&c->journal);
bch2_journal_set_replay_done(&c->journal);
err = "error starting allocator";
@@ -993,6 +988,9 @@ static void bch2_dev_free(struct bch_dev *ca)
bioset_exit(&ca->replica_set);
bch2_dev_buckets_free(ca);
+ bch2_time_stats_exit(&ca->io_latency[WRITE]);
+ bch2_time_stats_exit(&ca->io_latency[READ]);
+
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
kobject_put(&ca->kobj);
@@ -1089,6 +1087,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+ bch2_time_stats_init(&ca->io_latency[READ]);
+ bch2_time_stats_init(&ca->io_latency[WRITE]);
+
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
@@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
- ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index a52ee3b..231bc52 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
- return ca->disk_sb.bdev != NULL;
+ return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+ return bch2_dev_is_online(ca) &&
+ ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+ if (!percpu_ref_tryget(&ca->io_ref))
+ return false;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+ return true;
+
+ percpu_ref_put(&ca->io_ref);
+ return false;
}
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index e8089db..65345d8 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -141,11 +141,19 @@ read_attribute(btree_node_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(durability);
-read_attribute(iostats);
-read_attribute(last_read_quantiles);
-read_attribute(last_write_quantiles);
-read_attribute(fragmentation_quantiles);
-read_attribute(oldest_gen_quantiles);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(bucket_quantiles_last_read);
+read_attribute(bucket_quantiles_last_write);
+read_attribute(bucket_quantiles_fragmentation);
+read_attribute(bucket_quantiles_oldest_gen);
+
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@@ -177,6 +185,7 @@ sysfs_pd_controller_attribute(copy_gc);
rw_attribute(rebalance_enabled);
rw_attribute(rebalance_percent);
sysfs_pd_controller_attribute(rebalance);
+rw_attribute(promote_whole_extents);
rw_attribute(pd_controllers_update_seconds);
@@ -189,8 +198,9 @@ read_attribute(data_replicas_have);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_time_stats_attribute(name, frequency_units, duration_units);
+#define BCH_TIME_STAT(_name) \
+ static struct attribute sysfs_time_stat_##_name = \
+ { .name = #_name, .mode = S_IRUGO };
BCH_TIME_STATS()
#undef BCH_TIME_STAT
@@ -332,9 +342,10 @@ SHOW(bch2_fs)
sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
sysfs_print(rebalance_percent, c->rebalance_percent);
-
sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
+ sysfs_print(promote_whole_extents, c->promote_whole_extents);
+
sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
@@ -406,6 +417,8 @@ STORE(__bch2_fs)
sysfs_strtoul(rebalance_percent, c->rebalance_percent);
sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
+
/* Debugging: */
#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
@@ -462,6 +475,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_reclaim_delay_ms,
&sysfs_rebalance_percent,
+ &sysfs_promote_whole_extents,
&sysfs_compression_stats,
NULL
@@ -531,9 +545,16 @@ STORE(bch2_fs_opts_dir)
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
int ret, id = opt - bch2_opt_table;
+ char *tmp;
u64 v;
- ret = bch2_opt_parse(c, opt, buf, &v);
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+ kfree(tmp);
+
if (ret < 0)
return ret;
@@ -592,9 +613,9 @@ SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_print_time_stats(&c->name##_time, name, \
- frequency_units, duration_units);
+#define BCH_TIME_STAT(name) \
+ if (attr == &sysfs_time_stat_##name) \
+ return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
@@ -603,23 +624,15 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_clear_time_stats(&c->name##_time, name);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
struct attribute *bch2_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
+#define BCH_TIME_STAT(name) \
+ &sysfs_time_stat_##name,
BCH_TIME_STATS()
#undef BCH_TIME_STAT
-
NULL
};
@@ -774,7 +787,7 @@ static const char * const bch2_rw[] = {
NULL
};
-static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf)
+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
{
char *out = buf, *end = buf + PAGE_SIZE;
int rw, i, cpu;
@@ -851,16 +864,28 @@ SHOW(bch2_dev)
return out - buf;
}
- if (attr == &sysfs_iostats)
- return show_dev_iostats(ca, buf);
+ if (attr == &sysfs_iodone)
+ return show_dev_iodone(ca, buf);
- if (attr == &sysfs_last_read_quantiles)
+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
+
+ if (attr == &sysfs_io_latency_stats_read)
+ return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+ if (attr == &sysfs_io_latency_stats_write)
+ return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+
+ sysfs_printf(congested, "%u%%",
+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+ * 100 / CONGESTED_MAX);
+
+ if (attr == &sysfs_bucket_quantiles_last_read)
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
- if (attr == &sysfs_last_write_quantiles)
+ if (attr == &sysfs_bucket_quantiles_last_write)
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
- if (attr == &sysfs_fragmentation_quantiles)
+ if (attr == &sysfs_bucket_quantiles_fragmentation)
return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
- if (attr == &sysfs_oldest_gen_quantiles)
+ if (attr == &sysfs_bucket_quantiles_oldest_gen)
return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
if (attr == &sysfs_reserve_stats)
@@ -944,13 +969,20 @@ struct attribute *bch2_dev_files[] = {
&sysfs_label,
&sysfs_has_data,
- &sysfs_iostats,
+ &sysfs_iodone,
+
+ &sysfs_io_latency_read,
+ &sysfs_io_latency_write,
+ &sysfs_io_latency_stats_read,
+ &sysfs_io_latency_stats_write,
+ &sysfs_congested,
/* alloc info - other stats: */
- &sysfs_last_read_quantiles,
- &sysfs_last_write_quantiles,
- &sysfs_fragmentation_quantiles,
- &sysfs_oldest_gen_quantiles,
+ &sysfs_bucket_quantiles_last_read,
+ &sysfs_bucket_quantiles_last_write,
+ &sysfs_bucket_quantiles_fragmentation,
+ &sysfs_bucket_quantiles_oldest_gen,
+
&sysfs_reserve_stats,
/* debug: */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index fa85375..1f2c23b 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -13,12 +13,15 @@
#include <linux/kthread.h>
#include <linux/log2.h>
#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
+#include "eytzinger.h"
#include "util.h"
#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
@@ -200,59 +203,189 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_time_stats_clear(struct time_stats *stats)
+void bch2_quantiles_update(struct quantiles *q, u64 v)
{
- spin_lock(&stats->lock);
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
- stats->count = 0;
- stats->last_duration = 0;
- stats->max_duration = 0;
- stats->average_duration = 0;
- stats->average_frequency = 0;
- stats->last = 0;
+ if (v >= e->m)
+ break;
- spin_unlock(&stats->lock);
+ i = eytzinger0_child(i, v > e->m);
+ }
}
-void __bch2_time_stats_update(struct time_stats *stats, u64 start_time)
+/* time stats: */
+
+static void bch2_time_stats_update_one(struct time_stats *stats,
+ u64 start, u64 end)
{
- u64 now, duration, last;
+ u64 duration, freq;
+
+ duration = time_after64(end, start)
+ ? end - start : 0;
+ freq = time_after64(end, stats->last_event)
+ ? end - stats->last_event : 0;
stats->count++;
- now = local_clock();
- duration = time_after64(now, start_time)
- ? now - start_time : 0;
- last = time_after64(now, stats->last)
- ? now - stats->last : 0;
+ stats->average_duration = stats->average_duration
+ ? ewma_add(stats->average_duration, duration, 6)
+ : duration;
+
+ stats->average_frequency = stats->average_frequency
+ ? ewma_add(stats->average_frequency, freq, 6)
+ : freq;
- stats->last_duration = duration;
stats->max_duration = max(stats->max_duration, duration);
- if (stats->last) {
- stats->average_duration = ewma_add(stats->average_duration,
- duration << 8, 3);
+ stats->last_event = end;
- if (stats->average_frequency)
- stats->average_frequency =
- ewma_add(stats->average_frequency,
- last << 8, 3);
- else
- stats->average_frequency = last << 8;
+ bch2_quantiles_update(&stats->quantiles, duration);
+}
+
+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ bch2_time_stats_update_one(stats, start, end);
+
+ if (stats->average_frequency < 32 &&
+ stats->count > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
} else {
- stats->average_duration = duration << 8;
+ struct time_stat_buffer_entry *i;
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (b->nr == ARRAY_SIZE(b->entries)) {
+ spin_lock_irqsave(&stats->lock, flags);
+ for (i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ spin_unlock_irqrestore(&stats->lock, flags);
+
+ b->nr = 0;
+ }
+
+ preempt_enable();
+ }
+}
+
+static const struct time_unit {
+ const char *name;
+ u32 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "sec", NSEC_PER_SEC },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static size_t pr_time_units(char *buf, size_t len, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
+{
+ char *out = buf, *end = buf + len;
+ const struct time_unit *u;
+ u64 freq = READ_ONCE(stats->average_frequency);
+ u64 q, last_q = 0;
+ int i;
+
+ out += scnprintf(out, end - out, "count:\t\t%llu\n",
+ stats->count);
+ out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
+
+ out += scnprintf(out, end - out, "frequency:\t");
+ out += pr_time_units(out, end - out, freq);
+
+ out += scnprintf(out, end - out, "\navg duration:\t");
+ out += pr_time_units(out, end - out, stats->average_duration);
+
+ out += scnprintf(out, end - out, "\nmax duration:\t");
+ out += pr_time_units(out, end - out, stats->max_duration);
+
+ i = eytzinger0_first(NR_QUANTILES);
+ u = pick_time_units(stats->quantiles.entries[i].m);
+
+ out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ q = max(stats->quantiles.entries[i].m, last_q);
+ out += scnprintf(out, end - out, "%llu%s",
+ div_u64(q, u->nsecs),
+ is_last ? "\n" : " ");
+ last_q = q;
}
- stats->last = now ?: 1;
+ return out - buf;
}
-void bch2_time_stats_update(struct time_stats *stats, u64 start_time)
+void bch2_time_stats_exit(struct time_stats *stats)
{
- spin_lock(&stats->lock);
- __bch2_time_stats_update(stats, start_time);
- spin_unlock(&stats->lock);
+ free_percpu(stats->buffer);
}
+void bch2_time_stats_init(struct time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
/**
* bch2_ratelimit_delay() - return how long to delay until the next time to do
* some work
@@ -310,6 +443,8 @@ int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
}
}
+/* pd controller: */
+
/*
* Updates pd_controller. Attempts to scale inputed values to units per second.
* @target: desired value
@@ -404,6 +539,8 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
derivative, change, next_io);
}
+/* misc: */
+
void bch2_bio_map(struct bio *bio, void *base)
{
size_t size = bio->bi_iter.bi_size;
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index cc89da1..7c7264f 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -371,87 +371,50 @@ ssize_t bch2_read_string_list(const char *, const char * const[]);
ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
u64 bch2_read_flag_list(char *, const char * const[]);
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[32];
+};
+
struct time_stats {
spinlock_t lock;
u64 count;
- /*
- * all fields are in nanoseconds, averages are ewmas stored left shifted
- * by 8
- */
- u64 last_duration;
- u64 max_duration;
+ /* all fields are in nanoseconds */
u64 average_duration;
u64 average_frequency;
- u64 last;
+ u64 max_duration;
+ u64 last_event;
+ struct quantiles quantiles;
+
+ struct time_stat_buffer __percpu *buffer;
};
-void bch2_time_stats_clear(struct time_stats *stats);
-void __bch2_time_stats_update(struct time_stats *stats, u64 time);
-void bch2_time_stats_update(struct time_stats *stats, u64 time);
+void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
-static inline unsigned local_clock_us(void)
+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
{
- return local_clock() >> 10;
+ __bch2_time_stats_update(stats, start, local_clock());
}
-#define NSEC_PER_ns 1L
-#define NSEC_PER_us NSEC_PER_USEC
-#define NSEC_PER_ms NSEC_PER_MSEC
-#define NSEC_PER_sec NSEC_PER_SEC
-
-#define __print_time_stat(stats, name, stat, units) \
- sysfs_print(name ## _ ## stat ## _ ## units, \
- div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
-
-#define sysfs_print_time_stats(stats, name, \
- frequency_units, \
- duration_units) \
-do { \
- __print_time_stat(stats, name, \
- average_frequency, frequency_units); \
- __print_time_stat(stats, name, \
- average_duration, duration_units); \
- sysfs_print(name ## _ ##count, (stats)->count); \
- sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \
- div_u64((stats)->last_duration, \
- NSEC_PER_ ## duration_units)); \
- sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
- div_u64((stats)->max_duration, \
- NSEC_PER_ ## duration_units)); \
- \
- sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
- ? div_s64(local_clock() - (stats)->last, \
- NSEC_PER_ ## frequency_units) \
- : -1LL); \
-} while (0)
-
-#define sysfs_clear_time_stats(stats, name) \
-do { \
- if (attr == &sysfs_ ## name ## _clear) \
- bch2_time_stats_clear(stats); \
-} while (0)
+size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
-#define sysfs_time_stats_attribute(name, \
- frequency_units, \
- duration_units) \
-write_attribute(name ## _clear); \
-read_attribute(name ## _count); \
-read_attribute(name ## _average_frequency_ ## frequency_units); \
-read_attribute(name ## _average_duration_ ## duration_units); \
-read_attribute(name ## _last_duration_ ## duration_units); \
-read_attribute(name ## _max_duration_ ## duration_units); \
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name, \
- frequency_units, \
- duration_units) \
-&sysfs_ ## name ## _clear, \
-&sysfs_ ## name ## _count, \
-&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
-&sysfs_ ## name ## _average_duration_ ## duration_units, \
-&sysfs_ ## name ## _last_duration_ ## duration_units, \
-&sysfs_ ## name ## _max_duration_ ## duration_units, \
-&sysfs_ ## name ## _last_ ## frequency_units,
+void bch2_time_stats_exit(struct time_stats *);
+void bch2_time_stats_init(struct time_stats *);
#define ewma_add(ewma, val, weight) \
({ \