summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-11-13 14:41:06 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-11-16 18:23:54 -0500
commit13f53aa228c83731226f4a359983215f1f7c2a47 (patch)
treef13df1911f064fe507afbd2420d49be797904e63
parent3420d86959401e5884627efdf3c2361e50b05eaa (diff)
Update bcachefs sources to d1fd471830 bcachefs: Add more debug checks
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/bitops.h11
-rw-r--r--include/linux/kernel.h2
-rw-r--r--include/linux/srcu.h31
-rw-r--r--include/linux/types.h1
-rw-r--r--include/trace/events/bcachefs.h10
-rw-r--r--libbcachefs/bcachefs.h3
-rw-r--r--libbcachefs/bkey_methods.c18
-rw-r--r--libbcachefs/bset.c94
-rw-r--r--libbcachefs/btree_cache.c8
-rw-r--r--libbcachefs/btree_io.c9
-rw-r--r--libbcachefs/btree_io.h17
-rw-r--r--libbcachefs/btree_iter.c28
-rw-r--r--libbcachefs/btree_key_cache.c130
-rw-r--r--libbcachefs/btree_types.h9
-rw-r--r--libbcachefs/btree_update_interior.c39
-rw-r--r--libbcachefs/btree_update_interior.h3
-rw-r--r--libbcachefs/btree_update_leaf.c2
-rw-r--r--libbcachefs/buckets.c2
-rw-r--r--libbcachefs/fs-io.c68
-rw-r--r--libbcachefs/fs.c8
-rw-r--r--libbcachefs/fs.h1
-rw-r--r--libbcachefs/inode.c15
-rw-r--r--libbcachefs/journal.c230
-rw-r--r--libbcachefs/journal.h7
-rw-r--r--libbcachefs/journal_io.c117
-rw-r--r--libbcachefs/journal_reclaim.c7
-rw-r--r--libbcachefs/journal_types.h3
-rw-r--r--libbcachefs/recovery.c3
-rw-r--r--libbcachefs/sysfs.c2
30 files changed, 548 insertions, 332 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index dc583047..9c20ba85 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-1d669389f79de8571732c13fdf4d23039e2308fd
+d1fd47183051729471bce1c9f84fa63cb84dc557
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index f2183d54..2fe736e9 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -85,6 +85,17 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
return (old & mask) != 0;
}
+static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+ unsigned long old;
+
+ old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED);
+
+ return (old & mask) != 0;
+}
+
static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 10d94c5e..4b45306d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -219,4 +219,6 @@ struct qstr {
#define POISON_FREE 0x6b
+static inline void dump_stack(void) {}
+
#endif
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
new file mode 100644
index 00000000..75823cf2
--- /dev/null
+++ b/include/linux/srcu.h
@@ -0,0 +1,31 @@
+#ifndef __TOOLS_LINUX_SRCU_H
+#define __TOOLS_LINUX_SRCU_H
+
+struct srcu_struct {
+};
+
+static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {}
+
+static inline int srcu_read_lock(struct srcu_struct *ssp)
+{
+ return 0;
+}
+
+static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ return false;
+}
+
+static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return 0;
+}
+
+static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
+
+static inline int init_srcu_struct(struct srcu_struct *ssp)
+{
+ return 0;
+}
+
+#endif /* __TOOLS_LINUX_SRCU_H */
diff --git a/include/linux/types.h b/include/linux/types.h
index 387c3831..1e125550 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -31,6 +31,7 @@ typedef unsigned gfp_t;
#define __GFP_IO 0
#define __GFP_NOWARN 0
#define __GFP_NORETRY 0
+#define __GFP_NOFAIL 0
#define __GFP_ZERO 1
#define PAGE_ALLOC_COSTLY_ORDER 6
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index ba2c5555..a8b8c5b6 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -513,7 +513,7 @@ TRACE_EVENT(transaction_restart_ip,
__entry->ip = ip;
),
- TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
);
DECLARE_EVENT_CLASS(transaction_restart,
@@ -528,7 +528,7 @@ DECLARE_EVENT_CLASS(transaction_restart,
__entry->ip = ip;
),
- TP_printk("%pf", (void *) __entry->ip)
+ TP_printk("%ps", (void *) __entry->ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
@@ -568,7 +568,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
__entry->want_iter_type = want_iter_type;
),
- TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+ TP_printk("%ps %pS because %u have %u:%u want %u:%u",
(void *) __entry->trans_ip,
(void *) __entry->caller_ip,
__entry->reason,
@@ -592,7 +592,7 @@ TRACE_EVENT(trans_restart_iters_realloced,
__entry->nr = nr;
),
- TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr)
+ TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
);
TRACE_EVENT(trans_restart_mem_realloced,
@@ -609,7 +609,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
__entry->bytes = bytes;
),
- TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes)
+ TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes)
);
DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 35311dbb..b20895a4 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -193,6 +193,7 @@
#include <linux/semaphore.h>
#include <linux/seqlock.h>
#include <linux/shrinker.h>
+#include <linux/srcu.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
@@ -642,6 +643,8 @@ struct bch_fs {
mempool_t btree_iters_pool;
struct btree_iter_buf __percpu *btree_iters_bufs;
+ struct srcu_struct btree_trans_barrier;
+
struct btree_key_cache btree_key_cache;
struct workqueue_struct *wq;
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 99b7fce2..f5779795 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
if (k) {
- pr_buf(out, "u64s %u type %s ", k->u64s,
- bch2_bkey_types[k->type]);
+ pr_buf(out, "u64s %u type ", k->u64s);
+
+ if (k->type < KEY_TYPE_MAX)
+ pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+ else
+ pr_buf(out, "%u ", k->type);
bch2_bpos_to_text(out, k->p);
@@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+ if (k.k->type < KEY_TYPE_MAX) {
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
- if (likely(ops->val_to_text))
- ops->val_to_text(out, c, k);
+ if (likely(ops->val_to_text))
+ ops->val_to_text(out, c, k);
+ } else {
+ pr_buf(out, "(invalid type %u)", k.k->type);
+ }
}
void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 26716657..1c7318c6 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -604,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
return (u16) v;
}
-static void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
+__always_inline
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *l, *r;
+ struct bkey_packed *l = is_power_of_2(j)
+ ? min_key
+ : tree_to_prev_bkey(b, t, j >> ffs(j));
+ struct bkey_packed *r = is_power_of_2(j + 1)
+ ? max_key
+ : tree_to_bkey(b, t, j >> (ffz(j) + 1));
unsigned mantissa;
int shift, exponent, high_bit;
- if (is_power_of_2(j)) {
- l = min_key;
-
- if (!l->u64s) {
- if (!bkey_pack_pos(l, b->data->min_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = b->data->min_key;
- bkey_copy(l, &tmp);
- }
- }
- } else {
- l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
- EBUG_ON(m < l);
- }
-
- if (is_power_of_2(j + 1)) {
- r = max_key;
-
- if (!r->u64s) {
- if (!bkey_pack_pos(r, t->max_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = t->max_key;
- bkey_copy(r, &tmp);
- }
- }
- } else {
- r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- EBUG_ON(m > r);
- }
-
/*
* for failed bfloats, the lookup code falls back to comparing against
* the original key.
@@ -707,6 +677,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
f->mantissa = mantissa;
}
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
+{
+ struct bkey_i *k;
+
+ if (is_power_of_2(j) &&
+ !min_key->u64s) {
+ k = (void *) min_key;
+ bkey_init(&k->k);
+ k->k.p = b->data->min_key;
+ }
+
+ if (is_power_of_2(j + 1) &&
+ !max_key->u64s) {
+ k = (void *) max_key;
+ bkey_init(&k->k);
+ k->k.p = t->max_key;
+ }
+
+ __make_bfloat(b, t, j, min_key, max_key);
+}
+
/* bytes remaining - only valid for last bset: */
static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
@@ -726,7 +720,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *k;
@@ -745,15 +739,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
}
}
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
- struct bkey_packed min_key, max_key;
+ struct bkey_i min_key, max_key;
unsigned j, cacheline = 1;
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
retry:
@@ -789,9 +780,16 @@ retry:
t->max_key = bkey_unpack_pos(b, prev);
+ bkey_init(&min_key.k);
+ min_key.k.p = b->data->min_key;
+ bkey_init(&max_key.k);
+ max_key.k.p = t->max_key;
+
/* Then we build the tree */
eytzinger1_for_each(j, t->size)
- make_bfloat(b, t, j, &min_key, &max_key);
+ __make_bfloat(b, t, j,
+ bkey_to_packed(&min_key),
+ bkey_to_packed(&max_key));
}
static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 325a1661..5bceff48 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -328,9 +328,9 @@ restart:
clear_btree_node_accessed(b);
}
- memalloc_nofs_restore(flags);
mutex_unlock(&bc->lock);
out:
+ memalloc_nofs_restore(flags);
return (unsigned long) freed * btree_pages(c);
}
@@ -381,11 +381,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
btree_node_data_free(c, b);
}
+ BUG_ON(atomic_read(&c->btree_cache.dirty));
+
while (!list_empty(&bc->freed)) {
b = list_first_entry(&bc->freed, struct btree, list);
list_del(&b->list);
@@ -445,7 +447,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->shrink.scan_objects = bch2_btree_cache_scan;
bc->shrink.seeks = 4;
bc->shrink.batch = btree_pages(c) * 2;
- register_shrinker(&bc->shrink);
+ ret = register_shrinker(&bc->shrink);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 10a00085..2406745f 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1442,8 +1442,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
- if (ret)
+ if (ret) {
bch2_inconsistent_error(c);
+ dump_stack();
+ }
return ret;
}
@@ -1498,6 +1500,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
+ atomic_dec(&c->btree_cache.dirty);
+
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
@@ -1530,6 +1534,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
seq = max(seq, le64_to_cpu(i->journal_seq));
}
+ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+ bytes += 8;
+
data = btree_bounce_alloc(c, bytes, &used_mempool);
if (!b->written) {
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 626d0f07..1a4b11e9 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -14,6 +14,23 @@ struct btree_write;
struct btree;
struct btree_iter;
+static inline bool btree_node_dirty(struct btree *b)
+{
+ return test_bit(BTREE_NODE_dirty, &b->flags);
+}
+
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_dec(&c->btree_cache.dirty);
+}
+
struct btree_read_bio {
struct bch_fs *c;
u64 start_time;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 58f1a3dd..96cc5394 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -2342,12 +2342,15 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
unsigned new_size = BTREE_ITER_MAX;
size_t iters_bytes = sizeof(struct btree_iter) * new_size;
size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size;
- void *p;
+ void *p = NULL;
BUG_ON(trans->used_mempool);
- p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
- mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+#ifdef __KERNEL__
+ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+#endif
+ if (!p)
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
trans->iters = p; p += iters_bytes;
trans->updates = p; p += updates_bytes;
@@ -2369,8 +2372,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
*/
bch2_trans_alloc_iters(trans, c);
- if (expected_mem_bytes)
- bch2_trans_preload_mem(trans, expected_mem_bytes);
+ if (expected_mem_bytes) {
+ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+ }
+
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
#ifdef CONFIG_BCACHEFS_DEBUG
trans->pid = current->pid;
@@ -2392,12 +2399,19 @@ int bch2_trans_exit(struct btree_trans *trans)
mutex_unlock(&trans->c->btree_trans_lock);
#endif
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
+#ifdef __KERNEL__
+ /*
+ * Userspace doesn't have a real percpu implementation:
+ */
trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+#endif
if (trans->iters)
mempool_free(trans->iters, &trans->c->btree_iters_pool);
@@ -2474,6 +2488,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
mempool_exit(&c->btree_iters_pool);
+ cleanup_srcu_struct(&c->btree_trans_barrier);
}
int bch2_fs_btree_iter_init(struct bch_fs *c)
@@ -2483,7 +2498,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
INIT_LIST_HEAD(&c->btree_trans_list);
mutex_init(&c->btree_trans_lock);
- return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+ return init_srcu_struct(&c->btree_trans_barrier) ?:
+ mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
sizeof(struct btree_insert_entry) * nr);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 0ee4f78c..d605ff18 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -9,6 +9,7 @@
#include "journal.h"
#include "journal_reclaim.h"
+#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -66,12 +67,19 @@ static void bkey_cached_evict(struct btree_key_cache *c,
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
+
+ c->nr_keys--;
}
-static void bkey_cached_free(struct btree_key_cache *c,
+static void bkey_cached_free(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
- list_move(&ck->list, &c->freed);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ list_move(&ck->list, &bc->freed);
kfree(ck->k);
ck->k = NULL;
@@ -135,6 +143,8 @@ btree_key_cache_create(struct btree_key_cache *c,
return NULL;
}
+ c->nr_keys++;
+
list_move(&ck->list, &c->clean);
six_unlock_write(&ck->c.lock);
@@ -355,10 +365,14 @@ err:
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
if (!evict) {
mutex_lock(&c->btree_key_cache.lock);
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ c->btree_key_cache.nr_dirty--;
+ }
+
list_move_tail(&ck->list, &c->btree_key_cache.clean);
mutex_unlock(&c->btree_key_cache.lock);
} else {
@@ -371,6 +385,11 @@ evict:
six_lock_write(&ck->c.lock, NULL, NULL);
mutex_lock(&c->btree_key_cache.lock);
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ c->btree_key_cache.nr_dirty--;
+ }
+
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free(&c->btree_key_cache, ck);
mutex_unlock(&c->btree_key_cache.lock);
@@ -391,19 +410,23 @@ static void btree_key_cache_journal_flush(struct journal *j,
struct bkey_cached_key key;
struct btree_trans trans;
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+
six_lock_read(&ck->c.lock, NULL, NULL);
key = ck->key;
if (ck->journal.seq != seq ||
!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_read(&ck->c.lock);
- return;
+ goto unlock;
}
six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0);
btree_key_cache_flush_pos(&trans, key, seq, false);
bch2_trans_exit(&trans);
+unlock:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
}
/*
@@ -448,9 +471,10 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
mutex_lock(&c->btree_key_cache.lock);
- list_del_init(&ck->list);
+ list_move(&ck->list, &c->btree_key_cache.dirty);
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ c->btree_key_cache.nr_dirty++;
mutex_unlock(&c->btree_key_cache.lock);
}
@@ -467,20 +491,97 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
}
#endif
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_key_cache.shrink);
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bkey_cached *ck, *t;
+ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+ unsigned flags;
+
+ /* Return -1 if we can't do anything right now */
+ if (sc->gfp_mask & __GFP_FS)
+ mutex_lock(&bc->lock);
+ else if (!mutex_trylock(&bc->lock))
+ return -1;
+
+ flags = memalloc_nofs_save();
+
+ list_for_each_entry_safe(ck, t, &bc->freed, list) {
+ scanned++;
+
+ if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq)) {
+ list_del(&ck->list);
+ kfree(ck);
+ freed++;
+ }
+
+ if (scanned >= nr)
+ goto out;
+ }
+
+ list_for_each_entry_safe(ck, t, &bc->clean, list) {
+ scanned++;
+
+ if (bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free(bc, ck);
+ }
+
+ if (scanned >= nr) {
+ if (&t->list != &bc->clean)
+ list_move_tail(&bc->clean, &t->list);
+ goto out;
+ }
+ }
+out:
+ memalloc_nofs_restore(flags);
+ mutex_unlock(&bc->lock);
+
+ return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_key_cache.shrink);
+ struct btree_key_cache *bc = &c->btree_key_cache;
+
+ return bc->nr_keys;
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bkey_cached *ck, *n;
- mutex_lock(&c->lock);
- list_for_each_entry_safe(ck, n, &c->clean, list) {
+ if (bc->shrink.list.next)
+ unregister_shrinker(&bc->shrink);
+
+ mutex_lock(&bc->lock);
+ list_splice(&bc->dirty, &bc->clean);
+
+ list_for_each_entry_safe(ck, n, &bc->clean, list) {
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
+ bch2_journal_preres_put(&c->journal, &ck->res);
+
kfree(ck->k);
kfree(ck);
+ bc->nr_keys--;
}
- list_for_each_entry_safe(ck, n, &c->freed, list)
+
+ BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
+ BUG_ON(bc->nr_keys);
+
+ list_for_each_entry_safe(ck, n, &bc->freed, list)
kfree(ck);
- mutex_unlock(&c->lock);
+ mutex_unlock(&bc->lock);
- rhashtable_destroy(&c->table);
+ rhashtable_destroy(&bc->table);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -488,11 +589,16 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
INIT_LIST_HEAD(&c->clean);
+ INIT_LIST_HEAD(&c->dirty);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
{
- return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ c->shrink.count_objects = bch2_btree_key_cache_count;
+ c->shrink.scan_objects = bch2_btree_key_cache_scan;
+
+ return register_shrinker(&c->shrink) ?:
+ rhashtable_init(&c->table, &bch2_btree_key_cache_params);
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 93721fbc..6013c916 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -158,6 +158,7 @@ struct btree_cache {
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ atomic_t dirty;
struct shrinker shrink;
/*
@@ -294,6 +295,11 @@ struct btree_key_cache {
struct rhashtable table;
struct list_head freed;
struct list_head clean;
+ struct list_head dirty;
+ struct shrinker shrink;
+
+ size_t nr_keys;
+ size_t nr_dirty;
};
struct bkey_cached_key {
@@ -309,6 +315,7 @@ struct bkey_cached {
unsigned long flags;
u8 u64s;
bool valid;
+ u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
@@ -345,6 +352,7 @@ struct btree_trans {
pid_t pid;
#endif
unsigned long ip;
+ int srcu_idx;
u64 iters_linked;
u64 iters_live;
@@ -411,7 +419,6 @@ enum btree_flags {
BTREE_FLAG(read_in_flight);
BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
BTREE_FLAG(need_write);
BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 4ddd1697..d4f3dd7a 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -11,6 +11,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
@@ -149,7 +150,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
b->ob.nr = 0;
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
@@ -264,7 +265,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b = as->prealloc_nodes[--as->nr_prealloc_nodes];
set_btree_node_accessed(b);
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
@@ -523,6 +524,7 @@ static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
struct btree *b = as->b;
+ struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
@@ -540,14 +542,16 @@ static void btree_update_nodes_written(struct btree_update *as)
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED,
- btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_init(&trans, c, 0, 512);
+ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ BTREE_INSERT_JOURNAL_RESERVED,
+ btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_exit(&trans);
BUG_ON(ret && !bch2_journal_error(&c->journal));
if (b) {
@@ -827,7 +831,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
closure_wake_up(&c->btree_interior_update_wait);
}
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
clear_btree_node_need_write(b);
/*
@@ -1018,7 +1022,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
struct bkey_i *insert,
struct btree_node_iter *node_iter)
{
+ struct bch_fs *c = as->c;
struct bkey_packed *k;
+ const char *invalid;
+
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b));
+ if (invalid) {
+ char buf[160];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
+ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+ dump_stack();
+ }
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -1034,7 +1049,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
bch2_btree_node_iter_advance(node_iter, b);
bch2_btree_bset_insert_key(iter, b, node_iter, insert);
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 7668225e..41854fc3 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -237,6 +237,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
b->whiteout_u64s;
ssize_t total = c->opts.btree_node_size << 6;
+ /* Always leave one extra u64 for bch2_varint_decode: */
+ used++;
+
return total - used;
}
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index e386f8ed..a2ca31e7 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
if (unlikely(!btree_node_dirty(b)))
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 82f1cc4c..be65f2e7 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -323,7 +323,7 @@ static u64 reserve_factor(u64 r)
static u64 avail_factor(u64 r)
{
- return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 1eb69ed3..389f23ee 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -35,6 +35,22 @@
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+static inline struct address_space *faults_disabled_mapping(void)
+{
+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+ current->faults_disabled_mapping =
+ (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+ return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
struct quota_res {
u64 sectors;
};
@@ -493,10 +509,35 @@ static void bch2_set_page_dirty(struct bch_fs *c,
vm_fault_t bch2_page_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct address_space *fdm = faults_disabled_mapping();
struct bch_inode_info *inode = file_bch_inode(file);
int ret;
+ if (fdm == mapping)
+ return VM_FAULT_SIGBUS;
+
+ /* Lock ordering: */
+ if (fdm > mapping) {
+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+ goto got_lock;
+
+ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+
+ /* Signal that lock has been dropped: */
+ set_fdm_dropped_locks();
+ return VM_FAULT_SIGBUS;
+ }
+
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+got_lock:
ret = filemap_fault(vmf);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
@@ -1742,14 +1783,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bio *bio = &dio->op.wbio.bio;
struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned unaligned;
- bool sync = dio->sync;
+ unsigned unaligned, iter_count;
+ bool sync = dio->sync, dropped_locks;
long ret;
if (dio->loop)
goto loop;
while (1) {
+ iter_count = dio->iter.count;
+
if (kthread)
kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
@@ -1757,13 +1800,34 @@ static long bch2_dio_write_loop(struct dio_write *dio)
ret = bio_iov_iter_get_pages(bio, &dio->iter);
+ dropped_locks = fdm_dropped_locks();
+
current->faults_disabled_mapping = NULL;
if (kthread)
kthread_unuse_mm(dio->mm);
+ /*
+ * If the fault handler returned an error but also signalled
+ * that it dropped & retook ei_pagecache_lock, we just need to
+ * re-shoot down the page cache and retry:
+ */
+ if (dropped_locks && ret)
+ ret = 0;
+
if (unlikely(ret < 0))
goto err;
+ if (unlikely(dropped_locks)) {
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter_count - 1);
+ if (unlikely(ret))
+ goto err;
+
+ if (!bio->bi_iter.bi_size)
+ continue;
+ }
+
unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
bio->bi_iter.bi_size -= unaligned;
iov_iter_revert(&dio->iter, unaligned);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 3ac57ba2..6e3d4bea 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -91,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock)
__pagecache_lock_put(lock, 1);
}
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
+{
+ return __pagecache_lock_tryget(lock, 1);
+}
+
void bch2_pagecache_add_get(struct pagecache_lock *lock)
{
__pagecache_lock_get(lock, 1);
@@ -271,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
if (!tmpfile)
mutex_lock(&dir->ei_update_lock);
- bch2_trans_init(&trans, c, 8, 1024);
+ bch2_trans_init(&trans, c, 8,
+ 2048 + (!tmpfile ? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index eda903a4..4ee1ac99 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -26,6 +26,7 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock)
}
void bch2_pagecache_add_put(struct pagecache_lock *);
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
void bch2_pagecache_add_get(struct pagecache_lock *);
void bch2_pagecache_block_put(struct pagecache_lock *);
void bch2_pagecache_block_get(struct pagecache_lock *);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 42371de7..823a1dde 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -537,7 +537,9 @@ found_slot:
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
- return bch2_inode_write(trans, iter, inode_u);
+ ret = bch2_inode_write(trans, iter, inode_u);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -574,16 +576,9 @@ retry:
bi_generation = 0;
- ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
- if (ret) {
- if (ret != -EINTR)
- bch_err(c, "error flushing btree key cache: %i", ret);
- goto err;
- }
-
iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index c2cafd38..e99faad8 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -18,7 +18,19 @@
#include <trace/events/bcachefs.h>
-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+static u64 last_unwritten_seq(struct journal *j)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+
+ lockdep_assert_held(&j->lock);
+
+ return journal_cur_seq(j) - s.prev_buf_unwritten;
+}
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq >= last_unwritten_seq(j);
+}
static bool __journal_entry_is_open(union journal_res_state state)
{
@@ -30,6 +42,22 @@ static bool journal_entry_is_open(struct journal *j)
return __journal_entry_is_open(j->reservations);
}
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+ struct journal_buf *buf = NULL;
+
+ EBUG_ON(seq > journal_cur_seq(j));
+ EBUG_ON(seq == journal_cur_seq(j) &&
+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+ if (journal_seq_unwritten(j, seq)) {
+ buf = j->buf + (seq & 1);
+ EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+ }
+ return buf;
+}
+
static void journal_pin_new_entry(struct journal *j, int count)
{
struct journal_entry_pin_list *p;
@@ -51,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
+ bkey_extent_init(&buf->key);
+
memset(buf->has_inode, 0, sizeof(buf->has_inode));
memset(buf->data, 0, sizeof(*buf->data));
@@ -72,6 +102,7 @@ void bch2_journal_halt(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ j->err_seq = journal_cur_seq(j);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
}
@@ -139,8 +170,6 @@ static bool __journal_entry_close(struct journal *j)
BUG_ON(sectors > buf->sectors);
buf->sectors = sectors;
- bkey_extent_init(&buf->key);
-
/*
* We have to set last_seq here, _before_ opening a new journal entry:
*
@@ -162,11 +191,6 @@ static bool __journal_entry_close(struct journal *j)
*/
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
- if (journal_entry_empty(buf->data))
- clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
- else
- set_bit(JOURNAL_NOT_EMPTY, &j->flags);
-
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
@@ -391,8 +415,17 @@ unlock:
goto retry;
if (ret == -ENOSPC) {
- WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
- "JOURNAL_RES_GET_RESERVED set but journal full");
+ if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
+ "JOURNAL_RES_GET_RESERVED set but journal full")) {
+ char *buf;
+
+ buf = kmalloc(4096, GFP_NOFS);
+ if (buf) {
+ bch2_journal_debug_to_text(&PBUF(buf), j);
+ pr_err("\n%s", buf);
+ kfree(buf);
+ }
+ }
/*
* Journal is full - can't rely on reclaim from work item due to
@@ -503,146 +536,28 @@ out:
/* journal flushing: */
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
- u64 seq;
-
- spin_lock(&j->lock);
- seq = journal_cur_seq(j);
- if (j->reservations.prev_buf_unwritten)
- seq--;
- spin_unlock(&j->lock);
-
- return seq;
-}
-
-/**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
- *
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
- */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int ret;
-
- spin_lock(&j->lock);
-
- /*
- * Can't try to open more than one sequence number ahead:
- */
- BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
-
- if (journal_cur_seq(j) > seq ||
- journal_entry_is_open(j)) {
- spin_unlock(&j->lock);
- return 0;
- }
-
- if (journal_cur_seq(j) < seq &&
- !__journal_entry_close(j)) {
- /* haven't finished writing out the previous one: */
- trace_journal_entry_full(c);
- ret = -EAGAIN;
- } else {
- BUG_ON(journal_cur_seq(j) != seq);
-
- ret = journal_entry_open(j);
- }
-
- if ((ret == -EAGAIN || ret == -ENOSPC) &&
- !j->res_get_blocked_start)
- j->res_get_blocked_start = local_clock() ?: 1;
-
- if (ret == -EAGAIN || ret == -ENOSPC)
- closure_wait(&j->async_wait, cl);
-
- spin_unlock(&j->lock);
-
- if (ret == -ENOSPC) {
- trace_journal_full(c);
- bch2_journal_reclaim_work(&j->reclaim_work.work);
- ret = -EAGAIN;
- }
-
- return ret;
-}
-
-static int journal_seq_error(struct journal *j, u64 seq)
-{
- union journal_res_state state = READ_ONCE(j->reservations);
-
- if (seq == journal_cur_seq(j))
- return bch2_journal_error(j);
-
- if (seq + 1 == journal_cur_seq(j) &&
- !state.prev_buf_unwritten &&
- seq > j->seq_ondisk)
- return -EIO;
-
- return 0;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
- /* seq should be for a journal entry that has been opened: */
- BUG_ON(seq > journal_cur_seq(j));
- BUG_ON(seq == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
-
- if (seq == journal_cur_seq(j))
- return journal_cur_buf(j);
- if (seq + 1 == journal_cur_seq(j) &&
- j->reservations.prev_buf_unwritten)
- return journal_prev_buf(j);
- return NULL;
-}
-
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct journal_buf *buf;
-
- spin_lock(&j->lock);
-
- if ((buf = journal_seq_to_buf(j, seq))) {
- if (!closure_wait(&buf->wait, parent))
- BUG();
-
- if (seq == journal_cur_seq(j)) {
- smp_mb();
- if (bch2_journal_error(j))
- closure_wake_up(&buf->wait);
- }
- }
-
- spin_unlock(&j->lock);
-}
-
/**
* bch2_journal_flush_seq_async - wait for a journal entry to be written
*
* like bch2_journal_wait_on_seq, except that it triggers a write immediately if
* necessary
*/
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct closure *parent)
{
struct journal_buf *buf;
+ int ret = 0;
spin_lock(&j->lock);
+ if (seq <= j->err_seq) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (seq <= j->seq_ondisk) {
+ ret = 1;
+ goto out;
+ }
if (parent &&
(buf = journal_seq_to_buf(j, seq)))
@@ -651,20 +566,8 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
+out:
spin_unlock(&j->lock);
-}
-
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
- int ret;
-
- spin_lock(&j->lock);
- ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
-
- if (seq == journal_cur_seq(j))
- __journal_entry_close(j);
- spin_unlock(&j->lock);
-
return ret;
}
@@ -673,28 +576,13 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
u64 start_time = local_clock();
int ret, ret2;
- ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+ ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
bch2_time_stats_update(j->flush_seq_time, start_time);
return ret ?: ret2 < 0 ? ret2 : 0;
}
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
- struct journal_res res;
-
- memset(&res, 0, sizeof(res));
-
- bch2_journal_res_get(j, &res, jset_u64s(0), 0);
- bch2_journal_res_put(j, &res);
-
- bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
int bch2_journal_meta(struct journal *j)
{
struct journal_res res;
@@ -989,7 +877,8 @@ void bch2_fs_journal_stop(struct journal *j)
journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+ (journal_entry_is_open(j) ||
+ j->last_empty_seq + 1 != journal_cur_seq(j)));
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);
@@ -1047,6 +936,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
set_bit(JOURNAL_STARTED, &j->flags);
journal_pin_new_entry(j, 1);
+
+ j->reservations.idx = journal_cur_seq(j);
+
bch2_journal_buf_init(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index f60bc964..25c68767 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -464,13 +464,8 @@ void bch2_journal_entry_res_resize(struct journal *,
struct journal_entry_res *,
unsigned);
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
-
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index bd0e6b37..7c157bc5 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -161,6 +161,8 @@ static void journal_entry_null_range(void *start, void *end)
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define FSCK_DELETED_KEY 5
+
static int journal_validate_key(struct bch_fs *c, struct jset *jset,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
@@ -173,28 +175,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in journal: k->u64s 0", type)) {
+ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry), c,
- "invalid %s in journal: extends past end of journal entry",
- type)) {
+ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in journal: bad format %u",
- type, k->k.format)) {
- le16_add_cpu(&entry->u64s, -k->k.u64s);
+ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s),
+ k->k.format)) {
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (!write)
@@ -208,13 +224,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
- type, invalid, buf);
-
- le16_add_cpu(&entry->u64s, -k->k.u64s);
+ mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s),
+ invalid, buf);
+
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (write)
@@ -230,15 +251,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
struct jset_entry *entry,
int write)
{
- struct bkey_i *k;
+ struct bkey_i *k = entry->start;
- vstruct_for_each(entry, k) {
+ while (k != vstruct_last(entry)) {
int ret = journal_validate_key(c, jset, entry,
entry->level,
entry->btree_id,
k, "key", write);
- if (ret)
- return ret;
+ if (ret == FSCK_DELETED_KEY)
+ continue;
+
+ k = bkey_next(k);
}
return 0;
@@ -432,46 +455,45 @@ static int jset_validate(struct bch_fs *c,
"%s sector %llu seq %llu: unknown journal entry version %u",
ca->name, sector, le64_to_cpu(jset->seq),
version)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
+ /* don't try to continue: */
+ return EINVAL;
}
+ if (bytes > (sectors_read << 9) &&
+ sectors_read < bucket_sectors_left)
+ return JOURNAL_ENTRY_REREAD;
+
if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
+ ret = JOURNAL_ENTRY_BAD;
+ le32_add_cpu(&jset->u64s,
+ -((bytes - (bucket_sectors_left << 9)) / 8));
}
- if (bytes > sectors_read << 9)
- return JOURNAL_ENTRY_REREAD;
-
if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca->name, sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset)))
- return JOURNAL_ENTRY_BAD;
+ JSET_CSUM_TYPE(jset))) {
+ ret = JOURNAL_ENTRY_BAD;
+ goto bad_csum_type;
+ }
csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
"%s sector %llu seq %llu: journal checksum bad",
- ca->name, sector, le64_to_cpu(jset->seq))) {
- /* XXX: retry IO, when we start retrying checksum errors */
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
+ ca->name, sector, le64_to_cpu(jset->seq)))
+ ret = JOURNAL_ENTRY_BAD;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
-
+bad_csum_type:
if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
"invalid journal entry: last_seq > seq")) {
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
-
- return 0;
fsck_err:
return ret;
}
@@ -939,24 +961,29 @@ static void journal_write_done(struct closure *cl)
struct bch_replicas_padded replicas;
u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq);
+ int err = 0;
bch2_time_stats_update(j->write_time, j->write_start_time);
if (!devs.nr) {
bch_err(c, "unable to write journal to sufficient devices");
- goto err;
+ err = -EIO;
+ } else {
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+ if (bch2_mark_replicas(c, &replicas.e))
+ err = -EIO;
}
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
-
- if (bch2_mark_replicas(c, &replicas.e))
- goto err;
+ if (err)
+ bch2_fatal_error(c);
spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = devs;
j->seq_ondisk = seq;
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
j->last_seq_ondisk = last_seq;
bch2_journal_space_available(j);
@@ -968,7 +995,7 @@ static void journal_write_done(struct closure *cl)
* bch2_fs_journal_stop():
*/
mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
-out:
+
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
@@ -982,11 +1009,6 @@ out:
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
- return;
-err:
- bch2_fatal_error(c);
- spin_lock(&j->lock);
- goto out;
}
static void journal_write_endio(struct bio *bio)
@@ -1067,6 +1089,9 @@ void bch2_journal_write(struct closure *cl)
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+ if (journal_entry_empty(jset))
+ j->last_empty_seq = le64_to_cpu(jset->seq);
+
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 18e45296..7a04d06b 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -263,6 +263,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
@@ -547,6 +548,12 @@ void bch2_journal_reclaim(struct journal *j)
if (j->prereserved.reserved * 2 > j->prereserved.remaining)
min_nr = 1;
+
+ if ((atomic_read(&c->btree_cache.dirty) * 4 >
+ c->btree_cache.used * 3) ||
+ (c->btree_key_cache.nr_dirty * 4 >
+ c->btree_key_cache.nr_keys))
+ min_nr = 1;
} while (journal_flush_pins(j, seq_to_flush, min_nr));
if (!bch2_journal_error(j))
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 154b51b8..9757e3d5 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -127,7 +127,6 @@ enum {
JOURNAL_STARTED,
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
- JOURNAL_NOT_EMPTY,
JOURNAL_MAY_GET_UNRESERVED,
};
@@ -181,6 +180,8 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 last_seq_ondisk;
+ u64 err_seq;
+ u64 last_empty_seq;
/*
* FIFO of journal entries whose btree updates have not yet been
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 1745cfac..67500636 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -456,6 +456,7 @@ retry:
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split,
BTREE_TRIGGER_NORUN);
+ bch2_trans_iter_put(&trans, split_iter);
bch2_btree_iter_set_pos(iter, split->k.p);
@@ -481,6 +482,8 @@ retry:
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
err:
+ bch2_trans_iter_put(&trans, iter);
+
if (ret == -EINTR)
goto retry;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index d7ad293a..58c00e26 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -458,7 +458,7 @@ STORE(bch2_fs)
/* Debugging: */
if (attr == &sysfs_trigger_journal_flush)
- bch2_journal_meta_async(&c->journal, NULL);
+ bch2_journal_meta(&c->journal);
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);