summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-11-29 23:55:51 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-11-30 00:06:46 -0500
commit1e574cb1aa07ab3a796c7d6c5501b96f3056ef4d (patch)
treeebcb516a753e19eb83f2dd4ed9b4f851f8afd911
parent41bec63b265a38dd9fa168b6042ea5bf07135048 (diff)
Update bcachefs sources to 021e62a098 bcachefs: Fix error in filesystem initialization
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/sched/mm.h15
-rw-r--r--include/linux/slab.h31
-rw-r--r--include/trace/events/bcachefs.h69
-rw-r--r--libbcachefs/alloc_background.c2
-rw-r--r--libbcachefs/bcachefs.h1
-rw-r--r--libbcachefs/btree_cache.c6
-rw-r--r--libbcachefs/btree_cache.h1
-rw-r--r--libbcachefs/btree_gc.c2
-rw-r--r--libbcachefs/btree_key_cache.c138
-rw-r--r--libbcachefs/btree_key_cache.h21
-rw-r--r--libbcachefs/btree_types.h6
-rw-r--r--libbcachefs/btree_update.h4
-rw-r--r--libbcachefs/btree_update_interior.c27
-rw-r--r--libbcachefs/btree_update_interior.h1
-rw-r--r--libbcachefs/btree_update_leaf.c96
-rw-r--r--libbcachefs/buckets.c10
-rw-r--r--libbcachefs/buckets.h6
-rw-r--r--libbcachefs/chardev.c3
-rw-r--r--libbcachefs/fs-io.c2
-rw-r--r--libbcachefs/fs.c2
-rw-r--r--libbcachefs/fsck.c2
-rw-r--r--libbcachefs/inode.c34
-rw-r--r--libbcachefs/inode.h2
-rw-r--r--libbcachefs/journal.c115
-rw-r--r--libbcachefs/journal_io.c4
-rw-r--r--libbcachefs/journal_reclaim.c158
-rw-r--r--libbcachefs/journal_reclaim.h15
-rw-r--r--libbcachefs/journal_types.h14
-rw-r--r--libbcachefs/movinggc.c2
-rw-r--r--libbcachefs/rebalance.c2
-rw-r--r--libbcachefs/recovery.c1
-rw-r--r--libbcachefs/super.c18
-rw-r--r--libbcachefs/sysfs.c7
34 files changed, 577 insertions, 242 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index ec3e6587..6ba1c9af 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-b1107114caf6aa6f725170f3d75b072badcfa573
+021e62a098d9fa7e558ae935180e2fb16bb50a3a
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 347105c6..03feda7a 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -1,7 +1,8 @@
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H
-#define PF_MEMALLOC_NOFS 0
+#define PF_MEMALLOC 0x00000800 /* Allocating memory */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
static inline unsigned int memalloc_nofs_save(void)
{
@@ -15,4 +16,16 @@ static inline void memalloc_nofs_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
+ return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ff342b65..b8a1235b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -132,4 +132,35 @@ static inline void *kmemdup(const void *src, size_t len, gfp_t gfp)
return p;
}
+struct kmem_cache {
+ size_t obj_size;
+};
+
+static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp)
+{
+ return kmalloc(c->obj_size, gfp);
+}
+
+static inline void kmem_cache_free(struct kmem_cache *c, void *p)
+{
+ kfree(p);
+}
+
+static inline void kmem_cache_destroy(struct kmem_cache *p)
+{
+ kfree(p);
+}
+
+static inline struct kmem_cache *kmem_cache_create(size_t obj_size)
+{
+ struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ p->obj_size = obj_size;
+ return p;
+}
+
+#define KMEM_CACHE(_struct, _flags) kmem_cache_create(sizeof(struct _struct))
+
#endif /* __TOOLS_LINUX_SLAB_H */
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index a8b8c5b6..d4cb7a29 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write,
TP_ARGS(bio)
);
+TRACE_EVENT(journal_reclaim_start,
+ TP_PROTO(struct bch_fs *c, u64 min_nr,
+ u64 prereserved, u64 prereserved_total,
+ u64 btree_cache_dirty, u64 btree_cache_total,
+ u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+ TP_ARGS(c, min_nr, prereserved, prereserved_total,
+ btree_cache_dirty, btree_cache_total,
+ btree_key_cache_dirty, btree_key_cache_total),
+
+ TP_STRUCT__entry(
+ __array(char, uuid, 16 )
+ __field(u64, min_nr )
+ __field(u64, prereserved )
+ __field(u64, prereserved_total )
+ __field(u64, btree_cache_dirty )
+ __field(u64, btree_cache_total )
+ __field(u64, btree_key_cache_dirty )
+ __field(u64, btree_key_cache_total )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->min_nr = min_nr;
+ __entry->prereserved = prereserved;
+ __entry->prereserved_total = prereserved_total;
+ __entry->btree_cache_dirty = btree_cache_dirty;
+ __entry->btree_cache_total = btree_cache_total;
+ __entry->btree_key_cache_dirty = btree_key_cache_dirty;
+ __entry->btree_key_cache_total = btree_key_cache_total;
+ ),
+
+ TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ __entry->uuid,
+ __entry->min_nr,
+ __entry->prereserved,
+ __entry->prereserved_total,
+ __entry->btree_cache_dirty,
+ __entry->btree_cache_total,
+ __entry->btree_key_cache_dirty,
+ __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+ TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+ TP_ARGS(c, nr_flushed),
+
+ TP_STRUCT__entry(
+ __array(char, uuid, 16 )
+ __field(u64, nr_flushed )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->nr_flushed = nr_flushed;
+ ),
+
+ TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+);
+
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -622,6 +681,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get,
TP_ARGS(ip)
);
+DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
+ TP_PROTO(unsigned long ip),
+ TP_ARGS(ip)
+);
+
DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
TP_PROTO(unsigned long ip),
TP_ARGS(ip)
@@ -657,11 +721,6 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse,
TP_ARGS(ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_atomic,
- TP_PROTO(unsigned long ip),
- TP_ARGS(ip)
-);
-
DECLARE_EVENT_CLASS(node_lock_fail,
TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
TP_ARGS(level, iter_seq, node, node_seq),
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 97508de9..2dd8a37f 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -1456,7 +1456,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
p = kthread_create(bch2_allocator_thread, ca,
- "bch_alloc[%s]", ca->name);
+ "bch-alloc/%s", ca->name);
if (IS_ERR(p))
return PTR_ERR(p);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index b20895a4..6d54defc 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -650,7 +650,6 @@ struct bch_fs {
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
- struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 5bceff48..09774f56 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -1064,3 +1064,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
stats.floats,
stats.failed);
}
+
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used);
+ pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty));
+}
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 8a19e60e..e766ef55 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c)
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index ba4acc11..ac81c9b9 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -1427,7 +1427,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
BUG_ON(c->gc_thread);
- p = kthread_create(bch2_gc_thread, c, "bch_gc");
+ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
if (IS_ERR(p))
return PTR_ERR(p);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index d605ff18..a21dc485 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -12,6 +12,8 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+static struct kmem_cache *bch2_key_cache;
+
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@@ -76,10 +78,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- list_move(&ck->list, &bc->freed);
+ list_move_tail(&ck->list, &bc->freed);
+ bc->nr_freed++;
kfree(ck->k);
ck->k = NULL;
@@ -94,9 +99,20 @@ bkey_cached_alloc(struct btree_key_cache *c)
{
struct bkey_cached *ck;
- list_for_each_entry(ck, &c->freed, list)
- if (bkey_cached_lock_for_evict(ck))
+ list_for_each_entry_reverse(ck, &c->freed, list)
+ if (bkey_cached_lock_for_evict(ck)) {
+ c->nr_freed--;
return ck;
+ }
+
+ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
+ if (likely(ck)) {
+ INIT_LIST_HEAD(&ck->list);
+ six_lock_init(&ck->c.lock);
+ BUG_ON(!six_trylock_intent(&ck->c.lock));
+ BUG_ON(!six_trylock_write(&ck->c.lock));
+ return ck;
+ }
list_for_each_entry(ck, &c->clean, list)
if (bkey_cached_lock_for_evict(ck)) {
@@ -104,16 +120,7 @@ bkey_cached_alloc(struct btree_key_cache *c)
return ck;
}
- ck = kzalloc(sizeof(*ck), GFP_NOFS);
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- six_lock_init(&ck->c.lock);
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
-
- return ck;
+ return NULL;
}
static struct bkey_cached *
@@ -132,8 +139,7 @@ btree_key_cache_create(struct btree_key_cache *c,
ck->key.btree_id = btree_id;
ck->key.pos = pos;
ck->valid = false;
-
- BUG_ON(ck->flags);
+ ck->flags = 1U << BKEY_CACHED_ACCESSED;
if (rhashtable_lookup_insert_fast(&c->table,
&ck->hash,
@@ -290,6 +296,9 @@ fill:
goto err;
}
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
iter->uptodate = BTREE_ITER_NEED_PEEK;
bch2_btree_iter_downgrade(iter);
return ret;
@@ -451,6 +460,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) iter->l[0].b;
+ bool kick_reclaim = false;
BUG_ON(insert->u64s > ck->u64s);
@@ -475,11 +485,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
c->btree_key_cache.nr_dirty++;
+
+ if (bch2_nr_btree_keys_need_flush(c))
+ kick_reclaim = true;
+
mutex_unlock(&c->btree_key_cache.lock);
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
&ck->journal, btree_key_cache_journal_flush);
+
+ if (kick_reclaim)
+ journal_reclaim_kick(&c->journal);
return true;
}
@@ -509,28 +526,34 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
flags = memalloc_nofs_save();
+ /*
+ * Newest freed entries are at the end of the list - once we hit one
+ * that's too new to be freed, we can bail out:
+ */
list_for_each_entry_safe(ck, t, &bc->freed, list) {
- scanned++;
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
- if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq)) {
- list_del(&ck->list);
- kfree(ck);
- freed++;
- }
-
- if (scanned >= nr)
- goto out;
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
+ bc->nr_freed--;
+ scanned++;
+ freed++;
}
- list_for_each_entry_safe(ck, t, &bc->clean, list) {
- scanned++;
+ if (scanned >= nr)
+ goto out;
- if (bkey_cached_lock_for_evict(ck)) {
+ list_for_each_entry_safe(ck, t, &bc->clean, list) {
+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ else if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
}
+ scanned++;
if (scanned >= nr) {
if (&t->list != &bc->clean)
list_move_tail(&bc->clean, &t->list);
@@ -570,18 +593,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
bch2_journal_preres_put(&c->journal, &ck->res);
kfree(ck->k);
- kfree(ck);
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
bc->nr_keys--;
}
BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
BUG_ON(bc->nr_keys);
- list_for_each_entry_safe(ck, n, &bc->freed, list)
- kfree(ck);
+ list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
+ }
mutex_unlock(&bc->lock);
- rhashtable_destroy(&bc->table);
+ if (bc->table_init_done)
+ rhashtable_destroy(&bc->table);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -594,33 +621,42 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
{
+ int ret;
+
+ c->shrink.seeks = 1;
c->shrink.count_objects = bch2_btree_key_cache_count;
c->shrink.scan_objects = bch2_btree_key_cache_scan;
- return register_shrinker(&c->shrink) ?:
- rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ ret = register_shrinker(&c->shrink);
+ if (ret)
+ return ret;
+
+ ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ if (ret)
+ return ret;
+
+ c->table_init_done = true;
+ return 0;
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- struct rhash_head *pos;
- size_t i;
+ pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
+ pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys);
+ pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty);
+}
- mutex_lock(&c->lock);
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+void bch2_btree_key_cache_exit(void)
+{
+ if (bch2_key_cache)
+ kmem_cache_destroy(bch2_key_cache);
+}
- for (i = 0; i < tbl->size; i++) {
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- pr_buf(out, "%s:",
- bch2_btree_ids[ck->key.btree_id]);
- bch2_bpos_to_text(out, ck->key.pos);
+int __init bch2_btree_key_cache_init(void)
+{
+ bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+ if (!bch2_key_cache)
+ return -ENOMEM;
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
- pr_buf(out, " journal seq %llu", ck->journal.seq);
- pr_buf(out, "\n");
- }
- }
- mutex_unlock(&c->lock);
+ return 0;
}
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
index d448264a..d7d31a06 100644
--- a/libbcachefs/btree_key_cache.h
+++ b/libbcachefs/btree_key_cache.h
@@ -1,6 +1,24 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+ size_t max_dirty = 4096 + nr_keys / 2;
+
+ return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+ size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+ return nr_dirty > max_dirty;
+}
+
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
@@ -25,4 +43,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 6013c916..cf59f122 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -293,11 +293,13 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
+ bool table_init_done;
struct list_head freed;
struct list_head clean;
struct list_head dirty;
struct shrinker shrink;
+ size_t nr_freed;
size_t nr_keys;
size_t nr_dirty;
};
@@ -307,7 +309,8 @@ struct bkey_cached_key {
struct bpos pos;
} __attribute__((packed, aligned(4)));
-#define BKEY_CACHED_DIRTY 0
+#define BKEY_CACHED_ACCESSED 0
+#define BKEY_CACHED_DIRTY 1
struct bkey_cached {
struct btree_bkey_cached_common c;
@@ -647,6 +650,7 @@ enum btree_insert_ret {
BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_MARK_REPLICAS,
BTREE_INSERT_NEED_JOURNAL_RES,
+ BTREE_INSERT_NEED_JOURNAL_RECLAIM,
};
enum btree_gc_coalesce_fail_reason {
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index e0b1bde3..adb07043 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
- struct bpos, u64 *);
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+ struct bpos, struct bpos, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index d4f3dd7a..5143896e 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -49,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
break;
bp = bkey_s_c_to_btree_ptr_v2(k);
- BUG_ON(bkey_cmp(next_node, bp.v->min_key));
+ if (bkey_cmp(next_node, bp.v->min_key)) {
+ bch2_dump_btree_node(c, b);
+ panic("expected next min_key %llu:%llu got %llu:%llu\n",
+ next_node.inode,
+ next_node.offset,
+ bp.v->min_key.inode,
+ bp.v->min_key.offset);
+ }
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
- BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+
+ if (bkey_cmp(k.k->p, b->key.k.p)) {
+ bch2_dump_btree_node(c, b);
+ panic("expected end %llu:%llu got %llu:%llu\n",
+ b->key.k.p.inode,
+ b->key.k.p.offset,
+ k.k->p.inode,
+ k.k->p.offset);
+ }
break;
}
@@ -1026,7 +1041,8 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
struct bkey_packed *k;
const char *invalid;
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b));
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
char buf[160];
@@ -1368,9 +1384,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- if (as->must_rewrite)
- goto split;
-
bch2_btree_node_lock_for_insert(c, b, iter);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
@@ -1378,6 +1391,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
goto split;
}
+ btree_node_interior_verify(c, b);
+
bch2_btree_insert_keys_interior(as, b, iter, keys);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 41854fc3..45d21273 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -47,7 +47,6 @@ struct btree_update {
BTREE_INTERIOR_UPDATING_AS,
} mode;
- unsigned must_rewrite:1;
unsigned nodes_written:1;
enum btree_id btree_id;
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index a2ca31e7..bbc6d512 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans,
BUG_ON(iter->level);
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(trans->c))
+ return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+
if (u64s <= ck->u64s)
return BTREE_INSERT_OK;
@@ -642,20 +646,24 @@ int bch2_trans_commit_error(struct btree_trans *trans,
trace_trans_restart_journal_res_get(trans->ip);
ret = -EINTR;
break;
- default:
- BUG_ON(ret >= 0);
- break;
- }
-
- if (ret == -EINTR) {
- int ret2 = bch2_btree_iter_traverse_all(trans);
+ case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+ bch2_trans_unlock(trans);
- if (ret2) {
- trace_trans_restart_traverse(trans->ip);
- return ret2;
+ while (bch2_btree_key_cache_must_wait(c)) {
+ mutex_lock(&c->journal.reclaim_lock);
+ bch2_journal_reclaim(&c->journal);
+ mutex_unlock(&c->journal.reclaim_lock);
}
- trace_trans_restart_atomic(trans->ip);
+ if (bch2_trans_relock(trans))
+ return 0;
+
+ trace_trans_restart_journal_reclaim(trans->ip);
+ ret = -EINTR;
+ break;
+ default:
+ BUG_ON(ret >= 0);
+ break;
}
return ret;
@@ -1076,13 +1084,32 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
__bch2_btree_insert(&trans, id, k));
}
-int bch2_btree_delete_at_range(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- u64 *journal_seq)
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_i k;
+
+ bkey_init(&k.k);
+ k.k.p = iter->pos;
+
+ bch2_trans_update(trans, iter, &k, 0);
+ return bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|flags);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+ struct bpos start, struct bpos end,
+ u64 *journal_seq)
{
+ struct btree_iter *iter;
struct bkey_s_c k;
int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret;
retry:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -1094,6 +1121,10 @@ retry:
bkey_init(&delete.k);
/*
+ * This could probably be more efficient for extents:
+ */
+
+ /*
* For extents, iter.pos won't necessarily be the same as
* bkey_start_pos(k.k) (for non extents they always will be the
* same). It's important that we delete starting from iter.pos
@@ -1132,22 +1163,8 @@ retry:
goto retry;
}
+ bch2_trans_iter_put(trans, iter);
return ret;
-
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_i k;
-
- bkey_init(&k.k);
- k.k.p = iter->pos;
-
- bch2_trans_update(trans, iter, &k, 0);
- return bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|flags);
}
/*
@@ -1159,21 +1176,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start, struct bpos end,
u64 *journal_seq)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret = 0;
-
- /*
- * XXX: whether we need mem/more iters depends on whether this btree id
- * has triggers
- */
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
-
- iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
- ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
- ret = bch2_trans_exit(&trans) ?: ret;
-
- BUG_ON(ret == -EINTR);
- return ret;
+ return bch2_trans_do(c, NULL, journal_seq, 0,
+ bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
}
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index be65f2e7..f7bdb143 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -2044,16 +2044,6 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
return avail_factor(__bch2_fs_usage_read_short(c).free);
}
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
-{
- percpu_down_read(&c->mark_lock);
- this_cpu_sub(c->usage[0]->online_reserved,
- res->sectors);
- percpu_up_read(&c->mark_lock);
-
- res->sectors = 0;
-}
-
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index a3873bec..856dc5a8 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -272,13 +272,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
/* disk reservations: */
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
-
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
- if (res->sectors)
- __bch2_disk_reservation_put(c, res);
+ this_cpu_sub(c->usage[0]->online_reserved, res->sectors);
+ res->sectors = 0;
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 4663784d..e7c8969a 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -341,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+ ctx->thread = kthread_create(bch2_data_thread, ctx,
+ "bch-data/%s", c->name);
if (IS_ERR(ctx->thread)) {
ret = PTR_ERR(ctx->thread);
goto err;
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 389f23ee..7d193ce4 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -684,7 +684,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
if (!iter->pages)
return -ENOMEM;
- __readahead_batch(ractl, iter->pages, nr_pages);
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
for (i = 0; i < nr_pages; i++) {
__bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
put_page(iter->pages[i]);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 6e3d4bea..f3f6fe6c 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1252,7 +1252,7 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode->v.i_ino);
+ bch2_inode_rm(c, inode->v.i_ino, true);
}
}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 0c503527..09ce6c29 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1254,7 +1254,7 @@ static int check_inode(struct btree_trans *trans,
bch2_fs_lazy_rw(c);
- ret = bch2_inode_rm(c, u.bi_inum);
+ ret = bch2_inode_rm(c, u.bi_inum, false);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 823a1dde..82099e5a 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -542,7 +542,7 @@ found_slot:
return ret;
}
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -553,6 +553,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
u64 bi_generation;
int ret;
+ bch2_trans_init(&trans, c, 0, 0);
+
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
@@ -561,30 +563,34 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
- start, end, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_XATTRS,
- start, end, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
- start, end, NULL);
+ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS,
+ start, end, NULL) ?:
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS,
+ start, end, NULL) ?:
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS,
+ start, end, NULL);
if (ret)
- return ret;
-
- bch2_trans_init(&trans, c, 0, 0);
+ goto err;
retry:
bch2_trans_begin(&trans);
bi_generation = 0;
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_cached(iter);
+ if (cached) {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_cached(iter);
+ } else {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ }
ret = bkey_err(k);
if (ret)
goto err;
- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c,
"inode %llu not found when deleting",
inode_nr);
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index ef7e885d..dbdfcf63 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, u64, bool);
int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
struct bch_inode_unpacked *);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 1b3f249b..5874a9ff 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -226,16 +226,19 @@ static bool journal_entry_close(struct journal *j)
*/
static int journal_entry_open(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
int u64s;
u64 v;
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
if (j->blocked)
- return -EAGAIN;
+ return cur_entry_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
@@ -251,7 +254,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
- return -ENOSPC;
+ return cur_entry_journal_full;
/*
* Must be set before marking the journal entry as open:
@@ -263,7 +266,7 @@ static int journal_entry_open(struct journal *j)
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return -EROFS;
+ return cur_entry_insufficient_devices;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
@@ -376,7 +379,7 @@ retry:
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
- ret = -ENOSPC;
+ ret = cur_entry_journal_full;
goto unlock;
}
@@ -399,14 +402,16 @@ retry:
* there's still a previous one in flight:
*/
trace_journal_entry_full(c);
- ret = -EAGAIN;
+ ret = cur_entry_blocked;
} else {
ret = journal_entry_open(j);
}
unlock:
- if ((ret == -EAGAIN || ret == -ENOSPC) &&
- !j->res_get_blocked_start)
+ if ((ret && ret != cur_entry_insufficient_devices) &&
+ !j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
+ trace_journal_full(c);
+ }
can_discard = j->can_discard;
spin_unlock(&j->lock);
@@ -414,41 +419,39 @@ unlock:
if (!ret)
goto retry;
- if (ret == -ENOSPC) {
- if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
- "JOURNAL_RES_GET_RESERVED set but journal full")) {
- char *buf;
-
- buf = kmalloc(4096, GFP_NOFS);
- if (buf) {
- bch2_journal_debug_to_text(&PBUF(buf), j);
- pr_err("\n%s", buf);
- kfree(buf);
- }
+ if (WARN_ONCE(ret == cur_entry_journal_full &&
+ !can_discard &&
+ (flags & JOURNAL_RES_GET_RESERVED),
+ "JOURNAL_RES_GET_RESERVED set but journal full")) {
+ char *buf;
+
+ buf = kmalloc(4096, GFP_NOFS);
+ if (buf) {
+ bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+ pr_err("\n%s", buf);
+ kfree(buf);
}
+ }
- /*
- * Journal is full - can't rely on reclaim from work item due to
- * freezing:
- */
- trace_journal_full(c);
-
- if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
- if (can_discard) {
- bch2_journal_do_discards(j);
- goto retry;
- }
-
- if (mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
+ /*
+ * Journal is full - can't rely on reclaim from work item due to
+ * freezing:
+ */
+ if ((ret == cur_entry_journal_full ||
+ ret == cur_entry_journal_pin_full) &&
+ !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+ if (can_discard) {
+ bch2_journal_do_discards(j);
+ goto retry;
}
- ret = -EAGAIN;
+ if (mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
}
- return ret;
+ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
}
/*
@@ -481,8 +484,10 @@ static bool journal_preres_available(struct journal *j,
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
- if (!ret)
- bch2_journal_reclaim_work(&j->reclaim_work.work);
+ if (!ret && mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
return ret;
}
@@ -543,12 +548,20 @@ out:
* necessary
*/
int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
- struct closure *parent)
+ struct closure *parent)
{
struct journal_buf *buf;
int ret = 0;
+ if (seq <= j->err_seq)
+ return -EIO;
+
+ if (seq <= j->seq_ondisk)
+ return 1;
+
spin_lock(&j->lock);
+
+ /* Recheck under lock: */
if (seq <= j->err_seq) {
ret = -EIO;
goto out;
@@ -678,16 +691,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (nr <= ja->nr)
return 0;
- ret = -ENOMEM;
new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
+ if (!new_buckets || !new_bucket_seq) {
+ ret = -ENOMEM;
goto err;
+ }
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
+ if (!journal_buckets) {
+ ret = -ENOSPC;
goto err;
+ }
/*
* We may be called from the device add path, before the new device has
@@ -716,8 +732,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err;
}
} else {
+ rcu_read_lock();
ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
false, cl);
+ rcu_read_unlock();
if (IS_ERR(ob)) {
ret = cl ? -EAGAIN : -ENOSPC;
goto err;
@@ -769,8 +787,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (!new_fs)
bch2_open_bucket_put(c, ob);
}
-
- ret = 0;
err:
bch2_sb_resize_journal(&ca->disk_sb,
ja->nr + sizeof(*journal_buckets) / sizeof(u64));
@@ -889,7 +905,7 @@ void bch2_fs_journal_stop(struct journal *j)
j->last_empty_seq + 1 != journal_cur_seq(j)));
cancel_delayed_work_sync(&j->write_work);
- cancel_delayed_work_sync(&j->reclaim_work);
+ bch2_journal_reclaim_stop(j);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -1017,7 +1033,6 @@ int bch2_fs_journal_init(struct journal *j)
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
init_waitqueue_head(&j->pin_flush_wait);
mutex_init(&j->reclaim_lock);
mutex_init(&j->discard_lock);
@@ -1069,7 +1084,10 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
+ "nr direct reclaim:\t%llu\n"
+ "nr background reclaim:\t%llu\n"
"current entry sectors:\t%u\n"
+ "current entry error:\t%u\n"
"current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
@@ -1077,7 +1095,10 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
- j->cur_entry_sectors);
+ j->nr_direct_reclaim,
+ j->nr_background_reclaim,
+ j->cur_entry_sectors,
+ j->cur_entry_error);
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 7c157bc5..d1367cf0 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -994,7 +994,7 @@ static void journal_write_done(struct closure *cl)
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
+ journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
@@ -1045,6 +1045,8 @@ void bch2_journal_write(struct closure *cl)
unsigned i, sectors, bytes, u64s;
int ret;
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
journal_buf_realloc(j, w);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 7a04d06b..66f5dcce 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -1,12 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_key_cache.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+#include <trace/events/bcachefs.h>
+
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
@@ -164,12 +169,12 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
- ret = -EROFS;
+ ret = cur_entry_insufficient_devices;
goto out;
}
if (!fifo_free(&j->pin)) {
- ret = -ENOSPC;
+ ret = cur_entry_journal_pin_full;
goto out;
}
@@ -180,7 +185,7 @@ void bch2_journal_space_available(struct journal *j)
clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
if (!discarded.next_entry)
- ret = -ENOSPC;
+ ret = cur_entry_journal_full;
overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
journal_entry_overhead(j);
@@ -432,7 +437,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
list_move(&ret->list, &pin_list->flushed);
BUG_ON(j->flush_in_progress);
j->flush_in_progress = ret;
- j->last_flushed = jiffies;
}
spin_unlock(&j->lock);
@@ -441,17 +445,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
}
/* returns true if we did work */
-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
- unsigned min_nr)
+static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
+ unsigned min_nr)
{
struct journal_entry_pin *pin;
- bool ret = false;
- u64 seq;
+ u64 seq, ret = 0;
lockdep_assert_held(&j->reclaim_lock);
- while ((pin = journal_get_next_pin(j, min_nr
- ? U64_MAX : seq_to_flush, &seq))) {
+ while (1) {
+ cond_resched();
+
+ j->last_flushed = jiffies;
+
+ pin = journal_get_next_pin(j, min_nr
+ ? U64_MAX : seq_to_flush, &seq);
+ if (!pin)
+ break;
+
if (min_nr)
min_nr--;
@@ -460,7 +471,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
BUG_ON(j->flush_in_progress != pin);
j->flush_in_progress = NULL;
wake_up(&j->pin_flush_wait);
- ret = true;
+ ret++;
}
return ret;
@@ -524,15 +535,27 @@ static u64 journal_seq_to_flush(struct journal *j)
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
-void bch2_journal_reclaim(struct journal *j)
+static void __bch2_journal_reclaim(struct journal *j, bool direct)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned min_nr = 0;
- u64 seq_to_flush = 0;
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ u64 seq_to_flush, nr_flushed = 0;
+ size_t min_nr;
+ unsigned flags;
+ /*
+ * We can't invoke memory reclaim while holding the reclaim_lock -
+ * journal reclaim is required to make progress for memory reclaim
+ * (cleaning the caches), so we can't get stuck in memory reclaim while
+ * we're holding the reclaim lock:
+ */
lockdep_assert_held(&j->reclaim_lock);
+ flags = memalloc_noreclaim_save();
do {
+ if (kthread && kthread_should_stop())
+ break;
+
bch2_journal_do_discards(j);
seq_to_flush = journal_seq_to_flush(j);
@@ -549,26 +572,103 @@ void bch2_journal_reclaim(struct journal *j)
if (j->prereserved.reserved * 2 > j->prereserved.remaining)
min_nr = 1;
- if ((atomic_read(&c->btree_cache.dirty) * 4 >
- c->btree_cache.used * 3) ||
- (c->btree_key_cache.nr_dirty * 4 >
- c->btree_key_cache.nr_keys))
+ if (atomic_read(&c->btree_cache.dirty) * 4 >
+ c->btree_cache.used * 3)
min_nr = 1;
- } while (journal_flush_pins(j, seq_to_flush, min_nr));
- if (!bch2_journal_error(j))
- queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
- msecs_to_jiffies(j->reclaim_delay_ms));
+ min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
+
+ trace_journal_reclaim_start(c,
+ min_nr,
+ j->prereserved.reserved,
+ j->prereserved.remaining,
+ atomic_read(&c->btree_cache.dirty),
+ c->btree_cache.used,
+ c->btree_key_cache.nr_dirty,
+ c->btree_key_cache.nr_keys);
+
+ nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
+
+ if (direct)
+ j->nr_direct_reclaim += nr_flushed;
+ else
+ j->nr_background_reclaim += nr_flushed;
+ trace_journal_reclaim_finish(c, nr_flushed);
+ } while (min_nr);
+
+ memalloc_noreclaim_restore(flags);
}
-void bch2_journal_reclaim_work(struct work_struct *work)
+void bch2_journal_reclaim(struct journal *j)
{
- struct journal *j = container_of(to_delayed_work(work),
- struct journal, reclaim_work);
+ __bch2_journal_reclaim(j, true);
+}
- mutex_lock(&j->reclaim_lock);
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
+static int bch2_journal_reclaim_thread(void *arg)
+{
+ struct journal *j = arg;
+ unsigned long next;
+
+ set_freezable();
+
+ kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
+
+ while (!kthread_should_stop()) {
+ j->reclaim_kicked = false;
+
+ mutex_lock(&j->reclaim_lock);
+ __bch2_journal_reclaim(j, false);
+ mutex_unlock(&j->reclaim_lock);
+
+ next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ if (j->reclaim_kicked)
+ break;
+ if (time_after_eq(jiffies, next))
+ break;
+ schedule_timeout(next - jiffies);
+ try_to_freeze();
+
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+ struct task_struct *p = j->reclaim_thread;
+
+ j->reclaim_thread = NULL;
+
+ if (p) {
+ kthread_stop(p);
+ put_task_struct(p);
+ }
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct task_struct *p;
+
+ if (j->reclaim_thread)
+ return 0;
+
+ p = kthread_create(bch2_journal_reclaim_thread, j,
+ "bch-reclaim/%s", c->name);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ get_task_struct(p);
+ j->reclaim_thread = p;
+ wake_up_process(p);
+ return 0;
}
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
@@ -582,7 +682,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0);
+ *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
spin_lock(&j->lock);
/*
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
index 8128907a..bae2c921 100644
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -10,6 +10,17 @@ enum journal_space_from {
journal_space_clean,
};
+static inline void journal_reclaim_kick(struct journal *j)
+{
+ struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+ if (p && !j->reclaim_kicked) {
+ j->reclaim_kicked = true;
+ if (p)
+ wake_up_process(p);
+ }
+}
+
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
enum journal_space_from);
@@ -55,7 +66,9 @@ void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
void bch2_journal_do_discards(struct journal *);
void bch2_journal_reclaim(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
bool bch2_journal_flush_pins(struct journal *, u64);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 9757e3d5..4640bb86 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -146,7 +146,13 @@ struct journal {
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- int cur_entry_error;
+ enum {
+ cur_entry_ok,
+ cur_entry_blocked,
+ cur_entry_journal_full,
+ cur_entry_journal_pin_full,
+ cur_entry_insufficient_devices,
+ } cur_entry_error;
union journal_preres_state prereserved;
@@ -210,8 +216,12 @@ struct journal {
struct write_point wp;
spinlock_t err_lock;
- struct delayed_work reclaim_work;
struct mutex reclaim_lock;
+ struct task_struct *reclaim_thread;
+ bool reclaim_kicked;
+ u64 nr_direct_reclaim;
+ u64 nr_background_reclaim;
+
unsigned long last_flushed;
struct journal_entry_pin *flush_in_progress;
wait_queue_head_t pin_flush_wait;
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index ddfda1ef..4834f41f 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c)
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
- t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
+ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
if (IS_ERR(t))
return PTR_ERR(t);
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 44d2651b..c3373c48 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c)
if (c->opts.nochanges)
return 0;
- p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
if (IS_ERR(p))
return PTR_ERR(p);
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 67500636..0b3521c9 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -616,6 +616,7 @@ static int bch2_journal_replay(struct bch_fs *c,
*/
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+ journal_reclaim_kick(j);
j->replay_journal_seq = seq;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 8673e974..e3bbd0b0 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -49,7 +49,6 @@
#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/idr.h>
-#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/random.h>
@@ -259,7 +258,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
void bch2_fs_read_only(struct bch_fs *c)
{
if (!test_bit(BCH_FS_RW, &c->flags)) {
- cancel_delayed_work_sync(&c->journal.reclaim_work);
+ BUG_ON(c->journal.reclaim_thread);
return;
}
@@ -417,6 +416,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret) {
+ bch_err(c, "error starting journal reclaim: %i", ret);
+ return ret;
+ }
+
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -425,9 +430,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
percpu_ref_reinit(&c->writes);
set_bit(BCH_FS_RW, &c->flags);
-
- queue_delayed_work(c->journal_reclaim_wq,
- &c->journal.reclaim_work, 0);
return 0;
err:
__bch2_fs_read_only(c);
@@ -495,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c)
kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
- if (c->journal_reclaim_wq)
- destroy_workqueue(c->journal_reclaim_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->wq)
@@ -750,8 +750,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
@@ -2018,6 +2016,7 @@ static void bcachefs_exit(void)
bch2_debug_exit();
bch2_vfs_exit();
bch2_chardev_exit();
+ bch2_btree_key_cache_exit();
if (bcachefs_kset)
kset_unregister(bcachefs_kset);
}
@@ -2027,6 +2026,7 @@ static int __init bcachefs_init(void)
bch2_bkey_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+ bch2_btree_key_cache_init() ||
bch2_chardev_init() ||
bch2_vfs_init() ||
bch2_debug_init())
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 58c00e26..900eda88 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -165,6 +165,7 @@ read_attribute(journal_debug);
read_attribute(journal_pins);
read_attribute(btree_updates);
read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_transactions);
read_attribute(stripes_heap);
@@ -374,6 +375,11 @@ SHOW(bch2_fs)
return out.pos - buf;
}
+ if (attr == &sysfs_btree_cache) {
+ bch2_btree_cache_to_text(&out, c);
+ return out.pos - buf;
+ }
+
if (attr == &sysfs_btree_key_cache) {
bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
return out.pos - buf;
@@ -550,6 +556,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_pins,
&sysfs_btree_updates,
&sysfs_dirty_btree_nodes,
+ &sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
&sysfs_stripes_heap,