summaryrefslogtreecommitdiff
path: root/libbcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs')
-rw-r--r--libbcachefs/alloc.c171
-rw-r--r--libbcachefs/alloc_types.h7
-rw-r--r--libbcachefs/bcachefs.h10
-rw-r--r--libbcachefs/bcachefs_format.h3
-rw-r--r--libbcachefs/btree_gc.c68
-rw-r--r--libbcachefs/btree_update.c80
-rw-r--r--libbcachefs/btree_update.h7
-rw-r--r--libbcachefs/buckets_types.h8
-rw-r--r--libbcachefs/checksum.c30
-rw-r--r--libbcachefs/compress.c38
-rw-r--r--libbcachefs/extents.c9
-rw-r--r--libbcachefs/fs-io.c9
-rw-r--r--libbcachefs/fs-io.h2
-rw-r--r--libbcachefs/io.c4
-rw-r--r--libbcachefs/journal.c12
-rw-r--r--libbcachefs/migrate.c21
-rw-r--r--libbcachefs/super-io.c2
-rw-r--r--libbcachefs/super.c6
-rw-r--r--libbcachefs/sysfs.c12
-rw-r--r--libbcachefs/util.c1
-rw-r--r--libbcachefs/util.h2
21 files changed, 275 insertions, 227 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index a4e412ea..f3ded7b4 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -71,6 +71,8 @@
#include <linux/math64.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
#include <trace/events/bcachefs.h>
static void __bch2_bucket_free(struct bch_dev *, struct bucket *);
@@ -283,8 +285,8 @@ int bch2_prio_write(struct bch_dev *ca)
r < ca->mi.nbuckets && d < end;
r++, d++) {
g = ca->buckets + r;
- d->read_prio = cpu_to_le16(g->read_prio);
- d->write_prio = cpu_to_le16(g->write_prio);
+ d->prio[READ] = cpu_to_le16(g->prio[READ]);
+ d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]);
d->gen = ca->buckets[r].mark.gen;
}
@@ -445,8 +447,8 @@ int bch2_prio_read(struct bch_dev *ca)
d = p->data;
}
- ca->buckets[b].read_prio = le16_to_cpu(d->read_prio);
- ca->buckets[b].write_prio = le16_to_cpu(d->write_prio);
+ ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]);
+ ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]);
bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen);
}
@@ -469,9 +471,9 @@ fsck_err:
* If there aren't enough available buckets to fill up free_inc, wait until
* there are.
*/
-static int wait_buckets_available(struct bch_dev *ca)
+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
+ unsigned long gc_count = c->gc_count;
int ret = 0;
while (1) {
@@ -481,27 +483,18 @@ static int wait_buckets_available(struct bch_dev *ca)
break;
}
- if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) {
- if (c->gc_thread) {
- trace_gc_cannot_inc_gens(ca->fs);
- atomic_inc(&c->kick_gc);
- wake_up_process(ca->fs->gc_thread);
- }
+ if (gc_count != c->gc_count)
+ ca->inc_gen_really_needs_gc = 0;
- /*
- * We are going to wait for GC to wake us up, even if
- * bucket counters tell us enough buckets are available,
- * because we are actually waiting for GC to rewrite
- * nodes with stale pointers
- */
- } else if (dev_buckets_available(ca) >=
- fifo_free(&ca->free_inc))
+ if ((ssize_t) (dev_buckets_available(ca) -
+ ca->inc_gen_really_needs_gc) >=
+ (ssize_t) fifo_free(&ca->free_inc))
break;
- up_read(&ca->fs->gc_lock);
+ up_read(&c->gc_lock);
schedule();
try_to_freeze();
- down_read(&ca->fs->gc_lock);
+ down_read(&c->gc_lock);
}
__set_current_state(TASK_RUNNING);
@@ -639,9 +632,12 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
if (!is_available_bucket(mark))
return false;
- if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1)
+ if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++;
+ if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX)
+ ca->inc_gen_really_needs_gc++;
+
return can_inc_bucket_gen(ca, g);
}
@@ -651,8 +647,8 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
bch2_invalidate_bucket(ca, g);
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
+ g->prio[READ] = ca->fs->prio_clock[READ].hand;
+ g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
verify_not_on_freelist(ca, g - ca->buckets);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
@@ -672,40 +668,34 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
* - The number of sectors of cached data in the bucket, which gives us an
* indication of the cost in cache misses this eviction will cause.
*
- * - The difference between the bucket's current gen and oldest gen of any
- * pointer into it, which gives us an indication of the cost of an eventual
- * btree GC to rewrite nodes with stale pointers.
+ * - If hotness * sectors used compares equal, we pick the bucket with the
+ * smallest bucket_gc_gen() - since incrementing the same bucket's generation
+ * number repeatedly forces us to run mark and sweep gc to avoid generation
+ * number wraparound.
*/
-static unsigned long bucket_sort_key(bucket_heap *h,
- struct bucket_heap_entry e)
+static unsigned long bucket_sort_key(struct bch_dev *ca,
+ struct bucket *g,
+ struct bucket_mark m)
{
- struct bch_dev *ca = container_of(h, struct bch_dev, alloc_heap);
- struct bucket *g = ca->buckets + e.bucket;
- unsigned long prio = g->read_prio - ca->min_prio[READ];
- prio = (prio * 7) / (ca->fs->prio_clock[READ].hand -
- ca->min_prio[READ]);
+ unsigned long hotness =
+ (g->prio[READ] - ca->min_prio[READ]) * 7 /
+ (ca->fs->prio_clock[READ].hand - ca->min_prio[READ]);
- return (prio + 1) * bucket_sectors_used(e.mark);
-}
-
-static inline int bucket_alloc_cmp(bucket_heap *h,
- struct bucket_heap_entry l,
- struct bucket_heap_entry r)
-{
- return bucket_sort_key(h, l) - bucket_sort_key(h, r);
+ return (((hotness + 1) * bucket_sectors_used(m)) << 8) |
+ bucket_gc_gen(ca, g);
}
-static inline long bucket_idx_cmp(bucket_heap *h,
- struct bucket_heap_entry l,
- struct bucket_heap_entry r)
+static inline int bucket_alloc_cmp(alloc_heap *h,
+ struct alloc_heap_entry l,
+ struct alloc_heap_entry r)
{
- return l.bucket - r.bucket;
+ return (l.key > r.key) - (l.key < r.key);
}
static void invalidate_buckets_lru(struct bch_dev *ca)
{
- struct bucket_heap_entry e;
+ struct alloc_heap_entry e;
struct bucket *g;
ca->alloc_heap.used = 0;
@@ -721,23 +711,26 @@ static void invalidate_buckets_lru(struct bch_dev *ca)
*/
for_each_bucket(g, ca) {
struct bucket_mark m = READ_ONCE(g->mark);
- struct bucket_heap_entry e = { g - ca->buckets, m };
if (!bch2_can_invalidate_bucket(ca, g, m))
continue;
+ e = (struct alloc_heap_entry) {
+ .bucket = g - ca->buckets,
+ .key = bucket_sort_key(ca, g, m)
+ };
+
heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
}
- /* Sort buckets by physical location on disk for better locality */
- heap_resort(&ca->alloc_heap, bucket_idx_cmp);
+ heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
/*
* If we run out of buckets to invalidate, bch2_allocator_thread() will
* kick stuff and retry us
*/
while (!fifo_full(&ca->free_inc) &&
- heap_pop(&ca->alloc_heap, e, bucket_idx_cmp))
+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]);
mutex_unlock(&ca->fs->bucket_lock);
@@ -790,6 +783,7 @@ static void invalidate_buckets_random(struct bch_dev *ca)
static void invalidate_buckets(struct bch_dev *ca)
{
ca->inc_gen_needs_gc = 0;
+ ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
case CACHE_REPLACEMENT_LRU:
@@ -852,8 +846,8 @@ static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
spin_lock(&ca->freelist_lock);
bch2_mark_alloc_bucket(ca, g, true);
- g->read_prio = c->prio_clock[READ].hand;
- g->write_prio = c->prio_clock[WRITE].hand;
+ g->prio[READ] = c->prio_clock[READ].hand;
+ g->prio[WRITE] = c->prio_clock[WRITE].hand;
verify_not_on_freelist(ca, g - ca->buckets);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
@@ -866,6 +860,13 @@ static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
}
}
+static int size_t_cmp(const void *_l, const void *_r)
+{
+ const size_t *l = _l, *r = _r;
+
+ return (*l > *r) - (*l < *r);
+}
+
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
@@ -923,27 +924,13 @@ static int bch2_allocator_thread(void *arg)
__set_current_state(TASK_RUNNING);
}
- down_read(&c->gc_lock);
-
- /*
- * See if we have buckets we can reuse without invalidating them
- * or forcing a journal commit:
- */
- //bch2_find_empty_buckets(c, ca);
-
- if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
- up_read(&c->gc_lock);
- continue;
- }
-
/* We've run out of free buckets! */
- while (!fifo_full(&ca->free_inc)) {
- if (wait_buckets_available(ca)) {
- up_read(&c->gc_lock);
- goto out;
- }
+ BUG_ON(fifo_used(&ca->free_inc));
+ ca->free_inc.front = ca->free_inc.back = 0;
+ down_read(&c->gc_lock);
+ while (1) {
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
@@ -952,12 +939,38 @@ static int bch2_allocator_thread(void *arg)
*/
invalidate_buckets(ca);
+
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
- ca->free_inc.size);
- }
+ ca->free_inc.size);
+
+ if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
+ (!fifo_full(&ca->free_inc) &&
+ ca->inc_gen_really_needs_gc >=
+ fifo_free(&ca->free_inc))) &&
+ c->gc_thread) {
+ atomic_inc(&c->kick_gc);
+ wake_up_process(c->gc_thread);
+ }
+
+ if (fifo_full(&ca->free_inc))
+ break;
+ if (wait_buckets_available(c, ca)) {
+ up_read(&c->gc_lock);
+ goto out;
+ }
+ }
up_read(&c->gc_lock);
+ BUG_ON(ca->free_inc.front);
+
+ spin_lock(&ca->freelist_lock);
+ sort(ca->free_inc.data,
+ ca->free_inc.back,
+ sizeof(ca->free_inc.data[0]),
+ size_t_cmp, NULL);
+ spin_unlock(&ca->freelist_lock);
+
/*
* free_inc is full of newly-invalidated buckets, must write out
* prios and gens before they can be re-used
@@ -1022,8 +1035,8 @@ out:
g = ca->buckets + r;
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
+ g->prio[READ] = ca->fs->prio_clock[READ].hand;
+ g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand;
return r;
}
@@ -1031,9 +1044,6 @@ out:
static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g)
{
bch2_mark_free_bucket(ca, g);
-
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
}
enum bucket_alloc_ret {
@@ -1614,8 +1624,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
unsigned i, j;
for_each_online_member(ca, c, i) {
- struct backing_dev_info *bdi =
- blk_get_backing_dev_info(ca->disk_sb.bdev);
+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
ra_pages += bdi->ra_pages;
}
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index 1bf48ef9..ae58d083 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -99,4 +99,11 @@ struct write_point {
*/
};
+struct alloc_heap_entry {
+ size_t bucket;
+ unsigned long key;
+};
+
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
+
#endif /* _BCACHE_ALLOC_TYPES_H */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 6259b50e..977ac364 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -240,8 +240,6 @@ do { \
"btree node it traverses") \
BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
"Disables rewriting of btree nodes during mark and sweep")\
- BCH_DEBUG_PARAM(btree_gc_coalesce_disabled, \
- "Disables coalescing of btree nodes") \
BCH_DEBUG_PARAM(btree_shrinker_disabled, \
"Disables the shrinker callback for the btree node cache")
@@ -273,7 +271,6 @@ do { \
#define BCH_TIME_STATS() \
BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \
BCH_TIME_STAT(btree_gc, sec, ms) \
- BCH_TIME_STAT(btree_coalesce, sec, ms) \
BCH_TIME_STAT(btree_split, sec, us) \
BCH_TIME_STAT(btree_sort, ms, us) \
BCH_TIME_STAT(btree_read, ms, us) \
@@ -417,8 +414,9 @@ struct bch_dev {
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
+ size_t inc_gen_really_needs_gc;
- bucket_heap alloc_heap;
+ alloc_heap alloc_heap;
bucket_heap copygc_heap;
/* Moving GC: */
@@ -681,6 +679,7 @@ struct bch_fs {
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
+ unsigned long gc_count;
/*
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
@@ -716,7 +715,7 @@ struct bch_fs {
mempool_t compression_bounce[2];
struct crypto_shash *sha256;
- struct crypto_blkcipher *chacha20;
+ struct crypto_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -762,6 +761,7 @@ struct bch_fs {
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
+ unsigned btree_gc_periodic:1;
unsigned foreground_write_ratelimit_enabled:1;
unsigned copy_gc_enabled:1;
unsigned tiering_enabled:1;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index ef854fb1..2d64bcae 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1208,8 +1208,7 @@ struct prio_set {
__le64 next_bucket;
struct bucket_disk {
- __le16 read_prio;
- __le16 write_prio;
+ __le16 prio[2];
__u8 gen;
} __attribute__((packed)) data[];
} __attribute__((packed, aligned(8)));
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 99d28f64..d907ef58 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -166,14 +166,14 @@ fsck_err:
return ret;
}
-static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b)
+static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
{
- if (btree_node_has_ptrs(b)) {
- struct btree_node_iter iter;
- struct bkey unpacked;
- struct bkey_s_c k;
- u8 stale = 0;
+ struct btree_node_iter iter;
+ struct bkey unpacked;
+ struct bkey_s_c k;
+ u8 stale = 0;
+ if (btree_node_has_ptrs(b))
for_each_btree_node_key_unpack(b, k, &iter,
btree_node_is_extents(b),
&unpacked) {
@@ -182,17 +182,7 @@ static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b)
btree_node_type(b), k));
}
- if (btree_gc_rewrite_disabled(c))
- return false;
-
- if (stale > 10)
- return true;
- }
-
- if (btree_gc_always_rewrite(c))
- return true;
-
- return false;
+ return stale;
}
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
@@ -212,10 +202,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
{
struct btree_iter iter;
struct btree *b;
- bool should_rewrite;
struct range_checks r;
unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
- int ret;
+ unsigned max_stale;
+ int ret = 0;
/*
* if expensive_debug_checks is on, run range_checks on all leaf nodes:
@@ -231,12 +221,21 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
bch2_verify_btree_nr_keys(b);
- should_rewrite = btree_gc_mark_node(c, b);
+ max_stale = btree_gc_mark_node(c, b);
gc_pos_set(c, gc_pos_btree_node(b));
- if (should_rewrite)
- bch2_btree_node_rewrite(&iter, b, NULL);
+ if (max_stale > 32)
+ bch2_btree_node_rewrite(c, &iter,
+ b->data->keys.seq,
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_GC_LOCK_HELD);
+ else if (!btree_gc_rewrite_disabled(c) &&
+ (btree_gc_always_rewrite(c) || max_stale > 16))
+ bch2_btree_node_rewrite(c, &iter,
+ b->data->keys.seq,
+ BTREE_INSERT_NOWAIT|
+ BTREE_INSERT_GC_LOCK_HELD);
bch2_btree_iter_cond_resched(&iter);
}
@@ -507,6 +506,7 @@ void bch2_gc(struct bch_fs *c)
/* Indicates that gc is no longer in progress: */
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
+ c->gc_count++;
up_write(&c->gc_lock);
trace_gc_end(c);
@@ -835,7 +835,6 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
*/
void bch2_coalesce(struct bch_fs *c)
{
- u64 start_time;
enum btree_id id;
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
@@ -843,7 +842,6 @@ void bch2_coalesce(struct bch_fs *c)
down_read(&c->gc_lock);
trace_gc_coalesce_start(c);
- start_time = local_clock();
for (id = 0; id < BTREE_ID_NR; id++) {
int ret = c->btree_roots[id].b
@@ -858,7 +856,6 @@ void bch2_coalesce(struct bch_fs *c)
}
}
- bch2_time_stats_update(&c->btree_coalesce_time, start_time);
trace_gc_coalesce_end(c);
up_read(&c->gc_lock);
}
@@ -873,9 +870,7 @@ static int bch2_gc_thread(void *arg)
set_freezable();
while (1) {
- unsigned long next = last + c->capacity / 16;
-
- while (atomic_long_read(&clock->now) < next) {
+ while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
@@ -883,21 +878,28 @@ static int bch2_gc_thread(void *arg)
return 0;
}
- if (atomic_read(&c->kick_gc) != last_kick) {
- __set_current_state(TASK_RUNNING);
+ if (atomic_read(&c->kick_gc) != last_kick)
break;
+
+ if (c->btree_gc_periodic) {
+ unsigned long next = last + c->capacity / 16;
+
+ if (atomic_long_read(&clock->now) >= next)
+ break;
+
+ bch2_io_clock_schedule_timeout(clock, next);
+ } else {
+ schedule();
}
- bch2_io_clock_schedule_timeout(clock, next);
try_to_freeze();
}
+ __set_current_state(TASK_RUNNING);
last = atomic_long_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
bch2_gc(c);
- if (!btree_gc_coalesce_disabled(c))
- bch2_coalesce(c);
debug_check_no_locks_held();
}
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index 2f67c092..8a4ee6d1 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -533,6 +533,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
if (flags & BTREE_INSERT_NOFAIL)
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
+ if (flags & BTREE_INSERT_NOWAIT)
+ cl = NULL;
+
/*
* This check isn't necessary for correctness - it's just to potentially
* prevent us from doing a lot of work that'll end up being wasted:
@@ -2279,30 +2282,13 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
return ret;
}
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
- struct closure *cl)
+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+ struct btree *b, unsigned flags,
+ struct closure *cl)
{
- struct bch_fs *c = iter->c;
struct btree *n, *parent = iter->nodes[b->level + 1];
struct btree_reserve *reserve;
struct btree_interior_update *as;
- unsigned flags = BTREE_INSERT_NOFAIL;
-
- /*
- * if caller is going to wait if allocating reserve fails, then this is
- * a rewrite that must succeed:
- */
- if (cl)
- flags |= BTREE_INSERT_USE_RESERVE;
-
- if (!bch2_btree_iter_set_locks_want(iter, U8_MAX))
- return -EINTR;
reserve = bch2_btree_reserve_get(c, b, 0, flags, cl);
if (IS_ERR(reserve)) {
@@ -2341,3 +2327,57 @@ int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
bch2_btree_reserve_put(c, reserve);
return 0;
}
+
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ *
+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
+ * btree_check_reserve() has to wait)
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+ __le64 seq, unsigned flags)
+{
+ unsigned locks_want = iter->locks_want;
+ struct closure cl;
+ struct btree *b;
+ int ret;
+
+ flags |= BTREE_INSERT_NOFAIL;
+
+ closure_init_stack(&cl);
+
+ bch2_btree_iter_set_locks_want(iter, U8_MAX);
+
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
+ if (!down_read_trylock(&c->gc_lock)) {
+ bch2_btree_iter_unlock(iter);
+ down_read(&c->gc_lock);
+ }
+ }
+
+ while (1) {
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ break;
+
+ b = bch2_btree_iter_peek_node(iter);
+ if (!b || b->data->keys.seq != seq)
+ break;
+
+ ret = __btree_node_rewrite(c, iter, b, flags, &cl);
+ if (ret != -EAGAIN &&
+ ret != -EINTR)
+ break;
+
+ bch2_btree_iter_unlock(iter);
+ closure_sync(&cl);
+ }
+
+ bch2_btree_iter_set_locks_want(iter, locks_want);
+
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
+
+ closure_sync(&cl);
+ return ret;
+}
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index a933d5a9..7c4abe4a 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -380,6 +380,10 @@ int __bch2_btree_insert_at(struct btree_insert *);
*/
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT (1 << 4)
+#define BTREE_INSERT_GC_LOCK_HELD (1 << 5)
+
int bch2_btree_delete_at(struct btree_iter *, unsigned);
int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
@@ -416,7 +420,8 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct disk_reservation *,
struct extent_insert_hook *, u64 *);
-int bch2_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *);
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
+ __le64, unsigned);
#endif /* _BCACHE_BTREE_INSERT_H */
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 18bf1713..68f863f3 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -50,13 +50,7 @@ struct bucket_mark {
};
struct bucket {
- union {
- struct {
- u16 read_prio;
- u16 write_prio;
- };
- u16 prio[2];
- };
+ u16 prio[2];
union {
struct bucket_mark _mark;
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 4545a499..f2883e1f 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -178,18 +178,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_blkcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d };
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+ ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_blkcipher *tfm,
+static inline void do_encrypt(struct crypto_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -202,20 +205,20 @@ static inline void do_encrypt(struct crypto_blkcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_blkcipher *chacha20 =
- crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC);
+ struct crypto_skcipher *chacha20 =
+ crypto_alloc_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20)
return PTR_ERR(chacha20);
- ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
if (ret)
goto err;
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_blkcipher(chacha20);
+ crypto_free_skcipher(chacha20);
return ret;
}
@@ -377,7 +380,7 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
return PTR_ERR(keyring_key);
down_read(&keyring_key->sem);
- ukp = user_key_payload(keyring_key);
+ ukp = dereference_key_locked(keyring_key);
if (ukp->datalen == sizeof(*key)) {
memcpy(key, ukp->data, ukp->datalen);
ret = 0;
@@ -454,8 +457,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_blkcipher("chacha20", 0,
- CRYPTO_ALG_ASYNC);
+ c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20))
return PTR_ERR(c->chacha20);
@@ -532,7 +534,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_blkcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(c->chacha20,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -560,7 +562,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_blkcipher(c->chacha20);
+ crypto_free_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -587,7 +589,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto err;
- ret = crypto_blkcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(c->chacha20,
(void *) &key.key, sizeof(key.key));
err:
memzero_explicit(&key, sizeof(key));
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 547ea732..62b42042 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -148,9 +148,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
switch (crc.compression_type) {
case BCH_COMPRESSION_LZ4:
- ret = lz4_decompress(src_data, &src_len,
- dst_data, dst_len);
- if (ret) {
+ ret = LZ4_decompress_safe(src_data, dst_data,
+ src_len, dst_len);
+
+ if (ret != dst_len) {
ret = -EIO;
goto err;
}
@@ -286,32 +287,27 @@ static int __bio_compress(struct bch_fs *c,
switch (compression_type) {
case BCH_COMPRESSION_LZ4: {
void *workspace;
-
- *dst_len = dst->bi_iter.bi_size;
- *src_len = src->bi_iter.bi_size;
+ int srclen = src->bi_iter.bi_size;
+ ret = 0;
workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
- while (*src_len > block_bytes(c) &&
- (ret = lz4_compress(src_data, *src_len,
- dst_data, dst_len,
- workspace))) {
- /*
- * On error, the compressed data was bigger than
- * dst_len, and -ret is the amount of data we were able
- * to compress - round down to nearest block and try
- * again:
- */
- BUG_ON(ret > 0);
- BUG_ON(-ret >= *src_len);
-
- *src_len = round_down(-ret, block_bytes(c));
+ while (srclen > block_bytes(c) &&
+ (ret = LZ4_compress_destSize(src_data, dst_data,
+ &srclen, dst->bi_iter.bi_size,
+ workspace)) &&
+ (srclen & (block_bytes(c) - 1))) {
+ /* Round down to nearest block and try again: */
+ srclen = round_down(srclen, block_bytes(c));
}
mempool_free(workspace, &c->lz4_workspace_pool);
- if (ret)
+ if (!ret)
goto err;
+
+ *src_len = srclen;
+ *dst_len = ret;
break;
}
case BCH_COMPRESSION_GZIP: {
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 219b60a3..57bfb4a6 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -559,10 +559,10 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
return;
err:
bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
- bch2_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i "
+ bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
"gen %i last_gc %i mark %08x",
err, buf, PTR_BUCKET_NR(ca, ptr),
- g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
+ PTR_BUCKET(ca, ptr)->mark.gen,
ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
(unsigned) g->mark.counter);
}
@@ -1769,10 +1769,9 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
bad_ptr:
bch2_bkey_val_to_text(c, btree_node_type(b), buf,
sizeof(buf), e.s_c);
- bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
+ bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
"gen %i last_gc %i mark 0x%08x",
- buf, PTR_BUCKET_NR(ca, ptr),
- g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
+ buf, PTR_BUCKET_NR(ca, ptr), PTR_BUCKET(ca, ptr)->mark.gen,
ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
(unsigned) g->mark.counter);
return;
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 4a680ade..803611d1 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -757,7 +757,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
flags |= BCH_READ_IS_LAST;
if (pick.ca) {
- PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+ PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] =
c->prio_clock[READ].hand;
bch2_read_extent(c, rbio, k, &pick, flags);
@@ -1775,16 +1775,17 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ret;
}
-int bch2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int bch2_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = file_inode(vma->vm_file);
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
struct bch_fs *c = inode->i_sb->s_fs_info;
int ret = VM_FAULT_LOCKED;
sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
+ file_update_time(file);
/*
* Not strictly necessary, but helps avoid dio writes livelocking in
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index f3fcf947..3fcc1e7d 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -29,7 +29,7 @@ long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
loff_t bch2_llseek(struct file *, loff_t, int);
-int bch2_page_mkwrite(struct vm_area_struct *, struct vm_fault *);
+int bch2_page_mkwrite(struct vm_fault *);
void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
int bch2_releasepage(struct page *, gfp_t);
int bch2_migrate_page(struct address_space *, struct page *,
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index d588f6ab..1145a190 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -1300,11 +1300,11 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
flags |= BCH_READ_IS_LAST;
if (pick.ca) {
- PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
+ PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] =
c->prio_clock[READ].hand;
bch2_read_extent_iter(c, rbio, bvec_iter,
- k, &pick, flags);
+ k, &pick, flags);
flags &= ~BCH_READ_MAY_REUSE_BIO;
} else {
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 510066a2..92364fea 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -164,21 +164,15 @@ static void journal_seq_blacklist_flush(struct journal *j,
mutex_unlock(&j->blacklist_lock);
__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
-redo_peek:
+
b = bch2_btree_iter_peek_node(&iter);
/* The node might have already been rewritten: */
if (b->data->keys.seq == n.seq) {
- ret = bch2_btree_node_rewrite(&iter, b, &cl);
+ ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
if (ret) {
bch2_btree_iter_unlock(&iter);
- closure_sync(&cl);
-
- if (ret == -EAGAIN ||
- ret == -EINTR)
- goto redo_peek;
-
bch2_fs_fatal_error(c,
"error %i rewriting btree node with blacklisted journal seq",
ret);
@@ -190,8 +184,6 @@ redo_peek:
bch2_btree_iter_unlock(&iter);
}
- closure_sync(&cl);
-
for (i = 0;; i++) {
struct btree_interior_update *as;
struct pending_btree_node_free *d;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 8680b100..8c9e3c25 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -156,9 +156,9 @@ next:
* This walks the btree, and for any node on the relevant device it moves the
* node elsewhere.
*/
-static int bch2_move_btree_off(struct bch_dev *ca, enum btree_id id)
+static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
+ enum btree_id id)
{
- struct bch_fs *c = ca->fs;
struct btree_iter iter;
struct closure cl;
struct btree *b;
@@ -170,22 +170,11 @@ static int bch2_move_btree_off(struct bch_dev *ca, enum btree_id id)
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-retry:
+
if (!bch2_extent_has_device(e, ca->dev_idx))
continue;
- ret = bch2_btree_node_rewrite(&iter, b, &cl);
- if (ret == -EINTR || ret == -ENOSPC) {
- /*
- * Drop locks to upgrade locks or wait on
- * reserve: after retaking, recheck in case we
- * raced.
- */
- bch2_btree_iter_unlock(&iter);
- closure_sync(&cl);
- b = bch2_btree_iter_peek_node(&iter);
- goto retry;
- }
+ ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
if (ret) {
bch2_btree_iter_unlock(&iter);
return ret;
@@ -268,7 +257,7 @@ int bch2_move_metadata_off_device(struct bch_dev *ca)
/* 1st, Move the btree nodes off the device */
for (i = 0; i < BTREE_ID_NR; i++) {
- ret = bch2_move_btree_off(ca, i);
+ ret = bch2_move_btree_off(c, ca, i);
if (ret)
return ret;
}
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index fa020af3..130b130f 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -442,7 +442,7 @@ static const char *bch2_blkdev_open(const char *path, fmode_t mode,
return "failed to open device";
if (mode & FMODE_WRITE)
- bdev_get_queue(bdev)->backing_dev_info.capabilities
+ bdev_get_queue(bdev)->backing_dev_info->capabilities
|= BDI_CAP_STABLE_WRITES;
*ret = bdev;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 528538b5..2a3947e2 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -148,7 +148,7 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
if (bdi_bits & (1 << WB_sync_congested)) {
/* Reads - check all devices: */
for_each_readable_member(ca, c, i) {
- bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+ bdi = ca->disk_sb.bdev->bd_bdi;
if (bdi_congested(bdi, bdi_bits)) {
ret = 1;
@@ -162,7 +162,7 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
rcu_read_lock();
group_for_each_dev(ca, grp, i) {
- bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+ bdi = ca->disk_sb.bdev->bd_bdi;
if (bdi_congested(bdi, bdi_bits)) {
ret = 1;
@@ -1144,7 +1144,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
movinggc_reserve, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
- !init_heap(&ca->alloc_heap, heap_size, GFP_KERNEL) ||
+ !init_heap(&ca->alloc_heap, free_inc_reserve, GFP_KERNEL) ||
!init_heap(&ca->copygc_heap,heap_size, GFP_KERNEL) ||
!(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets *
sizeof(u8),
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 3c47f1cb..edfa85b0 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -24,6 +24,7 @@
#include <linux/blkdev.h>
#include <linux/sort.h>
+#include <linux/sched/clock.h>
#include "util.h"
@@ -124,6 +125,7 @@ write_attribute(trigger_journal_flush);
write_attribute(trigger_btree_coalesce);
write_attribute(trigger_gc);
write_attribute(prune_cache);
+rw_attribute(btree_gc_periodic);
read_attribute(uuid);
read_attribute(minor);
@@ -319,6 +321,8 @@ SHOW(bch2_fs)
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
+
sysfs_printf(foreground_write_ratelimit_enabled, "%i",
c->foreground_write_ratelimit_enabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
@@ -367,6 +371,14 @@ STORE(__bch2_fs)
sysfs_strtoul(foreground_write_ratelimit_enabled,
c->foreground_write_ratelimit_enabled);
+ if (attr == &sysfs_btree_gc_periodic) {
+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+ ?: (ssize_t) size;
+
+ wake_up_process(c->gc_thread);
+ return ret;
+ }
+
if (attr == &sysfs_copy_gc_enabled) {
struct bch_dev *ca;
unsigned i;
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 6ffc9811..5400dec5 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -17,6 +17,7 @@
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/sched/clock.h>
#include "util.h"
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index d7511aeb..927aa3a9 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -5,9 +5,9 @@
#include <linux/blkdev.h>
#include <linux/closure.h>
#include <linux/errno.h>
-#include <linux/blkdev.h>
#include <linux/freezer.h>
#include <linux/kernel.h>
+#include <linux/sched/clock.h>
#include <linux/llist.h>
#include <linux/log2.h>
#include <linux/ratelimit.h>