diff options
Diffstat (limited to 'libbcachefs')
-rw-r--r-- | libbcachefs/alloc.c | 171 | ||||
-rw-r--r-- | libbcachefs/alloc_types.h | 7 | ||||
-rw-r--r-- | libbcachefs/bcachefs.h | 10 | ||||
-rw-r--r-- | libbcachefs/bcachefs_format.h | 3 | ||||
-rw-r--r-- | libbcachefs/btree_gc.c | 68 | ||||
-rw-r--r-- | libbcachefs/btree_update.c | 80 | ||||
-rw-r--r-- | libbcachefs/btree_update.h | 7 | ||||
-rw-r--r-- | libbcachefs/buckets_types.h | 8 | ||||
-rw-r--r-- | libbcachefs/checksum.c | 30 | ||||
-rw-r--r-- | libbcachefs/compress.c | 38 | ||||
-rw-r--r-- | libbcachefs/extents.c | 9 | ||||
-rw-r--r-- | libbcachefs/fs-io.c | 9 | ||||
-rw-r--r-- | libbcachefs/fs-io.h | 2 | ||||
-rw-r--r-- | libbcachefs/io.c | 4 | ||||
-rw-r--r-- | libbcachefs/journal.c | 12 | ||||
-rw-r--r-- | libbcachefs/migrate.c | 21 | ||||
-rw-r--r-- | libbcachefs/super-io.c | 2 | ||||
-rw-r--r-- | libbcachefs/super.c | 6 | ||||
-rw-r--r-- | libbcachefs/sysfs.c | 12 | ||||
-rw-r--r-- | libbcachefs/util.c | 1 | ||||
-rw-r--r-- | libbcachefs/util.h | 2 |
21 files changed, 275 insertions, 227 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index a4e412ea..f3ded7b4 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -71,6 +71,8 @@ #include <linux/math64.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/sched/task.h> +#include <linux/sort.h> #include <trace/events/bcachefs.h> static void __bch2_bucket_free(struct bch_dev *, struct bucket *); @@ -283,8 +285,8 @@ int bch2_prio_write(struct bch_dev *ca) r < ca->mi.nbuckets && d < end; r++, d++) { g = ca->buckets + r; - d->read_prio = cpu_to_le16(g->read_prio); - d->write_prio = cpu_to_le16(g->write_prio); + d->prio[READ] = cpu_to_le16(g->prio[READ]); + d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]); d->gen = ca->buckets[r].mark.gen; } @@ -445,8 +447,8 @@ int bch2_prio_read(struct bch_dev *ca) d = p->data; } - ca->buckets[b].read_prio = le16_to_cpu(d->read_prio); - ca->buckets[b].write_prio = le16_to_cpu(d->write_prio); + ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]); + ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]); bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen); } @@ -469,9 +471,9 @@ fsck_err: * If there aren't enough available buckets to fill up free_inc, wait until * there are. */ -static int wait_buckets_available(struct bch_dev *ca) +static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; + unsigned long gc_count = c->gc_count; int ret = 0; while (1) { @@ -481,27 +483,18 @@ static int wait_buckets_available(struct bch_dev *ca) break; } - if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) { - if (c->gc_thread) { - trace_gc_cannot_inc_gens(ca->fs); - atomic_inc(&c->kick_gc); - wake_up_process(ca->fs->gc_thread); - } + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; - /* - * We are going to wait for GC to wake us up, even if - * bucket counters tell us enough buckets are available, - * because we are actually waiting for GC to rewrite - * nodes with stale pointers - */ - } else if (dev_buckets_available(ca) >= - fifo_free(&ca->free_inc)) + if ((ssize_t) (dev_buckets_available(ca) - + ca->inc_gen_really_needs_gc) >= + (ssize_t) fifo_free(&ca->free_inc)) break; - up_read(&ca->fs->gc_lock); + up_read(&c->gc_lock); schedule(); try_to_freeze(); - down_read(&ca->fs->gc_lock); + down_read(&c->gc_lock); } __set_current_state(TASK_RUNNING); @@ -639,9 +632,12 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, if (!is_available_bucket(mark)) return false; - if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1) + if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; + if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX) + ca->inc_gen_really_needs_gc++; + return can_inc_bucket_gen(ca, g); } @@ -651,8 +647,8 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) bch2_invalidate_bucket(ca, g); - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; + g->prio[READ] = ca->fs->prio_clock[READ].hand; + g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand; verify_not_on_freelist(ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); @@ -672,40 +668,34 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) * - The number of sectors of cached data in the bucket, which gives us an * indication of the cost in cache misses this eviction will cause. * - * - The difference between the bucket's current gen and oldest gen of any - * pointer into it, which gives us an indication of the cost of an eventual - * btree GC to rewrite nodes with stale pointers. + * - If hotness * sectors used compares equal, we pick the bucket with the + * smallest bucket_gc_gen() - since incrementing the same bucket's generation + * number repeatedly forces us to run mark and sweep gc to avoid generation + * number wraparound. */ -static unsigned long bucket_sort_key(bucket_heap *h, - struct bucket_heap_entry e) +static unsigned long bucket_sort_key(struct bch_dev *ca, + struct bucket *g, + struct bucket_mark m) { - struct bch_dev *ca = container_of(h, struct bch_dev, alloc_heap); - struct bucket *g = ca->buckets + e.bucket; - unsigned long prio = g->read_prio - ca->min_prio[READ]; - prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - - ca->min_prio[READ]); + unsigned long hotness = + (g->prio[READ] - ca->min_prio[READ]) * 7 / + (ca->fs->prio_clock[READ].hand - ca->min_prio[READ]); - return (prio + 1) * bucket_sectors_used(e.mark); -} - -static inline int bucket_alloc_cmp(bucket_heap *h, - struct bucket_heap_entry l, - struct bucket_heap_entry r) -{ - return bucket_sort_key(h, l) - bucket_sort_key(h, r); + return (((hotness + 1) * bucket_sectors_used(m)) << 8) | + bucket_gc_gen(ca, g); } -static inline long bucket_idx_cmp(bucket_heap *h, - struct bucket_heap_entry l, - struct bucket_heap_entry r) +static inline int bucket_alloc_cmp(alloc_heap *h, + struct alloc_heap_entry l, + struct alloc_heap_entry r) { - return l.bucket - r.bucket; + return (l.key > r.key) - (l.key < r.key); } static void invalidate_buckets_lru(struct bch_dev *ca) { - struct bucket_heap_entry e; + struct alloc_heap_entry e; struct bucket *g; ca->alloc_heap.used = 0; @@ -721,23 +711,26 @@ static void invalidate_buckets_lru(struct bch_dev *ca) */ for_each_bucket(g, ca) { struct bucket_mark m = READ_ONCE(g->mark); - struct bucket_heap_entry e = { g - ca->buckets, m }; if (!bch2_can_invalidate_bucket(ca, g, m)) continue; + e = (struct alloc_heap_entry) { + .bucket = g - ca->buckets, + .key = bucket_sort_key(ca, g, m) + }; + heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); } - /* Sort buckets by physical location on disk for better locality */ - heap_resort(&ca->alloc_heap, bucket_idx_cmp); + heap_resort(&ca->alloc_heap, bucket_alloc_cmp); /* * If we run out of buckets to invalidate, bch2_allocator_thread() will * kick stuff and retry us */ while (!fifo_full(&ca->free_inc) && - heap_pop(&ca->alloc_heap, e, bucket_idx_cmp)) + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]); mutex_unlock(&ca->fs->bucket_lock); @@ -790,6 +783,7 @@ static void invalidate_buckets_random(struct bch_dev *ca) static void invalidate_buckets(struct bch_dev *ca) { ca->inc_gen_needs_gc = 0; + ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case CACHE_REPLACEMENT_LRU: @@ -852,8 +846,8 @@ static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) spin_lock(&ca->freelist_lock); bch2_mark_alloc_bucket(ca, g, true); - g->read_prio = c->prio_clock[READ].hand; - g->write_prio = c->prio_clock[WRITE].hand; + g->prio[READ] = c->prio_clock[READ].hand; + g->prio[WRITE] = c->prio_clock[WRITE].hand; verify_not_on_freelist(ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); @@ -866,6 +860,13 @@ static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) } } +static int size_t_cmp(const void *_l, const void *_r) +{ + const size_t *l = _l, *r = _r; + + return (*l > *r) - (*l < *r); +} + /** * bch_allocator_thread - move buckets from free_inc to reserves * @@ -923,27 +924,13 @@ static int bch2_allocator_thread(void *arg) __set_current_state(TASK_RUNNING); } - down_read(&c->gc_lock); - - /* - * See if we have buckets we can reuse without invalidating them - * or forcing a journal commit: - */ - //bch2_find_empty_buckets(c, ca); - - if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { - up_read(&c->gc_lock); - continue; - } - /* We've run out of free buckets! */ - while (!fifo_full(&ca->free_inc)) { - if (wait_buckets_available(ca)) { - up_read(&c->gc_lock); - goto out; - } + BUG_ON(fifo_used(&ca->free_inc)); + ca->free_inc.front = ca->free_inc.back = 0; + down_read(&c->gc_lock); + while (1) { /* * Find some buckets that we can invalidate, either * they're completely unused, or only contain clean data @@ -952,12 +939,38 @@ static int bch2_allocator_thread(void *arg) */ invalidate_buckets(ca); + trace_alloc_batch(ca, fifo_used(&ca->free_inc), - ca->free_inc.size); - } + ca->free_inc.size); + + if ((ca->inc_gen_needs_gc >= ca->free_inc.size || + (!fifo_full(&ca->free_inc) && + ca->inc_gen_really_needs_gc >= + fifo_free(&ca->free_inc))) && + c->gc_thread) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } + + if (fifo_full(&ca->free_inc)) + break; + if (wait_buckets_available(c, ca)) { + up_read(&c->gc_lock); + goto out; + } + } up_read(&c->gc_lock); + BUG_ON(ca->free_inc.front); + + spin_lock(&ca->freelist_lock); + sort(ca->free_inc.data, + ca->free_inc.back, + sizeof(ca->free_inc.data[0]), + size_t_cmp, NULL); + spin_unlock(&ca->freelist_lock); + /* * free_inc is full of newly-invalidated buckets, must write out * prios and gens before they can be re-used @@ -1022,8 +1035,8 @@ out: g = ca->buckets + r; - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; + g->prio[READ] = ca->fs->prio_clock[READ].hand; + g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand; return r; } @@ -1031,9 +1044,6 @@ out: static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g) { bch2_mark_free_bucket(ca, g); - - g->read_prio = ca->fs->prio_clock[READ].hand; - g->write_prio = ca->fs->prio_clock[WRITE].hand; } enum bucket_alloc_ret { @@ -1614,8 +1624,7 @@ void bch2_recalc_capacity(struct bch_fs *c) unsigned i, j; for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = - blk_get_backing_dev_info(ca->disk_sb.bdev); + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; ra_pages += bdi->ra_pages; } diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 1bf48ef9..ae58d083 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -99,4 +99,11 @@ struct write_point { */ }; +struct alloc_heap_entry { + size_t bucket; + unsigned long key; +}; + +typedef HEAP(struct alloc_heap_entry) alloc_heap; + #endif /* _BCACHE_ALLOC_TYPES_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 6259b50e..977ac364 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -240,8 +240,6 @@ do { \ "btree node it traverses") \ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ "Disables rewriting of btree nodes during mark and sweep")\ - BCH_DEBUG_PARAM(btree_gc_coalesce_disabled, \ - "Disables coalescing of btree nodes") \ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ "Disables the shrinker callback for the btree node cache") @@ -273,7 +271,6 @@ do { \ #define BCH_TIME_STATS() \ BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \ BCH_TIME_STAT(btree_gc, sec, ms) \ - BCH_TIME_STAT(btree_coalesce, sec, ms) \ BCH_TIME_STAT(btree_split, sec, us) \ BCH_TIME_STAT(btree_sort, ms, us) \ BCH_TIME_STAT(btree_read, ms, us) \ @@ -417,8 +414,9 @@ struct bch_dev { atomic_long_t saturated_count; size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; - bucket_heap alloc_heap; + alloc_heap alloc_heap; bucket_heap copygc_heap; /* Moving GC: */ @@ -681,6 +679,7 @@ struct bch_fs { /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; + unsigned long gc_count; /* * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] @@ -716,7 +715,7 @@ struct bch_fs { mempool_t compression_bounce[2]; struct crypto_shash *sha256; - struct crypto_blkcipher *chacha20; + struct crypto_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -762,6 +761,7 @@ struct bch_fs { /* The rest of this all shows up in sysfs */ atomic_long_t read_realloc_races; + unsigned btree_gc_periodic:1; unsigned foreground_write_ratelimit_enabled:1; unsigned copy_gc_enabled:1; unsigned tiering_enabled:1; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index ef854fb1..2d64bcae 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1208,8 +1208,7 @@ struct prio_set { __le64 next_bucket; struct bucket_disk { - __le16 read_prio; - __le16 write_prio; + __le16 prio[2]; __u8 gen; } __attribute__((packed)) data[]; } __attribute__((packed, aligned(8))); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 99d28f64..d907ef58 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -166,14 +166,14 @@ fsck_err: return ret; } -static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b) +static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) { - if (btree_node_has_ptrs(b)) { - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; - u8 stale = 0; + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; + u8 stale = 0; + if (btree_node_has_ptrs(b)) for_each_btree_node_key_unpack(b, k, &iter, btree_node_is_extents(b), &unpacked) { @@ -182,17 +182,7 @@ static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b) btree_node_type(b), k)); } - if (btree_gc_rewrite_disabled(c)) - return false; - - if (stale > 10) - return true; - } - - if (btree_gc_always_rewrite(c)) - return true; - - return false; + return stale; } static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) @@ -212,10 +202,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) { struct btree_iter iter; struct btree *b; - bool should_rewrite; struct range_checks r; unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1; - int ret; + unsigned max_stale; + int ret = 0; /* * if expensive_debug_checks is on, run range_checks on all leaf nodes: @@ -231,12 +221,21 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) bch2_verify_btree_nr_keys(b); - should_rewrite = btree_gc_mark_node(c, b); + max_stale = btree_gc_mark_node(c, b); gc_pos_set(c, gc_pos_btree_node(b)); - if (should_rewrite) - bch2_btree_node_rewrite(&iter, b, NULL); + if (max_stale > 32) + bch2_btree_node_rewrite(c, &iter, + b->data->keys.seq, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_GC_LOCK_HELD); + else if (!btree_gc_rewrite_disabled(c) && + (btree_gc_always_rewrite(c) || max_stale > 16)) + bch2_btree_node_rewrite(c, &iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); bch2_btree_iter_cond_resched(&iter); } @@ -507,6 +506,7 @@ void bch2_gc(struct bch_fs *c) /* Indicates that gc is no longer in progress: */ gc_pos_set(c, gc_phase(GC_PHASE_DONE)); + c->gc_count++; up_write(&c->gc_lock); trace_gc_end(c); @@ -835,7 +835,6 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) */ void bch2_coalesce(struct bch_fs *c) { - u64 start_time; enum btree_id id; if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) @@ -843,7 +842,6 @@ void bch2_coalesce(struct bch_fs *c) down_read(&c->gc_lock); trace_gc_coalesce_start(c); - start_time = local_clock(); for (id = 0; id < BTREE_ID_NR; id++) { int ret = c->btree_roots[id].b @@ -858,7 +856,6 @@ void bch2_coalesce(struct bch_fs *c) } } - bch2_time_stats_update(&c->btree_coalesce_time, start_time); trace_gc_coalesce_end(c); up_read(&c->gc_lock); } @@ -873,9 +870,7 @@ static int bch2_gc_thread(void *arg) set_freezable(); while (1) { - unsigned long next = last + c->capacity / 16; - - while (atomic_long_read(&clock->now) < next) { + while (1) { set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) { @@ -883,21 +878,28 @@ static int bch2_gc_thread(void *arg) return 0; } - if (atomic_read(&c->kick_gc) != last_kick) { - __set_current_state(TASK_RUNNING); + if (atomic_read(&c->kick_gc) != last_kick) break; + + if (c->btree_gc_periodic) { + unsigned long next = last + c->capacity / 16; + + if (atomic_long_read(&clock->now) >= next) + break; + + bch2_io_clock_schedule_timeout(clock, next); + } else { + schedule(); } - bch2_io_clock_schedule_timeout(clock, next); try_to_freeze(); } + __set_current_state(TASK_RUNNING); last = atomic_long_read(&clock->now); last_kick = atomic_read(&c->kick_gc); bch2_gc(c); - if (!btree_gc_coalesce_disabled(c)) - bch2_coalesce(c); debug_check_no_locks_held(); } diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 2f67c092..8a4ee6d1 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -533,6 +533,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, if (flags & BTREE_INSERT_NOFAIL) disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; + if (flags & BTREE_INSERT_NOWAIT) + cl = NULL; + /* * This check isn't necessary for correctness - it's just to potentially * prevent us from doing a lot of work that'll end up being wasted: @@ -2279,30 +2282,13 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } -/** - * bch_btree_node_rewrite - Rewrite/move a btree node - * - * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. - * btree_check_reserve() has to wait) - */ -int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b, - struct closure *cl) +static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + struct btree *b, unsigned flags, + struct closure *cl) { - struct bch_fs *c = iter->c; struct btree *n, *parent = iter->nodes[b->level + 1]; struct btree_reserve *reserve; struct btree_interior_update *as; - unsigned flags = BTREE_INSERT_NOFAIL; - - /* - * if caller is going to wait if allocating reserve fails, then this is - * a rewrite that must succeed: - */ - if (cl) - flags |= BTREE_INSERT_USE_RESERVE; - - if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) - return -EINTR; reserve = bch2_btree_reserve_get(c, b, 0, flags, cl); if (IS_ERR(reserve)) { @@ -2341,3 +2327,57 @@ int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b, bch2_btree_reserve_put(c, reserve); return 0; } + +/** + * bch_btree_node_rewrite - Rewrite/move a btree node + * + * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. + * btree_check_reserve() has to wait) + */ +int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + __le64 seq, unsigned flags) +{ + unsigned locks_want = iter->locks_want; + struct closure cl; + struct btree *b; + int ret; + + flags |= BTREE_INSERT_NOFAIL; + + closure_init_stack(&cl); + + bch2_btree_iter_set_locks_want(iter, U8_MAX); + + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { + if (!down_read_trylock(&c->gc_lock)) { + bch2_btree_iter_unlock(iter); + down_read(&c->gc_lock); + } + } + + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (ret) + break; + + b = bch2_btree_iter_peek_node(iter); + if (!b || b->data->keys.seq != seq) + break; + + ret = __btree_node_rewrite(c, iter, b, flags, &cl); + if (ret != -EAGAIN && + ret != -EINTR) + break; + + bch2_btree_iter_unlock(iter); + closure_sync(&cl); + } + + bch2_btree_iter_set_locks_want(iter, locks_want); + + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); + + closure_sync(&cl); + return ret; +} diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index a933d5a9..7c4abe4a 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -380,6 +380,10 @@ int __bch2_btree_insert_at(struct btree_insert *); */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) +/* Don't block on allocation failure (for new btree nodes: */ +#define BTREE_INSERT_NOWAIT (1 << 4) +#define BTREE_INSERT_GC_LOCK_HELD (1 << 5) + int bch2_btree_delete_at(struct btree_iter *, unsigned); int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, @@ -416,7 +420,8 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct disk_reservation *, struct extent_insert_hook *, u64 *); -int bch2_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *); +int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, + __le64, unsigned); #endif /* _BCACHE_BTREE_INSERT_H */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 18bf1713..68f863f3 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -50,13 +50,7 @@ struct bucket_mark { }; struct bucket { - union { - struct { - u16 read_prio; - u16 write_prio; - }; - u16 prio[2]; - }; + u16 prio[2]; union { struct bucket_mark _mark; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 4545a499..f2883e1f 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -178,18 +178,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_blkcipher *tfm, +static inline void do_encrypt_sg(struct crypto_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d }; + SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + skcipher_request_set_tfm(req, tfm); + skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + + ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_blkcipher *tfm, +static inline void do_encrypt(struct crypto_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -202,20 +205,20 @@ static inline void do_encrypt(struct crypto_blkcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_blkcipher *chacha20 = - crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC); + struct crypto_skcipher *chacha20 = + crypto_alloc_skcipher("chacha20", 0, 0); int ret; if (!chacha20) return PTR_ERR(chacha20); - ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); if (ret) goto err; do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_blkcipher(chacha20); + crypto_free_skcipher(chacha20); return ret; } @@ -377,7 +380,7 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) return PTR_ERR(keyring_key); down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); + ukp = dereference_key_locked(keyring_key); if (ukp->datalen == sizeof(*key)) { memcpy(key, ukp->data, ukp->datalen); ret = 0; @@ -454,8 +457,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_blkcipher("chacha20", 0, - CRYPTO_ALG_ASYNC); + c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) return PTR_ERR(c->chacha20); @@ -532,7 +534,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_blkcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(c->chacha20, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -560,7 +562,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_blkcipher(c->chacha20); + crypto_free_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -587,7 +589,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto err; - ret = crypto_blkcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(c->chacha20, (void *) &key.key, sizeof(key.key)); err: memzero_explicit(&key, sizeof(key)); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 547ea732..62b42042 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -148,9 +148,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_LZ4: - ret = lz4_decompress(src_data, &src_len, - dst_data, dst_len); - if (ret) { + ret = LZ4_decompress_safe(src_data, dst_data, + src_len, dst_len); + + if (ret != dst_len) { ret = -EIO; goto err; } @@ -286,32 +287,27 @@ static int __bio_compress(struct bch_fs *c, switch (compression_type) { case BCH_COMPRESSION_LZ4: { void *workspace; - - *dst_len = dst->bi_iter.bi_size; - *src_len = src->bi_iter.bi_size; + int srclen = src->bi_iter.bi_size; + ret = 0; workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); - while (*src_len > block_bytes(c) && - (ret = lz4_compress(src_data, *src_len, - dst_data, dst_len, - workspace))) { - /* - * On error, the compressed data was bigger than - * dst_len, and -ret is the amount of data we were able - * to compress - round down to nearest block and try - * again: - */ - BUG_ON(ret > 0); - BUG_ON(-ret >= *src_len); - - *src_len = round_down(-ret, block_bytes(c)); + while (srclen > block_bytes(c) && + (ret = LZ4_compress_destSize(src_data, dst_data, + &srclen, dst->bi_iter.bi_size, + workspace)) && + (srclen & (block_bytes(c) - 1))) { + /* Round down to nearest block and try again: */ + srclen = round_down(srclen, block_bytes(c)); } mempool_free(workspace, &c->lz4_workspace_pool); - if (ret) + if (!ret) goto err; + + *src_len = srclen; + *dst_len = ret; break; } case BCH_COMPRESSION_GZIP: { diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 219b60a3..57bfb4a6 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -559,10 +559,10 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, return; err: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i " + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " "gen %i last_gc %i mark %08x", err, buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, + PTR_BUCKET(ca, ptr)->mark.gen, ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], (unsigned) g->mark.counter); } @@ -1769,10 +1769,9 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, bad_ptr: bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); - bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i " + bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " "gen %i last_gc %i mark 0x%08x", - buf, PTR_BUCKET_NR(ca, ptr), - g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen, + buf, PTR_BUCKET_NR(ca, ptr), PTR_BUCKET(ca, ptr)->mark.gen, ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)], (unsigned) g->mark.counter); return; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 4a680ade..803611d1 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -757,7 +757,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, flags |= BCH_READ_IS_LAST; if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = + PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] = c->prio_clock[READ].hand; bch2_read_extent(c, rbio, k, &pick, flags); @@ -1775,16 +1775,17 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) return ret; } -int bch2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int bch2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct bch_fs *c = inode->i_sb->s_fs_info; int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(file); /* * Not strictly necessary, but helps avoid dio writes livelocking in diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index f3fcf947..3fcc1e7d 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -29,7 +29,7 @@ long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); loff_t bch2_llseek(struct file *, loff_t, int); -int bch2_page_mkwrite(struct vm_area_struct *, struct vm_fault *); +int bch2_page_mkwrite(struct vm_fault *); void bch2_invalidatepage(struct page *, unsigned int, unsigned int); int bch2_releasepage(struct page *, gfp_t); int bch2_migrate_page(struct address_space *, struct page *, diff --git a/libbcachefs/io.c b/libbcachefs/io.c index d588f6ab..1145a190 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1300,11 +1300,11 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, flags |= BCH_READ_IS_LAST; if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = + PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] = c->prio_clock[READ].hand; bch2_read_extent_iter(c, rbio, bvec_iter, - k, &pick, flags); + k, &pick, flags); flags &= ~BCH_READ_MAY_REUSE_BIO; } else { diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 510066a2..92364fea 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -164,21 +164,15 @@ static void journal_seq_blacklist_flush(struct journal *j, mutex_unlock(&j->blacklist_lock); __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); -redo_peek: + b = bch2_btree_iter_peek_node(&iter); /* The node might have already been rewritten: */ if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(&iter, b, &cl); + ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0); if (ret) { bch2_btree_iter_unlock(&iter); - closure_sync(&cl); - - if (ret == -EAGAIN || - ret == -EINTR) - goto redo_peek; - bch2_fs_fatal_error(c, "error %i rewriting btree node with blacklisted journal seq", ret); @@ -190,8 +184,6 @@ redo_peek: bch2_btree_iter_unlock(&iter); } - closure_sync(&cl); - for (i = 0;; i++) { struct btree_interior_update *as; struct pending_btree_node_free *d; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 8680b100..8c9e3c25 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -156,9 +156,9 @@ next: * This walks the btree, and for any node on the relevant device it moves the * node elsewhere. */ -static int bch2_move_btree_off(struct bch_dev *ca, enum btree_id id) +static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, + enum btree_id id) { - struct bch_fs *c = ca->fs; struct btree_iter iter; struct closure cl; struct btree *b; @@ -170,22 +170,11 @@ static int bch2_move_btree_off(struct bch_dev *ca, enum btree_id id) for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); -retry: + if (!bch2_extent_has_device(e, ca->dev_idx)) continue; - ret = bch2_btree_node_rewrite(&iter, b, &cl); - if (ret == -EINTR || ret == -ENOSPC) { - /* - * Drop locks to upgrade locks or wait on - * reserve: after retaking, recheck in case we - * raced. - */ - bch2_btree_iter_unlock(&iter); - closure_sync(&cl); - b = bch2_btree_iter_peek_node(&iter); - goto retry; - } + ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0); if (ret) { bch2_btree_iter_unlock(&iter); return ret; @@ -268,7 +257,7 @@ int bch2_move_metadata_off_device(struct bch_dev *ca) /* 1st, Move the btree nodes off the device */ for (i = 0; i < BTREE_ID_NR; i++) { - ret = bch2_move_btree_off(ca, i); + ret = bch2_move_btree_off(c, ca, i); if (ret) return ret; } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index fa020af3..130b130f 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -442,7 +442,7 @@ static const char *bch2_blkdev_open(const char *path, fmode_t mode, return "failed to open device"; if (mode & FMODE_WRITE) - bdev_get_queue(bdev)->backing_dev_info.capabilities + bdev_get_queue(bdev)->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; *ret = bdev; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 528538b5..2a3947e2 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -148,7 +148,7 @@ int bch2_congested(struct bch_fs *c, int bdi_bits) if (bdi_bits & (1 << WB_sync_congested)) { /* Reads - check all devices: */ for_each_readable_member(ca, c, i) { - bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); + bdi = ca->disk_sb.bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { ret = 1; @@ -162,7 +162,7 @@ int bch2_congested(struct bch_fs *c, int bdi_bits) rcu_read_lock(); group_for_each_dev(ca, grp, i) { - bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); + bdi = ca->disk_sb.bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { ret = 1; @@ -1144,7 +1144,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) movinggc_reserve, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&ca->alloc_heap, heap_size, GFP_KERNEL) || + !init_heap(&ca->alloc_heap, free_inc_reserve, GFP_KERNEL) || !init_heap(&ca->copygc_heap,heap_size, GFP_KERNEL) || !(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets * sizeof(u8), diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 3c47f1cb..edfa85b0 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -24,6 +24,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> +#include <linux/sched/clock.h> #include "util.h" @@ -124,6 +125,7 @@ write_attribute(trigger_journal_flush); write_attribute(trigger_btree_coalesce); write_attribute(trigger_gc); write_attribute(prune_cache); +rw_attribute(btree_gc_periodic); read_attribute(uuid); read_attribute(minor); @@ -319,6 +321,8 @@ SHOW(bch2_fs) sysfs_print(read_realloc_races, atomic_long_read(&c->read_realloc_races)); + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + sysfs_printf(foreground_write_ratelimit_enabled, "%i", c->foreground_write_ratelimit_enabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -367,6 +371,14 @@ STORE(__bch2_fs) sysfs_strtoul(foreground_write_ratelimit_enabled, c->foreground_write_ratelimit_enabled); + if (attr == &sysfs_btree_gc_periodic) { + ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) + ?: (ssize_t) size; + + wake_up_process(c->gc_thread); + return ret; + } + if (attr == &sysfs_copy_gc_enabled) { struct bch_dev *ca; unsigned i; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 6ffc9811..5400dec5 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -17,6 +17,7 @@ #include <linux/seq_file.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/sched/clock.h> #include "util.h" diff --git a/libbcachefs/util.h b/libbcachefs/util.h index d7511aeb..927aa3a9 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -5,9 +5,9 @@ #include <linux/blkdev.h> #include <linux/closure.h> #include <linux/errno.h> -#include <linux/blkdev.h> #include <linux/freezer.h> #include <linux/kernel.h> +#include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/log2.h> #include <linux/ratelimit.h> |