From 565b4a74d6c25c78b0d2b82d9529595fc6269308 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 May 2017 18:45:15 -0800 Subject: Update bcachefs sources to 14e9ac5016 bcachefs: btree_iter fastpath --- .bcachefs_revision | 2 +- cmd_migrate.c | 9 +- include/linux/compiler.h | 4 + include/linux/mempool.h | 28 ++-- include/linux/slab.h | 5 - include/linux/vmalloc.h | 37 ++++- libbcachefs/alloc.c | 6 +- libbcachefs/bcachefs.h | 4 + libbcachefs/bcachefs_format.h | 4 +- libbcachefs/bkey.c | 29 +++- libbcachefs/btree_cache.c | 51 ++++--- libbcachefs/btree_cache.h | 10 ++ libbcachefs/btree_gc.c | 10 +- libbcachefs/btree_io.c | 142 ++++++++++++------- libbcachefs/btree_io.h | 3 +- libbcachefs/btree_iter.c | 39 +++++- libbcachefs/btree_iter.h | 29 ++-- libbcachefs/btree_types.h | 4 +- libbcachefs/btree_update.c | 316 ++++++++++++++++++++++++++++++++++-------- libbcachefs/btree_update.h | 13 +- libbcachefs/extents.c | 31 +++++ libbcachefs/extents.h | 21 +-- libbcachefs/fs-io.c | 37 +++-- libbcachefs/fs-io.h | 10 +- libbcachefs/fs.c | 5 +- libbcachefs/io.c | 142 ++++++++----------- libbcachefs/io.h | 9 +- libbcachefs/io_types.h | 32 ++--- libbcachefs/journal.c | 45 +++--- libbcachefs/journal.h | 55 ++++---- libbcachefs/migrate.c | 18 ++- libbcachefs/move.c | 13 +- libbcachefs/move.h | 1 - libbcachefs/super-io.c | 118 ++++++++++++++-- libbcachefs/super-io.h | 90 +----------- libbcachefs/super.c | 20 ++- libbcachefs/util.c | 14 ++ libbcachefs/util.h | 36 +++-- linux/sched.c | 3 + 39 files changed, 928 insertions(+), 517 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 8bc4e35..d2d0c51 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -43e3159567958ea70c8a95d98fdb6e881153a656 +14e9ac5016803fc63c1216608c866bef16b4053e diff --git a/cmd_migrate.c b/cmd_migrate.c index bf8f0be..82fa0f1 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -250,7 +250,6 @@ static void write_data(struct bch_fs *c, { struct disk_reservation res; struct bch_write_op op; - struct bch_write_bio bio; struct bio_vec bv; struct closure cl; @@ -259,15 +258,15 @@ static void write_data(struct bch_fs *c, closure_init_stack(&cl); - bio_init(&bio.bio, &bv, 1); - bio.bio.bi_iter.bi_size = len; - bch2_bio_map(&bio.bio, buf); + bio_init(&op.wbio.bio, &bv, 1); + op.wbio.bio.bi_iter.bi_size = len; + bch2_bio_map(&op.wbio.bio, buf); int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0); if (ret) die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_write_op_init(&op, c, &bio, res, c->write_points, + bch2_write_op_init(&op, c, res, c->write_points, POS(dst_inode->inum, dst_offset >> 9), NULL, 0); closure_call(&op.cl, bch2_write, NULL, &cl); closure_sync(&cl); diff --git a/include/linux/compiler.h b/include/linux/compiler.h index e5c31a6..915a6f8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -166,4 +166,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) +#ifdef __x86_64 +#define CONFIG_X86_64 y +#endif + #endif /* _TOOLS_LINUX_COMPILER_H */ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index ddf6f94..37d8149 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -10,8 +10,14 @@ struct kmem_cache; +typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); +typedef void (mempool_free_t)(void *element, void *pool_data); + typedef struct mempool_s { - size_t elem_size; + size_t elem_size; + void *pool_data; + mempool_alloc_t *alloc; + mempool_free_t *free; } mempool_t; static inline bool mempool_initialized(mempool_t *pool) @@ -60,24 +66,22 @@ static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t return 0; } -static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) -{ - mempool_t *pool = malloc(sizeof(*pool)); - pool->elem_size = size; - return pool; -} - static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) { pool->elem_size = PAGE_SIZE << order; return 0; } -static inline mempool_t *mempool_create_page_pool(int min_nr, int order) +static inline int mempool_init(mempool_t *pool, int min_nr, + mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, + void *pool_data) { - mempool_t *pool = malloc(sizeof(*pool)); - pool->elem_size = PAGE_SIZE << order; - return pool; + pool->elem_size = (size_t) pool_data; + pool->pool_data = pool_data; + pool->alloc = alloc_fn; + pool->free = free_fn; + return 0; } #endif /* _LINUX_MEMPOOL_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 58fb73e..d0d8790 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -43,9 +43,6 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) #define kcalloc(n, size, flags) calloc(n, size) #define kmalloc_array(n, size, flags) calloc(n, size) -#define vmalloc(size) malloc(size) -#define vzalloc(size) calloc(1, size) - #define kfree(p) free(p) #define kvfree(p) free(p) #define kzfree(p) free(p) @@ -89,8 +86,6 @@ do { \ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ -#define PAGE_KERNEL 0 - static inline void vunmap(const void *addr) {} static inline void *vmap(struct page **pages, unsigned int count, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb6284d..debdced 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -1,8 +1,41 @@ #ifndef __TOOLS_LINUX_VMALLOC_H #define __TOOLS_LINUX_VMALLOC_H -#define vmalloc(size) malloc(size) -#define __vmalloc(size, flags, prot) malloc(size) +#include +#include + +#include "tools-util.h" + +#define PAGE_KERNEL 0 +#define PAGE_KERNEL_EXEC 1 + #define vfree(p) free(p) +static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask, unsigned prot) +{ + void *p = aligned_alloc(PAGE_SIZE, size); + + if (p && prot == PAGE_KERNEL_EXEC) { + if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) { + vfree(p); + p = NULL; + } + } + + if (p && (gfp_mask & __GFP_ZERO)) + memset(p, 0, size); + + return p; +} + +static inline void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +} + +static inline void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL|__GFP_ZERO, PAGE_KERNEL); +} + #endif /* __TOOLS_LINUX_VMALLOC_H */ diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index a12c5d3..36dc947 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -361,7 +361,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, struct bucket *g, struct btree_iter *iter, u64 *journal_seq) { - struct bucket_mark m = READ_ONCE(g->mark); + struct bucket_mark m; __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; struct bkey_i_alloc *a; u8 *d; @@ -374,6 +374,8 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, if (ret) break; + /* read mark under btree node lock: */ + m = READ_ONCE(g->mark); a = bkey_alloc_init(&alloc_key.k); a->k.p = iter->pos; a->v.fields = 0; @@ -407,8 +409,6 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) struct btree_iter iter; int ret; - lockdep_assert_held(&c->state_lock); - if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) return 0; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 9d04e89..4d0fc62 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -725,6 +725,10 @@ struct bch_fs { struct work_struct read_retry_work; spinlock_t read_retry_lock; + struct bio_list btree_write_error_list; + struct work_struct btree_write_error_work; + spinlock_t btree_write_error_lock; + /* ERRORS */ struct list_head fsck_errors; struct mutex fsck_error_lock; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 3f6d51a..125b6fa 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1082,7 +1082,8 @@ struct jset_entry { __le16 u64s; __u8 btree_id; __u8 level; - __le32 flags; /* designates what this jset holds */ + __u8 type; /* designates what this jset holds */ + __u8 pad[3]; union { struct bkey_i start[0]; @@ -1092,7 +1093,6 @@ struct jset_entry { #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) -LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8); enum { JOURNAL_ENTRY_BTREE_KEYS = 0, JOURNAL_ENTRY_BTREE_ROOT = 1, diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index b9ceb6e..cc76257 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -791,11 +791,9 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, unsigned dst_offset, unsigned dst_size, bool *eax_zeroed) { - unsigned byte = format->key_u64s * sizeof(u64); unsigned bits = format->bits_per_field[field]; u64 offset = format->field_offset[field]; - unsigned i, bit_offset = 0; - unsigned shl, shr; + unsigned i, byte, bit_offset, align, shl, shr; if (!bits && !offset) { if (!*eax_zeroed) { @@ -842,11 +840,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, return out; } + bit_offset = format->key_u64s * 64; for (i = 0; i <= field; i++) - bit_offset += format->bits_per_field[i]; + bit_offset -= format->bits_per_field[i]; - byte -= DIV_ROUND_UP(bit_offset, 8); - bit_offset = round_up(bit_offset, 8) - bit_offset; + byte = bit_offset / 8; + bit_offset -= byte * 8; *eax_zeroed = false; @@ -857,6 +856,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, /* movzx eax, WORD PTR [rsi + imm8] */ I4(0x0f, 0xb7, 0x46, byte); } else if (bit_offset + bits <= 32) { + align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 32); + /* mov eax, [rsi + imm8] */ I3(0x8b, 0x46, byte); @@ -874,6 +879,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, out += 4; } } else if (bit_offset + bits <= 64) { + align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 64); + /* mov rax, [rsi + imm8] */ I4(0x48, 0x8b, 0x46, byte); @@ -890,6 +901,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, I4(0x48, 0xc1, 0xe8, shr); } } else { + align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 96); + /* mov rax, [rsi + byte] */ I4(0x48, 0x8b, 0x46, byte); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index bdbe21a..d619f37 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -41,7 +41,7 @@ static void __mca_data_free(struct bch_fs *c, struct btree *b) { EBUG_ON(btree_node_write_in_flight(b)); - free_pages((unsigned long) b->data, btree_page_order(c)); + kvpfree(b->data, btree_bytes(c)); b->data = NULL; bch2_btree_keys_free(b); } @@ -53,8 +53,6 @@ static void mca_data_free(struct bch_fs *c, struct btree *b) list_move(&b->list, &c->btree_cache_freed); } -#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0]) - static const struct rhashtable_params bch_btree_cache_params = { .head_offset = offsetof(struct btree, hash), .key_offset = offsetof(struct btree, key.v), @@ -63,20 +61,18 @@ static const struct rhashtable_params bch_btree_cache_params = { static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { - unsigned order = ilog2(btree_pages(c)); - - b->data = (void *) __get_free_pages(gfp, order); + b->data = kvpmalloc(btree_bytes(c), gfp); if (!b->data) goto err; - if (bch2_btree_keys_alloc(b, order, gfp)) + if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) goto err; c->btree_cache_used++; list_move(&b->list, &c->btree_cache_freeable); return; err: - free_pages((unsigned long) b->data, order); + kvpfree(b->data, btree_bytes(c)); b->data = NULL; list_move(&b->list, &c->btree_cache_freed); } @@ -91,7 +87,6 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) six_lock_init(&b->lock); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - INIT_LIST_HEAD(&b->reachable); mca_data_alloc(c, b, gfp); return b->data ? b : NULL; @@ -101,10 +96,6 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b) { - BUG_ON(btree_node_dirty(b)); - - b->nsets = 0; - rhashtable_remove_fast(&c->btree_cache_table, &b->hash, bch_btree_cache_params); @@ -112,23 +103,27 @@ void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b) bkey_i_to_extent(&b->key)->v._data[0] = 0; } +int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b) +{ + return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash, + bch_btree_cache_params); +} + int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b, unsigned level, enum btree_id id) { int ret; + b->level = level; b->btree_id = id; - ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash, - bch_btree_cache_params); - if (ret) - return ret; - mutex_lock(&c->btree_cache_lock); - list_add(&b->list, &c->btree_cache); + ret = __bch2_btree_node_hash_insert(c, b); + if (!ret) + list_add(&b->list, &c->btree_cache); mutex_unlock(&c->btree_cache_lock); - return 0; + return ret; } __flatten @@ -155,8 +150,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) if (!six_trylock_write(&b->lock)) goto out_unlock_intent; - if (btree_node_write_error(b) || - btree_node_noevict(b)) + if (btree_node_noevict(b)) goto out_unlock; if (!btree_node_may_write(b)) @@ -328,7 +322,7 @@ void bch2_fs_btree_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c))); + kvpfree(c->verify_ondisk, btree_bytes(c)); #endif for (i = 0; i < BTREE_ID_NR; i++) @@ -384,8 +378,7 @@ int bch2_fs_btree_init(struct bch_fs *c) #ifdef CONFIG_BCACHEFS_DEBUG mutex_init(&c->verify_lock); - c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(btree_pages(c))); + c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); if (!c->verify_ondisk) return -ENOMEM; @@ -510,7 +503,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) BUG_ON(!six_trylock_intent(&b->lock)); BUG_ON(!six_trylock_write(&b->lock)); out_unlock: - BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key)); + BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_write_in_flight(b)); list_del_init(&b->list); @@ -554,6 +547,12 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter, struct bch_fs *c = iter->c; struct btree *b; + /* + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ + BUG_ON(!btree_node_locked(iter, level + 1)); + b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) return b; diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index ca8e319..ea53d2b 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_types.h" +#include "extents.h" struct btree_iter; @@ -11,6 +12,7 @@ extern const char * const bch2_btree_ids[]; void bch2_recalc_btree_reserve(struct bch_fs *); void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *); +int __bch2_btree_node_hash_insert(struct bch_fs *, struct btree *); int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *, unsigned, enum btree_id); @@ -28,6 +30,14 @@ void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *, void bch2_fs_btree_exit(struct bch_fs *); int bch2_fs_btree_init(struct bch_fs *); +#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0]) + +/* is btree node in hash table? */ +static inline bool btree_node_hashed(struct btree *b) +{ + return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key); +} + #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \ &(_c)->btree_cache_table), \ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 815260b..376edaf 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -621,12 +621,10 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]); /* Repack everything with @new_format and sort down to one bset */ - for (i = 0; i < nr_old_nodes; i++) { + for (i = 0; i < nr_old_nodes; i++) new_nodes[i] = __bch2_btree_node_alloc_replacement(c, old_nodes[i], - new_format, res); - list_add(&new_nodes[i]->reachable, &as->reachable_list); - } + new_format, as, res); /* * Conceptually we concatenate the nodes together and slice them @@ -663,7 +661,6 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], set_btree_bset_end(n1, n1->set); - list_del_init(&n2->reachable); six_unlock_write(&n2->lock); bch2_btree_node_free_never_inserted(c, n2); six_unlock_intent(&n2->lock); @@ -796,7 +793,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) memset(merge, 0, sizeof(merge)); __for_each_btree_node(&iter, c, btree_id, POS_MIN, - U8_MAX, 0, BTREE_ITER_PREFETCH, b) { + BTREE_MAX_DEPTH, 0, + BTREE_ITER_PREFETCH, b) { memmove(merge + 1, merge, sizeof(merge) - sizeof(merge[0])); memmove(lock_seq + 1, lock_seq, diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 571a814..eeb546e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -56,9 +56,9 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order, bool used_mempool, void *p) { if (used_mempool) - mempool_free(virt_to_page(p), &c->btree_bounce_pool); + mempool_free(p, &c->btree_bounce_pool); else - free_pages((unsigned long) p, order); + vpfree(p, PAGE_SIZE << order); } static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, @@ -66,7 +66,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, { void *p; - BUG_ON(1 << order > btree_pages(c)); + BUG_ON(order > btree_page_order(c)); *used_mempool = false; p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); @@ -74,7 +74,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, return p; *used_mempool = true; - return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO)); + return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); } typedef int (*sort_cmp_fn)(struct btree *, @@ -1183,7 +1183,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, if (bne->keys.seq == b->data->keys.seq) goto err; - sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool); + sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); sorted->keys.u64s = 0; b->nr = btree_node_is_extents(b) @@ -1199,7 +1199,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted); + btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); bch2_bset_build_aux_tree(b, b->set, false); @@ -1344,50 +1344,100 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); - /* - * Before calling bch2_btree_complete_write() - if the write errored, we - * have to halt new journal writes before they see this btree node - * write as completed: - */ - if (btree_node_write_error(b)) - bch2_journal_halt(&c->journal); - bch2_btree_complete_write(c, b, w); btree_node_io_unlock(b); } -static void btree_node_write_endio(struct bio *bio) +static void bch2_btree_node_write_error(struct bch_fs *c, + struct bch_write_bio *wbio) { - struct btree *b = bio->bi_private; - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_fs *c = wbio->c; - struct bio *orig = wbio->split ? wbio->orig : NULL; - struct closure *cl = !wbio->split ? wbio->cl : NULL; - struct bch_dev *ca = wbio->ca; + struct btree *b = wbio->bio.bi_private; + struct closure *cl = wbio->cl; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct bkey_i_extent *new_key; + + bkey_copy(&tmp.k, &b->key); + new_key = bkey_i_to_extent(&tmp.k); + + while (wbio->replicas_failed) { + unsigned idx = __fls(wbio->replicas_failed); + + bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx); + wbio->replicas_failed ^= 1 << idx; + } + + if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) || + bch2_btree_node_update_key(c, b, new_key)) { + set_btree_node_noevict(b); + bch2_fatal_error(c); + } + + bio_put(&wbio->bio); + btree_node_write_done(c, b); + if (cl) + closure_put(cl); +} + +void bch2_btree_write_error_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + btree_write_error_work); + struct bio *bio; + + while (1) { + spin_lock_irq(&c->read_retry_lock); + bio = bio_list_pop(&c->read_retry_list); + spin_unlock_irq(&c->read_retry_lock); - if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") || + if (!bio) + break; + + bch2_btree_node_write_error(c, to_wbio(bio)); + } +} + +static void btree_node_write_endio(struct bio *bio) +{ + struct btree *b = bio->bi_private; + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_write_bio *orig = parent ?: wbio; + struct closure *cl = !wbio->split ? wbio->cl : NULL; + struct bch_fs *c = wbio->c; + struct bch_dev *ca = wbio->ca; + + if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") || bch2_meta_write_fault("btree")) - set_btree_node_write_error(b); + set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed); if (wbio->have_io_ref) percpu_ref_put(&ca->io_ref); - if (wbio->bounce) - btree_bounce_free(c, - wbio->order, - wbio->used_mempool, - page_address(bio->bi_io_vec[0].bv_page)); - - if (wbio->put_bio) + if (parent) { bio_put(bio); + bio_endio(&parent->bio); + return; + } - if (orig) { - bio_endio(orig); - } else { - btree_node_write_done(c, b); - if (cl) - closure_put(cl); + btree_bounce_free(c, + wbio->order, + wbio->used_mempool, + wbio->data); + + if (wbio->replicas_failed) { + unsigned long flags; + + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bio_list_add(&c->read_retry_list, &wbio->bio); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + queue_work(c->wq, &c->btree_write_error_work); + return; } + + bio_put(bio); + btree_node_write_done(c, b); + if (cl) + closure_put(cl); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, @@ -1411,7 +1461,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct closure *parent, enum six_lock_type lock_type_held) { - struct bio *bio; struct bch_write_bio *wbio; struct bset_tree *t; struct bset *i; @@ -1458,7 +1507,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, } while (cmpxchg_acquire(&b->flags, old, new) != old); BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(!list_empty_careful(&b->reachable) != !b->written); + BUG_ON((b->will_make_reachable != NULL) != !b->written); BUG_ON(b->written >= c->sb.btree_node_size); BUG_ON(bset_written(b, btree_bset_last(b))); @@ -1601,23 +1650,20 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, trace_btree_write(b, bytes_to_write, sectors_to_write); - bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); - - wbio = to_wbio(bio); + wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write)); wbio->cl = parent; - wbio->bounce = true; - wbio->put_bio = true; wbio->order = order; wbio->used_mempool = used_mempool; - bio->bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; - bio->bi_iter.bi_size = sectors_to_write << 9; - bio->bi_end_io = btree_node_write_endio; - bio->bi_private = b; + wbio->data = data; + wbio->bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; + wbio->bio.bi_iter.bi_size = sectors_to_write << 9; + wbio->bio.bi_end_io = btree_node_write_endio; + wbio->bio.bi_private = b; if (parent) closure_get(parent); - bch2_bio_map(bio, data); + bch2_bio_map(&wbio->bio, data); /* * If we're appending to a leaf node, we don't technically need FUA - diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 7333f30..91263ee 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -37,7 +37,7 @@ static inline void btree_node_wait_on_io(struct btree *b) static inline bool btree_node_may_write(struct btree *b) { return list_empty_careful(&b->write_blocked) && - list_empty_careful(&b->reachable); + !b->will_make_reachable; } enum compact_mode { @@ -79,6 +79,7 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id, void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); +void bch2_btree_write_error_work(struct work_struct *); void __bch2_btree_node_write(struct bch_fs *, struct btree *, struct closure *, enum six_lock_type); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index ecad24f..46df99f 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -252,6 +252,8 @@ static int __bch2_btree_iter_unlock(struct btree_iter *iter) while (iter->nodes_locked) btree_node_unlock(iter, __ffs(iter->nodes_locked)); + iter->flags &= ~BTREE_ITER_UPTODATE; + return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; } @@ -1006,16 +1008,30 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ iter->flags |= BTREE_ITER_AT_END_OF_LEAF; iter->pos = new_pos; + iter->flags &= ~BTREE_ITER_UPTODATE; } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ iter->pos = new_pos; + iter->flags &= ~BTREE_ITER_UPTODATE; } void bch2_btree_iter_advance_pos(struct btree_iter *iter) { + if (iter->flags & BTREE_ITER_UPTODATE && + !(iter->flags & BTREE_ITER_WITH_HOLES)) { + struct bkey_s_c k; + + __btree_iter_advance(iter); + k = __btree_iter_peek(iter); + if (likely(k.k)) { + iter->pos = bkey_start_pos(k.k); + return; + } + } + /* * We use iter->k instead of iter->pos for extents: iter->pos will be * equal to the start of the extent we returned, but we need to advance @@ -1032,6 +1048,7 @@ void bch2_btree_iter_rewind(struct btree_iter *iter, struct bpos pos) BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0); iter->pos = pos; + iter->flags &= ~BTREE_ITER_UPTODATE; __btree_iter_init(iter, iter->nodes[iter->level]); } @@ -1043,6 +1060,17 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); + if (iter->flags & BTREE_ITER_UPTODATE) { + struct btree *b = iter->nodes[0]; + struct bkey_packed *k = + __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b); + + return (struct bkey_s_c) { + .k = &iter->k, + .v = bkeyp_val(&b->format, k) + }; + } + while (1) { ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) { @@ -1058,7 +1086,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) */ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + iter->pos = bkey_start_pos(k.k); + + iter->flags |= BTREE_ITER_UPTODATE; return k; } @@ -1083,6 +1113,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter) EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS)); + iter->flags &= ~BTREE_ITER_UPTODATE; + while (1) { ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) { @@ -1131,12 +1163,15 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, unsigned locks_want, unsigned depth, unsigned flags) { + EBUG_ON(depth >= BTREE_MAX_DEPTH); + EBUG_ON(locks_want > BTREE_MAX_DEPTH); + iter->c = c; iter->pos = pos; iter->flags = flags; iter->btree_id = btree_id; iter->level = depth; - iter->locks_want = min(locks_want, BTREE_MAX_DEPTH); + iter->locks_want = locks_want; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; memset(iter->nodes, 0, sizeof(iter->nodes)); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 57f3876..34e5035 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -4,19 +4,20 @@ #include "btree_types.h" -#define BTREE_ITER_INTENT (1 << 0) +#define BTREE_ITER_UPTODATE (1 << 0) #define BTREE_ITER_WITH_HOLES (1 << 1) -#define BTREE_ITER_PREFETCH (1 << 2) +#define BTREE_ITER_INTENT (1 << 2) +#define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 3) +#define BTREE_ITER_IS_EXTENTS (1 << 4) /* * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: */ -#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) -#define BTREE_ITER_ERROR (1 << 5) +#define BTREE_ITER_AT_END_OF_LEAF (1 << 5) +#define BTREE_ITER_ERROR (1 << 6) /* * @pos - iterator's current position @@ -223,17 +224,23 @@ static inline int btree_iter_cmp(const struct btree_iter *l, #define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \ __for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b) +static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + unsigned flags) +{ + return flags & BTREE_ITER_WITH_HOLES + ? bch2_btree_iter_peek_with_holes(iter) + : bch2_btree_iter_peek(iter); +} + #define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \ - for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \ - (_start), (_flags)); \ - !IS_ERR_OR_NULL(((_k) = (((_flags) & BTREE_ITER_WITH_HOLES)\ - ? bch2_btree_iter_peek_with_holes(_iter)\ - : bch2_btree_iter_peek(_iter))).k); \ + for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \ + (_start), (_flags)); \ + !IS_ERR_OR_NULL(((_k) = __bch2_btree_iter_peek(_iter, _flags)).k);\ bch2_btree_iter_advance_pos(_iter)) static inline int btree_iter_err(struct bkey_s_c k) { - return IS_ERR(k.k) ? PTR_ERR(k.k) : 0; + return PTR_ERR_OR_ZERO(k.k); } /* diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index c613a7b..7085feb 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -116,7 +116,7 @@ struct btree { * another write - because that write also won't yet be reachable and * marking it as completed before it's reachable would be incorrect: */ - struct list_head reachable; + struct btree_interior_update *will_make_reachable; struct open_bucket *ob; @@ -143,7 +143,6 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ enum btree_flags { BTREE_NODE_read_in_flight, BTREE_NODE_read_error, - BTREE_NODE_write_error, BTREE_NODE_dirty, BTREE_NODE_need_write, BTREE_NODE_noevict, @@ -155,7 +154,6 @@ enum btree_flags { BTREE_FLAG(read_in_flight); BTREE_FLAG(read_error); -BTREE_FLAG(write_error); BTREE_FLAG(dirty); BTREE_FLAG(need_write); BTREE_FLAG(noevict); diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 9794ac3..c7b2018 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -21,6 +21,11 @@ static void btree_interior_update_updated_root(struct bch_fs *, struct btree_interior_update *, enum btree_id); +static void btree_interior_update_will_make_reachable(struct bch_fs *, + struct btree_interior_update *, + struct btree *); +static void btree_interior_update_drop_new_node(struct bch_fs *, + struct btree *); /* Calculate ideal packed bkey format for new btree nodes: */ @@ -166,7 +171,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, BUG_ON(b == btree_node_root(c, b)); BUG_ON(b->ob); BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(!list_empty(&b->reachable)); + BUG_ON(b->will_make_reachable); clear_btree_node_noevict(b); @@ -191,6 +196,8 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) { struct open_bucket *ob = b->ob; + btree_interior_update_drop_new_node(c, b); + b->ob = NULL; clear_btree_node_dirty(b); @@ -299,6 +306,7 @@ mem_alloc: static struct btree *bch2_btree_node_alloc(struct bch_fs *c, unsigned level, enum btree_id id, + struct btree_interior_update *as, struct btree_reserve *reserve) { struct btree *b; @@ -322,7 +330,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c, bch2_btree_build_aux_trees(b); - bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE); + btree_interior_update_will_make_reachable(c, as, b); trace_btree_node_alloc(c, b); return b; @@ -331,11 +339,12 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c, struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c, struct btree *b, struct bkey_format format, + struct btree_interior_update *as, struct btree_reserve *reserve) { struct btree *n; - n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve); + n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve); n->data->min_key = b->data->min_key; n->data->max_key = b->data->max_key; @@ -353,6 +362,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c, static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c, struct btree *b, + struct btree_interior_update *as, struct btree_reserve *reserve) { struct bkey_format new_f = bch2_btree_calc_format(b); @@ -364,7 +374,7 @@ static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c, if (!bch2_btree_node_format_fits(c, b, &new_f)) new_f = b->format; - return __bch2_btree_node_alloc_replacement(c, b, new_f, reserve); + return __bch2_btree_node_alloc_replacement(c, b, new_f, as, reserve); } static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b, @@ -478,9 +488,10 @@ static void bch2_btree_set_root(struct btree_iter *iter, struct btree *b, static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level, enum btree_id id, + struct btree_interior_update *as, struct btree_reserve *reserve) { - struct btree *b = bch2_btree_node_alloc(c, level, id, reserve); + struct btree *b = bch2_btree_node_alloc(c, level, id, as, reserve); b->data->min_key = POS_MIN; b->data->max_key = POS_MAX; @@ -581,6 +592,11 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, goto err_free; } + ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), + BCH_DATA_BTREE); + if (ret) + goto err_free; + reserve->b[reserve->nr++] = b; } @@ -608,11 +624,12 @@ struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, struct closure *writes) { - struct closure cl; + struct btree_interior_update as; struct btree_reserve *reserve; + struct closure cl; struct btree *b; - LIST_HEAD(reachable_list); + memset(&as, 0, sizeof(as)); closure_init_stack(&cl); while (1) { @@ -627,15 +644,14 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, closure_sync(&cl); } - b = __btree_root_alloc(c, 0, id, reserve); - list_add(&b->reachable, &reachable_list); + b = __btree_root_alloc(c, 0, id, &as, reserve); bch2_btree_node_write(c, b, writes, SIX_LOCK_intent); bch2_btree_set_root_initial(c, b, reserve); - bch2_btree_open_bucket_put(c, b); - list_del_init(&b->reachable); + btree_interior_update_drop_new_node(c, b); + bch2_btree_open_bucket_put(c, b); six_unlock_intent(&b->lock); bch2_btree_reserve_put(c, reserve); @@ -819,9 +835,12 @@ void bch2_btree_journal_key(struct btree_insert *trans, /* ick */ insert->k.needs_whiteout = false; bch2_journal_add_keys(j, &trans->journal_res, - b->btree_id, insert); + b->btree_id, insert); insert->k.needs_whiteout = needs_whiteout; + bch2_journal_set_has_inode(j, &trans->journal_res, + insert->k.p.inode); + if (trans->journal_seq) *trans->journal_seq = seq; btree_bset_last(b)->journal_seq = cpu_to_le64(seq); @@ -891,7 +910,6 @@ bch2_btree_interior_update_alloc(struct bch_fs *c) as->c = c; as->mode = BTREE_INTERIOR_NO_UPDATE; INIT_LIST_HEAD(&as->write_blocked_list); - INIT_LIST_HEAD(&as->reachable_list); bch2_keylist_init(&as->parent_keys, as->inline_keys, ARRAY_SIZE(as->inline_keys)); @@ -916,16 +934,16 @@ static void btree_interior_update_nodes_reachable(struct closure *cl) struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl); struct bch_fs *c = as->c; - unsigned i; bch2_journal_pin_drop(&c->journal, &as->journal); mutex_lock(&c->btree_interior_update_lock); - while (!list_empty(&as->reachable_list)) { - struct btree *b = list_first_entry(&as->reachable_list, - struct btree, reachable); - list_del_init(&b->reachable); + while (as->nr_new_nodes) { + struct btree *b = as->new_nodes[--as->nr_new_nodes]; + + BUG_ON(b->will_make_reachable != as); + b->will_make_reachable = NULL; mutex_unlock(&c->btree_interior_update_lock); six_lock_read(&b->lock); @@ -934,9 +952,8 @@ static void btree_interior_update_nodes_reachable(struct closure *cl) mutex_lock(&c->btree_interior_update_lock); } - for (i = 0; i < as->nr_pending; i++) - bch2_btree_node_free_ondisk(c, &as->pending[i]); - as->nr_pending = 0; + while (as->nr_pending) + bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); list_del(&as->list); mutex_unlock(&c->btree_interior_update_lock); @@ -1185,6 +1202,68 @@ static void btree_interior_update_updated_root(struct bch_fs *c, system_freezable_wq); } +static void btree_interior_update_will_make_reachable(struct bch_fs *c, + struct btree_interior_update *as, + struct btree *b) +{ + mutex_lock(&c->btree_interior_update_lock); + BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); + BUG_ON(b->will_make_reachable); + + as->new_nodes[as->nr_new_nodes++] = b; + b->will_make_reachable = as; + mutex_unlock(&c->btree_interior_update_lock); +} + +static void __btree_interior_update_drop_new_node(struct btree *b) +{ + struct btree_interior_update *as = b->will_make_reachable; + unsigned i; + + BUG_ON(!as); + + for (i = 0; i < as->nr_new_nodes; i++) + if (as->new_nodes[i] == b) + goto found; + + BUG(); +found: + as->nr_new_nodes--; + memmove(&as->new_nodes[i], + &as->new_nodes[i + 1], + sizeof(struct btree *) * (as->nr_new_nodes - i)); + b->will_make_reachable = NULL; +} + +static void btree_interior_update_drop_new_node(struct bch_fs *c, + struct btree *b) +{ + mutex_lock(&c->btree_interior_update_lock); + __btree_interior_update_drop_new_node(b); + mutex_unlock(&c->btree_interior_update_lock); +} + +static void bch2_btree_interior_update_add_node_reference(struct bch_fs *c, + struct btree_interior_update *as, + struct btree *b) +{ + struct pending_btree_node_free *d; + + mutex_lock(&c->btree_interior_update_lock); + + /* Add this node to the list of nodes being freed: */ + BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); + + d = &as->pending[as->nr_pending++]; + d->index_update_done = false; + d->seq = b->data->keys.seq; + d->btree_id = b->btree_id; + d->level = b->level; + bkey_copy(&d->key, &b->key); + + mutex_unlock(&c->btree_interior_update_lock); +} + /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_interior_updates - redirect @b's @@ -1196,10 +1275,11 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, { struct closure *cl, *cl_n; struct btree_interior_update *p, *n; - struct pending_btree_node_free *d; struct btree_write *w; struct bset_tree *t; + bch2_btree_interior_update_add_node_reference(c, as, b); + /* * Does this node have data that hasn't been written in the journal? * @@ -1213,16 +1293,6 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, mutex_lock(&c->btree_interior_update_lock); - /* Add this node to the list of nodes being freed: */ - BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending)); - - d = &as->pending[as->nr_pending++]; - d->index_update_done = false; - d->seq = b->data->keys.seq; - d->btree_id = b->btree_id; - d->level = b->level; - bkey_copy(&d->key, &b->key); - /* * Does this node have any btree_interior_update operations preventing * it from being written? @@ -1255,8 +1325,13 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, &as->journal, interior_update_flush); bch2_journal_pin_drop(&c->journal, &w->journal); - if (!list_empty(&b->reachable)) - list_del_init(&b->reachable); + w = btree_prev_write(b); + bch2_journal_pin_add_if_older(&c->journal, &w->journal, + &as->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &w->journal); + + if (b->will_make_reachable) + __btree_interior_update_drop_new_node(b); mutex_unlock(&c->btree_interior_update_lock); } @@ -1301,7 +1376,7 @@ err: #endif } -static enum btree_insert_ret +static int bch2_btree_insert_keys_interior(struct btree *b, struct btree_iter *iter, struct keylist *insert_keys, @@ -1324,7 +1399,7 @@ bch2_btree_insert_keys_interior(struct btree *b, if (bch_keylist_u64s(insert_keys) > bch_btree_keys_u64s_remaining(c, b)) { bch2_btree_node_unlock_write(b, iter); - return BTREE_INSERT_BTREE_NODE_FULL; + return -1; } /* Don't screw up @iter's position: */ @@ -1362,7 +1437,7 @@ bch2_btree_insert_keys_interior(struct btree *b, bch2_btree_node_unlock_write(b, iter); btree_node_interior_verify(b); - return BTREE_INSERT_OK; + return 0; } /* @@ -1373,13 +1448,13 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n struct btree_reserve *reserve, struct btree_interior_update *as) { + struct bch_fs *c = iter->c; size_t nr_packed = 0, nr_unpacked = 0; struct btree *n2; struct bset *set1, *set2; struct bkey_packed *k, *prev = NULL; - n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve); - list_add(&n2->reachable, &as->reachable_list); + n2 = bch2_btree_node_alloc(c, n1->level, iter->btree_id, as, reserve); n2->data->max_key = n1->data->max_key; n2->data->format = n1->format; @@ -1528,8 +1603,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, bch2_btree_interior_update_will_free_node(c, as, b); - n1 = bch2_btree_node_alloc_replacement(c, b, reserve); - list_add(&n1->reachable, &as->reachable_list); + n1 = bch2_btree_node_alloc_replacement(c, b, as, reserve); if (b->level) btree_split_insert_keys(iter, n1, insert_keys, reserve); @@ -1558,8 +1632,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, /* Depth increases, make a new root */ n3 = __btree_root_alloc(c, b->level + 1, iter->btree_id, - reserve); - list_add(&n3->reachable, &as->reachable_list); + as, reserve); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1641,16 +1714,10 @@ void bch2_btree_insert_node(struct btree *b, BUG_ON(!b->level); BUG_ON(!reserve || !as); - switch (bch2_btree_insert_keys_interior(b, iter, insert_keys, - as, reserve)) { - case BTREE_INSERT_OK: - break; - case BTREE_INSERT_BTREE_NODE_FULL: + if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) || + bch2_btree_insert_keys_interior(b, iter, insert_keys, + as, reserve)) btree_split(b, iter, insert_keys, reserve, as); - break; - default: - BUG(); - } } static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags) @@ -1859,8 +1926,7 @@ retry: bch2_btree_interior_update_will_free_node(c, as, b); bch2_btree_interior_update_will_free_node(c, as, m); - n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve); - list_add(&n->reachable, &as->reachable_list); + n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve); n->data->min_key = prev->data->min_key; n->data->max_key = next->data->max_key; @@ -1945,6 +2011,8 @@ btree_insert_key(struct btree_insert *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + iter->flags &= ~BTREE_ITER_UPTODATE; + ret = !btree_node_is_extents(b) ? bch2_insert_fixup_key(trans, insert) : bch2_insert_fixup_extent(trans, insert); @@ -2383,8 +2451,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, bch2_btree_interior_update_will_free_node(c, as, b); - n = bch2_btree_node_alloc_replacement(c, b, reserve); - list_add(&n->reachable, &as->reachable_list); + n = bch2_btree_node_alloc_replacement(c, b, as, reserve); bch2_btree_build_aux_trees(n); six_unlock_write(&n->lock); @@ -2464,3 +2531,140 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, closure_sync(&cl); return ret; } + +int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b, + struct bkey_i_extent *new_key) +{ + struct btree_interior_update *as; + struct btree_reserve *reserve = NULL; + struct btree *parent, *new_hash = NULL; + struct btree_iter iter; + struct closure cl; + bool must_rewrite_parent = false; + int ret; + + __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, + BTREE_MAX_DEPTH, + b->level, 0); + closure_init_stack(&cl); + + if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { + /* bch2_btree_reserve_get will unlock */ + do { + ret = bch2_btree_node_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret == -EAGAIN); + + BUG_ON(ret); + + new_hash = bch2_btree_node_mem_alloc(c); + } +retry: + reserve = bch2_btree_reserve_get(c, b, 0, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE, + &cl); + closure_sync(&cl); + if (IS_ERR(reserve)) { + ret = PTR_ERR(reserve); + if (ret == -EAGAIN || ret == -EINTR) + goto retry; + goto err; + } + + down_read(&c->gc_lock); + + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto err; + + mutex_lock(&c->btree_interior_update_lock); + + /* + * Two corner cases that need to be thought about here: + * + * @b may not be reachable yet - there might be another interior update + * operation waiting on @b to be written, and we're gonna deliver the + * write completion to that interior update operation _before_ + * persisting the new_key update + * + * That ends up working without us having to do anything special here: + * the reason is, we do kick off (and do the in memory updates) for the + * update for @new_key before we return, creating a new interior_update + * operation here. + * + * The new interior update operation here will in effect override the + * previous one. The previous one was going to terminate - make @b + * reachable - in one of two ways: + * - updating the btree root pointer + * In that case, + * no, this doesn't work. argh. + */ + + if (b->will_make_reachable) + must_rewrite_parent = true; + + /* other case: btree node being freed */ + if (iter.nodes[b->level] != b) { + /* node has been freed: */ + BUG_ON(btree_node_hashed(b)); + mutex_unlock(&c->btree_interior_update_lock); + goto err; + } + + mutex_unlock(&c->btree_interior_update_lock); + + ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE); + if (ret) + goto err; + + as = bch2_btree_interior_update_alloc(c); + + if (must_rewrite_parent) + as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE; + + bch2_btree_interior_update_add_node_reference(c, as, b); + + if (new_hash) { + bkey_copy(&new_hash->key, &new_key->k_i); + BUG_ON(bch2_btree_node_hash_insert(c, new_hash, + b->level, b->btree_id)); + } + + parent = iter.nodes[b->level + 1]; + if (parent) { + bch2_btree_insert_node(parent, &iter, + &keylist_single(&b->key), + reserve, as); + } else { + bch2_btree_set_root(&iter, b, as, reserve); + } + + if (new_hash) { + mutex_lock(&c->btree_cache_lock); + bch2_btree_node_hash_remove(c, b); + + bkey_copy(&b->key, &new_key->k_i); + __bch2_btree_node_hash_insert(c, b); + + bch2_btree_node_hash_remove(c, new_hash); + mutex_unlock(&c->btree_cache_lock); + } else { + bkey_copy(&b->key, &new_key->k_i); + } +err: + if (!IS_ERR_OR_NULL(reserve)) + bch2_btree_reserve_put(c, reserve); + if (new_hash) { + mutex_lock(&c->btree_cache_lock); + list_move(&b->list, &c->btree_cache_freeable); + mutex_unlock(&c->btree_cache_lock); + + six_unlock_write(&new_hash->lock); + six_unlock_intent(&new_hash->lock); + } + bch2_btree_iter_unlock(&iter); + up_read(&c->gc_lock); + return ret; +} diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index b5cfa89..086077f 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -76,6 +76,9 @@ struct btree_interior_update { BTREE_INTERIOR_UPDATING_AS, } mode; + unsigned flags; + struct btree_reserve *reserve; + /* * BTREE_INTERIOR_UPDATING_NODE: * The update that made the new nodes visible was a regular update to an @@ -86,7 +89,6 @@ struct btree_interior_update { */ struct btree *b; struct list_head write_blocked_list; - struct list_head reachable_list; /* * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now @@ -117,6 +119,10 @@ struct btree_interior_update { struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; unsigned nr_pending; + /* New nodes, that will be made reachable by this update: */ + struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; + unsigned nr_new_nodes; + /* Only here to reduce stack usage on recursive splits: */ struct keylist parent_keys; /* @@ -127,6 +133,8 @@ struct btree_interior_update { u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; }; +#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0) + #define for_each_pending_btree_node_free(c, as, p) \ list_for_each_entry(as, &c->btree_interior_update_list, list) \ for (p = as->pending; p < as->pending + as->nr_pending; p++) @@ -138,6 +146,7 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *); struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *, struct btree *, struct bkey_format, + struct btree_interior_update *, struct btree_reserve *); struct btree_interior_update * @@ -426,6 +435,8 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, __le64, unsigned); +int bch2_btree_node_update_key(struct bch_fs *, struct btree *, + struct bkey_i_extent *); #endif /* _BCACHE_BTREE_INSERT_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 74d54ab..1b0e3da 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -153,6 +153,37 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) return nr_ptrs; } +/* Doesn't cleanup redundant crcs */ +void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +{ + EBUG_ON(ptr < &e.v->start->ptr || + ptr >= &extent_entry_last(e)->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + memmove_u64s_down(ptr, ptr + 1, + (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); + e.k->u64s -= sizeof(*ptr) / sizeof(u64); +} + +void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +{ + __bch2_extent_drop_ptr(e, ptr); + bch2_extent_drop_redundant_crcs(e); +} + +void bch2_extent_drop_ptr_idx(struct bkey_s_extent e, unsigned idx) +{ + struct bch_extent_ptr *ptr; + unsigned i = 0; + + extent_for_each_ptr(e, ptr) + if (i++ == idx) + goto found; + + BUG(); +found: + bch2_extent_drop_ptr(e, ptr); +} + /* returns true if equal */ static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r) { diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 3a95248..3dc06cb 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -552,24 +552,9 @@ static inline unsigned extent_current_nonce(struct bkey_s_c_extent e) void bch2_extent_narrow_crcs(struct bkey_s_extent); void bch2_extent_drop_redundant_crcs(struct bkey_s_extent); -/* Doesn't cleanup redundant crcs */ -static inline void __bch2_extent_drop_ptr(struct bkey_s_extent e, - struct bch_extent_ptr *ptr) -{ - EBUG_ON(ptr < &e.v->start->ptr || - ptr >= &extent_entry_last(e)->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - memmove_u64s_down(ptr, ptr + 1, - (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); -} - -static inline void bch2_extent_drop_ptr(struct bkey_s_extent e, - struct bch_extent_ptr *ptr) -{ - __bch2_extent_drop_ptr(e, ptr); - bch2_extent_drop_redundant_crcs(e); -} +void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); +void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); +void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 803611d..079f958 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -871,9 +871,8 @@ static void bch2_writepage_io_free(struct closure *cl) { struct bch_writepage_io *io = container_of(cl, struct bch_writepage_io, cl); - struct bio *bio = &io->bio.bio; - bio_put(bio); + bio_put(&io->op.op.wbio.bio); } static void bch2_writepage_io_done(struct closure *cl) @@ -881,7 +880,7 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io *io = container_of(cl, struct bch_writepage_io, cl); struct bch_fs *c = io->op.op.c; - struct bio *bio = &io->bio.bio; + struct bio *bio = &io->op.op.wbio.bio; struct bio_vec *bvec; unsigned i; @@ -940,11 +939,12 @@ static void bch2_writepage_io_done(struct closure *cl) static void bch2_writepage_do_io(struct bch_writepage_state *w) { struct bch_writepage_io *io = w->io; + struct bio *bio = &io->op.op.wbio.bio; w->io = NULL; - atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages); + atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages); - io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector; + io->op.op.pos.offset = bio->bi_iter.bi_sector; closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl); continue_at(&io->cl, bch2_writepage_io_done, NULL); @@ -970,13 +970,13 @@ alloc_io: w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, bch2_writepage_bioset), - struct bch_writepage_io, bio.bio); + struct bch_writepage_io, op.op.wbio.bio); closure_init(&w->io->cl, NULL); w->io->op.ei = ei; w->io->op.sectors_added = 0; w->io->op.is_dio = false; - bch2_write_op_init(&w->io->op.op, c, &w->io->bio, + bch2_write_op_init(&w->io->op.op, c, (struct disk_reservation) { .nr_replicas = c->opts.data_replicas, }, @@ -987,7 +987,7 @@ alloc_io: } if (w->io->op.op.res.nr_replicas != nr_replicas || - bio_add_page_contig(&w->io->bio.bio, page)) { + bio_add_page_contig(&w->io->op.op.wbio.bio, page)) { bch2_writepage_do_io(w); goto alloc_io; } @@ -1038,7 +1038,7 @@ do_io: w->io->op.new_i_size = i_size; if (wbc->sync_mode == WB_SYNC_ALL) - w->io->bio.bio.bi_opf |= REQ_SYNC; + w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; /* Before unlocking the page, transfer reservation to w->io: */ old = page_state_cmpxchg(page_state(page), new, { @@ -1110,7 +1110,7 @@ get_pages: done_index = page->index; if (w.io && - !bio_can_add_page_contig(&w.io->bio.bio, page)) + !bio_can_add_page_contig(&w.io->op.op.wbio.bio, page)) bch2_writepage_do_io(&w); if (!w.io && @@ -1495,7 +1495,7 @@ static long __bch2_dio_write_complete(struct dio_write *dio) if (dio->iovec && dio->iovec != dio->inline_vecs) kfree(dio->iovec); - bio_put(&dio->bio.bio); + bio_put(&dio->iop.op.wbio.bio); return ret; } @@ -1517,11 +1517,11 @@ static void bch2_dio_write_done(struct dio_write *dio) if (dio->iop.op.error) dio->error = dio->iop.op.error; - bio_for_each_segment_all(bv, &dio->bio.bio, i) + bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i) put_page(bv->bv_page); if (dio->iter.count) - bio_reset(&dio->bio.bio); + bio_reset(&dio->iop.op.wbio.bio); } static void bch2_do_direct_IO_write(struct dio_write *dio) @@ -1529,7 +1529,7 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) struct file *file = dio->req->ki_filp; struct inode *inode = file->f_inode; struct bch_inode_info *ei = to_bch_ei(inode); - struct bio *bio = &dio->bio.bio; + struct bio *bio = &dio->iop.op.wbio.bio; unsigned flags = 0; int ret; @@ -1537,8 +1537,6 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) !dio->c->opts.journal_flush_disabled) flags |= BCH_WRITE_FLUSH; - bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9; - ret = bio_iov_iter_get_pages(bio, &dio->iter); if (ret < 0) { /* @@ -1555,10 +1553,9 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) dio->iop.sectors_added = 0; dio->iop.is_dio = true; dio->iop.new_i_size = U64_MAX; - bch2_write_op_init(&dio->iop.op, dio->c, &dio->bio, - dio->res, + bch2_write_op_init(&dio->iop.op, dio->c, dio->res, foreground_write_point(dio->c, inode->i_ino), - POS(inode->i_ino, bio->bi_iter.bi_sector), + POS(inode->i_ino, (dio->offset + dio->written) >> 9), &ei->journal_seq, flags); dio->iop.op.index_update_fn = bchfs_write_index_update; @@ -1619,7 +1616,7 @@ static int bch2_direct_IO_write(struct bch_fs *c, struct kiocb *req, bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), bch2_dio_write_bioset); - dio = container_of(bio, struct dio_write, bio.bio); + dio = container_of(bio, struct dio_write, iop.op.wbio.bio); dio->req = req; dio->c = c; dio->written = 0; diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index 3fcc1e7..252a403 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -46,16 +46,16 @@ struct bchfs_write_op { s64 sectors_added; bool is_dio; u64 new_i_size; + + /* must be last: */ struct bch_write_op op; }; struct bch_writepage_io { struct closure cl; + /* must be last: */ struct bchfs_write_op op; - - /* must come last: */ - struct bch_write_bio bio; }; extern struct bio_set *bch2_writepage_bioset; @@ -76,10 +76,8 @@ struct dio_write { struct mm_struct *mm; - struct bchfs_write_op iop; - /* must be last: */ - struct bch_write_bio bio; + struct bchfs_write_op iop; }; extern struct bio_set *bch2_dio_write_bioset; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 201cdfc..6c9792e 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1458,7 +1458,7 @@ int __init bch2_vfs_init(void) goto err; bch2_writepage_bioset = - bioset_create(4, offsetof(struct bch_writepage_io, bio.bio)); + bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio)); if (!bch2_writepage_bioset) goto err; @@ -1466,7 +1466,8 @@ int __init bch2_vfs_init(void) if (!bch2_dio_read_bioset) goto err; - bch2_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio)); + bch2_dio_write_bioset = + bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio)); if (!bch2_dio_write_bioset) goto err; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 54b523d..78cdaa3 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -92,12 +92,10 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, const struct bch_extent_ptr *ptr; struct bch_write_bio *n; struct bch_dev *ca; + unsigned ptr_idx = 0; BUG_ON(c->opts.nochanges); - wbio->split = false; - wbio->c = c; - extent_for_each_ptr(e, ptr) { ca = c->devs[ptr->dev]; @@ -107,24 +105,26 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; - n->c = c; - n->orig = &wbio->bio; - n->bounce = false; + n->parent = wbio; n->split = true; + n->bounce = false; n->put_bio = true; n->bio.bi_opf = wbio->bio.bi_opf; - __bio_inc_remaining(n->orig); + __bio_inc_remaining(&wbio->bio); } else { n = wbio; + n->split = false; } - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - + n->c = c; n->ca = ca; + n->ptr_idx = ptr_idx++; n->submit_time_us = local_clock_us(); n->bio.bi_iter.bi_sector = ptr->offset; + if (!journal_flushes_device(ca)) + n->bio.bi_opf |= REQ_FUA; + if (likely(percpu_ref_tryget(&ca->io_ref))) { n->have_io_ref = true; n->bio.bi_bdev = ca->disk_sb.bdev; @@ -250,10 +250,9 @@ static void bch2_write_index(struct closure *cl) static void bch2_write_discard(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->bio->bio; struct bpos end = op->pos; - end.offset += bio_sectors(bio); + end.offset += bio_sectors(&op->wbio.bio); op->error = bch2_discard(op->c, op->pos, end, op->version, &op->res, NULL, NULL); @@ -308,31 +307,28 @@ static void bch2_write_io_error(struct closure *cl) static void bch2_write_endio(struct bio *bio) { - struct closure *cl = bio->bi_private; - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_fs *c = wbio->c; - struct bio *orig = wbio->orig; - struct bch_dev *ca = wbio->ca; + struct closure *cl = bio->bi_private; + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_fs *c = wbio->c; + struct bch_dev *ca = wbio->ca; if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, - "data write")) + "data write")) set_closure_fn(cl, bch2_write_io_error, index_update_wq(op)); if (wbio->have_io_ref) percpu_ref_put(&ca->io_ref); - if (bio->bi_error && orig) - orig->bi_error = bio->bi_error; - if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); if (wbio->put_bio) bio_put(bio); - if (orig) - bio_endio(orig); + if (parent) + bio_endio(&parent->bio); else closure_put(cl); } @@ -380,11 +376,10 @@ static void init_append_extent(struct bch_write_op *op, bch2_keylist_push(&op->insert_keys); } -static int bch2_write_extent(struct bch_write_op *op, - struct open_bucket *ob, - struct bio *orig) +static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob) { struct bch_fs *c = op->c; + struct bio *orig = &op->wbio.bio; struct bio *bio; struct bch_write_bio *wbio; unsigned key_to_write_offset = op->insert_keys.top_p - @@ -392,11 +387,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct bkey_i *key_to_write; unsigned csum_type = op->csum_type; unsigned compression_type = op->compression_type; - int ret; + int ret, more; /* don't refetch csum type/compression type */ barrier(); + BUG_ON(!bio_sectors(orig)); + /* Need to decompress data? */ if ((op->flags & BCH_WRITE_DATA_COMPRESSED) && (crc_uncompressed_size(NULL, &op->crc) != op->size || @@ -421,11 +418,8 @@ static int bch2_write_extent(struct bch_write_op *op, ob); bio = orig; - wbio = to_wbio(bio); - wbio->orig = NULL; - wbio->bounce = false; - wbio->put_bio = false; - ret = 0; + wbio = wbio_init(bio); + more = 0; } else if (csum_type != BCH_CSUM_NONE || compression_type != BCH_COMPRESSION_NONE) { /* all units here in bytes */ @@ -439,19 +433,18 @@ static int bch2_write_extent(struct bch_write_op *op, bio = bio_alloc_bioset(GFP_NOIO, DIV_ROUND_UP(output_available, PAGE_SIZE), &c->bio_write); + wbio = wbio_init(bio); + wbio->bounce = true; + wbio->put_bio = true; + /* copy WRITE_SYNC flag */ + wbio->bio.bi_opf = orig->bi_opf; + /* * XXX: can't use mempool for more than * BCH_COMPRESSED_EXTENT_MAX worth of pages */ bch2_bio_alloc_pages_pool(c, bio, output_available); - /* copy WRITE_SYNC flag */ - bio->bi_opf = orig->bi_opf; - wbio = to_wbio(bio); - wbio->orig = NULL; - wbio->bounce = true; - wbio->put_bio = true; - do { unsigned fragment_compression_type = compression_type; size_t dst_len, src_len; @@ -504,45 +497,43 @@ static int bch2_write_extent(struct bch_write_op *op, mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page, &c->bio_bounce_pages); - ret = orig->bi_iter.bi_size != 0; + more = orig->bi_iter.bi_size != 0; } else { bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO, &c->bio_write); - - wbio = to_wbio(bio); - wbio->orig = NULL; - wbio->bounce = false; + wbio = wbio_init(bio); wbio->put_bio = bio != orig; init_append_extent(op, bio_sectors(bio), bio_sectors(bio), compression_type, 0, (struct bch_csum) { 0 }, csum_type, ob); - ret = bio != orig; + more = bio != orig; } + /* might have done a realloc... */ + + key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); + + ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write), + BCH_DATA_USER); + if (ret) + return ret; + bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); closure_get(bio->bi_private); - /* might have done a realloc... */ - - key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - - bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write), - BCH_DATA_USER); - bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write); - return ret; + return more; } static void __bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - struct bio *bio = &op->bio->bio; unsigned open_bucket_nr = 0; struct open_bucket *b; int ret; @@ -550,22 +541,12 @@ static void __bch2_write(struct closure *cl) memset(op->open_buckets, 0, sizeof(op->open_buckets)); if (op->flags & BCH_WRITE_DISCARD) { - op->flags |= BCH_WRITE_DONE; bch2_write_discard(cl); - bio_put(bio); + op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_done, index_update_wq(op)); } - /* - * Journal writes are marked REQ_PREFLUSH; if the original write was a - * flush, it'll wait on the journal write. - */ - bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); - do { - EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset); - EBUG_ON(!bio_sectors(bio)); - if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) continue_at(cl, bch2_write_index, index_update_wq(op)); @@ -622,7 +603,7 @@ static void __bch2_write(struct closure *cl) b - c->open_buckets > U8_MAX); op->open_buckets[open_bucket_nr++] = b - c->open_buckets; - ret = bch2_write_extent(op, b, bio); + ret = bch2_write_extent(op, b); bch2_alloc_sectors_done(c, op->wp, b); @@ -703,16 +684,13 @@ void bch2_wake_delayed_writes(unsigned long data) * after the data is written it calls bch_journal, and after the keys have been * added to the next journal write they're inserted into the btree. * - * It inserts the data in op->bio; bi_sector is used for the key offset, and - * op->inode is used for the key inode. - * * If op->discard is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ void bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->bio->bio; + struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; u64 inode = op->pos.inode; @@ -742,7 +720,7 @@ void bch2_write(struct closure *cl) spin_lock_irqsave(&c->foreground_write_pd_lock, flags); bch2_ratelimit_increment(&c->foreground_write_pd.rate, - bio->bi_iter.bi_size); + bio->bi_iter.bi_size); delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate); @@ -776,15 +754,14 @@ void bch2_write(struct closure *cl) } void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_write_bio *bio, struct disk_reservation res, - struct write_point *wp, struct bpos pos, - u64 *journal_seq, unsigned flags) + struct disk_reservation res, + struct write_point *wp, struct bpos pos, + u64 *journal_seq, unsigned flags) { EBUG_ON(res.sectors && !res.nr_replicas); op->c = c; op->io_wq = index_update_wq(op); - op->bio = bio; op->written = 0; op->error = 0; op->flags = flags; @@ -983,7 +960,7 @@ static void cache_promote_done(struct closure *cl) struct cache_promote_op *op = container_of(cl, struct cache_promote_op, cl); - bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio); + bch2_bio_free_pages_pool(op->write.op.c, &op->write.op.wbio.bio); kfree(op); } @@ -1020,7 +997,7 @@ static void __bch2_read_endio(struct work_struct *work) trace_promote(&rbio->bio); /* we now own pages: */ - swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt); + swap(promote->write.op.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt); rbio->promote = NULL; bch2_rbio_done(rbio); @@ -1112,7 +1089,7 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, promote_op = kmalloc(sizeof(*promote_op) + sizeof(struct bio_vec) * pages, GFP_NOIO); if (promote_op) { - struct bio *promote_bio = &promote_op->write.wbio.bio; + struct bio *promote_bio = &promote_op->write.op.wbio.bio; bio_init(promote_bio, promote_bio->bi_inline_vecs, @@ -1204,7 +1181,7 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, rbio->bio.bi_end_io = bch2_read_endio; if (promote_op) { - struct bio *promote_bio = &promote_op->write.wbio.bio; + struct bio *promote_bio = &promote_op->write.op.wbio.bio; promote_bio->bi_iter = rbio->bio.bi_iter; memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec, @@ -1367,12 +1344,11 @@ void bch2_read_retry_work(struct work_struct *work) read_retry_work); struct bch_read_bio *rbio; struct bio *bio; - unsigned long flags; while (1) { - spin_lock_irqsave(&c->read_retry_lock, flags); + spin_lock_irq(&c->read_retry_lock); bio = bio_list_pop(&c->read_retry_list); - spin_unlock_irqrestore(&c->read_retry_lock, flags); + spin_unlock_irq(&c->read_retry_lock); if (!bio) break; diff --git a/libbcachefs/io.h b/libbcachefs/io.h index fb6f300..619bf56 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -41,11 +41,18 @@ static inline struct write_point *foreground_write_point(struct bch_fs *c, } void bch2_write_op_init(struct bch_write_op *, struct bch_fs *, - struct bch_write_bio *, struct disk_reservation, struct write_point *, struct bpos, u64 *, unsigned); void bch2_write(struct closure *); +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + + memset(wbio, 0, offsetof(struct bch_write_bio, bio)); + return wbio; +} + struct cache_promote_op; struct extent_pick_ptr; diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index d104cb7..3b73bcf 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -66,37 +66,30 @@ struct bch_write_bio { struct bch_fs *c; struct bch_dev *ca; union { - struct bio *orig; - struct closure *cl; + struct bch_write_bio *parent; + struct closure *cl; }; - unsigned submit_time_us; + u8 ptr_idx; + u8 replicas_failed; + u8 order; + unsigned split:1, bounce:1, put_bio:1, - have_io_ref:1; + have_io_ref:1, + used_mempool:1; - /* Only for btree writes: */ - unsigned used_mempool:1; - u8 order; + unsigned submit_time_us; + void *data; struct bio bio; }; -struct bch_replace_info { - struct extent_insert_hook hook; - /* How many insertions succeeded */ - unsigned successes; - /* How many insertions failed */ - unsigned failures; - BKEY_PADDED(key); -}; - struct bch_write_op { struct closure cl; - struct bch_fs *c; + struct bch_fs *c; struct workqueue_struct *io_wq; - struct bch_write_bio *bio; unsigned written; /* sectors */ @@ -141,6 +134,9 @@ struct bch_write_op { struct keylist insert_keys; u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; + + /* Must be last: */ + struct bch_write_bio wbio; }; #endif /* _BCACHE_IO_TYPES_H */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b0011b4..bf8c152 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -53,15 +53,15 @@ static inline u64 journal_pin_seq(struct journal *j, return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); } -static inline void bch2_journal_add_entry(struct journal_buf *buf, - const void *data, size_t u64s, - unsigned type, enum btree_id id, - unsigned level) +static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf, + unsigned type, enum btree_id id, + unsigned level, + const void *data, size_t u64s) { struct jset *jset = buf->data; - bch2_journal_add_entry_at(buf, data, u64s, type, id, level, - le32_to_cpu(jset->u64s)); + bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s), + type, id, level, data, u64s); le32_add_cpu(&jset->u64s, jset_u64s(u64s)); } @@ -97,8 +97,9 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf, enum btree_id id, struct bkey_i *k, unsigned level) { - bch2_journal_add_entry(buf, k, k->k.u64s, - JOURNAL_ENTRY_BTREE_ROOT, id, level); + bch2_journal_add_entry_noreservation(buf, + JOURNAL_ENTRY_BTREE_ROOT, id, level, + k, k->k.u64s); } static void journal_seq_blacklist_flush(struct journal *j, @@ -416,13 +417,8 @@ static void journal_entry_null_range(void *start, void *end) { struct jset_entry *entry; - for (entry = start; entry != end; entry = vstruct_next(entry)) { - entry->u64s = 0; - entry->btree_id = 0; - entry->level = 0; - entry->flags = 0; - SET_JOURNAL_ENTRY_TYPE(entry, 0); - } + for (entry = start; entry != end; entry = vstruct_next(entry)) + memset(entry, 0, sizeof(*entry)); } static int journal_validate_key(struct bch_fs *c, struct jset *j, @@ -514,7 +510,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j, break; } - switch (JOURNAL_ENTRY_TYPE(entry)) { + switch (entry->type) { case JOURNAL_ENTRY_BTREE_KEYS: vstruct_for_each(entry, k) { ret = journal_validate_key(c, j, entry, k, @@ -555,8 +551,8 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j, break; default: - journal_entry_err(c, "invalid journal entry type %llu", - JOURNAL_ENTRY_TYPE(entry)); + journal_entry_err(c, "invalid journal entry type %u", + entry->type); journal_entry_null_range(entry, vstruct_next(entry)); break; } @@ -1426,9 +1422,9 @@ void bch2_journal_start(struct bch_fs *c) */ list_for_each_entry(bl, &j->seq_blacklist, list) if (!bl->written) { - bch2_journal_add_entry(journal_cur_buf(j), &bl->seq, 1, + bch2_journal_add_entry_noreservation(journal_cur_buf(j), JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED, - 0, 0); + 0, 0, &bl->seq, 1); journal_pin_add_entry(j, &fifo_peek_back(&j->pin), @@ -2083,8 +2079,8 @@ static void journal_write_compact(struct jset *jset) if (prev && i->btree_id == prev->btree_id && i->level == prev->level && - JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) && - JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS && + i->type == prev->type && + i->type == JOURNAL_ENTRY_BTREE_KEYS && le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { memmove_u64s_down(vstruct_next(prev), i->_data, @@ -2238,8 +2234,9 @@ static void journal_write(struct closure *cl) closure_return_with_destructor(cl, journal_write_done); } - bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key), - BCH_DATA_JOURNAL); + if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key), + BCH_DATA_JOURNAL)) + goto err; /* * XXX: we really should just disable the entire journal in nochanges diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 88a9bd1..d785a0c 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -125,7 +125,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { while (entry < vstruct_last(jset)) { - if (JOURNAL_ENTRY_TYPE(entry) == type) + if (entry->type == type) return entry; entry = vstruct_next(entry); @@ -187,8 +187,12 @@ static inline void journal_state_inc(union journal_res_state *s) s->buf1_count += s->idx == 1; } -static inline void bch2_journal_set_has_inode(struct journal_buf *buf, u64 inum) +static inline void bch2_journal_set_has_inode(struct journal *j, + struct journal_res *res, + u64 inum) { + struct journal_buf *buf = &j->buf[res->idx]; + set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode); } @@ -202,40 +206,46 @@ static inline unsigned jset_u64s(unsigned u64s) } static inline void bch2_journal_add_entry_at(struct journal_buf *buf, - const void *data, size_t u64s, + unsigned offset, unsigned type, enum btree_id id, - unsigned level, unsigned offset) + unsigned level, + const void *data, size_t u64s) { struct jset_entry *entry = vstruct_idx(buf->data, offset); - entry->u64s = cpu_to_le16(u64s); + memset(entry, 0, sizeof(*entry)); + entry->u64s = cpu_to_le16(u64s); entry->btree_id = id; - entry->level = level; - entry->flags = 0; - SET_JOURNAL_ENTRY_TYPE(entry, type); + entry->level = level; + entry->type = type; memcpy_u64s(entry->_data, data, u64s); } -static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, - enum btree_id id, const struct bkey_i *k) +static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, + const void *data, unsigned u64s) { struct journal_buf *buf = &j->buf[res->idx]; - unsigned actual = jset_u64s(k->k.u64s); + unsigned actual = jset_u64s(u64s); EBUG_ON(!res->ref); BUG_ON(actual > res->u64s); - bch2_journal_set_has_inode(buf, k->k.p.inode); - - bch2_journal_add_entry_at(buf, k, k->k.u64s, - JOURNAL_ENTRY_BTREE_KEYS, id, - 0, res->offset); - + bch2_journal_add_entry_at(buf, res->offset, type, + id, level, data, u64s); res->offset += actual; res->u64s -= actual; } +static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, + enum btree_id id, const struct bkey_i *k) +{ + bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS, + id, 0, k, k->k.u64s); +} + void bch2_journal_buf_put_slowpath(struct journal *, bool); static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, @@ -272,13 +282,10 @@ static inline void bch2_journal_res_put(struct journal *j, lock_release(&j->res_map, 0, _RET_IP_); - while (res->u64s) { - bch2_journal_add_entry_at(&j->buf[res->idx], NULL, 0, - JOURNAL_ENTRY_BTREE_KEYS, - 0, 0, res->offset); - res->offset += jset_u64s(0); - res->u64s -= jset_u64s(0); - } + while (res->u64s) + bch2_journal_add_entry(j, res, + JOURNAL_ENTRY_BTREE_KEYS, + 0, 0, NULL, 0); bch2_journal_buf_put(j, res->idx, false); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index ba0cc0e..78f6d3c 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -128,9 +128,12 @@ int bch2_move_data_off_device(struct bch_dev *ca) seen_key_count++; continue; next: - if (bkey_extent_is_data(k.k)) - bch2_check_mark_super(c, bkey_s_c_to_extent(k), - BCH_DATA_USER); + if (bkey_extent_is_data(k.k)) { + ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k), + BCH_DATA_USER); + if (ret) + break; + } bch2_btree_iter_advance_pos(&iter); bch2_btree_iter_cond_resched(&iter); @@ -386,9 +389,12 @@ int bch2_flag_data_bad(struct bch_dev *ca) */ continue; advance: - if (bkey_extent_is_data(k.k)) - bch2_check_mark_super(c, bkey_s_c_to_extent(k), - BCH_DATA_USER); + if (bkey_extent_is_data(k.k)) { + ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k), + BCH_DATA_USER); + if (ret) + break; + } bch2_btree_iter_advance_pos(&iter); } diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 8c9395d..8ef1a0b 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -155,11 +155,8 @@ void bch2_migrate_write_init(struct bch_fs *c, (move_ptr && move_ptr->cached)) flags |= BCH_WRITE_CACHED; - bch2_write_op_init(&m->op, c, &m->wbio, - (struct disk_reservation) { 0 }, - wp, - bkey_start_pos(k.k), - NULL, flags); + bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, wp, + bkey_start_pos(k.k), NULL, flags); if (m->move) m->op.alloc_reserve = RESERVE_MOVINGGC; @@ -194,7 +191,7 @@ static void moving_io_destructor(struct closure *cl) atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); wake_up(&ctxt->wait); - bio_for_each_segment_all(bv, &io->write.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) if (bv->bv_page) __free_page(bv->bv_page); @@ -307,9 +304,7 @@ int bch2_data_move(struct bch_fs *c, return -ENOMEM; } - migrate_bio_init(io, &io->write.wbio.bio, k.k->size); - bio_get(&io->write.wbio.bio); - io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size); bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0); diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 548f0f0..094eac8 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -19,7 +19,6 @@ struct migrate_write { bool move; struct bch_extent_ptr move_ptr; struct bch_write_op op; - struct bch_write_bio wbio; }; void bch2_migrate_write_init(struct bch_fs *, diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 1eae0fc..0ddfad3 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -783,6 +783,12 @@ out: /* replica information: */ +static inline struct bch_replicas_cpu_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + static inline struct bch_replicas_entry * replicas_entry_next(struct bch_replicas_entry *i) { @@ -794,6 +800,24 @@ replicas_entry_next(struct bch_replicas_entry *i) (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ (_i) = replicas_entry_next(_i)) +static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, + unsigned dev) +{ + return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; +} + +static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, + unsigned dev) +{ + e->devs[dev >> 3] |= 1 << (dev & 7); +} + +static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) +{ + return (r->entry_size - + offsetof(struct bch_replicas_cpu_entry, devs)) * 8; +} + static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, unsigned *nr, unsigned *bytes, @@ -879,6 +903,29 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) return 0; } +static void bkey_to_replicas(struct bkey_s_c_extent e, + enum bch_data_types data_type, + struct bch_replicas_cpu_entry *r, + unsigned *max_dev) +{ + const struct bch_extent_ptr *ptr; + + BUG_ON(!data_type || + data_type == BCH_DATA_SB || + data_type >= BCH_DATA_NR); + + memset(r, 0, sizeof(*r)); + r->data_type = data_type; + + *max_dev = 0; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) { + *max_dev = max_t(unsigned, *max_dev, ptr->dev); + replicas_set_dev(r, ptr->dev); + } +} + /* * for when gc of replica information is in progress: */ @@ -887,14 +934,11 @@ static int bch2_update_gc_replicas(struct bch_fs *c, struct bkey_s_c_extent e, enum bch_data_types data_type) { - const struct bch_extent_ptr *ptr; - struct bch_replicas_cpu_entry *new_e; + struct bch_replicas_cpu_entry new_e; struct bch_replicas_cpu *new; - unsigned i, nr, entry_size, max_dev = 0; + unsigned i, nr, entry_size, max_dev; - extent_for_each_ptr(e, ptr) - if (!ptr->cached) - max_dev = max_t(unsigned, max_dev, ptr->dev); + bkey_to_replicas(e, data_type, &new_e, &max_dev); entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + DIV_ROUND_UP(max_dev + 1, 8); @@ -914,12 +958,9 @@ static int bch2_update_gc_replicas(struct bch_fs *c, cpu_replicas_entry(gc_r, i), gc_r->entry_size); - new_e = cpu_replicas_entry(new, nr - 1); - new_e->data_type = data_type; - - extent_for_each_ptr(e, ptr) - if (!ptr->cached) - replicas_set_dev(new_e, ptr->dev); + memcpy(cpu_replicas_entry(new, nr - 1), + &new_e, + new->entry_size); eytzinger0_sort(new->entries, new->nr, @@ -931,8 +972,38 @@ static int bch2_update_gc_replicas(struct bch_fs *c, return 0; } -int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_types data_type) +static bool replicas_has_extent(struct bch_replicas_cpu *r, + struct bkey_s_c_extent e, + enum bch_data_types data_type) +{ + struct bch_replicas_cpu_entry search; + unsigned max_dev; + + bkey_to_replicas(e, data_type, &search, &max_dev); + + return max_dev < replicas_dev_slots(r) && + eytzinger0_find(r->entries, r->nr, + r->entry_size, + memcmp, &search) < r->nr; +} + +bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, + enum bch_data_types data_type) +{ + bool ret; + + rcu_read_lock(); + ret = replicas_has_extent(rcu_dereference(c->replicas), + e, data_type); + rcu_read_unlock(); + + return ret; +} + +noinline +static int bch2_check_mark_super_slowpath(struct bch_fs *c, + struct bkey_s_c_extent e, + enum bch_data_types data_type) { struct bch_replicas_cpu *gc_r; const struct bch_extent_ptr *ptr; @@ -996,6 +1067,25 @@ err: return ret; } +int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, + enum bch_data_types data_type) +{ + struct bch_replicas_cpu *gc_r; + bool marked; + + rcu_read_lock(); + marked = replicas_has_extent(rcu_dereference(c->replicas), + e, data_type) && + (!(gc_r = rcu_dereference(c->replicas_gc)) || + replicas_has_extent(gc_r, e, data_type)); + rcu_read_unlock(); + + if (marked) + return 0; + + return bch2_check_mark_super_slowpath(c, e, data_type); +} + struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_dev *dev_to_offline) { diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 879fdda..65dd9fb 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -121,92 +121,10 @@ const char *bch2_read_super(struct bcache_superblock *, struct bch_opts, const char *); void bch2_write_super(struct bch_fs *); -static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) -{ - return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; -} - -static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) -{ - e->devs[dev >> 3] |= 1 << (dev & 7); -} - -static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) -{ - return (r->entry_size - - offsetof(struct bch_replicas_cpu_entry, devs)) * 8; -} - -static inline struct bch_replicas_cpu_entry * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent, - enum bch_data_types); - -static inline bool replicas_has_extent(struct bch_replicas_cpu *r, - struct bkey_s_c_extent e, - enum bch_data_types data_type) -{ - const struct bch_extent_ptr *ptr; - struct bch_replicas_cpu_entry search = { - .data_type = data_type, - }; - unsigned max_dev = 0; - - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); - - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - max_dev = max_t(unsigned, max_dev, ptr->dev); - replicas_set_dev(&search, ptr->dev); - } - - return max_dev < replicas_dev_slots(r) && - eytzinger0_find(r->entries, r->nr, - r->entry_size, - memcmp, &search) < r->nr; -} - -static inline bool bch2_sb_has_replicas(struct bch_fs *c, - struct bkey_s_c_extent e, - enum bch_data_types data_type) -{ - bool ret; - - rcu_read_lock(); - ret = replicas_has_extent(rcu_dereference(c->replicas), - e, data_type); - rcu_read_unlock(); - - return ret; -} - -static inline int bch2_check_mark_super(struct bch_fs *c, - struct bkey_s_c_extent e, - enum bch_data_types data_type) -{ - struct bch_replicas_cpu *gc_r; - bool marked; - - rcu_read_lock(); - marked = replicas_has_extent(rcu_dereference(c->replicas), - e, data_type) && - (!(gc_r = rcu_dereference(c->replicas_gc)) || - replicas_has_extent(gc_r, e, data_type)); - rcu_read_unlock(); - - if (marked) - return 0; - - return bch2_check_mark_super_slowpath(c, e, data_type); -} +bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent, + enum bch_data_types); +int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent, + enum bch_data_types); struct replicas_status { struct { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 692eb41..c4cb0b2 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -517,10 +517,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_interior_update_lock); mutex_init(&c->bio_bounce_pages_lock); + mutex_init(&c->zlib_workspace_lock); + bio_list_init(&c->read_retry_list); spin_lock_init(&c->read_retry_lock); INIT_WORK(&c->read_retry_work, bch2_read_retry_work); - mutex_init(&c->zlib_workspace_lock); + + bio_list_init(&c->btree_write_error_list); + spin_lock_init(&c->btree_write_error_lock); + INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); @@ -593,8 +598,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) PAGE_SECTORS, 0) || !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || lg_lock_init(&c->usage_lock) || - mempool_init_page_pool(&c->btree_bounce_pool, 1, - ilog2(btree_pages(c))) || + mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || bdi_setup_and_register(&c->bdi, "bcachefs") || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || @@ -1345,11 +1349,13 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, } } -static bool bch2_fs_may_start(struct bch_fs *c, int flags) +static bool bch2_fs_may_start(struct bch_fs *c) { struct replicas_status s; struct bch_sb_field_members *mi; - unsigned i; + unsigned i, flags = c->opts.degraded + ? BCH_FORCE_IF_DEGRADED + : 0; if (!c->opts.degraded) { mutex_lock(&c->sb_lock); @@ -1773,7 +1779,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices, mutex_unlock(&c->sb_lock); err = "insufficient devices"; - if (!bch2_fs_may_start(c, 0)) + if (!bch2_fs_may_start(c)) goto err; if (!c->opts.nostart) { @@ -1844,7 +1850,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb, } mutex_unlock(&c->sb_lock); - if (!c->opts.nostart && bch2_fs_may_start(c, 0)) { + if (!c->opts.nostart && bch2_fs_may_start(c)) { err = __bch2_fs_start(c); if (err) goto err; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 906e7a6..9a95854 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -577,3 +577,17 @@ void sort_cmp_size(void *base, size_t num, size_t size, } } } + +void mempool_free_vp(void *element, void *pool_data) +{ + size_t size = (size_t) pool_data; + + vpfree(element, size); +} + +void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + + return vpmalloc(size, gfp_mask); +} diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 68d9a86..a9a17d9 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -79,23 +79,43 @@ do { \ (__builtin_types_compatible_p(typeof(_val), _type) || \ __builtin_types_compatible_p(typeof(_val), const _type)) -static inline void kvpfree(void *p, size_t size) +static inline void vpfree(void *p, size_t size) { - if (size < PAGE_SIZE) - kfree(p); - else if (is_vmalloc_addr(p)) + if (is_vmalloc_addr(p)) vfree(p); else free_pages((unsigned long) p, get_order(size)); +} +static inline void *vpmalloc(size_t size, gfp_t gfp_mask) +{ + return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, + get_order(size)) ?: + __vmalloc(size, gfp_mask, PAGE_KERNEL); +} + +static inline void kvpfree(void *p, size_t size) +{ + if (size < PAGE_SIZE) + kfree(p); + else + vpfree(p, size); } static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) { - return size < PAGE_SIZE ? kmalloc(size, gfp_mask) - : (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) - ?: __vmalloc(size, gfp_mask, PAGE_KERNEL); + return size < PAGE_SIZE + ? kmalloc(size, gfp_mask) + : vpmalloc(size, gfp_mask); +} + +void mempool_free_vp(void *element, void *pool_data); +void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data); + +static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size) +{ + return mempool_init(pool, min_nr, mempool_alloc_vp, + mempool_free_vp, (void *) size); } #define HEAP(type) \ diff --git a/linux/sched.c b/linux/sched.c index 11480f3..898ccb1 100644 --- a/linux/sched.c +++ b/linux/sched.c @@ -1,5 +1,6 @@ #include +#include #include #include @@ -163,6 +164,8 @@ static void sched_init(void) { struct task_struct *p = malloc(sizeof(*p)); + mlockall(MCL_CURRENT|MCL_FUTURE); + memset(p, 0, sizeof(*p)); p->state = TASK_RUNNING; -- cgit v1.2.3