summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-05-12 18:45:15 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2017-05-12 23:14:24 -0800
commit565b4a74d6c25c78b0d2b82d9529595fc6269308 (patch)
tree3e4440a60c5f8519352ce5b6c587a7d1a79c4655
parenta588eb0d9e30dffa4b319a4715c1454ee1d911f1 (diff)
Update bcachefs sources to 14e9ac5016 bcachefs: btree_iter fastpath
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c9
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/mempool.h28
-rw-r--r--include/linux/slab.h5
-rw-r--r--include/linux/vmalloc.h37
-rw-r--r--libbcachefs/alloc.c6
-rw-r--r--libbcachefs/bcachefs.h4
-rw-r--r--libbcachefs/bcachefs_format.h4
-rw-r--r--libbcachefs/bkey.c29
-rw-r--r--libbcachefs/btree_cache.c51
-rw-r--r--libbcachefs/btree_cache.h10
-rw-r--r--libbcachefs/btree_gc.c10
-rw-r--r--libbcachefs/btree_io.c142
-rw-r--r--libbcachefs/btree_io.h3
-rw-r--r--libbcachefs/btree_iter.c39
-rw-r--r--libbcachefs/btree_iter.h29
-rw-r--r--libbcachefs/btree_types.h4
-rw-r--r--libbcachefs/btree_update.c316
-rw-r--r--libbcachefs/btree_update.h13
-rw-r--r--libbcachefs/extents.c31
-rw-r--r--libbcachefs/extents.h21
-rw-r--r--libbcachefs/fs-io.c37
-rw-r--r--libbcachefs/fs-io.h10
-rw-r--r--libbcachefs/fs.c5
-rw-r--r--libbcachefs/io.c142
-rw-r--r--libbcachefs/io.h9
-rw-r--r--libbcachefs/io_types.h32
-rw-r--r--libbcachefs/journal.c45
-rw-r--r--libbcachefs/journal.h55
-rw-r--r--libbcachefs/migrate.c18
-rw-r--r--libbcachefs/move.c13
-rw-r--r--libbcachefs/move.h1
-rw-r--r--libbcachefs/super-io.c118
-rw-r--r--libbcachefs/super-io.h90
-rw-r--r--libbcachefs/super.c20
-rw-r--r--libbcachefs/util.c14
-rw-r--r--libbcachefs/util.h36
-rw-r--r--linux/sched.c3
39 files changed, 928 insertions, 517 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 8bc4e35..d2d0c51 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-43e3159567958ea70c8a95d98fdb6e881153a656
+14e9ac5016803fc63c1216608c866bef16b4053e
diff --git a/cmd_migrate.c b/cmd_migrate.c
index bf8f0be..82fa0f1 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -250,7 +250,6 @@ static void write_data(struct bch_fs *c,
{
struct disk_reservation res;
struct bch_write_op op;
- struct bch_write_bio bio;
struct bio_vec bv;
struct closure cl;
@@ -259,15 +258,15 @@ static void write_data(struct bch_fs *c,
closure_init_stack(&cl);
- bio_init(&bio.bio, &bv, 1);
- bio.bio.bi_iter.bi_size = len;
- bch2_bio_map(&bio.bio, buf);
+ bio_init(&op.wbio.bio, &bv, 1);
+ op.wbio.bio.bi_iter.bi_size = len;
+ bch2_bio_map(&op.wbio.bio, buf);
int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0);
if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret));
- bch2_write_op_init(&op, c, &bio, res, c->write_points,
+ bch2_write_op_init(&op, c, res, c->write_points,
POS(dst_inode->inum, dst_offset >> 9), NULL, 0);
closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl);
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e5c31a6..915a6f8 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -166,4 +166,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
#define flush_cache_vmap(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) do { } while (0)
+#ifdef __x86_64
+#define CONFIG_X86_64 y
+#endif
+
#endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index ddf6f94..37d8149 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -10,8 +10,14 @@
struct kmem_cache;
+typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
+typedef void (mempool_free_t)(void *element, void *pool_data);
+
typedef struct mempool_s {
- size_t elem_size;
+ size_t elem_size;
+ void *pool_data;
+ mempool_alloc_t *alloc;
+ mempool_free_t *free;
} mempool_t;
static inline bool mempool_initialized(mempool_t *pool)
@@ -60,24 +66,22 @@ static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t
return 0;
}
-static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
-{
- mempool_t *pool = malloc(sizeof(*pool));
- pool->elem_size = size;
- return pool;
-}
-
static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
{
pool->elem_size = PAGE_SIZE << order;
return 0;
}
-static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
+static inline int mempool_init(mempool_t *pool, int min_nr,
+ mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn,
+ void *pool_data)
{
- mempool_t *pool = malloc(sizeof(*pool));
- pool->elem_size = PAGE_SIZE << order;
- return pool;
+ pool->elem_size = (size_t) pool_data;
+ pool->pool_data = pool_data;
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
+ return 0;
}
#endif /* _LINUX_MEMPOOL_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 58fb73e..d0d8790 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -43,9 +43,6 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
#define kcalloc(n, size, flags) calloc(n, size)
#define kmalloc_array(n, size, flags) calloc(n, size)
-#define vmalloc(size) malloc(size)
-#define vzalloc(size) calloc(1, size)
-
#define kfree(p) free(p)
#define kvfree(p) free(p)
#define kzfree(p) free(p)
@@ -89,8 +86,6 @@ do { \
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
-#define PAGE_KERNEL 0
-
static inline void vunmap(const void *addr) {}
static inline void *vmap(struct page **pages, unsigned int count,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index eb6284d..debdced 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -1,8 +1,41 @@
#ifndef __TOOLS_LINUX_VMALLOC_H
#define __TOOLS_LINUX_VMALLOC_H
-#define vmalloc(size) malloc(size)
-#define __vmalloc(size, flags, prot) malloc(size)
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tools-util.h"
+
+#define PAGE_KERNEL 0
+#define PAGE_KERNEL_EXEC 1
+
#define vfree(p) free(p)
+static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask, unsigned prot)
+{
+ void *p = aligned_alloc(PAGE_SIZE, size);
+
+ if (p && prot == PAGE_KERNEL_EXEC) {
+ if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+ vfree(p);
+ p = NULL;
+ }
+ }
+
+ if (p && (gfp_mask & __GFP_ZERO))
+ memset(p, 0, size);
+
+ return p;
+}
+
+static inline void *vmalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+}
+
+static inline void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL|__GFP_ZERO, PAGE_KERNEL);
+}
+
#endif /* __TOOLS_LINUX_VMALLOC_H */
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index a12c5d3..36dc947 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -361,7 +361,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g, struct btree_iter *iter,
u64 *journal_seq)
{
- struct bucket_mark m = READ_ONCE(g->mark);
+ struct bucket_mark m;
__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
struct bkey_i_alloc *a;
u8 *d;
@@ -374,6 +374,8 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
if (ret)
break;
+ /* read mark under btree node lock: */
+ m = READ_ONCE(g->mark);
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
a->v.fields = 0;
@@ -407,8 +409,6 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
struct btree_iter iter;
int ret;
- lockdep_assert_held(&c->state_lock);
-
if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
return 0;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 9d04e89..4d0fc62 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -725,6 +725,10 @@ struct bch_fs {
struct work_struct read_retry_work;
spinlock_t read_retry_lock;
+ struct bio_list btree_write_error_list;
+ struct work_struct btree_write_error_work;
+ spinlock_t btree_write_error_lock;
+
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 3f6d51a..125b6fa 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1082,7 +1082,8 @@ struct jset_entry {
__le16 u64s;
__u8 btree_id;
__u8 level;
- __le32 flags; /* designates what this jset holds */
+ __u8 type; /* designates what this jset holds */
+ __u8 pad[3];
union {
struct bkey_i start[0];
@@ -1092,7 +1093,6 @@ struct jset_entry {
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
-LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8);
enum {
JOURNAL_ENTRY_BTREE_KEYS = 0,
JOURNAL_ENTRY_BTREE_ROOT = 1,
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index b9ceb6e..cc76257 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -791,11 +791,9 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
unsigned dst_offset, unsigned dst_size,
bool *eax_zeroed)
{
- unsigned byte = format->key_u64s * sizeof(u64);
unsigned bits = format->bits_per_field[field];
u64 offset = format->field_offset[field];
- unsigned i, bit_offset = 0;
- unsigned shl, shr;
+ unsigned i, byte, bit_offset, align, shl, shr;
if (!bits && !offset) {
if (!*eax_zeroed) {
@@ -842,11 +840,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
return out;
}
+ bit_offset = format->key_u64s * 64;
for (i = 0; i <= field; i++)
- bit_offset += format->bits_per_field[i];
+ bit_offset -= format->bits_per_field[i];
- byte -= DIV_ROUND_UP(bit_offset, 8);
- bit_offset = round_up(bit_offset, 8) - bit_offset;
+ byte = bit_offset / 8;
+ bit_offset -= byte * 8;
*eax_zeroed = false;
@@ -857,6 +856,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
/* movzx eax, WORD PTR [rsi + imm8] */
I4(0x0f, 0xb7, 0x46, byte);
} else if (bit_offset + bits <= 32) {
+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+ byte -= align;
+ bit_offset += align * 8;
+
+ BUG_ON(bit_offset + bits > 32);
+
/* mov eax, [rsi + imm8] */
I3(0x8b, 0x46, byte);
@@ -874,6 +879,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
out += 4;
}
} else if (bit_offset + bits <= 64) {
+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+ byte -= align;
+ bit_offset += align * 8;
+
+ BUG_ON(bit_offset + bits > 64);
+
/* mov rax, [rsi + imm8] */
I4(0x48, 0x8b, 0x46, byte);
@@ -890,6 +901,12 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
I4(0x48, 0xc1, 0xe8, shr);
}
} else {
+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+ byte -= align;
+ bit_offset += align * 8;
+
+ BUG_ON(bit_offset + bits > 96);
+
/* mov rax, [rsi + byte] */
I4(0x48, 0x8b, 0x46, byte);
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index bdbe21a..d619f37 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -41,7 +41,7 @@ static void __mca_data_free(struct bch_fs *c, struct btree *b)
{
EBUG_ON(btree_node_write_in_flight(b));
- free_pages((unsigned long) b->data, btree_page_order(c));
+ kvpfree(b->data, btree_bytes(c));
b->data = NULL;
bch2_btree_keys_free(b);
}
@@ -53,8 +53,6 @@ static void mca_data_free(struct bch_fs *c, struct btree *b)
list_move(&b->list, &c->btree_cache_freed);
}
-#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
-
static const struct rhashtable_params bch_btree_cache_params = {
.head_offset = offsetof(struct btree, hash),
.key_offset = offsetof(struct btree, key.v),
@@ -63,20 +61,18 @@ static const struct rhashtable_params bch_btree_cache_params = {
static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
- unsigned order = ilog2(btree_pages(c));
-
- b->data = (void *) __get_free_pages(gfp, order);
+ b->data = kvpmalloc(btree_bytes(c), gfp);
if (!b->data)
goto err;
- if (bch2_btree_keys_alloc(b, order, gfp))
+ if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
goto err;
c->btree_cache_used++;
list_move(&b->list, &c->btree_cache_freeable);
return;
err:
- free_pages((unsigned long) b->data, order);
+ kvpfree(b->data, btree_bytes(c));
b->data = NULL;
list_move(&b->list, &c->btree_cache_freed);
}
@@ -91,7 +87,6 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
six_lock_init(&b->lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- INIT_LIST_HEAD(&b->reachable);
mca_data_alloc(c, b, gfp);
return b->data ? b : NULL;
@@ -101,10 +96,6 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
{
- BUG_ON(btree_node_dirty(b));
-
- b->nsets = 0;
-
rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
bch_btree_cache_params);
@@ -112,23 +103,27 @@ void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
bkey_i_to_extent(&b->key)->v._data[0] = 0;
}
+int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b)
+{
+ return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
+ bch_btree_cache_params);
+}
+
int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b,
unsigned level, enum btree_id id)
{
int ret;
+
b->level = level;
b->btree_id = id;
- ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
- bch_btree_cache_params);
- if (ret)
- return ret;
-
mutex_lock(&c->btree_cache_lock);
- list_add(&b->list, &c->btree_cache);
+ ret = __bch2_btree_node_hash_insert(c, b);
+ if (!ret)
+ list_add(&b->list, &c->btree_cache);
mutex_unlock(&c->btree_cache_lock);
- return 0;
+ return ret;
}
__flatten
@@ -155,8 +150,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (!six_trylock_write(&b->lock))
goto out_unlock_intent;
- if (btree_node_write_error(b) ||
- btree_node_noevict(b))
+ if (btree_node_noevict(b))
goto out_unlock;
if (!btree_node_may_write(b))
@@ -328,7 +322,7 @@ void bch2_fs_btree_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &c->btree_cache);
- free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c)));
+ kvpfree(c->verify_ondisk, btree_bytes(c));
#endif
for (i = 0; i < BTREE_ID_NR; i++)
@@ -384,8 +378,7 @@ int bch2_fs_btree_init(struct bch_fs *c)
#ifdef CONFIG_BCACHEFS_DEBUG
mutex_init(&c->verify_lock);
- c->verify_ondisk = (void *)
- __get_free_pages(GFP_KERNEL, ilog2(btree_pages(c)));
+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
if (!c->verify_ondisk)
return -ENOMEM;
@@ -510,7 +503,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
BUG_ON(!six_trylock_intent(&b->lock));
BUG_ON(!six_trylock_write(&b->lock));
out_unlock:
- BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key));
+ BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_write_in_flight(b));
list_del_init(&b->list);
@@ -554,6 +547,12 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter,
struct bch_fs *c = iter->c;
struct btree *b;
+ /*
+ * Parent node must be locked, else we could read in a btree node that's
+ * been freed:
+ */
+ BUG_ON(!btree_node_locked(iter, level + 1));
+
b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
return b;
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index ca8e319..ea53d2b 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_types.h"
+#include "extents.h"
struct btree_iter;
@@ -11,6 +12,7 @@ extern const char * const bch2_btree_ids[];
void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *);
+int __bch2_btree_node_hash_insert(struct bch_fs *, struct btree *);
int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *,
unsigned, enum btree_id);
@@ -28,6 +30,14 @@ void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *,
void bch2_fs_btree_exit(struct bch_fs *);
int bch2_fs_btree_init(struct bch_fs *);
+#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+ return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+}
+
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \
&(_c)->btree_cache_table), \
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 815260b..376edaf 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -621,12 +621,10 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]);
/* Repack everything with @new_format and sort down to one bset */
- for (i = 0; i < nr_old_nodes; i++) {
+ for (i = 0; i < nr_old_nodes; i++)
new_nodes[i] =
__bch2_btree_node_alloc_replacement(c, old_nodes[i],
- new_format, res);
- list_add(&new_nodes[i]->reachable, &as->reachable_list);
- }
+ new_format, as, res);
/*
* Conceptually we concatenate the nodes together and slice them
@@ -663,7 +661,6 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
set_btree_bset_end(n1, n1->set);
- list_del_init(&n2->reachable);
six_unlock_write(&n2->lock);
bch2_btree_node_free_never_inserted(c, n2);
six_unlock_intent(&n2->lock);
@@ -796,7 +793,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
memset(merge, 0, sizeof(merge));
__for_each_btree_node(&iter, c, btree_id, POS_MIN,
- U8_MAX, 0, BTREE_ITER_PREFETCH, b) {
+ BTREE_MAX_DEPTH, 0,
+ BTREE_ITER_PREFETCH, b) {
memmove(merge + 1, merge,
sizeof(merge) - sizeof(merge[0]));
memmove(lock_seq + 1, lock_seq,
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 571a814..eeb546e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -56,9 +56,9 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order,
bool used_mempool, void *p)
{
if (used_mempool)
- mempool_free(virt_to_page(p), &c->btree_bounce_pool);
+ mempool_free(p, &c->btree_bounce_pool);
else
- free_pages((unsigned long) p, order);
+ vpfree(p, PAGE_SIZE << order);
}
static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
@@ -66,7 +66,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
{
void *p;
- BUG_ON(1 << order > btree_pages(c));
+ BUG_ON(order > btree_page_order(c));
*used_mempool = false;
p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
@@ -74,7 +74,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
return p;
*used_mempool = true;
- return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
+ return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
}
typedef int (*sort_cmp_fn)(struct btree *,
@@ -1183,7 +1183,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
if (bne->keys.seq == b->data->keys.seq)
goto err;
- sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
sorted->keys.u64s = 0;
b->nr = btree_node_is_extents(b)
@@ -1199,7 +1199,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
+ btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
bch2_bset_build_aux_tree(b, b->set, false);
@@ -1344,50 +1344,100 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
- /*
- * Before calling bch2_btree_complete_write() - if the write errored, we
- * have to halt new journal writes before they see this btree node
- * write as completed:
- */
- if (btree_node_write_error(b))
- bch2_journal_halt(&c->journal);
-
bch2_btree_complete_write(c, b, w);
btree_node_io_unlock(b);
}
-static void btree_node_write_endio(struct bio *bio)
+static void bch2_btree_node_write_error(struct bch_fs *c,
+ struct bch_write_bio *wbio)
{
- struct btree *b = bio->bi_private;
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_fs *c = wbio->c;
- struct bio *orig = wbio->split ? wbio->orig : NULL;
- struct closure *cl = !wbio->split ? wbio->cl : NULL;
- struct bch_dev *ca = wbio->ca;
+ struct btree *b = wbio->bio.bi_private;
+ struct closure *cl = wbio->cl;
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_i_extent *new_key;
+
+ bkey_copy(&tmp.k, &b->key);
+ new_key = bkey_i_to_extent(&tmp.k);
+
+ while (wbio->replicas_failed) {
+ unsigned idx = __fls(wbio->replicas_failed);
+
+ bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
+ wbio->replicas_failed ^= 1 << idx;
+ }
+
+ if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
+ bch2_btree_node_update_key(c, b, new_key)) {
+ set_btree_node_noevict(b);
+ bch2_fatal_error(c);
+ }
+
+ bio_put(&wbio->bio);
+ btree_node_write_done(c, b);
+ if (cl)
+ closure_put(cl);
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ btree_write_error_work);
+ struct bio *bio;
+
+ while (1) {
+ spin_lock_irq(&c->read_retry_lock);
+ bio = bio_list_pop(&c->read_retry_list);
+ spin_unlock_irq(&c->read_retry_lock);
- if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
+ if (!bio)
+ break;
+
+ bch2_btree_node_write_error(c, to_wbio(bio));
+ }
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+ struct btree *b = bio->bi_private;
+ struct bch_write_bio *wbio = to_wbio(bio);
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
+ struct bch_write_bio *orig = parent ?: wbio;
+ struct closure *cl = !wbio->split ? wbio->cl : NULL;
+ struct bch_fs *c = wbio->c;
+ struct bch_dev *ca = wbio->ca;
+
+ if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") ||
bch2_meta_write_fault("btree"))
- set_btree_node_write_error(b);
+ set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
- if (wbio->bounce)
- btree_bounce_free(c,
- wbio->order,
- wbio->used_mempool,
- page_address(bio->bi_io_vec[0].bv_page));
-
- if (wbio->put_bio)
+ if (parent) {
bio_put(bio);
+ bio_endio(&parent->bio);
+ return;
+ }
- if (orig) {
- bio_endio(orig);
- } else {
- btree_node_write_done(c, b);
- if (cl)
- closure_put(cl);
+ btree_bounce_free(c,
+ wbio->order,
+ wbio->used_mempool,
+ wbio->data);
+
+ if (wbio->replicas_failed) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bio_list_add(&c->read_retry_list, &wbio->bio);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ queue_work(c->wq, &c->btree_write_error_work);
+ return;
}
+
+ bio_put(bio);
+ btree_node_write_done(c, b);
+ if (cl)
+ closure_put(cl);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1411,7 +1461,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent,
enum six_lock_type lock_type_held)
{
- struct bio *bio;
struct bch_write_bio *wbio;
struct bset_tree *t;
struct bset *i;
@@ -1458,7 +1507,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
} while (cmpxchg_acquire(&b->flags, old, new) != old);
BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON(!list_empty_careful(&b->reachable) != !b->written);
+ BUG_ON((b->will_make_reachable != NULL) != !b->written);
BUG_ON(b->written >= c->sb.btree_node_size);
BUG_ON(bset_written(b, btree_bset_last(b)));
@@ -1601,23 +1650,20 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
trace_btree_write(b, bytes_to_write, sectors_to_write);
- bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
-
- wbio = to_wbio(bio);
+ wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent;
- wbio->bounce = true;
- wbio->put_bio = true;
wbio->order = order;
wbio->used_mempool = used_mempool;
- bio->bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
- bio->bi_iter.bi_size = sectors_to_write << 9;
- bio->bi_end_io = btree_node_write_endio;
- bio->bi_private = b;
+ wbio->data = data;
+ wbio->bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
+ wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
+ wbio->bio.bi_end_io = btree_node_write_endio;
+ wbio->bio.bi_private = b;
if (parent)
closure_get(parent);
- bch2_bio_map(bio, data);
+ bch2_bio_map(&wbio->bio, data);
/*
* If we're appending to a leaf node, we don't technically need FUA -
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 7333f30..91263ee 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -37,7 +37,7 @@ static inline void btree_node_wait_on_io(struct btree *b)
static inline bool btree_node_may_write(struct btree *b)
{
return list_empty_careful(&b->write_blocked) &&
- list_empty_careful(&b->reachable);
+ !b->will_make_reachable;
}
enum compact_mode {
@@ -79,6 +79,7 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
+void bch2_btree_write_error_work(struct work_struct *);
void __bch2_btree_node_write(struct bch_fs *, struct btree *,
struct closure *, enum six_lock_type);
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index ecad24f..46df99f 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -252,6 +252,8 @@ static int __bch2_btree_iter_unlock(struct btree_iter *iter)
while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked));
+ iter->flags &= ~BTREE_ITER_UPTODATE;
+
return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
}
@@ -1006,16 +1008,30 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
iter->pos = new_pos;
+ iter->flags &= ~BTREE_ITER_UPTODATE;
}
void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
iter->pos = new_pos;
+ iter->flags &= ~BTREE_ITER_UPTODATE;
}
void bch2_btree_iter_advance_pos(struct btree_iter *iter)
{
+ if (iter->flags & BTREE_ITER_UPTODATE &&
+ !(iter->flags & BTREE_ITER_WITH_HOLES)) {
+ struct bkey_s_c k;
+
+ __btree_iter_advance(iter);
+ k = __btree_iter_peek(iter);
+ if (likely(k.k)) {
+ iter->pos = bkey_start_pos(k.k);
+ return;
+ }
+ }
+
/*
* We use iter->k instead of iter->pos for extents: iter->pos will be
* equal to the start of the extent we returned, but we need to advance
@@ -1032,6 +1048,7 @@ void bch2_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0);
iter->pos = pos;
+ iter->flags &= ~BTREE_ITER_UPTODATE;
__btree_iter_init(iter, iter->nodes[iter->level]);
}
@@ -1043,6 +1060,17 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
+ if (iter->flags & BTREE_ITER_UPTODATE) {
+ struct btree *b = iter->nodes[0];
+ struct bkey_packed *k =
+ __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b);
+
+ return (struct bkey_s_c) {
+ .k = &iter->k,
+ .v = bkeyp_val(&b->format, k)
+ };
+ }
+
while (1) {
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret)) {
@@ -1058,7 +1086,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
*/
if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+ iter->pos = bkey_start_pos(k.k);
+
+ iter->flags |= BTREE_ITER_UPTODATE;
return k;
}
@@ -1083,6 +1113,8 @@ struct bkey_s_c bch2_btree_iter_peek_with_holes(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
+ iter->flags &= ~BTREE_ITER_UPTODATE;
+
while (1) {
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret)) {
@@ -1131,12 +1163,15 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
unsigned locks_want, unsigned depth,
unsigned flags)
{
+ EBUG_ON(depth >= BTREE_MAX_DEPTH);
+ EBUG_ON(locks_want > BTREE_MAX_DEPTH);
+
iter->c = c;
iter->pos = pos;
iter->flags = flags;
iter->btree_id = btree_id;
iter->level = depth;
- iter->locks_want = min(locks_want, BTREE_MAX_DEPTH);
+ iter->locks_want = locks_want;
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
memset(iter->nodes, 0, sizeof(iter->nodes));
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 57f3876..34e5035 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -4,19 +4,20 @@
#include "btree_types.h"
-#define BTREE_ITER_INTENT (1 << 0)
+#define BTREE_ITER_UPTODATE (1 << 0)
#define BTREE_ITER_WITH_HOLES (1 << 1)
-#define BTREE_ITER_PREFETCH (1 << 2)
+#define BTREE_ITER_INTENT (1 << 2)
+#define BTREE_ITER_PREFETCH (1 << 3)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 3)
+#define BTREE_ITER_IS_EXTENTS (1 << 4)
/*
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
*/
-#define BTREE_ITER_AT_END_OF_LEAF (1 << 4)
-#define BTREE_ITER_ERROR (1 << 5)
+#define BTREE_ITER_AT_END_OF_LEAF (1 << 5)
+#define BTREE_ITER_ERROR (1 << 6)
/*
* @pos - iterator's current position
@@ -223,17 +224,23 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \
__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+ unsigned flags)
+{
+ return flags & BTREE_ITER_WITH_HOLES
+ ? bch2_btree_iter_peek_with_holes(iter)
+ : bch2_btree_iter_peek(iter);
+}
+
#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \
- for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \
- (_start), (_flags)); \
- !IS_ERR_OR_NULL(((_k) = (((_flags) & BTREE_ITER_WITH_HOLES)\
- ? bch2_btree_iter_peek_with_holes(_iter)\
- : bch2_btree_iter_peek(_iter))).k); \
+ for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \
+ (_start), (_flags)); \
+ !IS_ERR_OR_NULL(((_k) = __bch2_btree_iter_peek(_iter, _flags)).k);\
bch2_btree_iter_advance_pos(_iter))
static inline int btree_iter_err(struct bkey_s_c k)
{
- return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
+ return PTR_ERR_OR_ZERO(k.k);
}
/*
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index c613a7b..7085feb 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -116,7 +116,7 @@ struct btree {
* another write - because that write also won't yet be reachable and
* marking it as completed before it's reachable would be incorrect:
*/
- struct list_head reachable;
+ struct btree_interior_update *will_make_reachable;
struct open_bucket *ob;
@@ -143,7 +143,6 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \
enum btree_flags {
BTREE_NODE_read_in_flight,
BTREE_NODE_read_error,
- BTREE_NODE_write_error,
BTREE_NODE_dirty,
BTREE_NODE_need_write,
BTREE_NODE_noevict,
@@ -155,7 +154,6 @@ enum btree_flags {
BTREE_FLAG(read_in_flight);
BTREE_FLAG(read_error);
-BTREE_FLAG(write_error);
BTREE_FLAG(dirty);
BTREE_FLAG(need_write);
BTREE_FLAG(noevict);
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index 9794ac3..c7b2018 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -21,6 +21,11 @@
static void btree_interior_update_updated_root(struct bch_fs *,
struct btree_interior_update *,
enum btree_id);
+static void btree_interior_update_will_make_reachable(struct bch_fs *,
+ struct btree_interior_update *,
+ struct btree *);
+static void btree_interior_update_drop_new_node(struct bch_fs *,
+ struct btree *);
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -166,7 +171,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
BUG_ON(b == btree_node_root(c, b));
BUG_ON(b->ob);
BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON(!list_empty(&b->reachable));
+ BUG_ON(b->will_make_reachable);
clear_btree_node_noevict(b);
@@ -191,6 +196,8 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
{
struct open_bucket *ob = b->ob;
+ btree_interior_update_drop_new_node(c, b);
+
b->ob = NULL;
clear_btree_node_dirty(b);
@@ -299,6 +306,7 @@ mem_alloc:
static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
unsigned level, enum btree_id id,
+ struct btree_interior_update *as,
struct btree_reserve *reserve)
{
struct btree *b;
@@ -322,7 +330,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
bch2_btree_build_aux_trees(b);
- bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE);
+ btree_interior_update_will_make_reachable(c, as, b);
trace_btree_node_alloc(c, b);
return b;
@@ -331,11 +339,12 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c,
struct btree *b,
struct bkey_format format,
+ struct btree_interior_update *as,
struct btree_reserve *reserve)
{
struct btree *n;
- n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
+ n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve);
n->data->min_key = b->data->min_key;
n->data->max_key = b->data->max_key;
@@ -353,6 +362,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c,
static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c,
struct btree *b,
+ struct btree_interior_update *as,
struct btree_reserve *reserve)
{
struct bkey_format new_f = bch2_btree_calc_format(b);
@@ -364,7 +374,7 @@ static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c,
if (!bch2_btree_node_format_fits(c, b, &new_f))
new_f = b->format;
- return __bch2_btree_node_alloc_replacement(c, b, new_f, reserve);
+ return __bch2_btree_node_alloc_replacement(c, b, new_f, as, reserve);
}
static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b,
@@ -478,9 +488,10 @@ static void bch2_btree_set_root(struct btree_iter *iter, struct btree *b,
static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level,
enum btree_id id,
+ struct btree_interior_update *as,
struct btree_reserve *reserve)
{
- struct btree *b = bch2_btree_node_alloc(c, level, id, reserve);
+ struct btree *b = bch2_btree_node_alloc(c, level, id, as, reserve);
b->data->min_key = POS_MIN;
b->data->max_key = POS_MAX;
@@ -581,6 +592,11 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
goto err_free;
}
+ ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+ BCH_DATA_BTREE);
+ if (ret)
+ goto err_free;
+
reserve->b[reserve->nr++] = b;
}
@@ -608,11 +624,12 @@ struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
struct closure *writes)
{
- struct closure cl;
+ struct btree_interior_update as;
struct btree_reserve *reserve;
+ struct closure cl;
struct btree *b;
- LIST_HEAD(reachable_list);
+ memset(&as, 0, sizeof(as));
closure_init_stack(&cl);
while (1) {
@@ -627,15 +644,14 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
closure_sync(&cl);
}
- b = __btree_root_alloc(c, 0, id, reserve);
- list_add(&b->reachable, &reachable_list);
+ b = __btree_root_alloc(c, 0, id, &as, reserve);
bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
bch2_btree_set_root_initial(c, b, reserve);
- bch2_btree_open_bucket_put(c, b);
- list_del_init(&b->reachable);
+ btree_interior_update_drop_new_node(c, b);
+ bch2_btree_open_bucket_put(c, b);
six_unlock_intent(&b->lock);
bch2_btree_reserve_put(c, reserve);
@@ -819,9 +835,12 @@ void bch2_btree_journal_key(struct btree_insert *trans,
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
- b->btree_id, insert);
+ b->btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
+ bch2_journal_set_has_inode(j, &trans->journal_res,
+ insert->k.p.inode);
+
if (trans->journal_seq)
*trans->journal_seq = seq;
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
@@ -891,7 +910,6 @@ bch2_btree_interior_update_alloc(struct bch_fs *c)
as->c = c;
as->mode = BTREE_INTERIOR_NO_UPDATE;
INIT_LIST_HEAD(&as->write_blocked_list);
- INIT_LIST_HEAD(&as->reachable_list);
bch2_keylist_init(&as->parent_keys, as->inline_keys,
ARRAY_SIZE(as->inline_keys));
@@ -916,16 +934,16 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
struct btree_interior_update *as =
container_of(cl, struct btree_interior_update, cl);
struct bch_fs *c = as->c;
- unsigned i;
bch2_journal_pin_drop(&c->journal, &as->journal);
mutex_lock(&c->btree_interior_update_lock);
- while (!list_empty(&as->reachable_list)) {
- struct btree *b = list_first_entry(&as->reachable_list,
- struct btree, reachable);
- list_del_init(&b->reachable);
+ while (as->nr_new_nodes) {
+ struct btree *b = as->new_nodes[--as->nr_new_nodes];
+
+ BUG_ON(b->will_make_reachable != as);
+ b->will_make_reachable = NULL;
mutex_unlock(&c->btree_interior_update_lock);
six_lock_read(&b->lock);
@@ -934,9 +952,8 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
mutex_lock(&c->btree_interior_update_lock);
}
- for (i = 0; i < as->nr_pending; i++)
- bch2_btree_node_free_ondisk(c, &as->pending[i]);
- as->nr_pending = 0;
+ while (as->nr_pending)
+ bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
list_del(&as->list);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1185,6 +1202,68 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
system_freezable_wq);
}
+static void btree_interior_update_will_make_reachable(struct bch_fs *c,
+ struct btree_interior_update *as,
+ struct btree *b)
+{
+ mutex_lock(&c->btree_interior_update_lock);
+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+ BUG_ON(b->will_make_reachable);
+
+ as->new_nodes[as->nr_new_nodes++] = b;
+ b->will_make_reachable = as;
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void __btree_interior_update_drop_new_node(struct btree *b)
+{
+ struct btree_interior_update *as = b->will_make_reachable;
+ unsigned i;
+
+ BUG_ON(!as);
+
+ for (i = 0; i < as->nr_new_nodes; i++)
+ if (as->new_nodes[i] == b)
+ goto found;
+
+ BUG();
+found:
+ as->nr_new_nodes--;
+ memmove(&as->new_nodes[i],
+ &as->new_nodes[i + 1],
+ sizeof(struct btree *) * (as->nr_new_nodes - i));
+ b->will_make_reachable = NULL;
+}
+
+static void btree_interior_update_drop_new_node(struct bch_fs *c,
+ struct btree *b)
+{
+ mutex_lock(&c->btree_interior_update_lock);
+ __btree_interior_update_drop_new_node(b);
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void bch2_btree_interior_update_add_node_reference(struct bch_fs *c,
+ struct btree_interior_update *as,
+ struct btree *b)
+{
+ struct pending_btree_node_free *d;
+
+ mutex_lock(&c->btree_interior_update_lock);
+
+ /* Add this node to the list of nodes being freed: */
+ BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
+
+ d = &as->pending[as->nr_pending++];
+ d->index_update_done = false;
+ d->seq = b->data->keys.seq;
+ d->btree_id = b->btree_id;
+ d->level = b->level;
+ bkey_copy(&d->key, &b->key);
+
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_interior_updates - redirect @b's
@@ -1196,10 +1275,11 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
{
struct closure *cl, *cl_n;
struct btree_interior_update *p, *n;
- struct pending_btree_node_free *d;
struct btree_write *w;
struct bset_tree *t;
+ bch2_btree_interior_update_add_node_reference(c, as, b);
+
/*
* Does this node have data that hasn't been written in the journal?
*
@@ -1213,16 +1293,6 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
mutex_lock(&c->btree_interior_update_lock);
- /* Add this node to the list of nodes being freed: */
- BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
- d = &as->pending[as->nr_pending++];
- d->index_update_done = false;
- d->seq = b->data->keys.seq;
- d->btree_id = b->btree_id;
- d->level = b->level;
- bkey_copy(&d->key, &b->key);
-
/*
* Does this node have any btree_interior_update operations preventing
* it from being written?
@@ -1255,8 +1325,13 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
&as->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
- if (!list_empty(&b->reachable))
- list_del_init(&b->reachable);
+ w = btree_prev_write(b);
+ bch2_journal_pin_add_if_older(&c->journal, &w->journal,
+ &as->journal, interior_update_flush);
+ bch2_journal_pin_drop(&c->journal, &w->journal);
+
+ if (b->will_make_reachable)
+ __btree_interior_update_drop_new_node(b);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1301,7 +1376,7 @@ err:
#endif
}
-static enum btree_insert_ret
+static int
bch2_btree_insert_keys_interior(struct btree *b,
struct btree_iter *iter,
struct keylist *insert_keys,
@@ -1324,7 +1399,7 @@ bch2_btree_insert_keys_interior(struct btree *b,
if (bch_keylist_u64s(insert_keys) >
bch_btree_keys_u64s_remaining(c, b)) {
bch2_btree_node_unlock_write(b, iter);
- return BTREE_INSERT_BTREE_NODE_FULL;
+ return -1;
}
/* Don't screw up @iter's position: */
@@ -1362,7 +1437,7 @@ bch2_btree_insert_keys_interior(struct btree *b,
bch2_btree_node_unlock_write(b, iter);
btree_node_interior_verify(b);
- return BTREE_INSERT_OK;
+ return 0;
}
/*
@@ -1373,13 +1448,13 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
struct btree_reserve *reserve,
struct btree_interior_update *as)
{
+ struct bch_fs *c = iter->c;
size_t nr_packed = 0, nr_unpacked = 0;
struct btree *n2;
struct bset *set1, *set2;
struct bkey_packed *k, *prev = NULL;
- n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
- list_add(&n2->reachable, &as->reachable_list);
+ n2 = bch2_btree_node_alloc(c, n1->level, iter->btree_id, as, reserve);
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
@@ -1528,8 +1603,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
bch2_btree_interior_update_will_free_node(c, as, b);
- n1 = bch2_btree_node_alloc_replacement(c, b, reserve);
- list_add(&n1->reachable, &as->reachable_list);
+ n1 = bch2_btree_node_alloc_replacement(c, b, as, reserve);
if (b->level)
btree_split_insert_keys(iter, n1, insert_keys, reserve);
@@ -1558,8 +1632,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
/* Depth increases, make a new root */
n3 = __btree_root_alloc(c, b->level + 1,
iter->btree_id,
- reserve);
- list_add(&n3->reachable, &as->reachable_list);
+ as, reserve);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@@ -1641,16 +1714,10 @@ void bch2_btree_insert_node(struct btree *b,
BUG_ON(!b->level);
BUG_ON(!reserve || !as);
- switch (bch2_btree_insert_keys_interior(b, iter, insert_keys,
- as, reserve)) {
- case BTREE_INSERT_OK:
- break;
- case BTREE_INSERT_BTREE_NODE_FULL:
+ if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) ||
+ bch2_btree_insert_keys_interior(b, iter, insert_keys,
+ as, reserve))
btree_split(b, iter, insert_keys, reserve, as);
- break;
- default:
- BUG();
- }
}
static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags)
@@ -1859,8 +1926,7 @@ retry:
bch2_btree_interior_update_will_free_node(c, as, b);
bch2_btree_interior_update_will_free_node(c, as, m);
- n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
- list_add(&n->reachable, &as->reachable_list);
+ n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve);
n->data->min_key = prev->data->min_key;
n->data->max_key = next->data->max_key;
@@ -1945,6 +2011,8 @@ btree_insert_key(struct btree_insert *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ iter->flags &= ~BTREE_ITER_UPTODATE;
+
ret = !btree_node_is_extents(b)
? bch2_insert_fixup_key(trans, insert)
: bch2_insert_fixup_extent(trans, insert);
@@ -2383,8 +2451,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_interior_update_will_free_node(c, as, b);
- n = bch2_btree_node_alloc_replacement(c, b, reserve);
- list_add(&n->reachable, &as->reachable_list);
+ n = bch2_btree_node_alloc_replacement(c, b, as, reserve);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
@@ -2464,3 +2531,140 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
closure_sync(&cl);
return ret;
}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
+ struct bkey_i_extent *new_key)
+{
+ struct btree_interior_update *as;
+ struct btree_reserve *reserve = NULL;
+ struct btree *parent, *new_hash = NULL;
+ struct btree_iter iter;
+ struct closure cl;
+ bool must_rewrite_parent = false;
+ int ret;
+
+ __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH,
+ b->level, 0);
+ closure_init_stack(&cl);
+
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ /* bch2_btree_reserve_get will unlock */
+ do {
+ ret = bch2_btree_node_cannibalize_lock(c, &cl);
+ closure_sync(&cl);
+ } while (ret == -EAGAIN);
+
+ BUG_ON(ret);
+
+ new_hash = bch2_btree_node_mem_alloc(c);
+ }
+retry:
+ reserve = bch2_btree_reserve_get(c, b, 0,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE,
+ &cl);
+ closure_sync(&cl);
+ if (IS_ERR(reserve)) {
+ ret = PTR_ERR(reserve);
+ if (ret == -EAGAIN || ret == -EINTR)
+ goto retry;
+ goto err;
+ }
+
+ down_read(&c->gc_lock);
+
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto err;
+
+ mutex_lock(&c->btree_interior_update_lock);
+
+ /*
+ * Two corner cases that need to be thought about here:
+ *
+ * @b may not be reachable yet - there might be another interior update
+ * operation waiting on @b to be written, and we're gonna deliver the
+ * write completion to that interior update operation _before_
+ * persisting the new_key update
+ *
+ * That ends up working without us having to do anything special here:
+ * the reason is, we do kick off (and do the in memory updates) for the
+ * update for @new_key before we return, creating a new interior_update
+ * operation here.
+ *
+ * The new interior update operation here will in effect override the
+ * previous one. The previous one was going to terminate - make @b
+ * reachable - in one of two ways:
+ * - updating the btree root pointer
+ * In that case,
+ * no, this doesn't work. argh.
+ */
+
+ if (b->will_make_reachable)
+ must_rewrite_parent = true;
+
+ /* other case: btree node being freed */
+ if (iter.nodes[b->level] != b) {
+ /* node has been freed: */
+ BUG_ON(btree_node_hashed(b));
+ mutex_unlock(&c->btree_interior_update_lock);
+ goto err;
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+ if (ret)
+ goto err;
+
+ as = bch2_btree_interior_update_alloc(c);
+
+ if (must_rewrite_parent)
+ as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
+
+ bch2_btree_interior_update_add_node_reference(c, as, b);
+
+ if (new_hash) {
+ bkey_copy(&new_hash->key, &new_key->k_i);
+ BUG_ON(bch2_btree_node_hash_insert(c, new_hash,
+ b->level, b->btree_id));
+ }
+
+ parent = iter.nodes[b->level + 1];
+ if (parent) {
+ bch2_btree_insert_node(parent, &iter,
+ &keylist_single(&b->key),
+ reserve, as);
+ } else {
+ bch2_btree_set_root(&iter, b, as, reserve);
+ }
+
+ if (new_hash) {
+ mutex_lock(&c->btree_cache_lock);
+ bch2_btree_node_hash_remove(c, b);
+
+ bkey_copy(&b->key, &new_key->k_i);
+ __bch2_btree_node_hash_insert(c, b);
+
+ bch2_btree_node_hash_remove(c, new_hash);
+ mutex_unlock(&c->btree_cache_lock);
+ } else {
+ bkey_copy(&b->key, &new_key->k_i);
+ }
+err:
+ if (!IS_ERR_OR_NULL(reserve))
+ bch2_btree_reserve_put(c, reserve);
+ if (new_hash) {
+ mutex_lock(&c->btree_cache_lock);
+ list_move(&b->list, &c->btree_cache_freeable);
+ mutex_unlock(&c->btree_cache_lock);
+
+ six_unlock_write(&new_hash->lock);
+ six_unlock_intent(&new_hash->lock);
+ }
+ bch2_btree_iter_unlock(&iter);
+ up_read(&c->gc_lock);
+ return ret;
+}
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index b5cfa89..086077f 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -76,6 +76,9 @@ struct btree_interior_update {
BTREE_INTERIOR_UPDATING_AS,
} mode;
+ unsigned flags;
+ struct btree_reserve *reserve;
+
/*
* BTREE_INTERIOR_UPDATING_NODE:
* The update that made the new nodes visible was a regular update to an
@@ -86,7 +89,6 @@ struct btree_interior_update {
*/
struct btree *b;
struct list_head write_blocked_list;
- struct list_head reachable_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
@@ -117,6 +119,10 @@ struct btree_interior_update {
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
unsigned nr_pending;
+ /* New nodes, that will be made reachable by this update: */
+ struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+ unsigned nr_new_nodes;
+
/* Only here to reduce stack usage on recursive splits: */
struct keylist parent_keys;
/*
@@ -127,6 +133,8 @@ struct btree_interior_update {
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
+#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0)
+
#define for_each_pending_btree_node_free(c, as, p) \
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
@@ -138,6 +146,7 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *);
struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *,
struct btree *,
struct bkey_format,
+ struct btree_interior_update *,
struct btree_reserve *);
struct btree_interior_update *
@@ -426,6 +435,8 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
+ struct bkey_i_extent *);
#endif /* _BCACHE_BTREE_INSERT_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 74d54ab..1b0e3da 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -153,6 +153,37 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
+/* Doesn't cleanup redundant crcs */
+void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+ EBUG_ON(ptr < &e.v->start->ptr ||
+ ptr >= &extent_entry_last(e)->ptr);
+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+ memmove_u64s_down(ptr, ptr + 1,
+ (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+ e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+ __bch2_extent_drop_ptr(e, ptr);
+ bch2_extent_drop_redundant_crcs(e);
+}
+
+void bch2_extent_drop_ptr_idx(struct bkey_s_extent e, unsigned idx)
+{
+ struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ extent_for_each_ptr(e, ptr)
+ if (i++ == idx)
+ goto found;
+
+ BUG();
+found:
+ bch2_extent_drop_ptr(e, ptr);
+}
+
/* returns true if equal */
static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
{
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 3a95248..3dc06cb 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -552,24 +552,9 @@ static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
void bch2_extent_narrow_crcs(struct bkey_s_extent);
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
-/* Doesn't cleanup redundant crcs */
-static inline void __bch2_extent_drop_ptr(struct bkey_s_extent e,
- struct bch_extent_ptr *ptr)
-{
- EBUG_ON(ptr < &e.v->start->ptr ||
- ptr >= &extent_entry_last(e)->ptr);
- EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
- memmove_u64s_down(ptr, ptr + 1,
- (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
- e.k->u64s -= sizeof(*ptr) / sizeof(u64);
-}
-
-static inline void bch2_extent_drop_ptr(struct bkey_s_extent e,
- struct bch_extent_ptr *ptr)
-{
- __bch2_extent_drop_ptr(e, ptr);
- bch2_extent_drop_redundant_crcs(e);
-}
+void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 803611d..079f958 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -871,9 +871,8 @@ static void bch2_writepage_io_free(struct closure *cl)
{
struct bch_writepage_io *io = container_of(cl,
struct bch_writepage_io, cl);
- struct bio *bio = &io->bio.bio;
- bio_put(bio);
+ bio_put(&io->op.op.wbio.bio);
}
static void bch2_writepage_io_done(struct closure *cl)
@@ -881,7 +880,7 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io *io = container_of(cl,
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.op.c;
- struct bio *bio = &io->bio.bio;
+ struct bio *bio = &io->op.op.wbio.bio;
struct bio_vec *bvec;
unsigned i;
@@ -940,11 +939,12 @@ static void bch2_writepage_io_done(struct closure *cl)
static void bch2_writepage_do_io(struct bch_writepage_state *w)
{
struct bch_writepage_io *io = w->io;
+ struct bio *bio = &io->op.op.wbio.bio;
w->io = NULL;
- atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages);
+ atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages);
- io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector;
+ io->op.op.pos.offset = bio->bi_iter.bi_sector;
closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
continue_at(&io->cl, bch2_writepage_io_done, NULL);
@@ -970,13 +970,13 @@ alloc_io:
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
BIO_MAX_PAGES,
bch2_writepage_bioset),
- struct bch_writepage_io, bio.bio);
+ struct bch_writepage_io, op.op.wbio.bio);
closure_init(&w->io->cl, NULL);
w->io->op.ei = ei;
w->io->op.sectors_added = 0;
w->io->op.is_dio = false;
- bch2_write_op_init(&w->io->op.op, c, &w->io->bio,
+ bch2_write_op_init(&w->io->op.op, c,
(struct disk_reservation) {
.nr_replicas = c->opts.data_replicas,
},
@@ -987,7 +987,7 @@ alloc_io:
}
if (w->io->op.op.res.nr_replicas != nr_replicas ||
- bio_add_page_contig(&w->io->bio.bio, page)) {
+ bio_add_page_contig(&w->io->op.op.wbio.bio, page)) {
bch2_writepage_do_io(w);
goto alloc_io;
}
@@ -1038,7 +1038,7 @@ do_io:
w->io->op.new_i_size = i_size;
if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->bio.bio.bi_opf |= REQ_SYNC;
+ w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
/* Before unlocking the page, transfer reservation to w->io: */
old = page_state_cmpxchg(page_state(page), new, {
@@ -1110,7 +1110,7 @@ get_pages:
done_index = page->index;
if (w.io &&
- !bio_can_add_page_contig(&w.io->bio.bio, page))
+ !bio_can_add_page_contig(&w.io->op.op.wbio.bio, page))
bch2_writepage_do_io(&w);
if (!w.io &&
@@ -1495,7 +1495,7 @@ static long __bch2_dio_write_complete(struct dio_write *dio)
if (dio->iovec && dio->iovec != dio->inline_vecs)
kfree(dio->iovec);
- bio_put(&dio->bio.bio);
+ bio_put(&dio->iop.op.wbio.bio);
return ret;
}
@@ -1517,11 +1517,11 @@ static void bch2_dio_write_done(struct dio_write *dio)
if (dio->iop.op.error)
dio->error = dio->iop.op.error;
- bio_for_each_segment_all(bv, &dio->bio.bio, i)
+ bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
put_page(bv->bv_page);
if (dio->iter.count)
- bio_reset(&dio->bio.bio);
+ bio_reset(&dio->iop.op.wbio.bio);
}
static void bch2_do_direct_IO_write(struct dio_write *dio)
@@ -1529,7 +1529,7 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
struct file *file = dio->req->ki_filp;
struct inode *inode = file->f_inode;
struct bch_inode_info *ei = to_bch_ei(inode);
- struct bio *bio = &dio->bio.bio;
+ struct bio *bio = &dio->iop.op.wbio.bio;
unsigned flags = 0;
int ret;
@@ -1537,8 +1537,6 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
!dio->c->opts.journal_flush_disabled)
flags |= BCH_WRITE_FLUSH;
- bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9;
-
ret = bio_iov_iter_get_pages(bio, &dio->iter);
if (ret < 0) {
/*
@@ -1555,10 +1553,9 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
dio->iop.sectors_added = 0;
dio->iop.is_dio = true;
dio->iop.new_i_size = U64_MAX;
- bch2_write_op_init(&dio->iop.op, dio->c, &dio->bio,
- dio->res,
+ bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
foreground_write_point(dio->c, inode->i_ino),
- POS(inode->i_ino, bio->bi_iter.bi_sector),
+ POS(inode->i_ino, (dio->offset + dio->written) >> 9),
&ei->journal_seq, flags);
dio->iop.op.index_update_fn = bchfs_write_index_update;
@@ -1619,7 +1616,7 @@ static int bch2_direct_IO_write(struct bch_fs *c, struct kiocb *req,
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
bch2_dio_write_bioset);
- dio = container_of(bio, struct dio_write, bio.bio);
+ dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
dio->req = req;
dio->c = c;
dio->written = 0;
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index 3fcc1e7..252a403 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -46,16 +46,16 @@ struct bchfs_write_op {
s64 sectors_added;
bool is_dio;
u64 new_i_size;
+
+ /* must be last: */
struct bch_write_op op;
};
struct bch_writepage_io {
struct closure cl;
+ /* must be last: */
struct bchfs_write_op op;
-
- /* must come last: */
- struct bch_write_bio bio;
};
extern struct bio_set *bch2_writepage_bioset;
@@ -76,10 +76,8 @@ struct dio_write {
struct mm_struct *mm;
- struct bchfs_write_op iop;
-
/* must be last: */
- struct bch_write_bio bio;
+ struct bchfs_write_op iop;
};
extern struct bio_set *bch2_dio_write_bioset;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 201cdfc..6c9792e 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1458,7 +1458,7 @@ int __init bch2_vfs_init(void)
goto err;
bch2_writepage_bioset =
- bioset_create(4, offsetof(struct bch_writepage_io, bio.bio));
+ bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
if (!bch2_writepage_bioset)
goto err;
@@ -1466,7 +1466,8 @@ int __init bch2_vfs_init(void)
if (!bch2_dio_read_bioset)
goto err;
- bch2_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio));
+ bch2_dio_write_bioset =
+ bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
if (!bch2_dio_write_bioset)
goto err;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 54b523d..78cdaa3 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -92,12 +92,10 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
struct bch_dev *ca;
+ unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges);
- wbio->split = false;
- wbio->c = c;
-
extent_for_each_ptr(e, ptr) {
ca = c->devs[ptr->dev];
@@ -107,24 +105,26 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
- n->c = c;
- n->orig = &wbio->bio;
- n->bounce = false;
+ n->parent = wbio;
n->split = true;
+ n->bounce = false;
n->put_bio = true;
n->bio.bi_opf = wbio->bio.bi_opf;
- __bio_inc_remaining(n->orig);
+ __bio_inc_remaining(&wbio->bio);
} else {
n = wbio;
+ n->split = false;
}
- if (!journal_flushes_device(ca))
- n->bio.bi_opf |= REQ_FUA;
-
+ n->c = c;
n->ca = ca;
+ n->ptr_idx = ptr_idx++;
n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset;
+ if (!journal_flushes_device(ca))
+ n->bio.bi_opf |= REQ_FUA;
+
if (likely(percpu_ref_tryget(&ca->io_ref))) {
n->have_io_ref = true;
n->bio.bi_bdev = ca->disk_sb.bdev;
@@ -250,10 +250,9 @@ static void bch2_write_index(struct closure *cl)
static void bch2_write_discard(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio;
struct bpos end = op->pos;
- end.offset += bio_sectors(bio);
+ end.offset += bio_sectors(&op->wbio.bio);
op->error = bch2_discard(op->c, op->pos, end, op->version,
&op->res, NULL, NULL);
@@ -308,31 +307,28 @@ static void bch2_write_io_error(struct closure *cl)
static void bch2_write_endio(struct bio *bio)
{
- struct closure *cl = bio->bi_private;
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_fs *c = wbio->c;
- struct bio *orig = wbio->orig;
- struct bch_dev *ca = wbio->ca;
+ struct closure *cl = bio->bi_private;
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_write_bio *wbio = to_wbio(bio);
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
+ struct bch_fs *c = wbio->c;
+ struct bch_dev *ca = wbio->ca;
if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca,
- "data write"))
+ "data write"))
set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
- if (bio->bi_error && orig)
- orig->bi_error = bio->bi_error;
-
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
if (wbio->put_bio)
bio_put(bio);
- if (orig)
- bio_endio(orig);
+ if (parent)
+ bio_endio(&parent->bio);
else
closure_put(cl);
}
@@ -380,11 +376,10 @@ static void init_append_extent(struct bch_write_op *op,
bch2_keylist_push(&op->insert_keys);
}
-static int bch2_write_extent(struct bch_write_op *op,
- struct open_bucket *ob,
- struct bio *orig)
+static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
{
struct bch_fs *c = op->c;
+ struct bio *orig = &op->wbio.bio;
struct bio *bio;
struct bch_write_bio *wbio;
unsigned key_to_write_offset = op->insert_keys.top_p -
@@ -392,11 +387,13 @@ static int bch2_write_extent(struct bch_write_op *op,
struct bkey_i *key_to_write;
unsigned csum_type = op->csum_type;
unsigned compression_type = op->compression_type;
- int ret;
+ int ret, more;
/* don't refetch csum type/compression type */
barrier();
+ BUG_ON(!bio_sectors(orig));
+
/* Need to decompress data? */
if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
(crc_uncompressed_size(NULL, &op->crc) != op->size ||
@@ -421,11 +418,8 @@ static int bch2_write_extent(struct bch_write_op *op,
ob);
bio = orig;
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = false;
- wbio->put_bio = false;
- ret = 0;
+ wbio = wbio_init(bio);
+ more = 0;
} else if (csum_type != BCH_CSUM_NONE ||
compression_type != BCH_COMPRESSION_NONE) {
/* all units here in bytes */
@@ -439,19 +433,18 @@ static int bch2_write_extent(struct bch_write_op *op,
bio = bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(output_available, PAGE_SIZE),
&c->bio_write);
+ wbio = wbio_init(bio);
+ wbio->bounce = true;
+ wbio->put_bio = true;
+ /* copy WRITE_SYNC flag */
+ wbio->bio.bi_opf = orig->bi_opf;
+
/*
* XXX: can't use mempool for more than
* BCH_COMPRESSED_EXTENT_MAX worth of pages
*/
bch2_bio_alloc_pages_pool(c, bio, output_available);
- /* copy WRITE_SYNC flag */
- bio->bi_opf = orig->bi_opf;
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = true;
- wbio->put_bio = true;
-
do {
unsigned fragment_compression_type = compression_type;
size_t dst_len, src_len;
@@ -504,45 +497,43 @@ static int bch2_write_extent(struct bch_write_op *op,
mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
&c->bio_bounce_pages);
- ret = orig->bi_iter.bi_size != 0;
+ more = orig->bi_iter.bi_size != 0;
} else {
bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
&c->bio_write);
-
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = false;
+ wbio = wbio_init(bio);
wbio->put_bio = bio != orig;
init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
compression_type, 0,
(struct bch_csum) { 0 }, csum_type, ob);
- ret = bio != orig;
+ more = bio != orig;
}
+ /* might have done a realloc... */
+
+ key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+ ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+ BCH_DATA_USER);
+ if (ret)
+ return ret;
+
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
closure_get(bio->bi_private);
- /* might have done a realloc... */
-
- key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
- bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
- BCH_DATA_USER);
-
bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
- return ret;
+ return more;
}
static void __bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- struct bio *bio = &op->bio->bio;
unsigned open_bucket_nr = 0;
struct open_bucket *b;
int ret;
@@ -550,22 +541,12 @@ static void __bch2_write(struct closure *cl)
memset(op->open_buckets, 0, sizeof(op->open_buckets));
if (op->flags & BCH_WRITE_DISCARD) {
- op->flags |= BCH_WRITE_DONE;
bch2_write_discard(cl);
- bio_put(bio);
+ op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_done, index_update_wq(op));
}
- /*
- * Journal writes are marked REQ_PREFLUSH; if the original write was a
- * flush, it'll wait on the journal write.
- */
- bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
do {
- EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
- EBUG_ON(!bio_sectors(bio));
-
if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
continue_at(cl, bch2_write_index, index_update_wq(op));
@@ -622,7 +603,7 @@ static void __bch2_write(struct closure *cl)
b - c->open_buckets > U8_MAX);
op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
- ret = bch2_write_extent(op, b, bio);
+ ret = bch2_write_extent(op, b);
bch2_alloc_sectors_done(c, op->wp, b);
@@ -703,16 +684,13 @@ void bch2_wake_delayed_writes(unsigned long data)
* after the data is written it calls bch_journal, and after the keys have been
* added to the next journal write they're inserted into the btree.
*
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
* If op->discard is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
void bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio;
+ struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
u64 inode = op->pos.inode;
@@ -742,7 +720,7 @@ void bch2_write(struct closure *cl)
spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
bch2_ratelimit_increment(&c->foreground_write_pd.rate,
- bio->bi_iter.bi_size);
+ bio->bi_iter.bi_size);
delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
@@ -776,15 +754,14 @@ void bch2_write(struct closure *cl)
}
void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_write_bio *bio, struct disk_reservation res,
- struct write_point *wp, struct bpos pos,
- u64 *journal_seq, unsigned flags)
+ struct disk_reservation res,
+ struct write_point *wp, struct bpos pos,
+ u64 *journal_seq, unsigned flags)
{
EBUG_ON(res.sectors && !res.nr_replicas);
op->c = c;
op->io_wq = index_update_wq(op);
- op->bio = bio;
op->written = 0;
op->error = 0;
op->flags = flags;
@@ -983,7 +960,7 @@ static void cache_promote_done(struct closure *cl)
struct cache_promote_op *op =
container_of(cl, struct cache_promote_op, cl);
- bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
+ bch2_bio_free_pages_pool(op->write.op.c, &op->write.op.wbio.bio);
kfree(op);
}
@@ -1020,7 +997,7 @@ static void __bch2_read_endio(struct work_struct *work)
trace_promote(&rbio->bio);
/* we now own pages: */
- swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
+ swap(promote->write.op.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL;
bch2_rbio_done(rbio);
@@ -1112,7 +1089,7 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
promote_op = kmalloc(sizeof(*promote_op) +
sizeof(struct bio_vec) * pages, GFP_NOIO);
if (promote_op) {
- struct bio *promote_bio = &promote_op->write.wbio.bio;
+ struct bio *promote_bio = &promote_op->write.op.wbio.bio;
bio_init(promote_bio,
promote_bio->bi_inline_vecs,
@@ -1204,7 +1181,7 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
rbio->bio.bi_end_io = bch2_read_endio;
if (promote_op) {
- struct bio *promote_bio = &promote_op->write.wbio.bio;
+ struct bio *promote_bio = &promote_op->write.op.wbio.bio;
promote_bio->bi_iter = rbio->bio.bi_iter;
memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
@@ -1367,12 +1344,11 @@ void bch2_read_retry_work(struct work_struct *work)
read_retry_work);
struct bch_read_bio *rbio;
struct bio *bio;
- unsigned long flags;
while (1) {
- spin_lock_irqsave(&c->read_retry_lock, flags);
+ spin_lock_irq(&c->read_retry_lock);
bio = bio_list_pop(&c->read_retry_list);
- spin_unlock_irqrestore(&c->read_retry_lock, flags);
+ spin_unlock_irq(&c->read_retry_lock);
if (!bio)
break;
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index fb6f300..619bf56 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -41,11 +41,18 @@ static inline struct write_point *foreground_write_point(struct bch_fs *c,
}
void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
- struct bch_write_bio *,
struct disk_reservation, struct write_point *,
struct bpos, u64 *, unsigned);
void bch2_write(struct closure *);
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+ struct bch_write_bio *wbio = to_wbio(bio);
+
+ memset(wbio, 0, offsetof(struct bch_write_bio, bio));
+ return wbio;
+}
+
struct cache_promote_op;
struct extent_pick_ptr;
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index d104cb7..3b73bcf 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -66,37 +66,30 @@ struct bch_write_bio {
struct bch_fs *c;
struct bch_dev *ca;
union {
- struct bio *orig;
- struct closure *cl;
+ struct bch_write_bio *parent;
+ struct closure *cl;
};
- unsigned submit_time_us;
+ u8 ptr_idx;
+ u8 replicas_failed;
+ u8 order;
+
unsigned split:1,
bounce:1,
put_bio:1,
- have_io_ref:1;
+ have_io_ref:1,
+ used_mempool:1;
- /* Only for btree writes: */
- unsigned used_mempool:1;
- u8 order;
+ unsigned submit_time_us;
+ void *data;
struct bio bio;
};
-struct bch_replace_info {
- struct extent_insert_hook hook;
- /* How many insertions succeeded */
- unsigned successes;
- /* How many insertions failed */
- unsigned failures;
- BKEY_PADDED(key);
-};
-
struct bch_write_op {
struct closure cl;
- struct bch_fs *c;
+ struct bch_fs *c;
struct workqueue_struct *io_wq;
- struct bch_write_bio *bio;
unsigned written; /* sectors */
@@ -141,6 +134,9 @@ struct bch_write_op {
struct keylist insert_keys;
u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+ /* Must be last: */
+ struct bch_write_bio wbio;
};
#endif /* _BCACHE_IO_TYPES_H */
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index b0011b4..bf8c152 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -53,15 +53,15 @@ static inline u64 journal_pin_seq(struct journal *j,
return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
}
-static inline void bch2_journal_add_entry(struct journal_buf *buf,
- const void *data, size_t u64s,
- unsigned type, enum btree_id id,
- unsigned level)
+static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
+ unsigned type, enum btree_id id,
+ unsigned level,
+ const void *data, size_t u64s)
{
struct jset *jset = buf->data;
- bch2_journal_add_entry_at(buf, data, u64s, type, id, level,
- le32_to_cpu(jset->u64s));
+ bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s),
+ type, id, level, data, u64s);
le32_add_cpu(&jset->u64s, jset_u64s(u64s));
}
@@ -97,8 +97,9 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
enum btree_id id, struct bkey_i *k,
unsigned level)
{
- bch2_journal_add_entry(buf, k, k->k.u64s,
- JOURNAL_ENTRY_BTREE_ROOT, id, level);
+ bch2_journal_add_entry_noreservation(buf,
+ JOURNAL_ENTRY_BTREE_ROOT, id, level,
+ k, k->k.u64s);
}
static void journal_seq_blacklist_flush(struct journal *j,
@@ -416,13 +417,8 @@ static void journal_entry_null_range(void *start, void *end)
{
struct jset_entry *entry;
- for (entry = start; entry != end; entry = vstruct_next(entry)) {
- entry->u64s = 0;
- entry->btree_id = 0;
- entry->level = 0;
- entry->flags = 0;
- SET_JOURNAL_ENTRY_TYPE(entry, 0);
- }
+ for (entry = start; entry != end; entry = vstruct_next(entry))
+ memset(entry, 0, sizeof(*entry));
}
static int journal_validate_key(struct bch_fs *c, struct jset *j,
@@ -514,7 +510,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
break;
}
- switch (JOURNAL_ENTRY_TYPE(entry)) {
+ switch (entry->type) {
case JOURNAL_ENTRY_BTREE_KEYS:
vstruct_for_each(entry, k) {
ret = journal_validate_key(c, j, entry, k,
@@ -555,8 +551,8 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
break;
default:
- journal_entry_err(c, "invalid journal entry type %llu",
- JOURNAL_ENTRY_TYPE(entry));
+ journal_entry_err(c, "invalid journal entry type %u",
+ entry->type);
journal_entry_null_range(entry, vstruct_next(entry));
break;
}
@@ -1426,9 +1422,9 @@ void bch2_journal_start(struct bch_fs *c)
*/
list_for_each_entry(bl, &j->seq_blacklist, list)
if (!bl->written) {
- bch2_journal_add_entry(journal_cur_buf(j), &bl->seq, 1,
+ bch2_journal_add_entry_noreservation(journal_cur_buf(j),
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
- 0, 0);
+ 0, 0, &bl->seq, 1);
journal_pin_add_entry(j,
&fifo_peek_back(&j->pin),
@@ -2083,8 +2079,8 @@ static void journal_write_compact(struct jset *jset)
if (prev &&
i->btree_id == prev->btree_id &&
i->level == prev->level &&
- JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
- JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
+ i->type == prev->type &&
+ i->type == JOURNAL_ENTRY_BTREE_KEYS &&
le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
memmove_u64s_down(vstruct_next(prev),
i->_data,
@@ -2238,8 +2234,9 @@ static void journal_write(struct closure *cl)
closure_return_with_destructor(cl, journal_write_done);
}
- bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
- BCH_DATA_JOURNAL);
+ if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
+ BCH_DATA_JOURNAL))
+ goto err;
/*
* XXX: we really should just disable the entire journal in nochanges
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 88a9bd1..d785a0c 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -125,7 +125,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
while (entry < vstruct_last(jset)) {
- if (JOURNAL_ENTRY_TYPE(entry) == type)
+ if (entry->type == type)
return entry;
entry = vstruct_next(entry);
@@ -187,8 +187,12 @@ static inline void journal_state_inc(union journal_res_state *s)
s->buf1_count += s->idx == 1;
}
-static inline void bch2_journal_set_has_inode(struct journal_buf *buf, u64 inum)
+static inline void bch2_journal_set_has_inode(struct journal *j,
+ struct journal_res *res,
+ u64 inum)
{
+ struct journal_buf *buf = &j->buf[res->idx];
+
set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode);
}
@@ -202,40 +206,46 @@ static inline unsigned jset_u64s(unsigned u64s)
}
static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
- const void *data, size_t u64s,
+ unsigned offset,
unsigned type, enum btree_id id,
- unsigned level, unsigned offset)
+ unsigned level,
+ const void *data, size_t u64s)
{
struct jset_entry *entry = vstruct_idx(buf->data, offset);
- entry->u64s = cpu_to_le16(u64s);
+ memset(entry, 0, sizeof(*entry));
+ entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
- entry->level = level;
- entry->flags = 0;
- SET_JOURNAL_ENTRY_TYPE(entry, type);
+ entry->level = level;
+ entry->type = type;
memcpy_u64s(entry->_data, data, u64s);
}
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
- enum btree_id id, const struct bkey_i *k)
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+ unsigned type, enum btree_id id,
+ unsigned level,
+ const void *data, unsigned u64s)
{
struct journal_buf *buf = &j->buf[res->idx];
- unsigned actual = jset_u64s(k->k.u64s);
+ unsigned actual = jset_u64s(u64s);
EBUG_ON(!res->ref);
BUG_ON(actual > res->u64s);
- bch2_journal_set_has_inode(buf, k->k.p.inode);
-
- bch2_journal_add_entry_at(buf, k, k->k.u64s,
- JOURNAL_ENTRY_BTREE_KEYS, id,
- 0, res->offset);
-
+ bch2_journal_add_entry_at(buf, res->offset, type,
+ id, level, data, u64s);
res->offset += actual;
res->u64s -= actual;
}
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
+ enum btree_id id, const struct bkey_i *k)
+{
+ bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
+ id, 0, k, k->k.u64s);
+}
+
void bch2_journal_buf_put_slowpath(struct journal *, bool);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
@@ -272,13 +282,10 @@ static inline void bch2_journal_res_put(struct journal *j,
lock_release(&j->res_map, 0, _RET_IP_);
- while (res->u64s) {
- bch2_journal_add_entry_at(&j->buf[res->idx], NULL, 0,
- JOURNAL_ENTRY_BTREE_KEYS,
- 0, 0, res->offset);
- res->offset += jset_u64s(0);
- res->u64s -= jset_u64s(0);
- }
+ while (res->u64s)
+ bch2_journal_add_entry(j, res,
+ JOURNAL_ENTRY_BTREE_KEYS,
+ 0, 0, NULL, 0);
bch2_journal_buf_put(j, res->idx, false);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index ba0cc0e..78f6d3c 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -128,9 +128,12 @@ int bch2_move_data_off_device(struct bch_dev *ca)
seen_key_count++;
continue;
next:
- if (bkey_extent_is_data(k.k))
- bch2_check_mark_super(c, bkey_s_c_to_extent(k),
- BCH_DATA_USER);
+ if (bkey_extent_is_data(k.k)) {
+ ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+ BCH_DATA_USER);
+ if (ret)
+ break;
+ }
bch2_btree_iter_advance_pos(&iter);
bch2_btree_iter_cond_resched(&iter);
@@ -386,9 +389,12 @@ int bch2_flag_data_bad(struct bch_dev *ca)
*/
continue;
advance:
- if (bkey_extent_is_data(k.k))
- bch2_check_mark_super(c, bkey_s_c_to_extent(k),
- BCH_DATA_USER);
+ if (bkey_extent_is_data(k.k)) {
+ ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+ BCH_DATA_USER);
+ if (ret)
+ break;
+ }
bch2_btree_iter_advance_pos(&iter);
}
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 8c9395d..8ef1a0b 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -155,11 +155,8 @@ void bch2_migrate_write_init(struct bch_fs *c,
(move_ptr && move_ptr->cached))
flags |= BCH_WRITE_CACHED;
- bch2_write_op_init(&m->op, c, &m->wbio,
- (struct disk_reservation) { 0 },
- wp,
- bkey_start_pos(k.k),
- NULL, flags);
+ bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, wp,
+ bkey_start_pos(k.k), NULL, flags);
if (m->move)
m->op.alloc_reserve = RESERVE_MOVINGGC;
@@ -194,7 +191,7 @@ static void moving_io_destructor(struct closure *cl)
atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
wake_up(&ctxt->wait);
- bio_for_each_segment_all(bv, &io->write.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -307,9 +304,7 @@ int bch2_data_move(struct bch_fs *c,
return -ENOMEM;
}
- migrate_bio_init(io, &io->write.wbio.bio, k.k->size);
- bio_get(&io->write.wbio.bio);
- io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
+ migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 548f0f0..094eac8 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -19,7 +19,6 @@ struct migrate_write {
bool move;
struct bch_extent_ptr move_ptr;
struct bch_write_op op;
- struct bch_write_bio wbio;
};
void bch2_migrate_write_init(struct bch_fs *,
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 1eae0fc..0ddfad3 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -783,6 +783,12 @@ out:
/* replica information: */
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+ return (void *) r->entries + r->entry_size * i;
+}
+
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
@@ -794,6 +800,24 @@ replicas_entry_next(struct bch_replicas_entry *i)
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+ unsigned dev)
+{
+ return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+ unsigned dev)
+{
+ e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+ return (r->entry_size -
+ offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
@@ -879,6 +903,29 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
return 0;
}
+static void bkey_to_replicas(struct bkey_s_c_extent e,
+ enum bch_data_types data_type,
+ struct bch_replicas_cpu_entry *r,
+ unsigned *max_dev)
+{
+ const struct bch_extent_ptr *ptr;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_SB ||
+ data_type >= BCH_DATA_NR);
+
+ memset(r, 0, sizeof(*r));
+ r->data_type = data_type;
+
+ *max_dev = 0;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached) {
+ *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+ replicas_set_dev(r, ptr->dev);
+ }
+}
+
/*
* for when gc of replica information is in progress:
*/
@@ -887,14 +934,11 @@ static int bch2_update_gc_replicas(struct bch_fs *c,
struct bkey_s_c_extent e,
enum bch_data_types data_type)
{
- const struct bch_extent_ptr *ptr;
- struct bch_replicas_cpu_entry *new_e;
+ struct bch_replicas_cpu_entry new_e;
struct bch_replicas_cpu *new;
- unsigned i, nr, entry_size, max_dev = 0;
+ unsigned i, nr, entry_size, max_dev;
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached)
- max_dev = max_t(unsigned, max_dev, ptr->dev);
+ bkey_to_replicas(e, data_type, &new_e, &max_dev);
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
@@ -914,12 +958,9 @@ static int bch2_update_gc_replicas(struct bch_fs *c,
cpu_replicas_entry(gc_r, i),
gc_r->entry_size);
- new_e = cpu_replicas_entry(new, nr - 1);
- new_e->data_type = data_type;
-
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached)
- replicas_set_dev(new_e, ptr->dev);
+ memcpy(cpu_replicas_entry(new, nr - 1),
+ &new_e,
+ new->entry_size);
eytzinger0_sort(new->entries,
new->nr,
@@ -931,8 +972,38 @@ static int bch2_update_gc_replicas(struct bch_fs *c,
return 0;
}
-int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e,
- enum bch_data_types data_type)
+static bool replicas_has_extent(struct bch_replicas_cpu *r,
+ struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
+{
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+
+ bkey_to_replicas(e, data_type, &search, &max_dev);
+
+ return max_dev < replicas_dev_slots(r) &&
+ eytzinger0_find(r->entries, r->nr,
+ r->entry_size,
+ memcmp, &search) < r->nr;
+}
+
+bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = replicas_has_extent(rcu_dereference(c->replicas),
+ e, data_type);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+ struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
{
struct bch_replicas_cpu *gc_r;
const struct bch_extent_ptr *ptr;
@@ -996,6 +1067,25 @@ err:
return ret;
}
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+ enum bch_data_types data_type)
+{
+ struct bch_replicas_cpu *gc_r;
+ bool marked;
+
+ rcu_read_lock();
+ marked = replicas_has_extent(rcu_dereference(c->replicas),
+ e, data_type) &&
+ (!(gc_r = rcu_dereference(c->replicas_gc)) ||
+ replicas_has_extent(gc_r, e, data_type));
+ rcu_read_unlock();
+
+ if (marked)
+ return 0;
+
+ return bch2_check_mark_super_slowpath(c, e, data_type);
+}
+
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_dev *dev_to_offline)
{
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 879fdda..65dd9fb 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -121,92 +121,10 @@ const char *bch2_read_super(struct bcache_superblock *,
struct bch_opts, const char *);
void bch2_write_super(struct bch_fs *);
-static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
- unsigned dev)
-{
- return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
-}
-
-static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
- unsigned dev)
-{
- e->devs[dev >> 3] |= 1 << (dev & 7);
-}
-
-static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
-{
- return (r->entry_size -
- offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
-}
-
-static inline struct bch_replicas_cpu_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
- return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent,
- enum bch_data_types);
-
-static inline bool replicas_has_extent(struct bch_replicas_cpu *r,
- struct bkey_s_c_extent e,
- enum bch_data_types data_type)
-{
- const struct bch_extent_ptr *ptr;
- struct bch_replicas_cpu_entry search = {
- .data_type = data_type,
- };
- unsigned max_dev = 0;
-
- BUG_ON(!data_type ||
- data_type == BCH_DATA_SB ||
- data_type >= BCH_DATA_NR);
-
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached) {
- max_dev = max_t(unsigned, max_dev, ptr->dev);
- replicas_set_dev(&search, ptr->dev);
- }
-
- return max_dev < replicas_dev_slots(r) &&
- eytzinger0_find(r->entries, r->nr,
- r->entry_size,
- memcmp, &search) < r->nr;
-}
-
-static inline bool bch2_sb_has_replicas(struct bch_fs *c,
- struct bkey_s_c_extent e,
- enum bch_data_types data_type)
-{
- bool ret;
-
- rcu_read_lock();
- ret = replicas_has_extent(rcu_dereference(c->replicas),
- e, data_type);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline int bch2_check_mark_super(struct bch_fs *c,
- struct bkey_s_c_extent e,
- enum bch_data_types data_type)
-{
- struct bch_replicas_cpu *gc_r;
- bool marked;
-
- rcu_read_lock();
- marked = replicas_has_extent(rcu_dereference(c->replicas),
- e, data_type) &&
- (!(gc_r = rcu_dereference(c->replicas_gc)) ||
- replicas_has_extent(gc_r, e, data_type));
- rcu_read_unlock();
-
- if (marked)
- return 0;
-
- return bch2_check_mark_super_slowpath(c, e, data_type);
-}
+bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
+ enum bch_data_types);
+int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
+ enum bch_data_types);
struct replicas_status {
struct {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 692eb41..c4cb0b2 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -517,10 +517,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->btree_interior_update_lock);
mutex_init(&c->bio_bounce_pages_lock);
+ mutex_init(&c->zlib_workspace_lock);
+
bio_list_init(&c->read_retry_list);
spin_lock_init(&c->read_retry_lock);
INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
- mutex_init(&c->zlib_workspace_lock);
+
+ bio_list_init(&c->btree_write_error_list);
+ spin_lock_init(&c->btree_write_error_lock);
+ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
@@ -593,8 +598,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
PAGE_SECTORS, 0) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
- mempool_init_page_pool(&c->btree_bounce_pool, 1,
- ilog2(btree_pages(c))) ||
+ mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
@@ -1345,11 +1349,13 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
}
}
-static bool bch2_fs_may_start(struct bch_fs *c, int flags)
+static bool bch2_fs_may_start(struct bch_fs *c)
{
struct replicas_status s;
struct bch_sb_field_members *mi;
- unsigned i;
+ unsigned i, flags = c->opts.degraded
+ ? BCH_FORCE_IF_DEGRADED
+ : 0;
if (!c->opts.degraded) {
mutex_lock(&c->sb_lock);
@@ -1773,7 +1779,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
mutex_unlock(&c->sb_lock);
err = "insufficient devices";
- if (!bch2_fs_may_start(c, 0))
+ if (!bch2_fs_may_start(c))
goto err;
if (!c->opts.nostart) {
@@ -1844,7 +1850,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb,
}
mutex_unlock(&c->sb_lock);
- if (!c->opts.nostart && bch2_fs_may_start(c, 0)) {
+ if (!c->opts.nostart && bch2_fs_may_start(c)) {
err = __bch2_fs_start(c);
if (err)
goto err;
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 906e7a6..9a95854 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -577,3 +577,17 @@ void sort_cmp_size(void *base, size_t num, size_t size,
}
}
}
+
+void mempool_free_vp(void *element, void *pool_data)
+{
+ size_t size = (size_t) pool_data;
+
+ vpfree(element, size);
+}
+
+void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t) pool_data;
+
+ return vpmalloc(size, gfp_mask);
+}
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 68d9a86..a9a17d9 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -79,23 +79,43 @@ do { \
(__builtin_types_compatible_p(typeof(_val), _type) || \
__builtin_types_compatible_p(typeof(_val), const _type))
-static inline void kvpfree(void *p, size_t size)
+static inline void vpfree(void *p, size_t size)
{
- if (size < PAGE_SIZE)
- kfree(p);
- else if (is_vmalloc_addr(p))
+ if (is_vmalloc_addr(p))
vfree(p);
else
free_pages((unsigned long) p, get_order(size));
+}
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+ get_order(size)) ?:
+ __vmalloc(size, gfp_mask, PAGE_KERNEL);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+ if (size < PAGE_SIZE)
+ kfree(p);
+ else
+ vpfree(p, size);
}
static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
{
- return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
- : (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
- get_order(size))
- ?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ return size < PAGE_SIZE
+ ? kmalloc(size, gfp_mask)
+ : vpmalloc(size, gfp_mask);
+}
+
+void mempool_free_vp(void *element, void *pool_data);
+void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
+
+static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
+{
+ return mempool_init(pool, min_nr, mempool_alloc_vp,
+ mempool_free_vp, (void *) size);
}
#define HEAP(type) \
diff --git a/linux/sched.c b/linux/sched.c
index 11480f3..898ccb1 100644
--- a/linux/sched.c
+++ b/linux/sched.c
@@ -1,5 +1,6 @@
#include <string.h>
+#include <sys/mman.h>
#include <linux/math64.h>
#include <linux/printk.h>
@@ -163,6 +164,8 @@ static void sched_init(void)
{
struct task_struct *p = malloc(sizeof(*p));
+ mlockall(MCL_CURRENT|MCL_FUTURE);
+
memset(p, 0, sizeof(*p));
p->state = TASK_RUNNING;