summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2019-01-23 15:49:44 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2019-01-23 15:53:32 -0500
commit35fca2f044d375b1590f499cfd34bef38ca0f8f1 (patch)
treec3906079fe4de190de30690bd2725e4bb0a28b0a
parent1c50d258e3462cd0e0f76570685092910fc11873 (diff)
Update bcachefs sources to 99750eab4d bcachefs: Persist stripe blocks_used
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/asm/page.h0
-rw-r--r--include/linux/atomic.h11
-rw-r--r--include/linux/generic-radix-tree.h127
-rw-r--r--libbcachefs/alloc_background.c69
-rw-r--r--libbcachefs/alloc_background.h1
-rw-r--r--libbcachefs/alloc_foreground.c18
-rw-r--r--libbcachefs/alloc_foreground.h1
-rw-r--r--libbcachefs/alloc_types.h5
-rw-r--r--libbcachefs/bcachefs.h9
-rw-r--r--libbcachefs/bkey.c9
-rw-r--r--libbcachefs/btree_gc.c48
-rw-r--r--libbcachefs/btree_gc.h2
-rw-r--r--libbcachefs/btree_update.h3
-rw-r--r--libbcachefs/btree_update_leaf.c6
-rw-r--r--libbcachefs/buckets.c128
-rw-r--r--libbcachefs/buckets.h6
-rw-r--r--libbcachefs/chardev.c2
-rw-r--r--libbcachefs/ec.c179
-rw-r--r--libbcachefs/ec.h52
-rw-r--r--libbcachefs/ec_types.h7
-rw-r--r--libbcachefs/extents.c32
-rw-r--r--libbcachefs/extents.h3
-rw-r--r--libbcachefs/eytzinger.h26
-rw-r--r--libbcachefs/fs-io.c40
-rw-r--r--libbcachefs/journal_io.c20
-rw-r--r--libbcachefs/journal_reclaim.c10
-rw-r--r--libbcachefs/migrate.c11
-rw-r--r--libbcachefs/move.c11
-rw-r--r--libbcachefs/recovery.c8
-rw-r--r--libbcachefs/replicas.c184
-rw-r--r--libbcachefs/replicas.h32
-rw-r--r--libbcachefs/super.c53
-rw-r--r--libbcachefs/sysfs.c42
-rw-r--r--libbcachefs/util.c31
-rw-r--r--libbcachefs/util.h2
-rw-r--r--linux/generic-radix-tree.c127
37 files changed, 916 insertions, 401 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 088f645c..8eca0593 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-bcca1c557b1897ecc3aeb1f89ab91865487d91ab
+99750eab4d583132cf61f071082c7cf21f5295c0
diff --git a/include/asm/page.h b/include/asm/page.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/include/asm/page.h
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 7471bd97..38a364c0 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -37,6 +37,7 @@ typedef struct {
#define xchg_acquire(p, v) uatomic_xchg(p, v)
#define cmpxchg(p, old, new) uatomic_cmpxchg(p, old, new)
#define cmpxchg_acquire(p, old, new) uatomic_cmpxchg(p, old, new)
+#define cmpxchg_release(p, old, new) uatomic_cmpxchg(p, old, new)
#define smp_mb__before_atomic() cmm_smp_mb__before_uatomic_add()
#define smp_mb__after_atomic() cmm_smp_mb__after_uatomic_add()
@@ -77,6 +78,16 @@ typedef struct {
__old; \
})
+#define cmpxchg_release(p, old, new) \
+({ \
+ typeof(*(p)) __old = (old); \
+ \
+ __atomic_compare_exchange_n((p), &__old, new, false, \
+ __ATOMIC_RELEASE, \
+ __ATOMIC_RELEASE); \
+ __old; \
+})
+
#define smp_mb__before_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define smp_mb__after_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define smp_wmb() __atomic_thread_fence(__ATOMIC_SEQ_CST)
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 7f637e17..3a91130a 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -1,34 +1,60 @@
#ifndef _LINUX_GENERIC_RADIX_TREE_H
#define _LINUX_GENERIC_RADIX_TREE_H
-/*
- * Generic radix trees/sparse arrays:
+/**
+ * DOC: Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ *
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ *
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ * reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ * NULL if that entry does not exist
*
- * A generic radix tree has all nodes of size PAGE_SIZE - both leaves and
- * interior nodes.
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ * allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
*/
+#include <asm/page.h>
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/log2.h>
-struct genradix_node;
+struct genradix_root;
struct __genradix {
- struct genradix_node *root;
- size_t depth;
+ struct genradix_root __rcu *root;
};
/*
- * NOTE: currently, sizeof(_type) must be a power of two and not larger than
- * PAGE_SIZE:
+ * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
*/
#define __GENRADIX_INITIALIZER \
{ \
.tree = { \
.root = NULL, \
- .depth = 0, \
} \
}
@@ -49,6 +75,12 @@ struct { \
#define DEFINE_GENRADIX(_name, _type) \
GENRADIX(_type) _name = __GENRADIX_INITIALIZER
+/**
+ * genradix_init - initialize a genradix
+ * @_radix: genradix to initialize
+ *
+ * Does not fail
+ */
#define genradix_init(_radix) \
do { \
*(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \
@@ -56,11 +88,20 @@ do { \
void __genradix_free(struct __genradix *);
+/**
+ * genradix_free: free all memory owned by a genradix
+ * @_radix: the genradix to free
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
#define genradix_free(_radix) __genradix_free(&(_radix)->tree)
static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
{
- BUILD_BUG_ON(obj_size > PAGE_SIZE);
+ if (__builtin_constant_p(obj_size))
+ BUILD_BUG_ON(obj_size > PAGE_SIZE);
+ else
+ BUG_ON(obj_size > PAGE_SIZE);
if (!is_power_of_2(obj_size)) {
size_t objs_per_page = PAGE_SIZE / obj_size;
@@ -79,7 +120,13 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
void *__genradix_ptr(struct __genradix *, size_t);
-/* Returns a pointer to element at @_idx */
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @_radix: genradix to access
+ * @_idx: index to fetch
+ *
+ * Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
+ */
#define genradix_ptr(_radix, _idx) \
(__genradix_cast(_radix) \
__genradix_ptr(&(_radix)->tree, \
@@ -87,7 +134,15 @@ void *__genradix_ptr(struct __genradix *, size_t);
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
-/* Returns a pointer to element at @_idx, allocating it if necessary */
+/**
+ * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
+ * if necessary
+ * @_radix: genradix to access
+ * @_idx: index to fetch
+ * @_gfp: gfp mask
+ *
+ * Returns a pointer to entry at @_idx, or NULL on allocation failure
+ */
#define genradix_ptr_alloc(_radix, _idx, _gfp) \
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
@@ -99,6 +154,11 @@ struct genradix_iter {
size_t pos;
};
+/**
+ * genradix_iter_init - initialize a genradix_iter
+ * @_radix: genradix that will be iterated over
+ * @_idx: index to start iterating from
+ */
#define genradix_iter_init(_radix, _idx) \
((struct genradix_iter) { \
.pos = (_idx), \
@@ -107,6 +167,14 @@ struct genradix_iter {
void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
+/**
+ * genradix_iter_peek - get first entry at or above iterator's current
+ * position
+ * @_iter: a genradix_iter
+ * @_radix: genradix being iterated over
+ *
+ * If no more entries exist at or above @_iter's current position, returns NULL
+ */
#define genradix_iter_peek(_iter, _radix) \
(__genradix_cast(_radix) \
__genradix_iter_peek(_iter, &(_radix)->tree, \
@@ -127,4 +195,37 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
#define genradix_iter_advance(_iter, _radix) \
__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+#define genradix_for_each_from(_radix, _iter, _p, _start) \
+ for (_iter = genradix_iter_init(_radix, _start); \
+ (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
+ genradix_iter_advance(&_iter, _radix))
+
+/**
+ * genradix_for_each - iterate over entry in a genradix
+ * @_radix: genradix to iterate over
+ * @_iter: a genradix_iter to track current position
+ * @_p: pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each(_radix, _iter, _p) \
+ genradix_for_each_from(_radix, _iter, _p, 0)
+
+int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
+
+/**
+ * genradix_prealloc - preallocate entries in a generic radix tree
+ * @_radix: genradix to preallocate
+ * @_nr: number of entries to preallocate
+ * @_gfp: gfp mask
+ *
+ * Returns 0 on success, -ENOMEM on failure
+ */
+#define genradix_prealloc(_radix, _nr, _gfp) \
+ __genradix_prealloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _nr + 1),\
+ _gfp)
+
+
#endif /* _LINUX_GENERIC_RADIX_TREE_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 6de6e263..2552d457 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -249,6 +249,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
+ for_each_member_device(ca, c, i)
+ bch2_dev_usage_from_buckets(c, ca);
+
mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
@@ -280,35 +283,51 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
#endif
struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
struct bucket *g;
- struct bucket_mark m;
+ struct bucket_mark m, new;
int ret;
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
a->k.p = POS(ca->dev_idx, b);
+ bch2_btree_iter_set_pos(iter, a->k.p);
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
percpu_down_read_preempt_disable(&c->mark_lock);
g = bucket(ca, b);
- m = bucket_cmpxchg(g, m, m.dirty = false);
+ m = READ_ONCE(g->mark);
+
+ if (!m.dirty) {
+ percpu_up_read_preempt_enable(&c->mark_lock);
+ return 0;
+ }
__alloc_write_key(a, g, m);
percpu_up_read_preempt_enable(&c->mark_lock);
bch2_btree_iter_cond_resched(iter);
- bch2_btree_iter_set_pos(iter, a->k.p);
-
ret = bch2_btree_insert_at(c, NULL, journal_seq,
+ BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
+ if (ret)
+ return ret;
+
+ new = m;
+ new.dirty = false;
+ atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
- if (!ret && ca->buckets_written)
+ if (ca->buckets_written)
set_bit(b, ca->buckets_written);
- return ret;
+ return 0;
}
int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
@@ -898,10 +917,19 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
+
closure_wake_up(&c->freelist_wait);
+ ca->allocator_blocked_full = false;
+
spin_unlock(&c->freelist_lock);
goto out;
}
+
+ if (!ca->allocator_blocked_full) {
+ ca->allocator_blocked_full = true;
+ closure_wake_up(&c->freelist_wait);
+ }
+
spin_unlock(&c->freelist_lock);
if ((current->flags & PF_KTHREAD) &&
@@ -1226,6 +1254,11 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
+{
+ closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
+}
+
/* stop allocator thread: */
void bch2_dev_allocator_stop(struct bch_dev *ca)
{
@@ -1333,6 +1366,24 @@ static void allocator_start_issue_discards(struct bch_fs *c)
ca->mi.bucket_size, GFP_NOIO, 0);
}
+static int resize_free_inc(struct bch_dev *ca)
+{
+ alloc_fifo free_inc;
+
+ if (!fifo_full(&ca->free_inc))
+ return 0;
+
+ if (!init_fifo(&free_inc,
+ ca->free_inc.size * 2,
+ GFP_KERNEL))
+ return -ENOMEM;
+
+ fifo_move(&free_inc, &ca->free_inc);
+ swap(free_inc, ca->free_inc);
+ free_fifo(&free_inc);
+ return 0;
+}
+
static int __bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -1408,6 +1459,12 @@ not_enough:
while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
(bu = next_alloc_bucket(ca)) >= 0) {
+ ret = resize_free_inc(ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+
bch2_invalidate_one_bucket(c, ca, bu,
&journal_seq);
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index a0c08e34..26561b3b 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -51,6 +51,7 @@ void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 14e6453b..f2f9015d 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -106,6 +106,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
false, gc_pos_alloc(c, ob), 0);
ob->valid = false;
+ ob->type = 0;
spin_unlock(&ob->lock);
percpu_up_read_preempt_enable(&c->mark_lock);
@@ -141,6 +142,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
ob = c->open_buckets + c->open_buckets_freelist;
c->open_buckets_freelist = ob->freelist;
atomic_set(&ob->pin, 1);
+ ob->type = 0;
c->open_buckets_nr_free--;
return ob;
@@ -209,9 +211,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
case RESERVE_ALLOC:
return 0;
case RESERVE_BTREE:
- return BTREE_NODE_RESERVE / 2;
+ return BTREE_NODE_OPEN_BUCKET_RESERVE;
default:
- return BTREE_NODE_RESERVE;
+ return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
}
}
@@ -837,15 +839,17 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
{
struct write_point *wp;
struct open_bucket *ob;
- unsigned nr_effective = 0;
- struct open_buckets ptrs = { .nr = 0 };
- bool have_cache = false;
- unsigned write_points_nr;
- int ret = 0, i;
+ struct open_buckets ptrs;
+ unsigned nr_effective, write_points_nr;
+ bool have_cache;
+ int ret, i;
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
+ ptrs.nr = 0;
+ nr_effective = 0;
write_points_nr = c->write_points_nr;
+ have_cache = false;
wp = writepoint_find(c, write_point.v);
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index b0e44f75..5224a52f 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -85,6 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
+ ob->type = wp->type;
atomic_inc(&ob->pin);
ob_push(c, ptrs, ob);
}
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index 6f17f094..66457fc7 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -55,9 +55,10 @@ struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
- bool valid;
- bool on_partial_list;
u8 ec_idx;
+ u8 type;
+ unsigned valid:1;
+ unsigned on_partial_list:1;
unsigned sectors_free;
struct bch_extent_ptr ptr;
struct ec_stripe_new *ec;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 449eb0c1..f42b2f90 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -330,6 +330,8 @@ enum bch_time_stats {
/* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
struct btree;
enum gc_phase {
@@ -426,7 +428,13 @@ struct bch_dev {
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
+
+ /*
+ * XXX: this should be an enum for allocator state, so as to include
+ * error state
+ */
bool allocator_blocked;
+ bool allocator_blocked_full;
alloc_heap alloc_heap;
@@ -597,6 +605,7 @@ struct bch_fs {
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
+ struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index 25725e42..40ce33a4 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -1010,11 +1010,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
nr_key_bits -= 64;
}
- if (l_v != r_v)
- return l_v < r_v ? -1 : 1;
-
- if (!nr_key_bits)
- return 0;
+ if (!nr_key_bits || l_v != r_v)
+ break;
l = next_word(l);
r = next_word(r);
@@ -1022,6 +1019,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
l_v = *l;
r_v = *r;
}
+
+ return (l_v > r_v) - (l_v < r_v);
}
#endif
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 23013fbb..433e8f22 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -483,31 +483,6 @@ static void bch2_gc_free(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
- u64 *ret;
- int cpu;
-
- preempt_disable();
- ret = this_cpu_ptr(p);
- preempt_enable();
-
- for_each_possible_cpu(cpu) {
- u64 *i = per_cpu_ptr(p, cpu);
-
- if (i != ret) {
- acc_u64s(ret, i, nr);
- memset(i, 0, nr * sizeof(u64));
- }
- }
-
- return ret;
-}
-
static void bch2_gc_done_nocheck(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -543,9 +518,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *)
- acc_percpu_u64s((void *) ca->usage[0], nr);
+ bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *)
- acc_percpu_u64s((void *) ca->usage[1], nr);
+ bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
*dst = *src;
}
@@ -554,9 +529,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr;
struct bch_fs_usage *dst = (void *)
- acc_percpu_u64s((void *) c->usage[0], nr);
+ bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
- acc_percpu_u64s((void *) c->usage[1], nr);
+ bch2_acc_percpu_u64s((void *) c->usage[1], nr);
memcpy(&dst->s.gc_start[0],
&src->s.gc_start[0],
@@ -582,6 +557,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
dst_iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
+ dst->dirty = true; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
@@ -612,16 +588,18 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
(src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
+ BUG_ON(src_iter.pos != dst_iter.pos);
+
copy_stripe_field(alive, "alive");
copy_stripe_field(sectors, "sectors");
copy_stripe_field(algorithm, "algorithm");
copy_stripe_field(nr_blocks, "nr_blocks");
copy_stripe_field(nr_redundant, "nr_redundant");
- copy_stripe_field(blocks_nonempty.counter,
+ copy_stripe_field(blocks_nonempty,
"blocks_nonempty");
for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
- copy_stripe_field(block_sectors[i].counter,
+ copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i);
if (dst->alive)
@@ -656,9 +634,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *)
- acc_percpu_u64s((void *) ca->usage[0], nr);
+ bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *)
- acc_percpu_u64s((void *) ca->usage[1], nr);
+ bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
@@ -678,9 +656,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr;
struct bch_fs_usage *dst = (void *)
- acc_percpu_u64s((void *) c->usage[0], nr);
+ bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
- acc_percpu_u64s((void *) c->usage[1], nr);
+ bch2_acc_percpu_u64s((void *) c->usage[1], nr);
copy_fs_field(s.hidden, "hidden");
copy_fs_field(s.data, "data");
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 8af5f841..1905acfa 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -109,7 +109,7 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
- ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
+ ret = gc_pos_cmp(pos, c->gc_pos) < 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret;
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index dd9d2559..4bd07258 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -77,6 +77,7 @@ enum {
__BTREE_INSERT_ATOMIC,
__BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL,
+ __BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
@@ -100,6 +101,8 @@ enum {
/* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
+#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW)
+
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 7eca9203..0df894fc 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -628,7 +628,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
trans_for_each_entry(trans, i)
btree_insert_entry_checks(c, i);
- if (unlikely(!percpu_ref_tryget(&c->writes)))
+ if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+ !percpu_ref_tryget(&c->writes)))
return -EROFS;
retry:
trans_for_each_iter(trans, i) {
@@ -658,7 +659,8 @@ retry:
trans_for_each_iter(trans, i)
bch2_btree_iter_downgrade(i->iter);
out:
- percpu_ref_put(&c->writes);
+ if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+ percpu_ref_put(&c->writes);
/* make sure we didn't drop or screw up locks: */
trans_for_each_iter(trans, i) {
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index d33d0bf0..ea71acb5 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -151,7 +151,6 @@ retry:
acc_u64s_percpu((u64 *) ret,
(u64 __percpu *) c->usage[0],
sizeof(*ret) / sizeof(u64) + nr);
- percpu_up_read_preempt_enable(&c->mark_lock);
return ret;
}
@@ -223,13 +222,14 @@ static bool bucket_became_unavailable(struct bucket_mark old,
!is_available_bucket(new);
}
-void bch2_fs_usage_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res,
- struct gc_pos gc_pos)
+int bch2_fs_usage_apply(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct disk_reservation *disk_res,
+ struct gc_pos gc_pos)
{
s64 added = fs_usage->s.data + fs_usage->s.reserved;
s64 should_not_have_added;
+ int ret = 0;
percpu_rwsem_assert_held(&c->mark_lock);
@@ -242,6 +242,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
"disk usage increased without a reservation")) {
atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added;
+ ret = -1;
}
if (added > 0) {
@@ -259,6 +260,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
(u64 *) fs_usage,
sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
}
+
+ return ret;
}
static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -363,10 +366,7 @@ static inline void update_cached_sectors(struct bch_fs *c,
{
struct bch_replicas_padded r;
- r.e.data_type = BCH_DATA_CACHED;
- r.e.nr_devs = 1;
- r.e.nr_required = 1;
- r.e.devs[0] = dev;
+ bch2_replicas_entry_cached(&r.e, dev);
update_replicas(c, fs_usage, &r.e, sectors);
}
@@ -382,7 +382,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new));
- new.owned_by_allocator = 1;
+ new.owned_by_allocator = true;
+ new.dirty = true;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
@@ -455,6 +456,7 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ new.dirty = true;
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
@@ -480,13 +482,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
true);
} else {
struct bucket *g;
- struct bucket_mark old, new;
+ struct bucket_mark new;
rcu_read_lock();
g = bucket(ca, b);
- old = bucket_cmpxchg(g, new, ({
- new.data_type = type;
+ bucket_cmpxchg(g, new, ({
+ new.dirty = true;
+ new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
@@ -537,6 +540,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
do {
new.v.counter = old.v.counter = v;
+ new.dirty = true;
+
/*
* Check this after reading bucket mark to guard against
* the allocator invalidating a bucket after we've already
@@ -591,9 +596,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
int blocks_nonempty_delta;
s64 parity_sectors;
+ BUG_ON(!sectors);
+
m = genradix_ptr(&c->stripes[gc], p.idx);
+ spin_lock(&c->ec_stripes_heap_lock);
+
if (!m || !m->alive) {
+ spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx);
return -1;
@@ -609,19 +619,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
parity_sectors = -parity_sectors;
sectors += parity_sectors;
- new = atomic_add_return(sectors, &m->block_sectors[p.block]);
- old = new - sectors;
+ old = m->block_sectors[p.block];
+ m->block_sectors[p.block] += sectors;
+ new = m->block_sectors[p.block];
blocks_nonempty_delta = (int) !!new - (int) !!old;
- if (!blocks_nonempty_delta)
- return 0;
+ if (blocks_nonempty_delta) {
+ m->blocks_nonempty += blocks_nonempty_delta;
- atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+ if (!gc)
+ bch2_stripes_heap_update(c, m, p.idx);
+ }
- BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+ m->dirty = true;
- if (!gc)
- bch2_stripes_heap_update(c, m, p.idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
update_replicas(c, fs_usage, &m->r.e, sectors);
@@ -629,8 +641,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
}
static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors,
- enum bch_data_type data_type,
+ s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags,
bool gc)
@@ -701,14 +712,13 @@ static void bucket_set_stripe(struct bch_fs *c,
BUG_ON(ptr_stale(ca, ptr));
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ new.dirty = true;
new.stripe = enabled;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
}));
-
- BUG_ON(old.stripe == enabled);
}
}
@@ -723,22 +733,19 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
+ spin_lock(&c->ec_stripes_heap_lock);
+
if (!m || (!inserting && !m->alive)) {
+ spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
return -1;
}
- if (inserting && m->alive) {
- bch_err_ratelimited(c, "error marking stripe %zu: already exists",
- idx);
- return -1;
- }
-
- BUG_ON(atomic_read(&m->blocks_nonempty));
+ if (m->alive)
+ bch2_stripes_heap_del(c, m, idx);
- for (i = 0; i < EC_STRIPE_MAX; i++)
- BUG_ON(atomic_read(&m->block_sectors[i]));
+ memset(m, 0, sizeof(*m));
if (inserting) {
m->sectors = le16_to_cpu(s.v->sectors);
@@ -754,7 +761,6 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
for (i = 0; i < s.v->nr_blocks; i++)
m->r.e.devs[i] = s.v->ptrs[i].dev;
- }
/*
* XXX: account for stripes somehow here
@@ -763,15 +769,23 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
#endif
- if (!gc) {
- if (inserting)
+ /* gc recalculates these fields: */
+ if (!(flags & BCH_BUCKET_MARK_GC)) {
+ for (i = 0; i < s.v->nr_blocks; i++) {
+ m->block_sectors[i] =
+ stripe_blockcount_get(s.v, i);
+ m->blocks_nonempty += !!m->block_sectors[i];
+ }
+ }
+
+ if (!gc)
bch2_stripes_heap_insert(c, m, idx);
else
- bch2_stripes_heap_del(c, m, idx);
- } else {
- m->alive = inserting;
+ m->alive = true;
}
+ spin_unlock(&c->ec_stripes_heap_lock);
+
bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
return 0;
}
@@ -879,6 +893,8 @@ void bch2_mark_update(struct btree_insert *trans,
struct bch_fs_usage *fs_usage;
struct gc_pos pos = gc_pos_btree_node(b);
struct bkey_packed *_k;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ static int warned_disk_usage = 0;
if (!btree_node_type_needs_gc(iter->btree_id))
return;
@@ -939,7 +955,37 @@ void bch2_mark_update(struct btree_insert *trans,
bch2_btree_node_iter_advance(&node_iter, b);
}
- bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos);
+ if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+ !warned_disk_usage &&
+ !xchg(&warned_disk_usage, 1)) {
+ char buf[200];
+
+ pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+ pr_err("while inserting");
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+ pr_err("%s", buf);
+ pr_err("overlapping with");
+
+ node_iter = iter->l[0].iter;
+ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ KEY_TYPE_discard))) {
+ struct bkey unpacked;
+ struct bkey_s_c k;
+
+ k = bkey_disassemble(b, _k, &unpacked);
+
+ if (btree_node_is_extents(b)
+ ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+ : bkey_cmp(insert->k->k.p, k.k->p))
+ break;
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ pr_err("%s", buf);
+
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
+ }
percpu_up_read_preempt_enable(&c->mark_lock);
}
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index ebd39e85..6f368172 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -181,6 +181,8 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
+
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
@@ -264,8 +266,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, struct gc_pos);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+ struct disk_reservation *, struct gc_pos);
/* disk reservations: */
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 56ceb260..b84ae5c9 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -402,6 +402,8 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (!src)
return -ENOMEM;
+ percpu_up_read_preempt_enable(&c->mark_lock);
+
dst.used = bch2_fs_sectors_used(c, *src);
dst.online_reserved = src->s.online_reserved;
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 755a2603..8018c2bc 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -11,6 +11,7 @@
#include "ec.h"
#include "error.h"
#include "io.h"
+#include "journal_io.h"
#include "keylist.h"
#include "super-io.h"
#include "util.h"
@@ -98,40 +99,6 @@ struct ec_bio {
/* Stripes btree keys: */
-static unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
- return DIV_ROUND_UP(le16_to_cpu(s->sectors),
- 1 << s->csum_granularity_bits);
-}
-
-static unsigned stripe_csum_offset(const struct bch_stripe *s,
- unsigned dev, unsigned csum_idx)
-{
- unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
- return sizeof(struct bch_stripe) +
- sizeof(struct bch_extent_ptr) * s->nr_blocks +
- (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static unsigned stripe_blockcount_offset(const struct bch_stripe *s,
- unsigned idx)
-{
- return stripe_csum_offset(s, s->nr_blocks, 0) +
- sizeof(16) * idx;
-}
-
-static unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
- return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
- sizeof(u64));
-}
-
-static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
-{
- return (void *) s + stripe_csum_offset(s, dev, csum_idx);
-}
-
const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@@ -164,8 +131,9 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
1U << s->csum_granularity_bits);
for (i = 0; i < s->nr_blocks; i++)
- pr_buf(out, " %u:%llu", s->ptrs[i].dev,
- (u64) s->ptrs[i].offset);
+ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
+ (u64) s->ptrs[i].offset,
+ stripe_blockcount_get(s, i));
}
static int ptr_matches_stripe(struct bch_fs *c,
@@ -609,29 +577,15 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
BUG_ON(h->data[m->heap_idx].idx != idx);
}
-static inline unsigned stripe_entry_blocks(struct stripe *m)
-{
- return atomic_read(&m->blocks_nonempty);
-}
-
void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
- bool queue_delete;
size_t i;
- spin_lock(&c->ec_stripes_heap_lock);
-
- if (!m->alive) {
- spin_unlock(&c->ec_stripes_heap_lock);
- return;
- }
-
heap_verify_backpointer(c, idx);
- h->data[m->heap_idx].blocks_nonempty =
- stripe_entry_blocks(m);
+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
i = m->heap_idx;
heap_sift_up(h, i, ec_stripes_heap_cmp,
@@ -641,44 +595,35 @@ void bch2_stripes_heap_update(struct bch_fs *c,
heap_verify_backpointer(c, idx);
- queue_delete = stripe_idx_to_delete(c) >= 0;
- spin_unlock(&c->ec_stripes_heap_lock);
-
- if (queue_delete)
+ if (stripe_idx_to_delete(c) >= 0)
schedule_work(&c->ec_stripe_delete_work);
}
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- spin_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
m->alive = false;
heap_del(&c->ec_stripes_heap, m->heap_idx,
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
- spin_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- spin_lock(&c->ec_stripes_heap_lock);
-
BUG_ON(heap_full(&c->ec_stripes_heap));
heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
.idx = idx,
- .blocks_nonempty = stripe_entry_blocks(m),
+ .blocks_nonempty = m->blocks_nonempty,
}),
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
m->alive = true;
heap_verify_backpointer(c, idx);
-
- spin_unlock(&c->ec_stripes_heap_lock);
}
/* stripe deletion */
@@ -1217,6 +1162,116 @@ unlock:
mutex_unlock(&c->ec_new_stripe_lock);
}
+static int __bch2_stripe_write_key(struct bch_fs *c,
+ struct btree_iter *iter,
+ struct stripe *m,
+ size_t idx,
+ struct bkey_i_stripe *new_key,
+ unsigned flags)
+{
+ struct bkey_s_c k;
+ unsigned i;
+ int ret;
+
+ bch2_btree_iter_set_pos(iter, POS(0, idx));
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = btree_iter_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_stripe)
+ return -EIO;
+
+ bkey_reassemble(&new_key->k_i, k);
+
+ spin_lock(&c->ec_stripes_heap_lock);
+
+ for (i = 0; i < new_key->v.nr_blocks; i++)
+ stripe_blockcount_set(&new_key->v, i,
+ m->block_sectors[i]);
+ m->dirty = false;
+
+ spin_unlock(&c->ec_stripes_heap_lock);
+
+ return bch2_btree_insert_at(c, NULL, NULL,
+ BTREE_INSERT_NOFAIL|flags,
+ BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+}
+
+int bch2_stripes_write(struct bch_fs *c, bool *wrote)
+{
+ struct btree_iter iter;
+ struct genradix_iter giter;
+ struct bkey_i_stripe *new_key;
+ struct stripe *m;
+ int ret = 0;
+
+ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
+ BUG_ON(!new_key);
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ genradix_for_each(&c->stripes[0], giter, m) {
+ if (!m->dirty)
+ continue;
+
+ ret = __bch2_stripe_write_key(c, &iter, m, giter.pos,
+ new_key, BTREE_INSERT_NOCHECK_RW);
+ if (ret)
+ break;
+
+ *wrote = true;
+ }
+
+ bch2_btree_iter_unlock(&iter);
+
+ kfree(new_key);
+
+ return ret;
+}
+
+static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
+{
+
+ struct gc_pos pos = { 0 };
+
+ bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
+}
+
+int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
+{
+ struct journal_replay *r;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_fs_ec_start(c);
+ if (ret)
+ return ret;
+
+ for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
+ bch2_stripe_read_key(c, k);
+ bch2_btree_iter_cond_resched(&iter);
+ }
+
+ ret = bch2_btree_iter_unlock(&iter);
+ if (ret)
+ return ret;
+
+ list_for_each_entry(r, journal_replay_list, list) {
+ struct bkey_i *k, *n;
+ struct jset_entry *entry;
+
+ for_each_jset_key(k, n, entry, &r->j)
+ if (entry->btree_id == BTREE_ID_EC)
+ bch2_stripe_read_key(c, bkey_i_to_s_c(k));
+ }
+
+ return 0;
+}
+
int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
{
struct btree_iter iter;
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index c728c52c..28178330 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -13,6 +13,55 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
.val_to_text = bch2_stripe_to_text, \
}
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+ return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+ 1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+ unsigned dev, unsigned csum_idx)
+{
+ unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+ return sizeof(struct bch_stripe) +
+ sizeof(struct bch_extent_ptr) * s->nr_blocks +
+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+ unsigned idx)
+{
+ return stripe_csum_offset(s, s->nr_blocks, 0) +
+ sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+ unsigned idx)
+{
+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+ unsigned idx, unsigned v)
+{
+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+ *p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+ sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+ unsigned dev, unsigned csum_idx)
+{
+ return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+}
+
struct bch_read_bio;
struct ec_stripe_buf {
@@ -100,6 +149,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_ec_flush_new_stripes(struct bch_fs *);
+int bch2_stripes_read(struct bch_fs *, struct list_head *);
+int bch2_stripes_write(struct bch_fs *, bool *);
+
int bch2_ec_mem_alloc(struct bch_fs *, bool);
int bch2_fs_ec_start(struct bch_fs *);
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
index 44c5d382..b4d37705 100644
--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@@ -19,9 +19,10 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
- u8 alive;
- atomic_t blocks_nonempty;
- atomic_t block_sectors[EC_STRIPE_MAX];
+ unsigned alive:1;
+ unsigned dirty:1;
+ u8 blocks_nonempty;
+ u16 block_sectors[EC_STRIPE_MAX];
struct bch_replicas_padded r;
};
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 29804168..0f075fa1 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -1664,12 +1664,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
return ret == BCH_MERGE_MERGE;
}
-int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+ unsigned nr_replicas)
{
struct btree_iter iter;
struct bpos end = pos;
struct bkey_s_c k;
- int ret = 0;
+ bool ret = true;
end.offset += size;
@@ -1678,8 +1679,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
- if (!bch2_extent_is_fully_allocated(k)) {
- ret = -ENOSPC;
+ if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
+ ret = false;
break;
}
}
@@ -1688,6 +1689,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
return ret;
}
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+ unsigned ret = 0;
+
+ switch (k.k->type) {
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ extent_for_each_ptr_decode(e, p, entry)
+ ret += !p.ptr.cached &&
+ p.crc.compression_type == BCH_COMPRESSION_NONE;
+ break;
+ }
+ case KEY_TYPE_reservation:
+ ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+ break;
+ }
+
+ return ret;
+}
+
/* KEY_TYPE_reservation: */
const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 0e6f4a0b..698b2581 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -571,6 +571,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
BUG_ON(!bch2_bkey_pack_key(dst, src, f));
}
-int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h
index 66fa227c..d19d809c 100644
--- a/libbcachefs/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@@ -262,18 +262,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
}
}
-static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
-{
- size_t i = 0;
- int res;
-
- while (i < nr &&
- (res = cmp(search, base + i * size, size)))
- i = eytzinger0_child(i, res > 0);
-
- return i;
-}
+#define eytzinger0_find(base, nr, size, _cmp, search) \
+({ \
+ void *_base = (base); \
+ void *_search = (search); \
+ size_t _nr = (nr); \
+ size_t _size = (size); \
+ size_t _i = 0; \
+ int _res; \
+ \
+ while (_i < _nr && \
+ (_res = _cmp(_search, _base + _i * _size, _size))) \
+ _i = eytzinger0_child(_i, _res > 0); \
+ _i; \
+})
void eytzinger0_sort(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t),
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index c1739f53..2cfc2d9e 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
BUG_ON(btree_iter_err(old));
if (allocating &&
- !bch2_extent_is_fully_allocated(old))
+ !*allocating &&
+ bch2_bkey_nr_ptrs_allocated(old) <
+ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
*allocating = true;
delta += (min(new->k.p.offset,
@@ -858,9 +860,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{
struct bvec_iter iter;
struct bio_vec bv;
- unsigned nr_ptrs = !bch2_extent_is_compressed(k)
- ? bch2_bkey_nr_dirty_ptrs(k)
- : 0;
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
bio_for_each_segment(bv, bio, iter) {
/* brand new pages, don't need to be locked: */
@@ -1759,6 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = dio->iop.inode;
struct bio *bio = &dio->iop.op.wbio.bio;
struct bio_vec *bv;
+ loff_t offset;
bool sync;
long ret;
int i;
@@ -1770,12 +1771,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
__pagecache_block_get(&mapping->add_lock);
/* Write and invalidate pagecache range that we're writing to: */
- ret = write_invalidate_inode_pages_range(mapping, req->ki_pos,
- req->ki_pos + iov_iter_count(&dio->iter) - 1);
+ offset = req->ki_pos + (dio->iop.op.written << 9);
+ ret = write_invalidate_inode_pages_range(mapping,
+ offset,
+ offset + iov_iter_count(&dio->iter) - 1);
if (unlikely(ret))
goto err;
while (1) {
+ offset = req->ki_pos + (dio->iop.op.written << 9);
+
BUG_ON(current->pagecache_lock);
current->pagecache_lock = &mapping->add_lock;
if (kthread)
@@ -1792,13 +1797,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
/* gup might have faulted pages back in: */
ret = write_invalidate_inode_pages_range(mapping,
- req->ki_pos + (dio->iop.op.written << 9),
- req->ki_pos + iov_iter_count(&dio->iter) - 1);
+ offset,
+ offset + bio->bi_iter.bi_size - 1);
if (unlikely(ret))
goto err;
- dio->iop.op.pos = POS(inode->v.i_ino,
- (req->ki_pos >> 9) + dio->iop.op.written);
+ dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
task_io_account_write(bio->bi_iter.bi_size);
@@ -1878,7 +1882,6 @@ static int bch2_direct_IO_write(struct kiocb *req,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct dio_write *dio;
struct bio *bio;
- loff_t offset = req->ki_pos;
ssize_t ret;
lockdep_assert_held(&inode->v.i_rwsem);
@@ -1886,7 +1889,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
if (unlikely(!iter->count))
return 0;
- if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL,
@@ -1898,7 +1901,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
dio->mm = current->mm;
dio->loop = false;
dio->sync = is_sync_kiocb(req) ||
- offset + iter->count > inode->v.i_size;
+ req->ki_pos + iter->count > inode->v.i_size;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->iter = *iter;
@@ -1915,19 +1918,20 @@ static int bch2_direct_IO_write(struct kiocb *req,
if (unlikely(ret))
goto err;
+ dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas;
+
ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
dio->iop.op.opts.data_replicas, 0);
if (unlikely(ret)) {
- if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
- offset >> 9),
- iter->count >> 9))
+ if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
+ req->ki_pos >> 9),
+ iter->count >> 9,
+ dio->iop.op.opts.data_replicas))
goto err;
dio->iop.unalloc = true;
}
- dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
-
return bch2_dio_write_loop(dio);
err:
bch2_disk_reservation_put(c, &dio->iop.op.res);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 52498627..5cc0651c 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
}
list_for_each_entry(i, list, list) {
+ struct bch_replicas_padded replicas;
+ char buf[80];
+
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
- i->devs, false), c,
- "superblock not marked as containing replicas (type %u)",
- BCH_DATA_JOURNAL))) {
- ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+ "superblock not marked as containing replicas %s",
+ (bch2_replicas_entry_to_text(&PBUF(buf),
+ &replicas.e), buf)))) {
+ ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
return ret;
}
@@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl)
struct journal_buf *w = journal_prev_buf(j);
struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+ struct bch_replicas_padded replicas;
u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq);
@@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl)
goto err;
}
- if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+
+ if (bch2_mark_replicas(c, &replicas.e))
goto err;
spin_lock(&j->lock);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 4a997366..a795e888 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -335,7 +335,7 @@ void bch2_journal_reclaim_work(struct work_struct *work)
mutex_unlock(&j->reclaim_lock);
if (!test_bit(BCH_FS_RO, &c->flags))
- queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
msecs_to_jiffies(j->reclaim_delay_ms));
}
@@ -387,7 +387,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
- struct bch_devs_list devs;
u64 iter, seq = 0;
int ret = 0;
@@ -412,12 +411,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
spin_lock(&j->lock);
while (!ret && seq < j->pin.back) {
+ struct bch_replicas_padded replicas;
+
seq = max(seq, journal_last_seq(j));
- devs = journal_seq_pin(j, seq)->devs;
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+ journal_seq_pin(j, seq)->devs);
seq++;
spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+ ret = bch2_mark_replicas(c, &replicas.e);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index b2198651..bb425d88 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -4,6 +4,7 @@
#include "bcachefs.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "extents.h"
#include "io.h"
@@ -152,6 +153,16 @@ retry:
bch2_btree_iter_unlock(&iter);
}
+ /* flush relevant btree updates */
+ while (1) {
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c) ||
+ c->btree_roots_dirty);
+ if (!bch2_btree_interior_updates_nr_pending(c))
+ break;
+ bch2_journal_meta(&c->journal);
+ }
+
ret = 0;
out:
ret = bch2_replicas_gc_end(c, ret);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 80909ae4..98cfcefd 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -3,6 +3,7 @@
#include "alloc_foreground.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "disk_groups.h"
#include "inode.h"
@@ -763,6 +764,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+
+ while (1) {
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c) ||
+ c->btree_roots_dirty);
+ if (!bch2_btree_interior_updates_nr_pending(c))
+ break;
+ bch2_journal_meta(&c->journal);
+ }
+
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL,
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index eae38ea7..f5f3f94e 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -214,12 +214,12 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret)
goto err;
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
- err = "cannot allocate memory";
- ret = bch2_fs_ec_start(c);
+ ret = bch2_stripes_read(c, &journal);
if (ret)
goto err;
+ pr_info("stripes_read done");
+
+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index 66ca13aa..230f807b 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -13,6 +13,16 @@ static inline int u8_cmp(u8 l, u8 r)
return (l > r) - (l < r);
}
+static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHES_DEBUG
+ unsigned i;
+
+ for (i = 0; i + 1 < e->nr_devs; i++)
+ BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
static void replicas_entry_sort(struct bch_replicas_entry *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
@@ -23,19 +33,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size)
-static inline struct bch_replicas_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
- return (void *) r->entries + r->entry_size * i;
-}
-
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
-static void replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry *e)
+void bch2_replicas_entry_to_text(struct printbuf *out,
+ struct bch_replicas_entry *e)
{
unsigned i;
@@ -60,7 +64,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
pr_buf(out, " ");
first = false;
- replicas_entry_to_text(out, e);
+ bch2_replicas_entry_to_text(out, e);
}
}
@@ -100,8 +104,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev;
}
-static void bkey_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *e)
+static void bkey_to_replicas(struct bch_replicas_entry *e,
+ struct bkey_s_c k)
{
e->nr_devs = 0;
@@ -119,11 +123,13 @@ static void bkey_to_replicas(struct bkey_s_c k,
stripe_to_replicas(k, e);
break;
}
+
+ replicas_entry_sort(e);
}
-static inline void devlist_to_replicas(struct bch_devs_list devs,
- enum bch_data_type data_type,
- struct bch_replicas_entry *e)
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
{
unsigned i;
@@ -137,6 +143,8 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i];
+
+ replicas_entry_sort(e);
}
static struct bch_replicas_cpu
@@ -150,6 +158,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
replicas_entry_bytes(new_entry)),
};
+ BUG_ON(!new_entry->data_type);
+ verify_replicas_entry_sorted(new_entry);
+
new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
if (!new.entries)
return new;
@@ -175,13 +186,12 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
if (unlikely(entry_size > r->entry_size))
return -1;
- replicas_entry_sort(search);
-
- while (entry_size < r->entry_size)
- ((char *) search)[entry_size++] = 0;
+ verify_replicas_entry_sorted(search);
+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
- memcmp, search);
+ entry_cmp, search);
+#undef entry_cmp
return idx < r->nr ? idx : -1;
}
@@ -189,6 +199,8 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
{
+ replicas_entry_sort(search);
+
return __replicas_entry_idx(&c->replicas, search);
}
@@ -198,12 +210,17 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
return __replicas_entry_idx(r, search) >= 0;
}
-static bool replicas_has_entry(struct bch_fs *c,
- struct bch_replicas_entry *search,
- bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+ struct bch_replicas_entry *search,
+ bool check_gc_replicas)
{
bool marked;
+ if (!search->nr_devs)
+ return true;
+
+ verify_replicas_entry_sorted(search);
+
percpu_down_read_preempt_disable(&c->mark_lock);
marked = __replicas_has_entry(&c->replicas, search) &&
(!check_gc_replicas ||
@@ -214,35 +231,31 @@ static bool replicas_has_entry(struct bch_fs *c,
return marked;
}
-static void __replicas_table_update(struct bch_fs_usage __percpu *dst,
+static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
struct bch_replicas_cpu *dst_r,
- struct bch_fs_usage __percpu *src,
+ struct bch_fs_usage __percpu *src_p,
struct bch_replicas_cpu *src_r)
{
- int src_idx, dst_idx, cpu;
+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+ struct bch_fs_usage *dst, *src = (void *)
+ bch2_acc_percpu_u64s((void *) src_p, src_nr);
+ int src_idx, dst_idx;
- for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
- u64 *dst_v, src_v = 0;
+ preempt_disable();
+ dst = this_cpu_ptr(dst_p);
+ preempt_enable();
- for_each_possible_cpu(cpu)
- src_v += *per_cpu_ptr(&src->data[src_idx], cpu);
+ *dst = *src;
- dst_idx = __replicas_entry_idx(dst_r,
- cpu_replicas_entry(src_r, src_idx));
-
- if (dst_idx < 0) {
- BUG_ON(src_v);
+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+ if (!src->data[src_idx])
continue;
- }
-
- preempt_disable();
- dst_v = this_cpu_ptr(&dst->data[dst_idx]);
- BUG_ON(*dst_v);
-
- *dst_v = src_v;
+ dst_idx = __replicas_entry_idx(dst_r,
+ cpu_replicas_entry(src_r, src_idx));
+ BUG_ON(dst_idx < 0);
- preempt_enable();
+ dst->data[dst_idx] = src->data[src_idx];
}
}
@@ -344,30 +357,32 @@ err:
return ret;
}
-static int __bch2_mark_replicas(struct bch_fs *c,
- struct bch_replicas_entry *devs)
+int bch2_mark_replicas(struct bch_fs *c,
+ struct bch_replicas_entry *r)
{
- return likely(replicas_has_entry(c, devs, true))
+ return likely(bch2_replicas_marked(c, r, true))
? 0
- : bch2_mark_replicas_slowpath(c, devs);
+ : bch2_mark_replicas_slowpath(c, r);
}
-int bch2_mark_replicas(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+ struct bkey_s_c k,
+ bool check_gc_replicas)
{
struct bch_replicas_padded search;
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
- if (!devs.nr)
- return 0;
-
- memset(&search, 0, sizeof(search));
+ for (i = 0; i < cached.nr; i++) {
+ bch2_replicas_entry_cached(&search.e, cached.devs[i]);
- BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+ if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
+ return false;
+ }
- devlist_to_replicas(devs, data_type, &search.e);
+ bkey_to_replicas(&search.e, k);
- return __bch2_mark_replicas(c, &search.e);
+ return bch2_replicas_marked(c, &search.e, check_gc_replicas);
}
int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
@@ -377,18 +392,17 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
unsigned i;
int ret;
- memset(&search, 0, sizeof(search));
+ for (i = 0; i < cached.nr; i++) {
+ bch2_replicas_entry_cached(&search.e, cached.devs[i]);
- for (i = 0; i < cached.nr; i++)
- if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
- bch2_dev_list_single(cached.devs[i]))))
+ ret = bch2_mark_replicas(c, &search.e);
+ if (ret)
return ret;
+ }
- bkey_to_replicas(k, &search.e);
+ bkey_to_replicas(&search.e, k);
- return search.e.nr_devs
- ? __bch2_mark_replicas(c, &search.e)
- : 0;
+ return bch2_mark_replicas(c, &search.e);
}
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
@@ -749,7 +763,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
pr_buf(out, " ");
first = false;
- replicas_entry_to_text(out, e);
+ bch2_replicas_entry_to_text(out, e);
}
}
@@ -798,46 +812,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
/* Query replicas: */
-bool bch2_replicas_marked(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_list devs,
- bool check_gc_replicas)
-{
- struct bch_replicas_padded search;
-
- if (!devs.nr)
- return true;
-
- memset(&search, 0, sizeof(search));
-
- devlist_to_replicas(devs, data_type, &search.e);
-
- return replicas_has_entry(c, &search.e, check_gc_replicas);
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
- struct bkey_s_c k,
- bool check_gc_replicas)
-{
- struct bch_replicas_padded search;
- struct bch_devs_list cached = bch2_bkey_cached_devs(k);
- unsigned i;
-
- memset(&search, 0, sizeof(search));
-
- for (i = 0; i < cached.nr; i++)
- if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
- bch2_dev_list_single(cached.devs[i]),
- check_gc_replicas))
- return false;
-
- bkey_to_replicas(k, &search.e);
-
- return search.e.nr_devs
- ? replicas_has_entry(c, &search.e, check_gc_replicas)
- : true;
-}
-
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index fc833653..0ac2b8e0 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -4,17 +4,39 @@
#include "eytzinger.h"
#include "replicas_types.h"
+void bch2_replicas_entry_to_text(struct printbuf *,
+ struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+ return (void *) r->entries + r->entry_size * i;
+}
+
int bch2_replicas_entry_idx(struct bch_fs *,
struct bch_replicas_entry *);
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
- struct bch_devs_list, bool);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+ enum bch_data_type,
+ struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *,
+ struct bch_replicas_entry *, bool);
+int bch2_mark_replicas(struct bch_fs *,
+ struct bch_replicas_entry *);
+
bool bch2_bkey_replicas_marked(struct bch_fs *,
struct bkey_s_c, bool);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
- struct bch_devs_list);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+ unsigned dev)
+{
+ e->data_type = BCH_DATA_CACHED;
+ e->nr_devs = 1;
+ e->nr_required = 1;
+ e->devs[0] = dev;
+}
struct replicas_status {
struct {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index a539f2a8..1835b535 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -205,7 +205,9 @@ int bch2_congested(void *data, int bdi_bits)
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
+ bool wrote;
unsigned i;
+ int ret;
bch2_rebalance_stop(c);
@@ -220,23 +222,42 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
bch2_journal_flush_all_pins(&c->journal);
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
+ do {
+ ret = bch2_alloc_write(c, false, &wrote);
+ if (ret) {
+ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+ break;
+ }
- bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_stripes_write(c, &wrote);
+ if (ret) {
+ bch2_fs_inconsistent(c, "error writing out stripes");
+ break;
+ }
- /*
- * We need to explicitly wait on btree interior updates to complete
- * before stopping the journal, flushing all journal pins isn't
- * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
- * interior updates have to drop their journal pin before they're
- * fully complete:
- */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ for_each_member_device(ca, c, i)
+ bch2_dev_allocator_quiesce(c, ca);
+
+ bch2_journal_flush_all_pins(&c->journal);
+
+ /*
+ * We need to explicitly wait on btree interior updates to complete
+ * before stopping the journal, flushing all journal pins isn't
+ * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
+ * interior updates have to drop their journal pin before they're
+ * fully complete:
+ */
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
+ } while (wrote);
+
+ for_each_member_device(ca, c, i)
+ bch2_dev_allocator_stop(ca);
bch2_fs_journal_stop(&c->journal);
+ /* XXX: mark super that alloc info is persistent */
+
/*
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal:
@@ -420,6 +441,8 @@ static void bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
+ if (c->journal_reclaim_wq)
+ destroy_workqueue(c->journal_reclaim_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->wq)
@@ -638,6 +661,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
+ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
sizeof(struct btree_reserve)) ||
@@ -1297,8 +1322,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (data) {
char data_has_str[100];
- bch2_string_opt_to_text(&PBUF(data_has_str),
- bch2_data_types, data);
+ bch2_flags_to_text(&PBUF(data_has_str),
+ bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
ret = -EBUSY;
goto err;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 2e6e9bd5..40384e7e 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -234,17 +234,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
{
struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
- unsigned replicas;
+ unsigned i;
if (!fs_usage)
return -ENOMEM;
pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
- for (replicas = 0;
- replicas < ARRAY_SIZE(fs_usage->persistent_reserved);
- replicas++) {
- pr_buf(&out, "%u replicas:\n", replicas + 1);
+ for (i = 0;
+ i < ARRAY_SIZE(fs_usage->persistent_reserved);
+ i++) {
+ pr_buf(&out, "%u replicas:\n", i + 1);
#if 0
for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
pr_buf(&out, "\t%s:\t\t%llu\n",
@@ -254,12 +254,23 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
stats.replicas[replicas].ec_data);
#endif
pr_buf(&out, "\treserved:\t%llu\n",
- fs_usage->persistent_reserved[replicas]);
+ fs_usage->persistent_reserved[i]);
}
pr_buf(&out, "online reserved:\t%llu\n",
fs_usage->s.online_reserved);
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+
+ pr_buf(&out, "\t");
+ bch2_replicas_entry_to_text(&out, e);
+ pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+ }
+
+ percpu_up_read_preempt_enable(&c->mark_lock);
+
kfree(fs_usage);
return out.pos - buf;
@@ -797,6 +808,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
{
struct bch_fs *c = ca->fs;
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+ unsigned i, nr[BCH_DATA_NR];
+
+ memset(nr, 0, sizeof(nr));
+
+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+ nr[c->open_buckets[i].type]++;
return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n"
@@ -823,7 +840,10 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
" copygc threshold: %llu\n"
"freelist_wait: %s\n"
"open buckets: %u/%u (reserved %u)\n"
- "open_buckets_wait: %s\n",
+ "open_buckets_wait: %s\n"
+ "open_buckets_btree: %u\n"
+ "open_buckets_user: %u\n"
+ "btree reserve cache: %u\n",
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
@@ -845,8 +865,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
stats.sectors_fragmented,
ca->copygc_threshold,
c->freelist_wait.list.first ? "waiting" : "empty",
- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
- c->open_buckets_wait.list.first ? "waiting" : "empty");
+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+ BTREE_NODE_OPEN_BUCKET_RESERVE,
+ c->open_buckets_wait.list.first ? "waiting" : "empty",
+ nr[BCH_DATA_BTREE],
+ nr[BCH_DATA_USER],
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 5c060e77..fea80e24 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -133,6 +133,7 @@ void bch2_flags_to_text(struct printbuf *out,
const char * const list[], u64 flags)
{
unsigned bit, nr = 0;
+ bool first = true;
if (out->pos != out->end)
*out->pos = '\0';
@@ -141,7 +142,10 @@ void bch2_flags_to_text(struct printbuf *out,
nr++;
while (flags && (bit = __ffs(flags)) < nr) {
- pr_buf(out, "%s,", list[bit]);
+ pr_buf(out, "%s", list[bit]);
+ if (!first)
+ pr_buf(out, ",");
+ first = false;
flags ^= 1 << bit;
}
}
@@ -894,3 +898,28 @@ void eytzinger0_find_test(void)
kfree(test_array);
}
#endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+ u64 *ret;
+ int cpu;
+
+ preempt_disable();
+ ret = this_cpu_ptr(p);
+ preempt_enable();
+
+ for_each_possible_cpu(cpu) {
+ u64 *i = per_cpu_ptr(p, cpu);
+
+ if (i != ret) {
+ acc_u64s(ret, i, nr);
+ memset(i, 0, nr * sizeof(u64));
+ }
+ }
+
+ return ret;
+}
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 25d67509..fbfb2085 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -715,4 +715,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
}
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c
index 5c4a275e..4f43d0bb 100644
--- a/linux/generic-radix-tree.c
+++ b/linux/generic-radix-tree.c
@@ -1,4 +1,5 @@
+#include <linux/atomic.h>
#include <linux/export.h>
#include <linux/generic-radix-tree.h>
#include <linux/gfp.h>
@@ -16,7 +17,7 @@ struct genradix_node {
};
};
-static inline unsigned genradix_depth_shift(unsigned depth)
+static inline int genradix_depth_shift(unsigned depth)
{
return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
@@ -29,16 +30,34 @@ static inline size_t genradix_depth_size(unsigned depth)
return 1UL << genradix_depth_shift(depth);
}
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH \
+ DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK \
+ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+ return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+ return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
/*
* Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated
*/
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
- size_t level = radix->depth;
- struct genradix_node *n = radix->root;
+ struct genradix_root *r = READ_ONCE(radix->root);
+ struct genradix_node *n = genradix_root_to_node(r);
+ unsigned level = genradix_root_to_depth(r);
- if (offset >= genradix_depth_size(radix->depth))
+ if (ilog2(offset) >= genradix_depth_shift(level))
return NULL;
while (1) {
@@ -64,43 +83,60 @@ EXPORT_SYMBOL(__genradix_ptr);
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
gfp_t gfp_mask)
{
- struct genradix_node **n;
- size_t level;
+ struct genradix_root *v = READ_ONCE(radix->root);
+ struct genradix_node *n, *new_node = NULL;
+ unsigned level;
/* Increase tree depth if necessary: */
+ while (1) {
+ struct genradix_root *r = v, *new_root;
- while (offset >= genradix_depth_size(radix->depth)) {
- struct genradix_node *new_root =
- (void *) __get_free_page(gfp_mask|__GFP_ZERO);
-
- if (!new_root)
- return NULL;
-
- new_root->children[0] = radix->root;
- radix->root = new_root;
- radix->depth++;
- }
+ n = genradix_root_to_node(r);
+ level = genradix_root_to_depth(r);
- n = &radix->root;
- level = radix->depth;
+ if (n && ilog2(offset) < genradix_depth_shift(level))
+ break;
- while (1) {
- if (!*n) {
- *n = (void *) __get_free_page(gfp_mask|__GFP_ZERO);
- if (!*n)
+ if (!new_node) {
+ new_node = (void *)
+ __get_free_page(gfp_mask|__GFP_ZERO);
+ if (!new_node)
return NULL;
}
- if (!level)
- break;
+ new_node->children[0] = n;
+ new_root = ((struct genradix_root *)
+ ((unsigned long) new_node | (n ? level + 1 : 0)));
- level--;
+ if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
+ v = new_root;
+ new_node = NULL;
+ }
+ }
- n = &(*n)->children[offset >> genradix_depth_shift(level)];
+ while (level--) {
+ struct genradix_node **p =
+ &n->children[offset >> genradix_depth_shift(level)];
offset &= genradix_depth_size(level) - 1;
+
+ n = READ_ONCE(*p);
+ if (!n) {
+ if (!new_node) {
+ new_node = (void *)
+ __get_free_page(gfp_mask|__GFP_ZERO);
+ if (!new_node)
+ return NULL;
+ }
+
+ if (!(n = cmpxchg_release(p, NULL, new_node)))
+ swap(n, new_node);
+ }
}
- return &(*n)->data[offset];
+ if (new_node)
+ free_page((unsigned long) new_node);
+
+ return &n->data[offset];
}
EXPORT_SYMBOL(__genradix_ptr_alloc);
@@ -108,17 +144,19 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
struct __genradix *radix,
size_t objs_per_page)
{
+ struct genradix_root *r;
struct genradix_node *n;
- size_t level, i;
-
- if (!radix->root)
- return NULL;
+ unsigned level, i;
restart:
- if (iter->offset >= genradix_depth_size(radix->depth))
+ r = READ_ONCE(radix->root);
+ if (!r)
return NULL;
- n = radix->root;
- level = radix->depth;
+ n = genradix_root_to_node(r);
+ level = genradix_root_to_depth(r);
+
+ if (ilog2(iter->offset) >= genradix_depth_shift(level))
+ return NULL;
while (level) {
level--;
@@ -157,11 +195,24 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
free_page((unsigned long) n);
}
+int __genradix_prealloc(struct __genradix *radix, size_t size,
+ gfp_t gfp_mask)
+{
+ size_t offset;
+
+ for (offset = 0; offset < size; offset += PAGE_SIZE)
+ if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(__genradix_prealloc);
+
void __genradix_free(struct __genradix *radix)
{
- genradix_free_recurse(radix->root, radix->depth);
+ struct genradix_root *r = xchg(&radix->root, NULL);
- radix->root = NULL;
- radix->depth = 0;
+ genradix_free_recurse(genradix_root_to_node(r),
+ genradix_root_to_depth(r));
}
EXPORT_SYMBOL(__genradix_free);