summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-08-23 15:38:12 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2024-08-23 19:34:30 -0400
commitb422b19f636329e1a082bcacd3ac6b485d7a5b3e (patch)
treeee9979640d17b982ed92581557634a6df57b2fc8
parent6f938e0399ce04699682459408b090387c73adb3 (diff)
Update bcachefs sources to 22fa8fc32e6a bcachefs: rcu_pending now works in userspace
-rw-r--r--.bcachefs_revision2
-rw-r--r--Cargo.lock2
-rw-r--r--Cargo.toml2
-rw-r--r--Makefile2
-rw-r--r--include/linux/generic-radix-tree.h105
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/slab.h3
-rw-r--r--include/linux/srcu.h31
-rw-r--r--libbcachefs/alloc_background.c65
-rw-r--r--libbcachefs/alloc_background_format.h1
-rw-r--r--libbcachefs/bcachefs_format.h3
-rw-r--r--libbcachefs/bset.c128
-rw-r--r--libbcachefs/btree_cache.c24
-rw-r--r--libbcachefs/btree_cache.h2
-rw-r--r--libbcachefs/btree_iter.h9
-rw-r--r--libbcachefs/btree_key_cache.c426
-rw-r--r--libbcachefs/btree_key_cache_types.h18
-rw-r--r--libbcachefs/btree_locking.h7
-rw-r--r--libbcachefs/btree_types.h3
-rw-r--r--libbcachefs/btree_update_interior.c70
-rw-r--r--libbcachefs/btree_update_interior.h2
-rw-r--r--libbcachefs/buckets.c74
-rw-r--r--libbcachefs/buckets_types.h2
-rw-r--r--libbcachefs/darray.c4
-rw-r--r--libbcachefs/darray.h26
-rw-r--r--libbcachefs/data_update.c2
-rw-r--r--libbcachefs/errcode.h1
-rw-r--r--libbcachefs/extents.c39
-rw-r--r--libbcachefs/extents.h1
-rw-r--r--libbcachefs/fs-io-pagecache.h4
-rw-r--r--libbcachefs/fs-ioctl.c3
-rw-r--r--libbcachefs/fs.c48
-rw-r--r--libbcachefs/fsck.c6
-rw-r--r--libbcachefs/journal.c2
-rw-r--r--libbcachefs/journal_io.c3
-rw-r--r--libbcachefs/journal_sb.c15
-rw-r--r--libbcachefs/movinggc.c2
-rw-r--r--libbcachefs/rcu_pending.c650
-rw-r--r--libbcachefs/rcu_pending.h27
-rw-r--r--libbcachefs/recovery.c9
-rw-r--r--libbcachefs/sb-counters_format.h3
-rw-r--r--libbcachefs/sb-downgrade.c8
-rw-r--r--libbcachefs/sb-errors_format.h2
-rw-r--r--libbcachefs/sysfs.c9
-rw-r--r--libbcachefs/thread_with_file.c2
-rw-r--r--libbcachefs/time_stats.c14
-rw-r--r--libbcachefs/time_stats.h3
-rw-r--r--libbcachefs/trace.h6
-rw-r--r--libbcachefs/util.c1
-rw-r--r--linux/generic-radix-tree.c80
50 files changed, 1317 insertions, 635 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 5c595274..485f92e1 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-62439c6f1a6dba3fca1e57f352745d6e36dd1e31
+22fa8fc32e6aafb8bd76c6b746868dbdbc6a934d
diff --git a/Cargo.lock b/Cargo.lock
index 8f911bf2..4995ce52 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -73,7 +73,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "bcachefs-tools"
-version = "1.11.1"
+version = "1.12.0"
dependencies = [
"anyhow",
"bch_bindgen",
diff --git a/Cargo.toml b/Cargo.toml
index dab3ef64..f2b93756 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "bcachefs-tools"
-version = "1.11.1"
+version = "1.12.0"
authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>", "Kent Overstreet <kent.overstreet@linux.dev>" ]
edition = "2021"
rust-version = "1.70"
diff --git a/Makefile b/Makefile
index 6f4122da..23122fd6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-VERSION=1.11.1
+VERSION=1.12.0
PREFIX?=/usr/local
LIBEXECDIR?=$(PREFIX)/libexec
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f3512fdd..5b51c3d5 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -41,6 +41,7 @@
#include <linux/limits.h>
#include <linux/log2.h>
#include <linux/math.h>
+#include <linux/slab.h>
#include <linux/types.h>
struct genradix_root;
@@ -48,10 +49,63 @@ struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
+#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH \
+ DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK \
+ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+ return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+ return 1UL << genradix_depth_shift(depth);
+}
+
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+ return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+ return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
struct __genradix {
struct genradix_root *root;
};
+struct genradix_node {
+ union {
+ /* Interior node: */
+ struct genradix_node *children[GENRADIX_ARY];
+
+ /* Leaf: */
+ u8 data[GENRADIX_NODE_SIZE];
+ };
+};
+
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+ return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+ kfree(node);
+}
+
/*
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
@@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
+static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
+{
+ struct genradix_root *r = READ_ONCE(radix->root);
+ struct genradix_node *n = genradix_root_to_node(r);
+ unsigned level = genradix_root_to_depth(r);
+ unsigned shift = genradix_depth_shift(level);
+
+ if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
+ return NULL;
+
+ while (n && shift > GENRADIX_NODE_SHIFT) {
+ shift -= GENRADIX_ARY_SHIFT;
+ n = n->children[offset >> shift];
+ offset &= (1UL << shift) - 1;
+ }
+
+ return n ? &n->data[offset] : NULL;
+}
+
+#define genradix_ptr_inlined(_radix, _idx) \
+ (__genradix_cast(_radix) \
+ __genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)))
+
void *__genradix_ptr(struct __genradix *, size_t);
/**
@@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
__genradix_ptr(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
-void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+void *__genradix_ptr_alloc(struct __genradix *, size_t,
+ struct genradix_node **, gfp_t);
+
+#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
+ (__genradix_cast(_radix) \
+ (__genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)) ?: \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ NULL, _gfp)))
+
+#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
+ (__genradix_cast(_radix) \
+ (__genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)) ?: \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _new_node, _gfp)))
/**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
- _gfp))
+ NULL, _gfp))
+
+#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
+ (__genradix_cast(_radix) \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _new_node, _gfp))
struct genradix_iter {
size_t offset;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 744a14ce..b7e83af0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3,6 +3,7 @@
#define _TOOLS_LINUX_MM_H
#include <sys/syscall.h>
+#include <unistd.h>
#include <linux/types.h>
struct sysinfo {
diff --git a/include/linux/slab.h b/include/linux/slab.h
index c17fb5d5..db6a5d55 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -18,6 +18,7 @@
#define alloc_hooks(_do, ...) _do
#define ARCH_KMALLOC_MINALIGN 16
+#define ARCH_SLAB_MINALIGN 16
#define KMALLOC_MAX_SIZE SIZE_MAX
#define MAX_PAGE_ORDER 10
@@ -102,6 +103,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
((size) != 0 && (n) > SIZE_MAX / (size) \
? NULL : kmalloc((n) * (size), flags))
+#define kvmalloc_array_noprof(...) kvmalloc_array(__VA_ARGS__)
+
#define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO)
#define kfree(p) free((void *) p)
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 1c816804..12719eb5 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -1,6 +1,12 @@
#ifndef __TOOLS_LINUX_SRCU_H
#define __TOOLS_LINUX_SRCU_H
+#include <linux/rcupdate.h>
+
+#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2
+
+typedef void (*rcu_callback_t)(struct rcu_head *head);
+
struct srcu_struct {
};
@@ -26,10 +32,35 @@ static inline unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
return 0;
}
+#undef poll_state_synchronize_rcu
+static inline bool poll_state_synchronize_rcu(unsigned long cookie)
+{
+ return false;
+}
+
+#undef start_poll_synchronize_rcu
+static inline unsigned long start_poll_synchronize_rcu()
+{
+ return 0;
+}
+
+static inline unsigned long get_state_synchronize_rcu()
+{
+ return 0;
+}
+
static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) {}
+static inline void srcu_barrier(struct srcu_struct *ssp) {}
+
static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
+static inline void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ func(rhp);
+}
+
static inline int init_srcu_struct(struct srcu_struct *ssp)
{
return 0;
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index ac933142..b54ce7f8 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
+#include <linux/jiffies.h>
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
@@ -240,71 +241,73 @@ fsck_err:
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags)
{
- struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ struct bch_alloc_v4 a;
int ret = 0;
- bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k),
+ bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
+
+ bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
c, alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
- alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
+ alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
- bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v),
+ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
c, alloc_v4_backpointers_start_bad,
"invalid backpointers_start");
- bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type,
+ bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
c, alloc_key_data_type_bad,
"invalid data type (got %u should be %u)",
- a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ a.data_type, alloc_data_type(a, a.data_type));
for (unsigned i = 0; i < 2; i++)
- bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
+ bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
c, alloc_key_io_time_bad,
"invalid io_time[%s]: %llu, max %llu",
i == READ ? "read" : "write",
- a.v->io_time[i], LRU_TIME_MAX);
+ a.io_time[i], LRU_TIME_MAX);
- unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
+ unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
offsetof(struct bch_alloc_v4, stripe_sectors)
- ? a.v->stripe_sectors
+ ? a.stripe_sectors
: 0;
- switch (a.v->data_type) {
+ switch (a.data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
bkey_fsck_err_on(stripe_sectors ||
- a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe,
+ a.dirty_sectors ||
+ a.cached_sectors ||
+ a.stripe,
c, alloc_key_empty_but_have_data,
"empty data type free but have data %u.%u.%u %u",
stripe_sectors,
- a.v->dirty_sectors,
- a.v->cached_sectors,
- a.v->stripe);
+ a.dirty_sectors,
+ a.cached_sectors,
+ a.stripe);
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!a.v->dirty_sectors &&
+ bkey_fsck_err_on(!a.dirty_sectors &&
!stripe_sectors,
c, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_type_str(a.v->data_type));
+ bch2_data_type_str(a.data_type));
break;
case BCH_DATA_cached:
- bkey_fsck_err_on(!a.v->cached_sectors ||
- a.v->dirty_sectors ||
+ bkey_fsck_err_on(!a.cached_sectors ||
+ a.dirty_sectors ||
stripe_sectors ||
- a.v->stripe,
+ a.stripe,
c, alloc_key_cached_inconsistency,
"data type inconsistency");
- bkey_fsck_err_on(!a.v->io_time[READ] &&
+ bkey_fsck_err_on(!a.io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
@@ -1872,26 +1875,26 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_dev_do_discards(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
- goto put_ioref;
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_write_ref;
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-put_ioref:
percpu_ref_put(&ca->io_ref);
+put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -2181,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
- if (last_updated + HZ * 10 < jiffies) {
+ if (time_after(jiffies, last_updated + HZ * 10)) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
last_updated = jiffies;
diff --git a/libbcachefs/alloc_background_format.h b/libbcachefs/alloc_background_format.h
index 47d9d006..f754a295 100644
--- a/libbcachefs/alloc_background_format.h
+++ b/libbcachefs/alloc_background_format.h
@@ -69,6 +69,7 @@ struct bch_alloc_v4 {
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
+ /* end of fields in original version of alloc_v4 */
__u64 fragmentation_lru;
__u32 stripe_sectors;
__u32 pad;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 00d1ca52..8c4adddd 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -677,7 +677,8 @@ struct bch_sb_field_ext {
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
- x(disk_accounting_inum, BCH_VERSION(1, 11))
+ x(disk_accounting_inum, BCH_VERSION(1, 11)) \
+ x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 1b66b2c7..d1f60926 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -885,6 +885,38 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
+static void rw_aux_tree_insert_entry(struct btree *b,
+ struct bset_tree *t,
+ unsigned idx)
+{
+ EBUG_ON(!idx || idx > t->size);
+ struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
+ struct bkey_packed *end = idx < t->size
+ ? rw_aux_to_bkey(b, t, idx)
+ : btree_bkey_last(b, t);
+
+ if (t->size < bset_rw_tree_capacity(b, t) &&
+ (void *) end - (void *) start > L1_CACHE_BYTES) {
+ struct bkey_packed *k = start;
+
+ while (1) {
+ k = bkey_p_next(k);
+ if (k == end)
+ break;
+
+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+ memmove(&rw_aux_tree(b, t)[idx + 1],
+ &rw_aux_tree(b, t)[idx],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx]);
+ t->size++;
+ rw_aux_tree_set(b, t, idx, k);
+ break;
+ }
+ }
+ }
+}
+
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
@@ -892,78 +924,54 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
unsigned new_u64s)
{
int shift = new_u64s - clobber_u64s;
- unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+ unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
+ if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
+ rw_aux_tree_insert_entry(b, t, t->size);
+ goto verify;
+ }
+
/* returns first entry >= where */
- l = rw_aux_tree_bsearch(b, t, where);
-
- if (!l) /* never delete first entry */
- l++;
- else if (l < t->size &&
- where < t->end_offset &&
- rw_aux_tree(b, t)[l].offset == where)
- rw_aux_tree_set(b, t, l++, _where);
-
- /* l now > where */
-
- for (j = l;
- j < t->size &&
- rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
- j++)
- ;
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset + shift ==
- rw_aux_tree(b, t)[l - 1].offset)
- j++;
-
- memmove(&rw_aux_tree(b, t)[l],
- &rw_aux_tree(b, t)[j],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[j]);
- t->size -= j - l;
-
- for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
+ idx = rw_aux_tree_bsearch(b, t, where);
+
+ if (rw_aux_tree(b, t)[idx].offset == where) {
+ if (!idx) { /* never delete first entry */
+ idx++;
+ } else if (where < t->end_offset) {
+ rw_aux_tree_set(b, t, idx++, _where);
+ } else {
+ EBUG_ON(where != t->end_offset);
+ rw_aux_tree_insert_entry(b, t, --t->size);
+ goto verify;
+ }
+ }
- EBUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset ==
- rw_aux_tree(b, t)[l - 1].offset);
+ EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
+ if (idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset + shift ==
+ rw_aux_tree(b, t)[idx - 1].offset) {
+ memmove(&rw_aux_tree(b, t)[idx],
+ &rw_aux_tree(b, t)[idx + 1],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx + 1]);
+ t->size -= 1;
+ }
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (l < t->size
- ? rw_aux_tree(b, t)[l].offset
- : t->end_offset) -
- rw_aux_tree(b, t)[l - 1].offset >
- L1_CACHE_BYTES / sizeof(u64)) {
- struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
- struct bkey_packed *end = l < t->size
- ? rw_aux_to_bkey(b, t, l)
- : btree_bkey_last(b, t);
- struct bkey_packed *k = start;
+ for (j = idx; j < t->size; j++)
+ rw_aux_tree(b, t)[j].offset += shift;
- while (1) {
- k = bkey_p_next(k);
- if (k == end)
- break;
+ EBUG_ON(idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset ==
+ rw_aux_tree(b, t)[idx - 1].offset);
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[l + 1],
- &rw_aux_tree(b, t)[l],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[l]);
- t->size++;
- rw_aux_tree_set(b, t, l, k);
- break;
- }
- }
- }
+ rw_aux_tree_insert_entry(b, t, idx);
+verify:
bch2_bset_verify_rw_aux_tree(b, t);
bset_aux_tree_verify(b);
}
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index cc778d7e..662f0f79 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return b;
}
+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
+{
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+}
+
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -661,9 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
- unsigned flags;
- flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
/*
@@ -735,7 +743,12 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
- memalloc_nofs_restore(flags);
+ int ret = bch2_trans_relock(trans);
+ if (unlikely(ret)) {
+ bch2_btree_node_to_freelist(c, b);
+ return ERR_PTR(ret);
+ }
+
return b;
err:
mutex_lock(&bc->lock);
@@ -764,7 +777,6 @@ err:
}
mutex_unlock(&bc->lock);
- memalloc_nofs_restore(flags);
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}
@@ -856,6 +868,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
bch2_btree_node_read(trans, b, sync);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+
if (!sync)
return NULL;
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index c0eb87a0..f8206400 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -12,6 +12,8 @@ struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *);
+void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 523a2867..6d87e577 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -587,6 +587,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
_btree_id, _pos, _flags, KEY_TYPE_##_type))
+#define bkey_val_copy(_dst_v, _src_k) \
+do { \
+ unsigned b = min_t(unsigned, sizeof(*_dst_v), \
+ bkey_val_bytes(_src_k.k)); \
+ memcpy(_dst_v, _src_k.v, b); \
+ if (b < sizeof(*_dst_v)) \
+ memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
+} while (0)
+
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned flags, unsigned type,
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 79954490..2e49ca71 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -79,130 +79,41 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
-static void bkey_cached_evict(struct btree_key_cache *c,
+static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
- BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
- bch2_btree_key_cache_params));
- memset(&ck->key, ~0, sizeof(ck->key));
-
- atomic_long_dec(&c->nr_keys);
-}
-
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- if (ck->c.lock.readers) {
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- } else {
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
- }
- atomic_long_inc(&bc->nr_freed);
-
- kfree(ck->k);
- ck->k = NULL;
- ck->u64s = 0;
-
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
-}
-
-#ifdef __KERNEL__
-static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- struct bkey_cached *pos;
-
- bc->nr_freed_nonpcpu++;
-
- list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
- if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
- pos->btree_trans_barrier_seq)) {
- list_move(&ck->list, &pos->list);
- return;
- }
+ bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
+ bch2_btree_key_cache_params);
+ if (ret) {
+ memset(&ck->key, ~0, sizeof(ck->key));
+ atomic_long_dec(&c->nr_keys);
}
- list_move(&ck->list, &bc->freed_nonpcpu);
+ return ret;
}
-#endif
-static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
{
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- if (!ck->c.lock.readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
- bool freed = false;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
+ struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
+ struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
- if (f->nr < ARRAY_SIZE(f->objs)) {
- f->objs[f->nr++] = ck;
- freed = true;
- }
- preempt_enable();
-
- if (!freed) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (f->nr > ARRAY_SIZE(f->objs) / 2) {
- struct bkey_cached *ck2 = f->objs[--f->nr];
-
- __bkey_cached_move_to_freelist_ordered(bc, ck2);
- }
- preempt_enable();
-
- __bkey_cached_move_to_freelist_ordered(bc, ck);
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- mutex_unlock(&bc->lock);
- }
+ this_cpu_dec(*c->btree_key_cache.nr_pending);
+ kmem_cache_free(bch2_key_cache, ck);
}
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void bkey_cached_free(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- list_del_init(&ck->list);
- atomic_long_inc(&bc->nr_freed);
-
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
- bkey_cached_move_to_freelist(bc, ck);
-
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
+
+ bool pcpu_readers = ck->c.lock.readers != NULL;
+ rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
+ this_cpu_inc(*bc->nr_pending);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
@@ -224,74 +135,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
- if (!pcpu_readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
- if (f->nr)
- ck = f->objs[--f->nr];
- preempt_enable();
-
- if (!ck) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (!list_empty(&bc->freed_nonpcpu) &&
- f->nr < ARRAY_SIZE(f->objs) / 2) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- f->objs[f->nr++] = ck;
- }
-
- ck = f->nr ? f->objs[--f->nr] : NULL;
- preempt_enable();
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_nonpcpu)) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- }
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_pcpu)) {
- ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_pcpu--;
- }
- mutex_unlock(&bc->lock);
- }
-
- if (ck) {
- ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
- if (unlikely(ret)) {
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
-
- ret = bch2_btree_node_lock_write(trans, path, &ck->c);
- if (unlikely(ret)) {
- btree_node_unlock(trans, path, 0);
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- return ck;
- }
+ struct bkey_cached *ck = container_of_or_null(
+ rcu_pending_dequeue(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
@@ -302,15 +153,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
return ERR_PTR(ret);
}
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ if (ck) {
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ ck->c.cached = true;
+ goto lock;
+ }
- ck->c.cached = true;
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
+ ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
+lock:
+ six_lock_intent(&ck->c.lock, NULL, NULL);
+ six_lock_write(&ck->c.lock, NULL, NULL);
return ck;
}
@@ -322,21 +177,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
- mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(c, ck);
- goto out;
+ if (bkey_cached_evict(c, ck))
+ goto out;
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
- mutex_unlock(&c->lock);
return ck;
}
@@ -415,7 +270,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
- bkey_cached_free_fast(bc, ck);
+ bkey_cached_free(bc, ck);
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
return ret;
@@ -611,8 +466,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
- bkey_cached_evict(&c->btree_key_cache, ck);
- bkey_cached_free_fast(&c->btree_key_cache, ck);
+ if (bkey_cached_evict(&c->btree_key_cache, ck)) {
+ bkey_cached_free(&c->btree_key_cache, ck);
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ }
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@@ -722,10 +581,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
- bkey_cached_free_fast(bc, ck);
+ bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ path->should_be_locked = false;
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
@@ -734,60 +594,41 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
- struct bkey_cached *ck, *t;
+ struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned start, flags;
+ unsigned iter, start;
int srcu_idx;
- mutex_lock(&bc->lock);
- bc->requested_to_free += sc->nr_to_scan;
-
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- flags = memalloc_nofs_save();
+ rcu_read_lock();
+
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
- * Newest freed entries are at the end of the list - once we hit one
- * that's too new to be freed, we can bail out:
+ * Scanning is expensive while a rehash is in progress - most elements
+ * will be on the new hashtable, if it's in progress
+ *
+ * A rehash could still start while we're scanning - that's ok, we'll
+ * still see most elements.
*/
- list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- bc->nr_freed_nonpcpu--;
- bc->freed++;
- }
-
- list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- bc->nr_freed_pcpu--;
- bc->freed++;
+ if (unlikely(tbl->nest)) {
+ rcu_read_unlock();
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ return SHRINK_STOP;
}
- rcu_read_lock();
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- start = bc->shrink_iter;
+ iter = bc->shrink_iter;
+ if (iter >= tbl->size)
+ iter = 0;
+ start = iter;
do {
struct rhash_head *pos, *next;
- pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+ pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
- next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -797,29 +638,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->skipped_accessed++;
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
- } else {
- bkey_cached_evict(bc, ck);
+ } else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
- bc->moved_to_freelist++;
+ bc->freed++;
freed++;
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
- break;
+ goto out;
pos = next;
}
- bc->shrink_iter++;
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- } while (scanned < nr && bc->shrink_iter != start);
+ iter++;
+ if (iter >= tbl->size)
+ iter = 0;
+ } while (scanned < nr && iter != start);
+out:
+ bc->shrink_iter = iter;
rcu_read_unlock();
- memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- mutex_unlock(&bc->lock);
return freed;
}
@@ -847,62 +690,37 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
- struct bkey_cached *ck, *n;
+ struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
-#ifdef __KERNEL__
- int cpu;
-#endif
shrinker_free(bc->shrink);
- mutex_lock(&bc->lock);
-
/*
* The loop is needed to guard against racing with rehash:
*/
while (atomic_long_read(&bc->nr_keys)) {
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl)
+ if (tbl) {
+ if (tbl->nest) {
+ /* wait for in progress rehash */
+ rcu_read_unlock();
+ mutex_lock(&bc->table.mutex);
+ mutex_unlock(&bc->table.mutex);
+ rcu_read_lock();
+ continue;
+ }
for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &items);
+ while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
+ ck = container_of(pos, struct bkey_cached, hash);
+ BUG_ON(!bkey_cached_evict(bc, ck));
+ kfree(ck->k);
+ kmem_cache_free(bch2_key_cache, ck);
}
- rcu_read_unlock();
- }
-
-#ifdef __KERNEL__
- if (bc->pcpu_freed) {
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
- }
}
- }
-#endif
-
- BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
- BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
-
- list_splice(&bc->freed_pcpu, &items);
- list_splice(&bc->freed_nonpcpu, &items);
-
- mutex_unlock(&bc->lock);
-
- list_for_each_entry_safe(ck, n, &items, list) {
- cond_resched();
-
- list_del(&ck->list);
- kfree(ck->k);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
+ rcu_read_unlock();
}
if (atomic_long_read(&bc->nr_dirty) &&
@@ -918,14 +736,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
- free_percpu(bc->pcpu_freed);
+ rcu_pending_exit(&bc->pending[0]);
+ rcu_pending_exit(&bc->pending[1]);
+
+ free_percpu(bc->nr_pending);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
- mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed_pcpu);
- INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@@ -933,11 +751,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
-#ifdef __KERNEL__
- bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
- if (!bc->pcpu_freed)
+ bc->nr_pending = alloc_percpu(size_t);
+ if (!bc->nr_pending)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+ if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
+ rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@@ -959,45 +779,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
printbuf_tabstop_push(out, 24);
printbuf_tabstop_push(out, 12);
- unsigned flags = memalloc_nofs_save();
- mutex_lock(&bc->lock);
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
- prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
- prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
-
- prt_printf(out, "\nshrinker:\n");
+ prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
+ prt_newline(out);
+ prt_printf(out, "shrinker:\n");
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
- prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
-
- prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
-
- struct bkey_cached *ck;
- unsigned iter = 0;
- list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
- prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
- if (++iter > 10)
- break;
- }
-
- iter = 0;
- list_for_each_entry(ck, &bc->freed_pcpu, list) {
- prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
- if (++iter > 10)
- break;
- }
- mutex_unlock(&bc->lock);
- memalloc_flags_restore(flags);
+ prt_newline(out);
+ prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
}
void bch2_btree_key_cache_exit(void)
diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h
index 237e8bb3..722f1ed1 100644
--- a/libbcachefs/btree_key_cache_types.h
+++ b/libbcachefs/btree_key_cache_types.h
@@ -2,33 +2,25 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
+#include "rcu_pending.h"
struct btree_key_cache {
- struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed_pcpu;
- size_t nr_freed_pcpu;
- struct list_head freed_nonpcpu;
- size_t nr_freed_nonpcpu;
-
struct shrinker *shrink;
unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
- atomic_long_t nr_freed;
+ /* 0: non pcpu reader locks, 1: pcpu reader locks */
+ struct rcu_pending pending[2];
+ size_t __percpu *nr_pending;
+
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
/* shrinker stats */
unsigned long requested_to_free;
unsigned long freed;
- unsigned long moved_to_freelist;
unsigned long skipped_dirty;
unsigned long skipped_accessed;
unsigned long skipped_lock_fail;
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index dd0a2a1a..7c07f9fa 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -218,14 +218,12 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
bool lock_may_not_fail,
unsigned long ip)
{
- int ret;
-
trans->lock_may_not_fail = lock_may_not_fail;
trans->lock_must_abort = false;
trans->locking = b;
- ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
- bch2_six_check_for_deadlock, trans, ip);
+ int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans, ip);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
@@ -284,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
+ bch2_trans_verify_not_unlocked(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index b256b2a2..0df07929 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -386,17 +386,16 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
- unsigned long btree_trans_barrier_seq;
u16 u64s;
struct bkey_cached_key key;
struct rhash_head hash;
- struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
+ struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 2b48cd14..1433aefb 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
: 0;
int ret;
+ b = bch2_btree_node_mem_alloc(trans, interior_node);
+ if (IS_ERR(b))
+ return b;
+
+ BUG_ON(b->ob.nr);
+
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
@@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
obs = a->ob;
bkey_copy(&tmp.k, &a->k);
mutex_unlock(&c->btree_reserve_cache_lock);
- goto mem_alloc;
+ goto out;
}
mutex_unlock(&c->btree_reserve_cache_lock);
-
retry:
ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
@@ -341,7 +346,7 @@ retry:
c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
- return ERR_PTR(ret);
+ goto err;
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@@ -360,19 +365,16 @@ retry:
bch2_open_bucket_get(c, wp, &obs);
bch2_alloc_sectors_done(c, wp);
-mem_alloc:
- b = bch2_btree_node_mem_alloc(trans, interior_node);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- /* we hold cannibalize_lock: */
- BUG_ON(IS_ERR(b));
- BUG_ON(b->ob.nr);
-
+out:
bkey_copy(&b->key, &tmp.k);
b->ob = obs;
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
return b;
+err:
+ bch2_btree_node_to_freelist(c, b);
+ return ERR_PTR(ret);
}
static struct btree *bch2_btree_node_alloc(struct btree_update *as,
@@ -730,6 +732,18 @@ static void btree_update_nodes_written(struct btree_update *as)
"%s", bch2_err_str(ret));
err:
/*
+ * Ensure transaction is unlocked before using btree_node_lock_nopath()
+ * (the use of which is always suspect, we need to work on removing this
+ * in the future)
+ *
+ * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
+ * rarely end up with a locked path besides the one we have here:
+ */
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
+
+ /*
* We have to be careful because another thread might be getting ready
* to free as->b and calling btree_update_reparent() on us - we'll
* recheck under btree_update_lock below:
@@ -748,18 +762,6 @@ err:
* we're in journal error state:
*/
- /*
- * Ensure transaction is unlocked before using
- * btree_node_lock_nopath() (the use of which is always suspect,
- * we need to work on removing this in the future)
- *
- * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
- * calls bch2_path_upgrade(), before we call path_make_mut(), so
- * we may rarely end up with a locked path besides the one we
- * have here:
- */
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
}
new_hash = bch2_btree_node_mem_alloc(trans, false);
+ ret = PTR_ERR_OR_ZERO(new_hash);
+ if (ret)
+ goto err;
}
path->intent_ref++;
@@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
commit_flags, skip_triggers);
--path->intent_ref;
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- list_move(&new_hash->list, &c->btree_cache.freeable);
- mutex_unlock(&c->btree_cache.lock);
-
- six_unlock_write(&new_hash->c.lock);
- six_unlock_intent(&new_hash->c.lock);
- }
+ if (new_hash)
+ bch2_btree_node_to_freelist(c, new_hash);
+err:
closure_sync(&cl);
bch2_btree_cache_cannibalize_unlock(trans);
return ret;
@@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
b = bch2_btree_node_mem_alloc(trans, false);
bch2_btree_cache_cannibalize_unlock(trans);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret;
+
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
b->c.level = level;
@@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
+ bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
}
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 02c6ecad..10f40095 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
+ bch2_trans_verify_not_unlocked(trans);
+
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index be2bbd24..a2274429 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -699,7 +699,8 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
+ enum btree_iter_update_trigger_flags flags,
+ s64 *replicas_sectors)
{
bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 replicas_sectors = 0;
int ret = 0;
struct disk_accounting_pos acc_replicas_key = {
@@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret)
return ret;
} else if (!p.has_ec) {
- replicas_sectors += disk_sectors;
+ *replicas_sectors += disk_sectors;
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
if (ret)
return ret;
}
@@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_snapshot,
.snapshot.id = k.k->p.snapshot,
};
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
if (ret)
return ret;
}
@@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_btree,
.btree.id = btree_id,
};
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
if (ret)
return ret;
} else {
@@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans,
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
- replicas_sectors,
+ *replicas_sectors,
};
ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
if (ret)
return ret;
}
- if (bch2_bkey_rebalance_opts(k)) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
- };
- ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
- if (ret)
- return ret;
- }
-
return 0;
}
@@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
@@ -858,21 +850,53 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes))
return 0;
- if (flags & BTREE_TRIGGER_transactional) {
- struct bch_fs *c = trans->c;
- int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
- (int) bch2_bkey_needs_rebalance(c, old);
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+ s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
+
+ if (old.k->type) {
+ int ret = __trigger_extent(trans, btree, level, old,
+ flags & ~BTREE_TRIGGER_insert,
+ &old_replicas_sectors);
+ if (ret)
+ return ret;
+ }
+
+ if (new.k->type) {
+ int ret = __trigger_extent(trans, btree, level, new.s_c,
+ flags & ~BTREE_TRIGGER_overwrite,
+ &new_replicas_sectors);
+ if (ret)
+ return ret;
+ }
+
+ int need_rebalance_delta = 0;
+ s64 need_rebalance_sectors_delta = 0;
+
+ s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+ need_rebalance_delta -= s != 0;
+ need_rebalance_sectors_delta -= s;
- if (mod) {
+ s = bch2_bkey_sectors_need_rebalance(c, old);
+ need_rebalance_delta += s != 0;
+ need_rebalance_sectors_delta += s;
+
+ if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, mod > 0);
+ new.k->p, need_rebalance_delta > 0);
if (ret)
return ret;
}
- }
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
- return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
+ if (need_rebalance_sectors_delta) {
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_rebalance_work,
+ };
+ int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
+ flags & BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index c9698cdf..a19460a1 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -24,7 +24,7 @@ struct bucket_array {
u16 first_bucket;
size_t nbuckets;
size_t nbuckets_minus_first;
- struct bucket b[];
+ struct bucket b[] __counted_by(nbuckets);
};
struct bucket_gens {
diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c
index b7d223f8..4f06cd8b 100644
--- a/libbcachefs/darray.c
+++ b/libbcachefs/darray.c
@@ -4,12 +4,12 @@
#include <linux/slab.h>
#include "darray.h"
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array(new_size, element_size, gfp);
+ void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
if (!data)
return -ENOMEM;
diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h
index 4b340d13..8f4c3f06 100644
--- a/libbcachefs/darray.h
+++ b/libbcachefs/darray.h
@@ -22,29 +22,23 @@ struct { \
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
-
-static inline int __darray_resize(darray_char *d, size_t element_size,
- size_t new_size, gfp_t gfp)
-{
- return unlikely(new_size > d->size)
- ? __bch2_darray_resize(d, element_size, new_size, gfp)
- : 0;
-}
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+
+#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
+
+#define __darray_resize(_d, _element_size, _new_size, _gfp) \
+ (unlikely((_new_size) > (_d)->size) \
+ ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+ : 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
- unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
-{
- return __darray_resize(d, t_size, d->nr + more, gfp);
-}
-
#define darray_make_room_gfp(_d, _more, _gfp) \
- __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+ darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index c219ee08..65176d51 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -79,6 +79,8 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
+
+ bucket = PTR_BUCKET_POS(ca, ptr2);
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
return false;
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 34805d14..ab5a7ade 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -145,7 +145,6 @@
x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
- x(BCH_ERR_transaction_restart, transaction_restart_freeing_inode) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 9406f82f..e317df36 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -1379,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
return r != NULL;
}
+static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 sectors = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten) {
+ sectors = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ sectors += p.crc.compressed_size;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
+ sectors += p.crc.compressed_size;
+ }
+
+ return sectors;
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
+}
+
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
struct bch_io_opts *opts)
{
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 8be05a6a..8e1ba46f 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
unsigned, unsigned);
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
struct bch_io_opts *);
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
index fd7d692c..fad911cf 100644
--- a/libbcachefs/fs-io-pagecache.h
+++ b/libbcachefs/fs-io-pagecache.h
@@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
- return folio_has_private(folio)
- ? (struct bch_folio *) folio_get_private(folio)
- : NULL;
+ return folio_get_private(folio);
}
static inline struct bch_folio *bch2_folio(struct folio *folio)
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c
index f5dfb7e4..405cf08b 100644
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,
mutex_lock(&c->sb_lock);
strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
- mutex_unlock(&c->sb_lock);
-
ret = bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
mnt_drop_write_file(file);
return ret;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 429ea110..7a9c164c 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -193,13 +193,19 @@ repeat:
inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
if (inode) {
spin_lock(&inode->v.i_lock);
+ if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+ spin_unlock(&inode->v.i_lock);
+ return NULL;
+ }
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
- if (trans)
+ if (!trans) {
+ __wait_on_freeing_inode(&inode->v);
+ } else {
bch2_trans_unlock(trans);
- __wait_on_freeing_inode(&inode->v);
- if (trans) {
- trace_and_count(c, trans_restart_freeing_inode, trans, _THIS_IP_);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_freeing_inode));
+ __wait_on_freeing_inode(&inode->v);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
}
goto repeat;
}
@@ -212,11 +218,14 @@ repeat:
static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
{
- if (test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+ spin_lock(&inode->v.i_lock);
+ bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+ spin_unlock(&inode->v.i_lock);
+
+ if (remove) {
int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
&inode->hash, bch2_vfs_inodes_params);
BUG_ON(ret);
- clear_bit(EI_INODE_HASHED, &inode->ei_flags);
inode->v.i_hash.pprev = NULL;
}
}
@@ -226,6 +235,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
struct bch_inode_info *inode)
{
struct bch_inode_info *old = inode;
+
+ set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
&inode->hash,
@@ -233,8 +244,8 @@ retry:
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
if (!old)
goto retry;
- if (IS_ERR(old))
- return old;
+
+ clear_bit(EI_INODE_HASHED, &inode->ei_flags);
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
@@ -249,9 +260,8 @@ retry:
*/
set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
- inode = old;
+ return old;
} else {
- set_bit(EI_INODE_HASHED, &inode->ei_flags);
inode_fake_hash(&inode->v);
inode_sb_list_add(&inode->v);
@@ -259,9 +269,8 @@ retry:
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
+ return inode;
}
-
- return inode;
}
#define memalloc_flags_do(_flags, _do) \
@@ -333,14 +342,7 @@ static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *tr
bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
- struct bch_inode_info *ret = bch2_inode_hash_insert(trans->c, trans, inode);
- if (IS_ERR(ret)) {
- inode->v.i_state |= I_NEW;
- set_nlink(&inode->v, 1);
- discard_new_inode(&inode->v);
- }
-
- return ret;
+ return bch2_inode_hash_insert(trans->c, trans, inode);
}
@@ -1656,6 +1658,10 @@ static void bch2_evict_inode(struct inode *vinode)
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
+ /*
+ * evict() has waited for outstanding writeback, we'll do no more IO
+ * through this inode: it's safe to remove from VFS inode hashtable here
+ */
bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data);
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9138944c..83bd31b4 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
ret = -BCH_ERR_fsck_repair_unimplemented;
- ret = 0;
goto err;
}
@@ -2216,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c)
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
+
+ inode_walker_exit(&inode);
bch_err_fn(c, ret);
return ret;
}
@@ -2469,8 +2470,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
: bch2_inode_unpack(inode_k, &inode);
if (ret) {
/* Should have been caught in dirents pass */
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error looking up parent directory: %i", ret);
+ bch_err_msg(c, ret, "error looking up parent directory");
break;
}
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 649e3a01..f5f7db50 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
}
if (!had_entries)
- j->last_empty_seq = cur_seq;
+ j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 7664b68e..32b886fe 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1950,7 +1950,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
if (error ||
w->noflush ||
(!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ time_before(jiffies, j->last_flush_write +
+ msecs_to_jiffies(c->opts.journal_flush_delay)) &&
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
index db80e506..62b910f2 100644
--- a/libbcachefs/journal_sb.c
+++ b/libbcachefs/journal_sb.c
@@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal;
+ u64 sum = 0;
unsigned nr;
unsigned i;
struct u64_range *b;
@@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
for (i = 0; i < nr; i++) {
b[i].start = le64_to_cpu(journal->d[i].start);
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+
+ if (b[i].end <= b[i].start) {
+ prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].nr));
+ goto err;
+ }
+
+ sum += le64_to_cpu(journal->d[i].nr);
}
sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
@@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
}
}
+ if (sum > UINT_MAX) {
+ prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
+ goto err;
+ }
+
ret = 0;
err:
kfree(b);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index deef4f02..d86565bf 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
- bch2_trans_unlock_long(ctxt.trans);
+ move_buckets_wait(&ctxt, buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
diff --git a/libbcachefs/rcu_pending.c b/libbcachefs/rcu_pending.c
new file mode 100644
index 00000000..19a64660
--- /dev/null
+++ b/libbcachefs/rcu_pending.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+
+#include <linux/generic-radix-tree.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/vmalloc.h>
+
+#include "rcu_pending.h"
+#include "darray.h"
+#include "util.h"
+
+#define static_array_for_each(_a, _i) \
+ for (typeof(&(_a)[0]) _i = _a; \
+ _i < (_a) + ARRAY_SIZE(_a); \
+ _i++)
+
+enum rcu_pending_special {
+ RCU_PENDING_KVFREE = 1,
+ RCU_PENDING_CALL_RCU = 2,
+};
+
+#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
+#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
+
+static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? get_state_synchronize_srcu(ssp)
+ : get_state_synchronize_rcu();
+}
+
+static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? start_poll_synchronize_srcu(ssp)
+ : start_poll_synchronize_rcu();
+}
+
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ return ssp
+ ? poll_state_synchronize_srcu(ssp, cookie)
+ : poll_state_synchronize_rcu(cookie);
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+ return ssp
+ ? srcu_barrier(ssp)
+ : rcu_barrier();
+}
+
+static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ if (ssp)
+ call_srcu(ssp, rhp, func);
+ else
+ call_rcu(rhp, func);
+}
+
+struct rcu_pending_seq {
+ /*
+ * We're using a radix tree like a vector - we're just pushing elements
+ * onto the end; we're using a radix tree instead of an actual vector to
+ * avoid reallocation overhead
+ */
+ GENRADIX(struct rcu_head *) objs;
+ size_t nr;
+ struct rcu_head **cursor;
+ unsigned long seq;
+};
+
+struct rcu_pending_list {
+ struct rcu_head *head;
+ struct rcu_head *tail;
+ unsigned long seq;
+};
+
+struct rcu_pending_pcpu {
+ struct rcu_pending *parent;
+ spinlock_t lock;
+ int cpu;
+
+ /*
+ * We can't bound the number of unprocessed gp sequence numbers, and we
+ * can't efficiently merge radix trees for expired grace periods, so we
+ * need darray/vector:
+ */
+ DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
+
+ /* Third entry is for expired objects: */
+ struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
+
+ struct rcu_head cb;
+ bool cb_armed;
+ struct work_struct work;
+};
+
+static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
+{
+ if (p->objs.nr)
+ return true;
+
+ static_array_for_each(p->lists, i)
+ if (i->head)
+ return true;
+
+ return false;
+}
+
+static void rcu_pending_list_merge(struct rcu_pending_list *l1,
+ struct rcu_pending_list *l2)
+{
+#ifdef __KERNEL__
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next = l2->head;
+#else
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next.next = (void *) l2->head;
+#endif
+
+ l1->tail = l2->tail;
+ l2->head = l2->tail = NULL;
+}
+
+static void rcu_pending_list_add(struct rcu_pending_list *l,
+ struct rcu_head *n)
+{
+#ifdef __KERNEL__
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next = n;
+ l->tail = n;
+ n->next = NULL;
+#else
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next.next = (void *) n;
+ l->tail = n;
+ n->next.next = NULL;
+#endif
+}
+
+static void merge_expired_lists(struct rcu_pending_pcpu *p)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+ for (struct rcu_pending_list *i = p->lists; i < expired; i++)
+ if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
+ rcu_pending_list_merge(expired, i);
+}
+
+#ifndef __KERNEL__
+static inline void kfree_bulk(size_t nr, void ** p)
+{
+ while (nr--)
+ kfree(*p);
+}
+
+#define local_irq_save(flags) \
+do { \
+ flags = 0; \
+} while (0)
+#endif
+
+static noinline void __process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ struct rcu_pending_seq objs = {};
+ struct rcu_head *list = NULL;
+
+ if (p->objs.nr &&
+ __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
+ objs = p->objs.data[0];
+ darray_remove_item(&p->objs, p->objs.data);
+ }
+
+ merge_expired_lists(p);
+
+ list = expired->head;
+ expired->head = expired->tail = NULL;
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ for (size_t i = 0; i < objs.nr; ) {
+ size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
+
+ kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
+ i += nr_this_node;
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+
+ /*
+ * low bit of pointer indicates whether rcu_head needs
+ * to be freed - kvfree_rcu_mightsleep()
+ */
+ BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
+
+ void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
+ kvfree(ptr);
+
+ bool free_head = ((unsigned long) obj->func) & 1UL;
+ if (free_head)
+ kfree(obj);
+ }
+
+ break;
+
+ case RCU_PENDING_CALL_RCU:
+ for (size_t i = 0; i < objs.nr; i++) {
+ struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
+ obj->func(obj);
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ obj->func(obj);
+ }
+ break;
+
+ default:
+ for (size_t i = 0; i < objs.nr; i++)
+ pending->process(pending, *genradix_ptr(&objs.objs, i));
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ pending->process(pending, obj);
+ }
+ break;
+ }
+}
+
+static bool process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ /*
+ * XXX: we should grab the gp seq once and avoid multiple function
+ * calls, this is called from __rcu_pending_enqueue() fastpath in
+ * may_sleep==true mode
+ */
+ if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
+ (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
+ (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
+ p->lists[2].head) {
+ __process_finished_items(pending, p, flags);
+ return true;
+ }
+
+ return false;
+}
+
+static void rcu_pending_work(struct work_struct *work)
+{
+ struct rcu_pending_pcpu *p =
+ container_of(work, struct rcu_pending_pcpu, work);
+ struct rcu_pending *pending = p->parent;
+ unsigned long flags;
+
+ do {
+ spin_lock_irqsave(&p->lock, flags);
+ } while (process_finished_items(pending, p, flags));
+
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void rcu_pending_rcu_cb(struct rcu_head *rcu)
+{
+ struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
+
+ schedule_work_on(p->cpu, &p->work);
+
+ unsigned long flags;
+ spin_lock_irqsave(&p->lock, flags);
+ if (__rcu_pending_has_pending(p)) {
+ spin_unlock_irqrestore(&p->lock, flags);
+ __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ p->cb_armed = false;
+ spin_unlock_irqrestore(&p->lock, flags);
+ }
+}
+
+static __always_inline struct rcu_pending_seq *
+get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+{
+ darray_for_each_reverse(p->objs, objs)
+ if (objs->seq == seq)
+ return objs;
+
+ if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
+ return NULL;
+
+ return &darray_last(p->objs);
+}
+
+static noinline bool
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+ struct rcu_head *head, void *ptr,
+ unsigned long *flags)
+{
+ if (ptr) {
+ if (!head) {
+ /*
+ * kvfree_rcu_mightsleep(): we weren't passed an
+ * rcu_head, but we need one: use the low bit of the
+ * ponter to free to flag that the head needs to be
+ * freed as well:
+ */
+ ptr = (void *)(((unsigned long) ptr)|1UL);
+ head = kmalloc(sizeof(*head), __GFP_NOWARN);
+ if (!head) {
+ spin_unlock_irqrestore(&p->lock, *flags);
+ head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
+ /*
+ * dropped lock, did GFP_KERNEL allocation,
+ * check for gp expiration
+ */
+ if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
+ kvfree(--ptr);
+ kfree(head);
+ spin_lock_irqsave(&p->lock, *flags);
+ return false;
+ }
+ }
+ }
+
+ head->func = ptr;
+ }
+again:
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (i->seq == seq) {
+ rcu_pending_list_add(i, head);
+ return false;
+ }
+ }
+
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (!i->head) {
+ i->seq = seq;
+ rcu_pending_list_add(i, head);
+ return true;
+ }
+ }
+
+ merge_expired_lists(p);
+ goto again;
+}
+
+/*
+ * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
+ * pending->pracess) once grace period elapses.
+ *
+ * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
+ * back to a linked list.
+ *
+ * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
+ * process callback
+ *
+ * - If @ptr and @head are both not NULL, we're kvfree_rcu()
+ *
+ * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
+ *
+ * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
+ * expired items.
+ */
+static __always_inline void
+__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
+ void *ptr, bool may_sleep)
+{
+
+ struct rcu_pending_pcpu *p;
+ struct rcu_pending_seq *objs;
+ struct genradix_node *new_node = NULL;
+ unsigned long seq, flags;
+ bool start_gp = false;
+
+ BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ seq = __get_state_synchronize_rcu(pending->srcu);
+restart:
+ if (may_sleep &&
+ unlikely(process_finished_items(pending, p, flags)))
+ goto check_expired;
+
+ /*
+ * In kvfree_rcu() mode, the radix tree is only for slab pointers so
+ * that we can do kfree_bulk() - vmalloc pointers always use the linked
+ * list:
+ */
+ if (ptr && unlikely(is_vmalloc_addr(ptr)))
+ goto list_add;
+
+ objs = get_object_radix(p, seq);
+ if (unlikely(!objs))
+ goto list_add;
+
+ if (unlikely(!objs->cursor)) {
+ /*
+ * New radix tree nodes must be added under @p->lock because the
+ * tree root is in a darray that can be resized (typically,
+ * genradix supports concurrent unlocked allocation of new
+ * nodes) - hence preallocation and the retry loop:
+ */
+ objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
+ objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
+ if (unlikely(!objs->cursor)) {
+ if (may_sleep) {
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ gfp_t gfp = GFP_KERNEL;
+ if (!head)
+ gfp |= __GFP_NOFAIL;
+
+ new_node = genradix_alloc_node(gfp);
+ if (!new_node)
+ may_sleep = false;
+ goto check_expired;
+ }
+list_add:
+ start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
+ goto start_gp;
+ }
+ }
+
+ *objs->cursor++ = ptr ?: head;
+ /* zero cursor if we hit the end of a radix tree node: */
+ if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
+ objs->cursor = NULL;
+ start_gp = !objs->nr;
+ objs->nr++;
+start_gp:
+ if (unlikely(start_gp)) {
+ /*
+ * We only have one callback (ideally, we would have one for
+ * every outstanding graceperiod) - so if our callback is
+ * already in flight, we may still have to start a grace period
+ * (since we used get_state() above, not start_poll())
+ */
+ if (!p->cb_armed) {
+ p->cb_armed = true;
+ __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ __start_poll_synchronize_rcu(pending->srcu);
+ }
+ }
+ spin_unlock_irqrestore(&p->lock, flags);
+free_node:
+ if (new_node)
+ genradix_free_node(new_node);
+ return;
+check_expired:
+ if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ kvfree(ptr);
+ break;
+ case RCU_PENDING_CALL_RCU:
+ head->func(head);
+ break;
+ default:
+ pending->process(pending, head);
+ break;
+ }
+ goto free_node;
+ }
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ goto restart;
+}
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
+{
+ __rcu_pending_enqueue(pending, obj, NULL, true);
+}
+
+static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
+{
+ struct rcu_head *ret = NULL;
+
+ spin_lock_irq(&p->lock);
+ darray_for_each(p->objs, objs)
+ if (objs->nr) {
+ ret = *genradix_ptr(&objs->objs, --objs->nr);
+ objs->cursor = NULL;
+ if (!objs->nr)
+ genradix_free(&objs->objs);
+ goto out;
+ }
+
+ static_array_for_each(p->lists, i)
+ if (i->head) {
+ ret = i->head;
+#ifdef __KERNEL__
+ i->head = ret->next;
+#else
+ i->head = (void *) ret->next.next;
+#endif
+ if (!i->head)
+ i->tail = NULL;
+ goto out;
+ }
+out:
+ spin_unlock_irq(&p->lock);
+
+ return ret;
+}
+
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
+{
+ return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
+{
+ struct rcu_head *ret = rcu_pending_dequeue(pending);
+
+ if (ret)
+ return ret;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ spin_lock_irq(&p->lock);
+ if (__rcu_pending_has_pending(p) || p->cb_armed) {
+ spin_unlock_irq(&p->lock);
+ return true;
+ }
+ spin_unlock_irq(&p->lock);
+ }
+
+ return false;
+}
+
+void rcu_pending_exit(struct rcu_pending *pending)
+{
+ int cpu;
+
+ if (!pending->p)
+ return;
+
+ while (rcu_pending_has_pending_or_armed(pending)) {
+ __rcu_barrier(pending->srcu);
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+ static_array_for_each(p->lists, i)
+ WARN_ON(i->head);
+ WARN_ON(p->objs.nr);
+ darray_exit(&p->objs);
+ }
+ free_percpu(pending->p);
+}
+
+/**
+ * rcu_pending_init: - initialize a rcu_pending
+ *
+ * @pending: Object to init
+ * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
+ * RCU flavor
+ * @process: Callback function invoked on objects once their RCU barriers
+ * have completed; if NULL, kvfree() is used.
+ */
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process)
+{
+ pending->p = alloc_percpu(struct rcu_pending_pcpu);
+ if (!pending->p)
+ return -ENOMEM;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ p->parent = pending;
+ p->cpu = cpu;
+ spin_lock_init(&p->lock);
+ darray_init(&p->objs);
+ INIT_WORK(&p->work, rcu_pending_work);
+ }
+
+ pending->srcu = srcu;
+ pending->process = process;
+
+ return 0;
+}
diff --git a/libbcachefs/rcu_pending.h b/libbcachefs/rcu_pending.h
new file mode 100644
index 00000000..71a2f4dd
--- /dev/null
+++ b/libbcachefs/rcu_pending.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RCU_PENDING_H
+#define _LINUX_RCU_PENDING_H
+
+#include <linux/rcupdate.h>
+
+struct rcu_pending;
+typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
+
+struct rcu_pending_pcpu;
+
+struct rcu_pending {
+ struct rcu_pending_pcpu __percpu *p;
+ struct srcu_struct *srcu;
+ rcu_pending_process_fn process;
+};
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
+
+void rcu_pending_exit(struct rcu_pending *pending);
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process);
+
+#endif /* _LINUX_RCU_PENDING_H */
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index d89eb43c..36de1c6f 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
- return cmp_int(l->journal_seq, r->journal_seq);
+ /*
+ * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
+ *
+ * journal_seq == 0 means that the key comes from early repair, and
+ * should be inserted last so as to avoid overflowing the journal
+ */
+ return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
}
int bch2_journal_replay(struct bch_fs *c)
@@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c)
}
}
+ bch2_trans_unlock_long(trans);
/*
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
index bb2214ad..62ea4782 100644
--- a/libbcachefs/sb-counters_format.h
+++ b/libbcachefs/sb-counters_format.h
@@ -81,8 +81,7 @@
x(trans_restart_write_buffer_flush, 75) \
x(trans_restart_split_race, 76) \
x(write_buffer_flush_slowpath, 77) \
- x(write_buffer_flush_sync, 78) \
- x(trans_restart_freeing_inode, 79)
+ x(write_buffer_flush_sync, 78)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index 650a1f77..c7e4cdd3 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -75,6 +75,9 @@
BCH_FSCK_ERR_accounting_key_junk_at_end) \
x(disk_accounting_inum, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(rebalance_work_acct_fix, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch)
#define DOWNGRADE_TABLE() \
@@ -108,7 +111,10 @@
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
BCH_FSCK_ERR_accounting_replicas_not_marked, \
- BCH_FSCK_ERR_bkey_version_in_future)
+ BCH_FSCK_ERR_bkey_version_in_future) \
+ x(rebalance_work_acct_fix, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch)
struct upgrade_downgrade_entry {
u64 recovery_passes;
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index d3a49861..31760201 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -23,7 +23,7 @@ enum bch_fsck_flags {
x(jset_past_bucket_end, 9, 0) \
x(jset_seq_blacklisted, 10, 0) \
x(journal_entries_missing, 11, 0) \
- x(journal_entry_replicas_not_marked, 12, 0) \
+ x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
x(journal_entry_past_jset_end, 13, 0) \
x(journal_entry_replicas_data_mismatch, 14, 0) \
x(journal_entry_bkey_u64s_0, 15, 0) \
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 9f8ca6a5..4a373581 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -233,7 +233,7 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = 0444 };
+ { .name = #_name, .mode = 0644 };
BCH_TIME_STATS()
#undef x
@@ -722,6 +722,13 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name) \
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
+ BCH_TIME_STATS()
+#undef x
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c
index 0807ce9b..fb3442a7 100644
--- a/libbcachefs/thread_with_file.c
+++ b/libbcachefs/thread_with_file.c
@@ -387,7 +387,7 @@ again:
seen = buf->buf.nr;
char *n = memchr(buf->buf.data, '\n', seen);
- if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
+ if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
spin_unlock(&buf->lock);
return -ETIME;
}
diff --git a/libbcachefs/time_stats.c b/libbcachefs/time_stats.c
index 4508e9dc..3fe82757 100644
--- a/libbcachefs/time_stats.c
+++ b/libbcachefs/time_stats.c
@@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
}
}
+void bch2_time_stats_reset(struct bch2_time_stats *stats)
+{
+ spin_lock_irq(&stats->lock);
+ unsigned offset = offsetof(struct bch2_time_stats, min_duration);
+ memset((void *) stats + offset, 0, sizeof(*stats) - offset);
+
+ if (stats->buffer) {
+ int cpu;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(stats->buffer, cpu)->nr = 0;
+ }
+ spin_unlock_irq(&stats->lock);
+}
+
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
diff --git a/libbcachefs/time_stats.h b/libbcachefs/time_stats.h
index 5df61403..dc6493f7 100644
--- a/libbcachefs/time_stats.h
+++ b/libbcachefs/time_stats.h
@@ -70,6 +70,7 @@ struct time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
+ struct time_stat_buffer __percpu *buffer;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
@@ -87,7 +88,6 @@ struct bch2_time_stats {
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
- struct time_stat_buffer __percpu *buffer;
};
struct bch2_time_stats_quantiles {
@@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
return false;
}
+void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index ba458db3..5597b9d6 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -1316,12 +1316,6 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
-DEFINE_EVENT(transaction_event, trans_restart_freeing_inode,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 332bc17f..2acdfa78 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
prt_printf(out, "\tsince mount\r\trecent\r\n");
- prt_printf(out, "recent");
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20);
diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c
index fa692c86..79e067b5 100644
--- a/linux/generic-radix-tree.c
+++ b/linux/generic-radix-tree.c
@@ -5,99 +5,31 @@
#include <linux/gfp.h>
#include <linux/kmemleak.h>
-#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
-#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
-
-struct genradix_node {
- union {
- /* Interior node: */
- struct genradix_node *children[GENRADIX_ARY];
-
- /* Leaf: */
- u8 data[GENRADIX_NODE_SIZE];
- };
-};
-
-static inline int genradix_depth_shift(unsigned depth)
-{
- return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
-}
-
-/*
- * Returns size (of data, in bytes) that a tree of a given depth holds:
- */
-static inline size_t genradix_depth_size(unsigned depth)
-{
- return 1UL << genradix_depth_shift(depth);
-}
-
-/* depth that's needed for a genradix that can address up to ULONG_MAX: */
-#define GENRADIX_MAX_DEPTH \
- DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
-
-#define GENRADIX_DEPTH_MASK \
- ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
-
-static inline unsigned genradix_root_to_depth(struct genradix_root *r)
-{
- return (unsigned long) r & GENRADIX_DEPTH_MASK;
-}
-
-static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
-{
- return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
-}
-
/*
* Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated
*/
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
- struct genradix_root *r = READ_ONCE(radix->root);
- struct genradix_node *n = genradix_root_to_node(r);
- unsigned level = genradix_root_to_depth(r);
-
- if (ilog2(offset) >= genradix_depth_shift(level))
- return NULL;
-
- while (1) {
- if (!n)
- return NULL;
- if (!level)
- break;
-
- level--;
-
- n = n->children[offset >> genradix_depth_shift(level)];
- offset &= genradix_depth_size(level) - 1;
- }
-
- return &n->data[offset];
+ return __genradix_ptr_inlined(radix, offset);
}
EXPORT_SYMBOL(__genradix_ptr);
-static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
-{
- return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
-}
-
-static inline void genradix_free_node(struct genradix_node *node)
-{
- kfree(node);
-}
-
/*
* Returns pointer to the specified byte @offset within @radix, allocating it if
* necessary - newly allocated slots are always zeroed out:
*/
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+ struct genradix_node **preallocated,
gfp_t gfp_mask)
{
struct genradix_root *v = READ_ONCE(radix->root);
struct genradix_node *n, *new_node = NULL;
unsigned level;
+ if (preallocated)
+ swap(new_node, *preallocated);
+
/* Increase tree depth if necessary: */
while (1) {
struct genradix_root *r = v, *new_root;
@@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
size_t offset;
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
- if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+ if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
return -ENOMEM;
return 0;