summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/percpu-rwsem.h72
-rw-r--r--include/linux/string.h1
-rw-r--r--include/trace/events/bcachefs.h5
-rw-r--r--libbcachefs/acl.c59
-rw-r--r--libbcachefs/acl.h6
-rw-r--r--libbcachefs/alloc.c92
-rw-r--r--libbcachefs/bcachefs.h15
-rw-r--r--libbcachefs/btree_gc.c24
-rw-r--r--libbcachefs/btree_iter.c14
-rw-r--r--libbcachefs/btree_update_interior.c32
-rw-r--r--libbcachefs/btree_update_interior.h9
-rw-r--r--libbcachefs/buckets.c62
-rw-r--r--libbcachefs/buckets.h2
-rw-r--r--libbcachefs/disk_groups.c2
-rw-r--r--libbcachefs/fs-io.c37
-rw-r--r--libbcachefs/fs.c4
-rw-r--r--libbcachefs/fsck.c9
-rw-r--r--libbcachefs/io.c4
-rw-r--r--libbcachefs/journal.c15
-rw-r--r--libbcachefs/opts.c2
-rw-r--r--libbcachefs/six.h70
-rw-r--r--libbcachefs/super.c6
-rw-r--r--libbcachefs/sysfs.c2
-rw-r--r--libbcachefs/util.c20
-rw-r--r--libbcachefs/util.h53
-rw-r--r--linux/string.c16
-rw-r--r--tools-util.c2
28 files changed, 383 insertions, 254 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index e40e21f8..51df9f0e 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-f65603966f7474213e6bf22b046e374d01fd6639
+9abf628c701ad92670d697624f674cc01d42705e
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
new file mode 100644
index 00000000..c233e3ce
--- /dev/null
+++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,72 @@
+
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PERCPU_RWSEM_H
+#define _LINUX_PERCPU_RWSEM_H
+
+#include <pthread.h>
+#include <linux/preempt.h>
+
+struct percpu_rw_semaphore {
+ pthread_rwlock_t lock;
+};
+
+#define DEFINE_STATIC_PERCPU_RWSEM(name) \
+static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \
+static struct percpu_rw_semaphore name = { \
+ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
+ .read_count = &__percpu_rwsem_rc_##name, \
+ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
+ .writer = __RCUWAIT_INITIALIZER(name.writer), \
+}
+
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
+{
+ pthread_rwlock_rdlock(&sem->lock);
+ preempt_disable();
+}
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ pthread_rwlock_rdlock(&sem->lock);
+}
+
+static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ return !pthread_rwlock_tryrdlock(&sem->lock);
+}
+
+static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
+{
+ preempt_enable();
+ pthread_rwlock_unlock(&sem->lock);
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ pthread_rwlock_unlock(&sem->lock);
+}
+
+static inline void percpu_down_write(struct percpu_rw_semaphore *sem)
+{
+ pthread_rwlock_wrlock(&sem->lock);
+}
+
+static inline void percpu_up_write(struct percpu_rw_semaphore *sem)
+{
+ pthread_rwlock_unlock(&sem->lock);
+}
+
+static inline void percpu_free_rwsem(struct percpu_rw_semaphore *sem) {}
+
+static inline int percpu_init_rwsem(struct percpu_rw_semaphore *sem)
+{
+ pthread_rwlock_init(&sem->lock, NULL);
+ return 0;
+}
+
+#define percpu_rwsem_assert_held(sem) do {} while (0)
+
+#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index abc191e7..ec35b8df 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -9,6 +9,7 @@ extern size_t strlcpy(char *dest, const char *src, size_t size);
extern char *skip_spaces(const char *);
extern char *strim(char *);
extern void memzero_explicit(void *, size_t);
+int match_string(const char * const *, size_t, const char *);
#define kstrndup(s, n, gfp) strndup(s, n)
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index a34574ca..13264b82 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -296,6 +296,11 @@ DEFINE_EVENT(btree_node, btree_compact,
TP_ARGS(c, b)
);
+DEFINE_EVENT(btree_node, btree_merge,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
DEFINE_EVENT(btree_node, btree_set_root,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c
index d29bdafa..29774e5d 100644
--- a/libbcachefs/acl.c
+++ b/libbcachefs/acl.c
@@ -176,34 +176,19 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
return acl;
}
-int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
+int __bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- umode_t mode = inode->v.i_mode;
int name_index;
void *value = NULL;
size_t size = 0;
int ret;
- if (type == ACL_TYPE_ACCESS && acl) {
- ret = posix_acl_update_mode(&inode->v, &mode, &acl);
- if (ret)
- return ret;
- }
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- ret = posix_acl_equiv_mode(acl, &inode->v.i_mode);
- if (ret < 0)
- return ret;
- if (ret == 0)
- acl = NULL;
- }
break;
-
case ACL_TYPE_DEFAULT:
name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->v.i_mode))
@@ -220,20 +205,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
return (int)PTR_ERR(value);
}
- if (mode != inode->v.i_mode) {
- mutex_lock(&inode->ei_update_lock);
- inode->v.i_mode = mode;
- inode->v.i_ctime = current_time(&inode->v);
-
- ret = bch2_write_inode(c, inode);
- mutex_unlock(&inode->ei_update_lock);
-
- if (ret)
- goto err;
- }
-
ret = bch2_xattr_set(c, inode, "", value, size, 0, name_index);
-err:
kfree(value);
if (ret == -ERANGE)
@@ -245,4 +217,33 @@ err:
return ret;
}
+int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ umode_t mode = inode->v.i_mode;
+ int ret;
+
+ if (type == ACL_TYPE_ACCESS && acl) {
+ ret = posix_acl_update_mode(&inode->v, &mode, &acl);
+ if (ret)
+ return ret;
+ }
+
+ ret = __bch2_set_acl(vinode, acl, type);
+ if (ret)
+ return ret;
+
+ if (mode != inode->v.i_mode) {
+ mutex_lock(&inode->ei_update_lock);
+ inode->v.i_mode = mode;
+ inode->v.i_ctime = current_time(&inode->v);
+
+ ret = bch2_write_inode(c, inode);
+ mutex_unlock(&inode->ei_update_lock);
+ }
+
+ return ret;
+}
+
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h
index b721330e..a66338d4 100644
--- a/libbcachefs/acl.h
+++ b/libbcachefs/acl.h
@@ -52,10 +52,16 @@ static inline int bch2_acl_count(size_t size)
struct posix_acl;
extern struct posix_acl *bch2_get_acl(struct inode *, int);
+extern int __bch2_set_acl(struct inode *, struct posix_acl *, int);
extern int bch2_set_acl(struct inode *, struct posix_acl *, int);
#else
+static inline int __bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ return 0;
+}
+
static inline int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
return 0;
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index 44f9479e..ac2c7d1f 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -223,7 +223,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
if (a.k->p.offset >= ca->mi.nbuckets)
return;
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
g = bucket(ca, a.k->p.offset);
bucket_cmpxchg(g, new, ({
@@ -237,7 +237,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
g->io_time[WRITE] = get_alloc_field(&d, 2);
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
}
int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
@@ -288,7 +288,7 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct btree_iter *iter,
- u64 *journal_seq)
+ u64 *journal_seq, bool nowait)
{
struct bucket_mark m;
__BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
@@ -296,6 +296,13 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
struct bkey_i_alloc *a;
u8 *d;
int ret;
+ unsigned flags = BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE;
+
+ if (nowait)
+ flags |= BTREE_INSERT_NOWAIT;
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
@@ -304,7 +311,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
if (ret)
break;
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
g = bucket(ca, b);
/* read mark under btree node lock: */
@@ -320,14 +327,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
put_alloc_field(&d, 2, g->io_time[READ]);
if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
put_alloc_field(&d, 2, g->io_time[WRITE]);
- lg_local_unlock(&c->usage_lock);
-
- ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_NOWAIT,
+ percpu_up_read_preempt_enable(&c->usage_lock);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
bch2_btree_iter_cond_resched(iter);
} while (ret == -EINTR);
@@ -352,7 +354,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL);
+ ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
+ NULL, false);
bch2_btree_iter_unlock(&iter);
return ret;
}
@@ -372,7 +375,8 @@ int bch2_alloc_write(struct bch_fs *c)
down_read(&ca->bucket_lock);
for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
- ret = __bch2_alloc_write_key(c, ca, bucket, &iter, NULL);
+ ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
+ NULL, false);
if (ret)
break;
@@ -583,15 +587,20 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
{
struct bucket_mark m;
+ percpu_down_read_preempt_disable(&c->usage_lock);
spin_lock(&c->freelist_lock);
+
if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
spin_unlock(&c->freelist_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
return;
}
verify_not_on_freelist(c, ca, bucket);
BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
spin_unlock(&c->freelist_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
/* gc lock held: */
bucket_io_clock_reset(c, ca, bucket, READ);
@@ -812,7 +821,8 @@ static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
}
static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, size_t nr)
+ u64 *journal_seq, size_t nr,
+ bool nowait)
{
struct btree_iter iter;
int ret = 0;
@@ -820,14 +830,12 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- /*
- * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the
- * btree update or journal_res_get
- */
+ /* Only use nowait if we've already invalidated at least one bucket: */
while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
- ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq);
+ ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
+ nowait && ca->nr_invalidated);
if (ret)
break;
@@ -835,7 +843,9 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
}
bch2_btree_iter_unlock(&iter);
- return ret;
+
+ /* If we used NOWAIT, don't return the error: */
+ return ca->nr_invalidated ? 0 : ret;
}
static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
@@ -943,7 +953,8 @@ static int bch2_allocator_thread(void *arg)
fifo_used(&ca->free_inc));
journal_seq = 0;
- ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
+ ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
+ SIZE_MAX, true);
if (ret) {
bch_err(ca, "error invalidating buckets: %i", ret);
goto stop;
@@ -1077,11 +1088,15 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ percpu_down_read_preempt_disable(&c->usage_lock);
spin_lock(&ob->lock);
+
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
false, gc_pos_alloc(c, ob), 0);
ob->valid = false;
+
spin_unlock(&ob->lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
spin_lock(&c->freelist_lock);
ob->freelist = c->open_buckets_freelist;
@@ -1151,6 +1166,7 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
long bucket;
spin_lock(&c->freelist_lock);
+
if (may_alloc_partial &&
ca->open_buckets_partial_nr) {
int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
@@ -1202,7 +1218,6 @@ out:
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
- lg_local_lock(&c->usage_lock);
buckets = bucket_array(ca);
ob->valid = true;
@@ -1215,8 +1230,6 @@ out:
bucket_io_clock_reset(c, ca, bucket, READ);
bucket_io_clock_reset(c, ca, bucket, WRITE);
-
- lg_local_unlock(&c->usage_lock);
spin_unlock(&ob->lock);
spin_unlock(&c->freelist_lock);
@@ -1296,7 +1309,6 @@ static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
if (nr_ptrs_effective >= nr_replicas)
return ALLOC_SUCCESS;
- rcu_read_lock();
devs_sorted = bch2_wp_alloc_list(c, wp, devs);
for (i = 0; i < devs_sorted.nr; i++) {
@@ -1337,7 +1349,6 @@ static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
break;
}
}
- rcu_read_unlock();
EBUG_ON(reserve == RESERVE_MOVINGGC &&
ret != ALLOC_SUCCESS &&
@@ -1422,8 +1433,13 @@ static int open_bucket_add_buckets(struct bch_fs *c,
struct closure *cl)
{
struct bch_devs_mask devs = c->rw_devs[wp->type];
+ const struct bch_devs_mask *t;
struct open_bucket *ob;
unsigned i;
+ int ret;
+
+ percpu_down_read_preempt_disable(&c->usage_lock);
+ rcu_read_lock();
/* Don't allocate from devices we already have pointers to: */
for (i = 0; i < devs_have->nr; i++)
@@ -1432,17 +1448,16 @@ static int open_bucket_add_buckets(struct bch_fs *c,
writepoint_for_each_ptr_all(wp, ob, i)
__clear_bit(ob->ptr.dev, devs.d);
- if (target) {
- const struct bch_devs_mask *t;
+ t = bch2_target_to_mask(c, target);
+ if (t)
+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
- rcu_read_lock();
- t = bch2_target_to_mask(c, target);
- if (t)
- bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
- rcu_read_unlock();
- }
+ ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
- return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
+ rcu_read_unlock();
+ percpu_up_read_preempt_enable(&c->usage_lock);
+
+ return ret;
}
static struct write_point *__writepoint_find(struct hlist_head *head,
@@ -1980,10 +1995,12 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
if (!is_available_bucket(m) || m.cached_sectors)
continue;
+ percpu_down_read_preempt_disable(&c->usage_lock);
bch2_mark_alloc_bucket(c, ca, bu, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
+ percpu_up_read_preempt_enable(&c->usage_lock);
fifo_push(&ca->free_inc, bu);
ca->nr_invalidated++;
@@ -2051,7 +2068,8 @@ not_enough:
for_each_rw_member(ca, c, dev_iter) {
ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
- ca->free[RESERVE_BTREE].size);
+ ca->free[RESERVE_BTREE].size,
+ false);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 4219c46c..4702b016 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -182,10 +182,10 @@
#include <linux/bio.h>
#include <linux/closure.h>
#include <linux/kobject.h>
-#include <linux/lglock.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/seqlock.h>
@@ -302,21 +302,14 @@ enum bch_time_stats {
#include "rebalance_types.h"
#include "super_types.h"
-/*
- * Number of nodes we might have to allocate in a worst case btree split
- * operation - we split all the way up to the root, then allocate a new root.
- */
-#define btree_reserve_required_nodes(depth) (((depth) + 1) * 2 + 1)
-
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX \
- (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
struct btree;
@@ -591,7 +584,7 @@ struct bch_fs {
struct bch_fs_usage __percpu *usage_percpu;
struct bch_fs_usage usage_cached;
- struct lglock usage_lock;
+ struct percpu_rw_semaphore usage_lock;
struct closure_waitlist freelist_wait;
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index cd5ebfbe..02b14e38 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -118,20 +118,17 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k, unsigned flags)
{
struct gc_pos pos = { 0 };
- struct bch_fs_usage *stats;
u8 ret = 0;
- preempt_disable();
- stats = this_cpu_ptr(c->usage_percpu);
switch (type) {
case BKEY_TYPE_BTREE:
- bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
+ bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL,
0, flags|
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
break;
case BKEY_TYPE_EXTENTS:
- bch2_mark_key(c, k, k.k->size, false, pos, stats,
+ bch2_mark_key(c, k, k.k->size, false, pos, NULL,
0, flags|
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
@@ -140,7 +137,6 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
default:
BUG();
}
- preempt_enable();
return ret;
}
@@ -320,8 +316,10 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
unsigned i;
u64 b;
- if (c)
+ if (c) {
lockdep_assert_held(&c->sb_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
+ }
for (i = 0; i < layout->nr_superblocks; i++) {
u64 offset = le64_to_cpu(layout->sb_offset[i]);
@@ -345,8 +343,10 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
gc_phase(GC_PHASE_SB), flags);
}
- if (c)
+ if (c) {
+ percpu_up_read_preempt_enable(&c->usage_lock);
spin_unlock(&c->journal.lock);
+ }
}
static void bch2_mark_superblocks(struct bch_fs *c)
@@ -397,6 +397,8 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
size_t i, j, iter;
unsigned ci;
+ percpu_down_read_preempt_disable(&c->usage_lock);
+
spin_lock(&c->freelist_lock);
gc_pos_set(c, gc_pos_alloc(c, NULL));
@@ -433,6 +435,8 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
}
spin_unlock(&ob->lock);
}
+
+ percpu_up_read_preempt_enable(&c->usage_lock);
}
static void bch2_gc_start(struct bch_fs *c)
@@ -444,7 +448,7 @@ static void bch2_gc_start(struct bch_fs *c)
size_t b;
int cpu;
- lg_global_lock(&c->usage_lock);
+ percpu_down_write(&c->usage_lock);
/*
* Indicates to buckets code that gc is now in progress - done under
@@ -470,7 +474,7 @@ static void bch2_gc_start(struct bch_fs *c)
memset(p->s, 0, sizeof(p->s));
}
- lg_global_unlock(&c->usage_lock);
+ percpu_up_write(&c->usage_lock);
/* Clear bucket marks: */
for_each_member_device(ca, c, i) {
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 70c3132e..95ee9f61 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -152,7 +152,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
* the prev sibling in btree node merging:
*/
if (iter->nodes_locked &&
- __ffs(iter->nodes_locked) == level &&
+ __ffs(iter->nodes_locked) <= level &&
__btree_iter_cmp(iter->btree_id, pos, iter))
return false;
@@ -592,6 +592,8 @@ static inline void __btree_iter_init(struct btree_iter *iter,
/* Skip to first non whiteout: */
if (b->level)
bch2_btree_node_iter_peek(&l->iter, b);
+
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
static inline void btree_iter_node_set(struct btree_iter *iter,
@@ -1084,6 +1086,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
+ EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
+ !btree_node_locked(iter, 0));
if (iter->uptodate == BTREE_ITER_UPTODATE) {
struct bkey_packed *k =
@@ -1093,8 +1097,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
.v = bkeyp_val(&l->b->format, k)
};
- EBUG_ON(!btree_node_locked(iter, 0));
-
if (debug_check_bkeys(iter->c))
bch2_bkey_debugcheck(iter->c, l->b, ret);
return ret;
@@ -1257,16 +1259,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(iter->btree_id == BTREE_ID_EXTENTS));
EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
+ EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
+ !btree_node_locked(iter, 0));
if (iter->uptodate == BTREE_ITER_UPTODATE) {
- struct bkey_s_c ret = { .k = &iter->k };;
+ struct bkey_s_c ret = { .k = &iter->k };
if (!bkey_deleted(&iter->k))
ret.v = bkeyp_val(&l->b->format,
__bch2_btree_node_iter_peek_all(&l->iter, l->b));
- EBUG_ON(!btree_node_locked(iter, 0));
-
if (debug_check_bkeys(iter->c))
bch2_bkey_debugcheck(iter->c, l->b, ret);
return ret;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index c3ecc1e9..92e19c4e 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -1564,11 +1564,15 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
struct btree_update *as;
struct closure cl;
int ret = 0;
+ struct btree_iter *linked;
/*
* We already have a disk reservation and open buckets pinned; this
* allocation must not block:
*/
+ for_each_linked_btree_iter(iter, linked)
+ if (linked->btree_id == BTREE_ID_EXTENTS)
+ btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
if (iter->btree_id == BTREE_ID_EXTENTS)
btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
@@ -1704,15 +1708,17 @@ retry:
}
as = bch2_btree_update_start(c, iter->btree_id,
- btree_update_reserve_required(c, b),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- &cl);
+ btree_update_reserve_required(c, parent) + 1,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE,
+ &cl);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
goto out_unlock;
}
+ trace_btree_merge(c, b);
+
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
@@ -1778,8 +1784,10 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
struct btree_update *as;
as = bch2_btree_update_start(c, iter->btree_id,
- btree_update_reserve_required(c, b),
- flags, cl);
+ (parent
+ ? btree_update_reserve_required(c, parent)
+ : 0) + 1,
+ flags, cl);
if (IS_ERR(as)) {
trace_btree_gc_rewrite_node_fail(c, b);
return PTR_ERR(as);
@@ -1966,6 +1974,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
struct btree *b, struct bkey_i_extent *new_key)
{
+ struct btree *parent = btree_node_parent(iter, b);
struct btree_update *as = NULL;
struct btree *new_hash = NULL;
struct closure cl;
@@ -2003,11 +2012,12 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
}
as = bch2_btree_update_start(c, iter->btree_id,
- btree_update_reserve_required(c, b),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
+ parent ? btree_update_reserve_required(c, parent) : 0,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE,
+ &cl);
+
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN)
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 25bfc7ab..abf14e4c 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -183,9 +183,14 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
{
- unsigned depth = btree_node_root(c, b)->level - b->level;
+ unsigned depth = btree_node_root(c, b)->level - b->level + 1;
- return btree_reserve_required_nodes(depth);
+ /*
+ * Number of nodes we might have to allocate in a worst case btree
+ * split operation - we split all the way up to the root, then allocate
+ * a new root.
+ */
+ return depth * 2 + 1;
}
static inline void btree_node_reset_sib_u64s(struct btree *b)
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 5dda22c7..b17189ee 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -331,7 +331,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
stats->online_reserved -= added;
}
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
/* online_reserved not subject to gc: */
this_cpu_ptr(c->usage_percpu)->online_reserved +=
stats->online_reserved;
@@ -341,7 +341,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
bch2_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
memset(stats, 0, sizeof(*stats));
}
@@ -352,7 +352,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_dev_usage *dev_usage;
if (c)
- lockdep_assert_held(&c->usage_lock);
+ percpu_rwsem_assert_held(&c->usage_lock);
if (old.data_type && new.data_type &&
old.data_type != new.data_type) {
@@ -399,12 +399,13 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g;
struct bucket_mark new;
- lg_local_lock(&c->usage_lock);
+ percpu_rwsem_assert_held(&c->usage_lock);
+
g = bucket(ca, b);
*old = bucket_data_cmpxchg(c, ca, g, new, ({
if (!is_available_bucket(new)) {
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
return false;
}
@@ -414,7 +415,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.dirty_sectors = 0;
new.gen++;
}));
- lg_local_unlock(&c->usage_lock);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
@@ -429,19 +429,16 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket *g;
struct bucket_mark old, new;
- lg_local_lock(&c->usage_lock);
+ percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
- gc_will_visit(c, pos)) {
- lg_local_unlock(&c->usage_lock);
+ gc_will_visit(c, pos))
return;
- }
old = bucket_data_cmpxchg(c, ca, g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
- lg_local_unlock(&c->usage_lock);
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
c->gc_pos.phase == GC_PHASE_DONE);
@@ -471,16 +468,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!type);
if (likely(c)) {
- lg_local_lock(&c->usage_lock);
+ percpu_rwsem_assert_held(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
- gc_will_visit(c, pos)) {
- lg_local_unlock(&c->usage_lock);
+ gc_will_visit(c, pos))
return;
- }
}
- preempt_disable();
+ rcu_read_lock();
g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, g, new, ({
@@ -489,10 +484,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
new.data_type = type;
}));
- preempt_enable();
-
- if (likely(c))
- lg_local_unlock(&c->usage_lock);
+ rcu_read_unlock();
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
@@ -654,11 +646,14 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
* (e.g. the btree node lock, or the relevant allocator lock).
*/
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+ if (!stats)
+ stats = this_cpu_ptr(c->usage_percpu);
+
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED: {
@@ -693,7 +688,7 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
break;
}
}
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
}
/* Disk reservations: */
@@ -711,19 +706,19 @@ static u64 __recalc_sectors_available(struct bch_fs *c)
/* Used by gc when it's starting: */
void bch2_recalc_sectors_available(struct bch_fs *c)
{
- lg_global_lock(&c->usage_lock);
+ percpu_down_write(&c->usage_lock);
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
- lg_global_unlock(&c->usage_lock);
+ percpu_up_write(&c->usage_lock);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
this_cpu_sub(c->usage_percpu->online_reserved,
res->sectors);
bch2_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
res->sectors = 0;
}
@@ -738,7 +733,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
s64 sectors_available;
int ret;
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
stats = this_cpu_ptr(c->usage_percpu);
if (sectors <= stats->available_cache)
@@ -750,7 +745,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
get = min((u64) sectors + SECTORS_CACHE, old);
if (get < sectors) {
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
goto recalculate;
}
} while ((v = atomic64_cmpxchg(&c->sectors_available,
@@ -765,7 +760,7 @@ out:
bch2_disk_reservations_verify(c, flags);
bch2_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
return 0;
recalculate:
@@ -785,8 +780,8 @@ recalculate:
else if (!down_read_trylock(&c->gc_lock))
return -EINTR;
}
- lg_global_lock(&c->usage_lock);
+ percpu_down_write(&c->usage_lock);
sectors_available = __recalc_sectors_available(c);
if (sectors <= sectors_available ||
@@ -804,7 +799,8 @@ recalculate:
}
bch2_fs_stats_verify(c);
- lg_global_unlock(&c->usage_lock);
+ percpu_up_write(&c->usage_lock);
+
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
up_read(&c->gc_lock);
@@ -874,7 +870,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (resize) {
down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
- lg_global_lock(&c->usage_lock);
+ percpu_down_write(&c->usage_lock);
}
old_buckets = bucket_array(ca);
@@ -900,7 +896,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
swap(ca->buckets_dirty, buckets_dirty);
if (resize)
- lg_global_unlock(&c->usage_lock);
+ percpu_up_write(&c->usage_lock);
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++) {
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index aefe6027..4deb6c37 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -32,7 +32,7 @@ static inline struct bucket_array *bucket_array(struct bch_dev *ca)
{
return rcu_dereference_check(ca->buckets,
!ca->fs ||
- lockdep_is_held(&ca->fs->usage_lock) ||
+ percpu_rwsem_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index cd200cbe..87f3940e 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -176,6 +176,8 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
struct target t = target_decode(target);
switch (t.type) {
+ case TARGET_NULL:
+ return NULL;
case TARGET_DEV: {
struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 1d9464af..d7b17195 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -1702,6 +1702,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct bio *bio;
loff_t offset = req->ki_pos;
bool sync = is_sync_kiocb(req);
+ size_t shorten;
ssize_t ret;
if ((offset|iter->count) & (block_bytes(c) - 1))
@@ -1709,11 +1710,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
ret = min_t(loff_t, iter->count,
max_t(loff_t, 0, i_size_read(&inode->v) - offset));
- iov_iter_truncate(iter, round_up(ret, block_bytes(c)));
if (!ret)
return ret;
+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+ iter->count -= shorten;
+
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
&c->dio_read_bioset);
@@ -1769,6 +1772,8 @@ start:
bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
}
+ iter->count += shorten;
+
if (sync) {
closure_sync(&dio->cl);
closure_debug_destroy(&dio->cl);
@@ -1822,6 +1827,13 @@ static long bch2_dio_write_loop(struct dio_write *dio)
if (unlikely(ret < 0))
goto err;
+ /* gup might have faulted pages back in: */
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos + (dio->iop.op.written << 9),
+ req->ki_pos + iov_iter_count(&dio->iter) - 1);
+ if (unlikely(ret))
+ goto err;
+
dio->iop.op.pos = POS(inode->v.i_ino,
(req->ki_pos >> 9) + dio->iop.op.written);
@@ -2280,7 +2292,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
loff_t new_size;
int ret;
- if ((offset | len) & (PAGE_SIZE - 1))
+ if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
@@ -2354,8 +2366,11 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
btree_iter_err:
if (ret == -EINTR)
ret = 0;
- if (ret)
+ if (ret) {
+ bch2_btree_iter_unlock(&src);
+ bch2_btree_iter_unlock(&dst);
goto err_put_sectors_dirty;
+ }
/*
* XXX: if we error here we've left data with multiple
* pointers... which isn't a _super_ serious problem...
@@ -2368,7 +2383,7 @@ btree_iter_err:
bch2_btree_iter_unlock(&dst);
ret = bch2_inode_truncate(c, inode->v.i_ino,
- round_up(new_size, PAGE_SIZE) >> 9,
+ round_up(new_size, block_bytes(c)) >> 9,
&i_sectors_hook.hook,
&inode->ei_journal_seq);
if (ret)
@@ -2381,9 +2396,6 @@ err_put_sectors_dirty:
err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
-
- bch2_btree_iter_unlock(&src);
- bch2_btree_iter_unlock(&dst);
return ret;
}
@@ -2483,7 +2495,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
&i_sectors_hook.quota_res,
sectors, true);
if (unlikely(ret))
- goto err_put_sectors_dirty;
+ goto btree_iter_err;
}
if (reservation.v.nr_replicas < replicas ||
@@ -2491,7 +2503,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
ret = bch2_disk_reservation_get(c, &disk_res, sectors,
replicas, 0);
if (unlikely(ret))
- goto err_put_sectors_dirty;
+ goto btree_iter_err;
reservation.v.nr_replicas = disk_res.nr_replicas;
}
@@ -2503,8 +2515,12 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
bch2_disk_reservation_put(c, &disk_res);
btree_iter_err:
- if (ret < 0 && ret != -EINTR)
+ if (ret == -EINTR)
+ ret = 0;
+ if (ret) {
+ bch2_btree_iter_unlock(&iter);
goto err_put_sectors_dirty;
+ }
}
bch2_btree_iter_unlock(&iter);
@@ -2544,7 +2560,6 @@ btree_iter_err:
err_put_sectors_dirty:
ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err:
- bch2_btree_iter_unlock(&iter);
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
return ret;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index fb30f0d9..dc6c651d 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -243,13 +243,13 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
atomic_long_inc(&c->nr_inodes);
if (default_acl) {
- ret = bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
+ ret = __bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
if (unlikely(ret))
goto err;
}
if (acl) {
- ret = bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
+ ret = __bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
if (unlikely(ret))
goto err;
}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index c554a987..048b5c10 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -747,8 +747,13 @@ up:
}
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
- if (k.k->type != BCH_INODE_FS ||
- !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
+ if (k.k->type != BCH_INODE_FS)
+ continue;
+
+ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
+ continue;
+
+ if (!bch2_empty_dir(c, k.k->p.inode))
continue;
if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 3762fb92..f26d4041 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -1698,9 +1698,9 @@ noclone:
if (!rbio->have_ioref)
goto no_device_postclone;
- lg_local_lock(&c->usage_lock);
+ percpu_down_read_preempt_disable(&c->usage_lock);
bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
- lg_local_unlock(&c->usage_lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index ea67af3d..addd51f0 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -725,7 +725,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
long bucket;
if (new_fs) {
+ percpu_down_read_preempt_disable(&c->usage_lock);
bucket = bch2_bucket_alloc_new_fs(ca);
+ percpu_up_read_preempt_enable(&c->usage_lock);
+
if (bucket < 0) {
ret = -ENOSPC;
goto err;
@@ -741,8 +744,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bucket = sector_to_bucket(ca, ob->ptr.offset);
}
- if (c)
+ if (c) {
+ percpu_down_read_preempt_disable(&c->usage_lock);
spin_lock(&c->journal.lock);
+ }
__array_insert_item(ja->buckets, ja->nr, ja->last_idx);
__array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
@@ -759,9 +764,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
}
ja->nr++;
- if (c)
- spin_unlock(&c->journal.lock);
-
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
@@ -769,6 +771,11 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
: 0);
+ if (c) {
+ spin_unlock(&c->journal.lock);
+ percpu_up_read_preempt_enable(&c->usage_lock);
+ }
+
if (!new_fs)
bch2_open_bucket_put(c, ob);
}
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 8db8096e..8e655bc1 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -218,7 +218,7 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
return -ERANGE;
break;
case BCH_OPT_STR:
- ret = bch2_read_string_list(val, opt->choices);
+ ret = match_string(opt->choices, -1, val);
if (ret < 0)
return ret;
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
index f518c64c..999c49db 100644
--- a/libbcachefs/six.h
+++ b/libbcachefs/six.h
@@ -1,6 +1,61 @@
#ifndef _BCACHEFS_SIX_H
#define _BCACHEFS_SIX_H
+/*
+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
+ * semaphores, except with a third intermediate state, intent. Basic operations
+ * are:
+ *
+ * six_lock_read(&foo->lock);
+ * six_unlock_read(&foo->lock);
+ *
+ * six_lock_intent(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ *
+ * Intent locks block other intent locks, but do not block read locks, and you
+ * must have an intent lock held before taking a write lock, like so:
+ *
+ * six_lock_intent(&foo->lock);
+ * six_lock_write(&foo->lock);
+ * six_unlock_write(&foo->lock);
+ * six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *
+ * six_trylock_read()
+ * six_trylock_intent()
+ * six_trylock_write()
+ *
+ * six_lock_downgrade(): convert from intent to read
+ * six_lock_tryupgrade(): attempt to convert from read to intent
+ *
+ * Locks also embed a sequence number, which is incremented when the lock is
+ * locked or unlocked for write. The current sequence number can be grabbed
+ * while a lock is held from lock->state.seq; then, if you drop the lock you can
+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
+ * iff it hasn't been locked for write in the meantime.
+ *
+ * There are also operations that take the lock type as a parameter, where the
+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
+ *
+ * six_lock_type(lock, type)
+ * six_unlock_type(lock, type)
+ * six_relock(lock, type, seq)
+ * six_trylock_type(lock, type)
+ * six_trylock_convert(lock, from, to)
+ *
+ * A lock may be held multiple types by the same thread (for read or intent,
+ * not write) - up to SIX_LOCK_MAX_RECURSE. However, the six locks code does
+ * _not_ implement the actual recursive checks itself though - rather, if your
+ * code (e.g. btree iterator code) knows that the current thread already has a
+ * lock held, and for the correct type, six_lock_increment() may be used to
+ * bump up the counter for that type - the only effect is that one more call to
+ * unlock will be required before the lock is unlocked.
+ */
+
#include <linux/lockdep.h>
#include <linux/osq_lock.h>
#include <linux/sched.h>
@@ -10,21 +65,6 @@
#define SIX_LOCK_SEPARATE_LOCKFNS
-/*
- * LOCK STATES:
- *
- * read, intent, write (i.e. shared/intent/exclusive, hence the name)
- *
- * read and write work as with normal read/write locks - a lock can have
- * multiple readers, but write excludes reads and other write locks.
- *
- * Intent does not block read, but it does block other intent locks. The idea is
- * by taking an intent lock, you can then later upgrade to a write lock without
- * dropping your read lock and without deadlocking - because no other thread has
- * the intent lock and thus no other thread could be trying to take the write
- * lock.
- */
-
union six_lock_state {
struct {
atomic64_t counter;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 55da242c..1eab7c77 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -412,7 +412,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
- lg_lock_free(&c->usage_lock);
+ percpu_free_rwsem(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
@@ -643,7 +643,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
- lg_lock_init(&c->usage_lock) ||
+ percpu_init_rwsem(&c->usage_lock) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
@@ -1215,6 +1215,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
+ bch2_dev_sysfs_online(c, ca);
+
if (c->sb.nr_devices == 1)
bdevname(ca->disk_sb.bdev, c->name);
bdevname(ca->disk_sb.bdev, ca->name);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 5e341a71..66b5b9f9 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -921,7 +921,7 @@ STORE(bch2_dev)
}
if (attr == &sysfs_cache_replacement_policy) {
- ssize_t v = bch2_read_string_list(buf, bch2_cache_replacement_policies);
+ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
if (v < 0)
return v;
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 60e1f1ff..e263dd20 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -126,24 +126,6 @@ ssize_t bch2_scnprint_string_list(char *buf, size_t size,
return out - buf;
}
-ssize_t bch2_read_string_list(const char *buf, const char * const list[])
-{
- size_t i, len;
-
- buf = skip_spaces(buf);
-
- len = strlen(buf);
- while (len && isspace(buf[len - 1]))
- --len;
-
- for (i = 0; list[i]; i++)
- if (strlen(list[i]) == len &&
- !memcmp(buf, list[i], len))
- break;
-
- return list[i] ? i : -EINVAL;
-}
-
ssize_t bch2_scnprint_flag_list(char *buf, size_t size,
const char * const list[], u64 flags)
{
@@ -178,7 +160,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
s = strim(d);
while ((p = strsep(&s, ","))) {
- int flag = bch2_read_string_list(p, list);
+ int flag = match_string(list, -1, p);
if (flag < 0) {
ret = -1;
break;
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 18491559..487591c4 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -227,57 +227,6 @@ do { \
heap_sift_down(heap, _i, cmp); \
} while (0)
-/*
- * Simple array based allocator - preallocates a number of elements and you can
- * never allocate more than that, also has no locking.
- *
- * Handy because if you know you only need a fixed number of elements you don't
- * have to worry about memory allocation failure, and sometimes a mempool isn't
- * what you want.
- *
- * We treat the free elements as entries in a singly linked list, and the
- * freelist as a stack - allocating and freeing push and pop off the freelist.
- */
-
-#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
- struct { \
- type *freelist; \
- type data[size]; \
- } name
-
-#define array_alloc(array) \
-({ \
- typeof((array)->freelist) _ret = (array)->freelist; \
- \
- if (_ret) \
- (array)->freelist = *((typeof((array)->freelist) *) _ret);\
- \
- _ret; \
-})
-
-#define array_free(array, ptr) \
-do { \
- typeof((array)->freelist) _ptr = ptr; \
- \
- *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
- (array)->freelist = _ptr; \
-} while (0)
-
-#define array_allocator_init(array) \
-do { \
- typeof((array)->freelist) _i; \
- \
- BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
- (array)->freelist = NULL; \
- \
- for (_i = (array)->data; \
- _i < (array)->data + ARRAY_SIZE((array)->data); \
- _i++) \
- array_free(array, _i); \
-} while (0)
-
-#define array_freelist_empty(array) ((array)->freelist == NULL)
-
#define ANYSINT_MAX(t) \
((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
@@ -359,8 +308,6 @@ bool bch2_is_zero(const void *, size_t);
ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t);
-ssize_t bch2_read_string_list(const char *, const char * const[]);
-
ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
u64 bch2_read_flag_list(char *, const char * const[]);
diff --git a/linux/string.c b/linux/string.c
index 0f23f074..4fa3f64b 100644
--- a/linux/string.c
+++ b/linux/string.c
@@ -95,3 +95,19 @@ void memzero_explicit(void *s, size_t count)
memset(s, 0, count);
barrier_data(s);
}
+
+int match_string(const char * const *array, size_t n, const char *string)
+{
+ int index;
+ const char *item;
+
+ for (index = 0; index < n; index++) {
+ item = array[index];
+ if (!item)
+ break;
+ if (!strcmp(item, string))
+ return index;
+ }
+
+ return -EINVAL;
+}
diff --git a/tools-util.c b/tools-util.c
index 8474ab06..ca6d89a5 100644
--- a/tools-util.c
+++ b/tools-util.c
@@ -218,7 +218,7 @@ u64 read_file_u64(int dirfd, const char *path)
ssize_t read_string_list_or_die(const char *opt, const char * const list[],
const char *msg)
{
- ssize_t v = bch2_read_string_list(opt, list);
+ ssize_t v = match_string(list, -1, opt);
if (v < 0)
die("Bad %s %s", msg, opt);