summaryrefslogtreecommitdiff
path: root/libbcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-11-22 00:42:55 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2017-11-22 00:50:47 -0500
commit22291ae84a029d65334d1a90b67b5031f45cd540 (patch)
treeab9fefe205577324915545b21535fcccbff89f48 /libbcachefs
parent74cb92203293a8d5b16b078389f6b3dba5300e89 (diff)
Update bcachefs sources to 9e7ae5219c bcachefs: Make write points more dynamic
Diffstat (limited to 'libbcachefs')
-rw-r--r--libbcachefs/alloc.c600
-rw-r--r--libbcachefs/alloc.h30
-rw-r--r--libbcachefs/alloc_types.h38
-rw-r--r--libbcachefs/bcachefs.h53
-rw-r--r--libbcachefs/btree_cache.c253
-rw-r--r--libbcachefs/btree_cache.h15
-rw-r--r--libbcachefs/btree_gc.c20
-rw-r--r--libbcachefs/btree_io.c10
-rw-r--r--libbcachefs/btree_iter.c4
-rw-r--r--libbcachefs/btree_types.h36
-rw-r--r--libbcachefs/btree_update_interior.c53
-rw-r--r--libbcachefs/buckets.c7
-rw-r--r--libbcachefs/compress.c47
-rw-r--r--libbcachefs/extents.c10
-rw-r--r--libbcachefs/fs-io.c18
-rw-r--r--libbcachefs/fs-io.h2
-rw-r--r--libbcachefs/io.c53
-rw-r--r--libbcachefs/io.h18
-rw-r--r--libbcachefs/io_types.h5
-rw-r--r--libbcachefs/migrate.c5
-rw-r--r--libbcachefs/move.c12
-rw-r--r--libbcachefs/move.h17
-rw-r--r--libbcachefs/movinggc.c3
-rw-r--r--libbcachefs/super.c14
-rw-r--r--libbcachefs/sysfs.c8
-rw-r--r--libbcachefs/tier.c3
26 files changed, 766 insertions, 568 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index a1086576..dc7348fc 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -70,6 +70,7 @@
#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/random.h>
+#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
@@ -1118,6 +1119,7 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
{
enum bucket_alloc_ret ret = NO_DEVICES;
struct dev_alloc_list devs_sorted;
+ u64 buckets_free;
unsigned i;
BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
@@ -1127,46 +1129,55 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
rcu_read_lock();
devs_sorted = bch2_wp_alloc_list(c, wp, devs);
+ spin_lock(&ob->lock);
for (i = 0; i < devs_sorted.nr; i++) {
struct bch_dev *ca =
rcu_dereference(c->devs[devs_sorted.devs[i]]);
- long bucket;
+ struct open_bucket_ptr ptr;
if (!ca)
continue;
- bucket = bch2_bucket_alloc(c, ca, reserve);
- if (bucket < 0) {
- ret = FREELIST_EMPTY;
- continue;
- }
-
- wp->next_alloc[ca->dev_idx] +=
- div64_u64(U64_MAX, dev_buckets_free(ca) *
- ca->mi.bucket_size);
- bch2_wp_rescale(c, ca, wp);
+ if (wp->type == BCH_DATA_USER &&
+ ca->open_buckets_partial_nr) {
+ ptr = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+ } else {
+ long bucket = bch2_bucket_alloc(c, ca, reserve);
+ if (bucket < 0) {
+ ret = FREELIST_EMPTY;
+ continue;
+ }
- __clear_bit(ca->dev_idx, devs->d);
+ ptr = (struct open_bucket_ptr) {
+ .ptr.gen = ca->buckets[bucket].mark.gen,
+ .ptr.offset = bucket_to_sector(ca, bucket),
+ .ptr.dev = ca->dev_idx,
+ .sectors_free = ca->mi.bucket_size,
+ };
+ }
/*
* open_bucket_add_buckets expects new pointers at the head of
* the list:
*/
- BUG_ON(ob->nr_ptrs >= BCH_REPLICAS_MAX);
+ BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs));
memmove(&ob->ptrs[1],
&ob->ptrs[0],
ob->nr_ptrs * sizeof(ob->ptrs[0]));
- memmove(&ob->ptr_offset[1],
- &ob->ptr_offset[0],
- ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
ob->nr_ptrs++;
- ob->ptrs[0] = (struct bch_extent_ptr) {
- .gen = ca->buckets[bucket].mark.gen,
- .offset = bucket_to_sector(ca, bucket),
- .dev = ca->dev_idx,
- };
- ob->ptr_offset[0] = 0;
+ ob->ptrs[0] = ptr;
+
+ buckets_free = U64_MAX, dev_buckets_free(ca);
+ if (buckets_free)
+ wp->next_alloc[ca->dev_idx] +=
+ div64_u64(U64_MAX, buckets_free *
+ ca->mi.bucket_size);
+ else
+ wp->next_alloc[ca->dev_idx] = U64_MAX;
+ bch2_wp_rescale(c, ca, wp);
+
+ __clear_bit(ca->dev_idx, devs->d);
if (ob->nr_ptrs == nr_replicas) {
ret = ALLOC_SUCCESS;
@@ -1175,6 +1186,7 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
}
EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
+ spin_unlock(&ob->lock);
rcu_read_unlock();
return ret;
}
@@ -1242,24 +1254,45 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- const struct bch_extent_ptr *ptr;
+ const struct open_bucket_ptr *ptr;
u8 new_ob;
if (!atomic_dec_and_test(&ob->pin))
return;
- spin_lock(&c->open_buckets_lock);
+ down_read(&c->alloc_gc_lock);
+ spin_lock(&ob->lock);
+
open_bucket_for_each_ptr(ob, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = c->devs[ptr->ptr.dev];
- bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
+ if (ptr->sectors_free) {
+ /*
+ * This is a ptr to a bucket that still has free space,
+ * but we don't want to use it
+ */
+ BUG_ON(ca->open_buckets_partial_nr >=
+ ARRAY_SIZE(ca->open_buckets_partial));
+
+ spin_lock(&ca->freelist_lock);
+ ca->open_buckets_partial[ca->open_buckets_partial_nr++]
+ = *ptr;
+ spin_unlock(&ca->freelist_lock);
+ } else {
+ bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false);
+ }
}
-
ob->nr_ptrs = 0;
+
+ spin_unlock(&ob->lock);
+ up_read(&c->alloc_gc_lock);
+
new_ob = ob->new_ob;
ob->new_ob = 0;
- list_move(&ob->list, &c->open_buckets_free);
+ spin_lock(&c->open_buckets_lock);
+ ob->freelist = c->open_buckets_freelist;
+ c->open_buckets_freelist = ob - c->open_buckets;
c->open_buckets_nr_free++;
spin_unlock(&c->open_buckets_lock);
@@ -1270,22 +1303,19 @@ void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
}
static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
- unsigned nr_reserved,
- struct closure *cl)
+ unsigned nr_reserved,
+ struct closure *cl)
{
struct open_bucket *ret;
spin_lock(&c->open_buckets_lock);
if (c->open_buckets_nr_free > nr_reserved) {
- BUG_ON(list_empty(&c->open_buckets_free));
- ret = list_first_entry(&c->open_buckets_free,
- struct open_bucket, list);
- list_move(&ret->list, &c->open_buckets_open);
- BUG_ON(ret->nr_ptrs);
+ BUG_ON(!c->open_buckets_freelist);
+ ret = c->open_buckets + c->open_buckets_freelist;
+ c->open_buckets_freelist = ret->freelist;
atomic_set(&ret->pin, 1); /* XXX */
- ret->has_full_ptrs = false;
BUG_ON(ret->new_ob);
BUG_ON(ret->nr_ptrs);
@@ -1307,148 +1337,259 @@ static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
return ret;
}
-static unsigned ob_ptr_sectors_free(struct bch_fs *c,
- struct open_bucket *ob,
- struct bch_extent_ptr *ptr)
-{
- struct bch_dev *ca = c->devs[ptr->dev];
- unsigned i = ptr - ob->ptrs;
- unsigned used = bucket_remainder(ca, ptr->offset) +
- ob->ptr_offset[i];
-
- BUG_ON(used > ca->mi.bucket_size);
-
- return ca->mi.bucket_size - used;
-}
-
static unsigned open_bucket_sectors_free(struct bch_fs *c,
struct open_bucket *ob,
unsigned nr_replicas)
{
- unsigned i, sectors_free = UINT_MAX;
+ unsigned sectors_free = UINT_MAX;
+ struct open_bucket_ptr *ptr;
- for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
- sectors_free = min(sectors_free,
- ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
+ open_bucket_for_each_ptr(ob, ptr)
+ sectors_free = min(sectors_free, ptr->sectors_free);
return sectors_free != UINT_MAX ? sectors_free : 0;
}
-static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
- struct open_bucket *new,
- struct open_bucket *old)
+static void open_bucket_move_ptrs(struct bch_fs *c,
+ struct open_bucket *dst,
+ struct open_bucket *src,
+ struct bch_devs_mask *devs,
+ unsigned nr_ptrs_dislike)
{
bool moved_ptr = false;
int i;
- for (i = old->nr_ptrs - 1; i >= 0; --i)
- if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
- BUG_ON(new->nr_ptrs >= BCH_REPLICAS_MAX);
+ down_read(&c->alloc_gc_lock);
- new->ptrs[new->nr_ptrs] = old->ptrs[i];
- new->ptr_offset[new->nr_ptrs] = old->ptr_offset[i];
- new->nr_ptrs++;
+ if (dst < src) {
+ spin_lock(&dst->lock);
+ spin_lock_nested(&src->lock, 1);
+ } else {
+ spin_lock(&src->lock);
+ spin_lock_nested(&dst->lock, 1);
+ }
- old->nr_ptrs--;
- memmove(&old->ptrs[i],
- &old->ptrs[i + 1],
- (old->nr_ptrs - i) * sizeof(old->ptrs[0]));
- memmove(&old->ptr_offset[i],
- &old->ptr_offset[i + 1],
- (old->nr_ptrs - i) * sizeof(old->ptr_offset[0]));
+ for (i = src->nr_ptrs - 1; i >= 0; --i) {
+ if (!src->ptrs[i].sectors_free) {
+ /*
+ * Don't do anything: leave the ptr on the old
+ * open_bucket for gc to find
+ */
+ } else if (nr_ptrs_dislike &&
+ !test_bit(src->ptrs[i].ptr.dev, devs->d)) {
+ /*
+ * We don't want this pointer; bch2_open_bucket_put()
+ * will stick it on ca->open_buckets_partial to be
+ * reused
+ */
+ --nr_ptrs_dislike;
+ } else {
+ BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs));
+
+ dst->ptrs[dst->nr_ptrs++] = src->ptrs[i];
+
+ src->nr_ptrs--;
+ memmove(&src->ptrs[i],
+ &src->ptrs[i + 1],
+ (src->nr_ptrs - i) * sizeof(src->ptrs[0]));
moved_ptr = true;
}
+ }
if (moved_ptr) {
- BUG_ON(old->new_ob);
+ BUG_ON(src->new_ob);
- atomic_inc(&new->pin);
- old->new_ob = new - c->open_buckets;
+ atomic_inc(&dst->pin);
+ src->new_ob = dst - c->open_buckets;
}
+
+ spin_unlock(&dst->lock);
+ spin_unlock(&src->lock);
+ up_read(&c->alloc_gc_lock);
}
static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- const struct bch_extent_ptr *ptr;
+ const struct open_bucket_ptr *ptr;
open_bucket_for_each_ptr(ob, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = c->devs[ptr->ptr.dev];
- BUG_ON(ptr_stale(ca, ptr));
+ BUG_ON(ptr_stale(ca, &ptr->ptr));
}
#endif
}
/* Sector allocator */
-static struct open_bucket *lock_writepoint(struct bch_fs *c,
- struct write_point *wp)
-{
- struct open_bucket *ob;
-
- while ((ob = ACCESS_ONCE(wp->b))) {
- mutex_lock(&ob->lock);
- if (wp->b == ob)
- break;
-
- mutex_unlock(&ob->lock);
- }
-
- return ob;
-}
-
static int open_bucket_add_buckets(struct bch_fs *c,
struct write_point *wp,
+ struct bch_devs_mask *_devs,
struct open_bucket *ob,
unsigned nr_replicas,
- unsigned nr_replicas_required,
enum alloc_reserve reserve,
struct closure *cl)
{
struct bch_devs_mask devs = c->rw_devs[wp->type];
- unsigned i;
- int ret;
+ struct open_bucket_ptr *ptr;
if (ob->nr_ptrs >= nr_replicas)
return 0;
+ if (_devs)
+ bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX);
+
/* Don't allocate from devices we already have pointers to: */
- for (i = 0; i < ob->nr_ptrs; i++)
- __clear_bit(ob->ptrs[i].dev, devs.d);
+ open_bucket_for_each_ptr(ob, ptr)
+ if (ptr->sectors_free)
+ __clear_bit(ptr->ptr.dev, devs.d);
- if (wp->group)
- bitmap_and(devs.d, devs.d, wp->group->d, BCH_SB_MEMBERS_MAX);
+ return bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
+ reserve, &devs, cl);
+}
- ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
- reserve, &devs, cl);
+static struct write_point *__writepoint_find(struct hlist_head *head,
+ unsigned long write_point)
+{
+ struct write_point *wp;
- if (ret == -EROFS &&
- ob->nr_ptrs >= nr_replicas_required)
- ret = 0;
+ hlist_for_each_entry_rcu(wp, head, node) {
+ if (wp->write_point == write_point)
+ continue;
- return ret;
+ mutex_lock(&wp->lock);
+ if (wp->write_point == write_point)
+ return wp;
+ mutex_unlock(&wp->lock);
+ }
+
+ return NULL;
+}
+
+static struct hlist_head *writepoint_hash(struct bch_fs *c,
+ unsigned long write_point)
+{
+ unsigned hash =
+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+ return &c->write_points_hash[hash];
+}
+
+static struct write_point *writepoint_find(struct bch_fs *c,
+ enum bch_data_type data_type,
+ unsigned long write_point)
+{
+ struct write_point *wp, *oldest = NULL;
+ struct hlist_head *head;
+
+ switch (data_type) {
+ case BCH_DATA_BTREE:
+ wp = &c->btree_write_point;
+ mutex_lock(&wp->lock);
+ return wp;
+ case BCH_DATA_USER:
+ break;
+ default:
+ BUG();
+ }
+
+ head = writepoint_hash(c, write_point);
+ wp = __writepoint_find(head, write_point);
+ if (wp)
+ goto out;
+
+ mutex_lock(&c->write_points_hash_lock);
+ wp = __writepoint_find(head, write_point);
+ if (wp)
+ goto out_unlock;
+
+ for (wp = c->write_points;
+ wp < c->write_points + ARRAY_SIZE(c->write_points);
+ wp++)
+ if (!oldest || time_before64(wp->last_used, oldest->last_used))
+ oldest = wp;
+
+ wp = oldest;
+ BUG_ON(!wp);
+
+ mutex_lock(&wp->lock);
+ hlist_del_rcu(&wp->node);
+ wp->write_point = write_point;
+ hlist_add_head_rcu(&wp->node, head);
+out_unlock:
+ mutex_unlock(&c->write_points_hash_lock);
+out:
+ wp->last_used = sched_clock();
+ return wp;
}
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
-struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c,
- struct write_point *wp,
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_mask *devs,
+ unsigned long write_point,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum alloc_reserve reserve,
+ unsigned flags,
struct closure *cl)
{
struct open_bucket *ob;
- unsigned open_buckets_reserved = wp == &c->btree_write_point
+ struct write_point *wp;
+ struct open_bucket_ptr *ptr;
+ unsigned open_buckets_reserved = data_type == BCH_DATA_BTREE
? 0 : BTREE_NODE_RESERVE;
+ unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0;
int ret;
BUG_ON(!nr_replicas);
-retry:
- ob = lock_writepoint(c, wp);
+
+ wp = writepoint_find(c, data_type, write_point);
+ BUG_ON(wp->type != data_type);
+
+ wp->last_used = sched_clock();
+
+ ob = wp->ob;
+
+ /* does ob have ptrs we don't need? */
+ open_bucket_for_each_ptr(ob, ptr) {
+ if (!ptr->sectors_free)
+ nr_ptrs_empty++;
+ else if (devs && !test_bit(ptr->ptr.dev, devs->d))
+ nr_ptrs_dislike++;
+ }
+
+ ret = open_bucket_add_buckets(c, wp, devs, ob,
+ nr_replicas + nr_ptrs_empty + nr_ptrs_dislike,
+ reserve, cl);
+ if (ret && ret != -EROFS)
+ goto err;
+
+ if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ goto alloc_done;
+
+ /*
+ * XXX:
+ * Should this allocation be _forced_ to used the specified device (e.g.
+ * internal migration), or should we fall back to allocating from all
+ * devices?
+ */
+ ret = open_bucket_add_buckets(c, wp, NULL, ob,
+ nr_replicas + nr_ptrs_empty,
+ reserve, cl);
+ if (ret && ret != -EROFS)
+ goto err;
+alloc_done:
+ if (ob->nr_ptrs - nr_ptrs_empty -
+ ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
+ < nr_replicas_required) {
+ ret = -EROFS;
+ goto err;
+ }
/*
* If ob->sectors_free == 0, one or more of the buckets ob points to is
@@ -1456,53 +1597,34 @@ retry:
* still needs to find them; instead, we must allocate a new open bucket
* and copy any pointers to non-full buckets into the new open bucket.
*/
- if (!ob || ob->has_full_ptrs) {
- struct open_bucket *new_ob;
-
- new_ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
- if (IS_ERR(new_ob))
- return new_ob;
-
- mutex_lock(&new_ob->lock);
-
- /*
- * We point the write point at the open_bucket before doing the
- * allocation to avoid a race with shutdown:
- */
- if (race_fault() ||
- cmpxchg(&wp->b, ob, new_ob) != ob) {
- /* We raced: */
- mutex_unlock(&new_ob->lock);
- bch2_open_bucket_put(c, new_ob);
-
- if (ob)
- mutex_unlock(&ob->lock);
- goto retry;
+ BUG_ON(ob->nr_ptrs - nr_ptrs_empty - nr_replicas > nr_ptrs_dislike);
+ nr_ptrs_dislike = ob->nr_ptrs - nr_ptrs_empty - nr_replicas;
+
+ if (nr_ptrs_empty || nr_ptrs_dislike) {
+ ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
+ if (IS_ERR(ob)) {
+ ret = PTR_ERR(ob);
+ goto err;
}
- if (ob) {
- open_bucket_copy_unused_ptrs(c, new_ob, ob);
- mutex_unlock(&ob->lock);
- bch2_open_bucket_put(c, ob);
- }
+ /* Remove pointers we don't want to use: */
- ob = new_ob;
+ open_bucket_move_ptrs(c, ob, wp->ob, devs, nr_ptrs_dislike);
+ bch2_open_bucket_put(c, wp->ob);
+ wp->ob = ob;
}
- ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
- nr_replicas_required,
- reserve, cl);
- if (ret) {
- mutex_unlock(&ob->lock);
- return ERR_PTR(ret);
- }
+ BUG_ON(ob->nr_ptrs < nr_replicas_required);
- ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
+ wp->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
- BUG_ON(!ob->sectors_free);
+ BUG_ON(!wp->sectors_free);
verify_not_stale(c, ob);
- return ob;
+ return wp;
+err:
+ mutex_unlock(&wp->lock);
+ return ERR_PTR(ret);
}
/*
@@ -1514,29 +1636,26 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
unsigned sectors)
{
struct bch_extent_ptr tmp;
- bool has_data = false;
- unsigned i;
+ struct open_bucket_ptr *ptr;
/*
* We're keeping any existing pointer k has, and appending new pointers:
* __bch2_write() will only write to the pointers we add here:
*/
- BUG_ON(sectors > ob->sectors_free);
-
- /* didn't use all the ptrs: */
- if (nr_replicas < ob->nr_ptrs)
- has_data = true;
+ for (ptr = ob->ptrs;
+ ptr < ob->ptrs + min_t(u8, ob->nr_ptrs, nr_replicas); ptr++) {
+ struct bch_dev *ca = c->devs[ptr->ptr.dev];
- for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
- EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
+ EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev));
- tmp = ob->ptrs[i];
+ tmp = ptr->ptr;
tmp.cached = bkey_extent_is_cached(&e->k);
- tmp.offset += ob->ptr_offset[i];
+ tmp.offset += ca->mi.bucket_size - ptr->sectors_free;
extent_ptr_append(e, tmp);
- ob->ptr_offset[i] += sectors;
+ BUG_ON(sectors > ptr->sectors_free);
+ ptr->sectors_free -= sectors;
}
}
@@ -1544,25 +1663,27 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
* Append pointers to the space we just allocated to @k, and mark @sectors space
* as allocated out of @ob
*/
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
- struct open_bucket *ob)
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
{
- bool has_data = false;
- unsigned i;
+ struct open_bucket *ob = wp->ob, *new_ob = NULL;
+ struct open_bucket_ptr *ptr;
+ bool empty = false;
- for (i = 0; i < ob->nr_ptrs; i++) {
- if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
- ob->has_full_ptrs = true;
- else
- has_data = true;
- }
+ open_bucket_for_each_ptr(ob, ptr)
+ empty |= !ptr->sectors_free;
+
+ if (empty)
+ new_ob = bch2_open_bucket_get(c, 0, NULL);
- if (likely(has_data))
+ if (!IS_ERR_OR_NULL(new_ob)) {
+ /* writepoint's ref becomes our ref: */
+ wp->ob = new_ob;
+ open_bucket_move_ptrs(c, new_ob, ob, 0, 0);
+ } else {
atomic_inc(&ob->pin);
- else
- BUG_ON(xchg(&wp->b, NULL) != ob);
+ }
- mutex_unlock(&ob->lock);
+ mutex_unlock(&wp->lock);
}
/*
@@ -1583,27 +1704,33 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
* @cl - closure to wait for a bucket
*/
struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
- struct write_point *wp,
+ enum bch_data_type data_type,
+ struct bch_devs_mask *devs,
+ unsigned long write_point,
struct bkey_i_extent *e,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum alloc_reserve reserve,
+ unsigned flags,
struct closure *cl)
{
+ struct write_point *wp;
struct open_bucket *ob;
- ob = bch2_alloc_sectors_start(c, wp, nr_replicas,
- nr_replicas_required,
- reserve, cl);
- if (IS_ERR_OR_NULL(ob))
- return ob;
+ wp = bch2_alloc_sectors_start(c, data_type, devs, write_point,
+ nr_replicas, nr_replicas_required,
+ reserve, flags, cl);
+ if (IS_ERR_OR_NULL(wp))
+ return ERR_CAST(wp);
- if (e->k.size > ob->sectors_free)
- bch2_key_resize(&e->k, ob->sectors_free);
+ ob = wp->ob;
+
+ if (e->k.size > wp->sectors_free)
+ bch2_key_resize(&e->k, wp->sectors_free);
bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
- bch2_alloc_sectors_done(c, wp, ob);
+ bch2_alloc_sectors_done(c, wp);
return ob;
}
@@ -1640,8 +1767,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
}
c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
-
- c->promote_write_point.group = &fastest_tier->devs;
+ c->fastest_devs = fastest_tier != slowest_tier ? &fastest_tier->devs : NULL;
if (!fastest_tier)
goto set_capacity;
@@ -1713,49 +1839,61 @@ set_capacity:
closure_wake_up(&c->freelist_wait);
}
+static bool open_bucket_has_device(struct open_bucket *ob,
+ struct bch_dev *ca)
+{
+ struct open_bucket_ptr *ptr;
+ bool ret = false;
+
+ spin_lock(&ob->lock);
+ open_bucket_for_each_ptr(ob, ptr)
+ ret |= ptr->ptr.dev == ca->dev_idx;
+ spin_unlock(&ob->lock);
+
+ return ret;
+}
+
static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
{
struct open_bucket *ob;
- struct bch_extent_ptr *ptr;
+ struct closure cl;
- ob = lock_writepoint(c, wp);
- if (!ob)
+ closure_init_stack(&cl);
+retry:
+ mutex_lock(&wp->lock);
+ if (!open_bucket_has_device(wp->ob, ca)) {
+ mutex_unlock(&wp->lock);
return;
+ }
- for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
- if (ptr->dev == ca->dev_idx)
- goto found;
+ ob = bch2_open_bucket_get(c, 0, &cl);
+ if (IS_ERR(ob)) {
+ mutex_unlock(&wp->lock);
+ closure_sync(&cl);
+ goto retry;
+
+ }
- mutex_unlock(&ob->lock);
- return;
-found:
- BUG_ON(xchg(&wp->b, NULL) != ob);
- mutex_unlock(&ob->lock);
+ open_bucket_move_ptrs(c, ob, wp->ob, &ca->self, ob->nr_ptrs);
+ bch2_open_bucket_put(c, wp->ob);
+ wp->ob = ob;
- /* Drop writepoint's ref: */
- bch2_open_bucket_put(c, ob);
+ mutex_unlock(&wp->lock);
}
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_extent_ptr *ptr;
struct open_bucket *ob;
+ bool ret = false;
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++)
- if (atomic_read(&ob->pin)) {
- mutex_lock(&ob->lock);
- for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
- if (ptr->dev == ca->dev_idx) {
- mutex_unlock(&ob->lock);
- return true;
- }
- mutex_unlock(&ob->lock);
- }
+ if (atomic_read(&ob->pin))
+ ret |= open_bucket_has_device(ob, ca);
- return false;
+ return ret;
}
/* device goes ro: */
@@ -1782,11 +1920,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* Next, close write points that point to this device... */
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
bch2_stop_write_point(c, ca, &c->write_points[i]);
-
- bch2_stop_write_point(c, ca, &ca->copygc_write_point);
- bch2_stop_write_point(c, ca, &c->promote_write_point);
- bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
- bch2_stop_write_point(c, ca, &c->migration_write_point);
bch2_stop_write_point(c, ca, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
@@ -1880,35 +2013,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_init(struct bch_fs *c)
{
- unsigned i;
+ struct open_bucket *ob;
+ struct write_point *wp;
- INIT_LIST_HEAD(&c->open_buckets_open);
- INIT_LIST_HEAD(&c->open_buckets_free);
+ mutex_init(&c->write_points_hash_lock);
+ init_rwsem(&c->alloc_gc_lock);
spin_lock_init(&c->open_buckets_lock);
bch2_prio_timer_init(c, READ);
bch2_prio_timer_init(c, WRITE);
/* open bucket 0 is a sentinal NULL: */
- mutex_init(&c->open_buckets[0].lock);
- INIT_LIST_HEAD(&c->open_buckets[0].list);
+ spin_lock_init(&c->open_buckets[0].lock);
- for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
- mutex_init(&c->open_buckets[i].lock);
+ for (ob = c->open_buckets + 1;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+ spin_lock_init(&ob->lock);
c->open_buckets_nr_free++;
- list_add(&c->open_buckets[i].list, &c->open_buckets_free);
+
+ ob->freelist = c->open_buckets_freelist;
+ c->open_buckets_freelist = ob - c->open_buckets;
}
- c->journal.wp.type = BCH_DATA_JOURNAL;
+ mutex_init(&c->btree_write_point.lock);
c->btree_write_point.type = BCH_DATA_BTREE;
+ c->btree_write_point.ob = bch2_open_bucket_get(c, 0, NULL);
+ BUG_ON(IS_ERR(c->btree_write_point.ob));
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
- c->tiers[i].wp.type = BCH_DATA_USER;
+ for (wp = c->write_points;
+ wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
+ mutex_init(&wp->lock);
+ wp->type = BCH_DATA_USER;
+ wp->ob = bch2_open_bucket_get(c, 0, NULL);
+ wp->last_used = sched_clock();
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- c->write_points[i].type = BCH_DATA_USER;
+ wp->write_point = (unsigned long) wp;
+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
- c->promote_write_point.type = BCH_DATA_USER;
- c->migration_write_point.type = BCH_DATA_USER;
+ BUG_ON(IS_ERR(wp->ob));
+ }
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index f07f1bfc..1ea747d2 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -28,20 +28,28 @@ long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *,
- struct write_point *,
- unsigned, unsigned,
- enum alloc_reserve,
- struct closure *);
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+ enum bch_data_type,
+ struct bch_devs_mask *,
+ unsigned long,
+ unsigned, unsigned,
+ enum alloc_reserve,
+ unsigned,
+ struct closure *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
unsigned, struct open_bucket *, unsigned);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *,
- struct open_bucket *);
-
-struct open_bucket *bch2_alloc_sectors(struct bch_fs *, struct write_point *,
- struct bkey_i_extent *, unsigned, unsigned,
- enum alloc_reserve, struct closure *);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
+ enum bch_data_type,
+ struct bch_devs_mask *,
+ unsigned long,
+ struct bkey_i_extent *,
+ unsigned, unsigned,
+ enum alloc_reserve,
+ unsigned,
+ struct closure *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index bee6d28d..c48d0aaa 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -2,6 +2,7 @@
#define _BCACHEFS_ALLOC_TYPES_H
#include <linux/mutex.h>
+#include <linux/spinlock.h>
#include "clock_types.h"
@@ -44,39 +45,34 @@ enum alloc_reserve {
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
+#define WRITE_POINT_COUNT 32
-#define WRITE_POINT_COUNT 16
+struct open_bucket_ptr {
+ struct bch_extent_ptr ptr;
+ unsigned sectors_free;
+};
struct open_bucket {
- struct list_head list;
- struct mutex lock;
+ spinlock_t lock;
atomic_t pin;
- bool has_full_ptrs;
+ u8 freelist;
u8 new_ob;
+ u8 nr_ptrs;
- /*
- * recalculated every time we allocate from this open_bucket based on
- * how many pointers we're actually going to use:
- */
- unsigned sectors_free;
- unsigned nr_ptrs;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
- unsigned ptr_offset[BCH_REPLICAS_MAX];
+ struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2];
};
struct write_point {
- struct open_bucket *b;
+ struct hlist_node node;
+ struct mutex lock;
+ u64 last_used;
+ unsigned long write_point;
enum bch_data_type type;
- /*
- * If not NULL, cache group for tiering, promotion and moving GC -
- * always allocates a single replica
- *
- * Otherwise do a normal replicated bucket allocation that could come
- * from any device in tier 0 (foreground write)
- */
- struct bch_devs_mask *group;
+ /* calculated based on how many pointers we're actually going to use: */
+ unsigned sectors_free;
+ struct open_bucket *ob;
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 1828bfdf..58d4723e 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -392,6 +392,9 @@ struct bch_dev {
unsigned nr_invalidated;
bool alloc_thread_started;
+ struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
+ unsigned open_buckets_partial_nr;
+
size_t fifo_last_bucket;
/* Allocation stuff: */
@@ -426,8 +429,6 @@ struct bch_dev {
struct bch_pd_controller moving_gc_pd;
- struct write_point copygc_write_point;
-
struct journal_device journal;
struct work_struct io_error_work;
@@ -472,7 +473,6 @@ struct bch_tier {
struct bch_pd_controller pd;
struct bch_devs_mask devs;
- struct write_point wp;
};
enum bch_fs_state {
@@ -546,40 +546,7 @@ struct bch_fs {
struct btree_root btree_roots[BTREE_ID_NR];
struct mutex btree_root_lock;
- bool btree_cache_table_init_done;
- struct rhashtable btree_cache_table;
-
- /*
- * We never free a struct btree, except on shutdown - we just put it on
- * the btree_cache_freed list and reuse it later. This simplifies the
- * code, and it doesn't cost us much memory as the memory usage is
- * dominated by buffers that hold the actual btree node data and those
- * can be freed - and the number of struct btrees allocated is
- * effectively bounded.
- *
- * btree_cache_freeable effectively is a small cache - we use it because
- * high order page allocations can be rather expensive, and it's quite
- * common to delete and allocate btree nodes in quick succession. It
- * should never grow past ~2-3 nodes in practice.
- */
- struct mutex btree_cache_lock;
- struct list_head btree_cache;
- struct list_head btree_cache_freeable;
- struct list_head btree_cache_freed;
-
- /* Number of elements in btree_cache + btree_cache_freeable lists */
- unsigned btree_cache_used;
- unsigned btree_cache_reserve;
- struct shrinker btree_cache_shrink;
-
- /*
- * If we need to allocate memory for a new btree node and that
- * allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation - lock to guarantee only one thread does
- * this at a time:
- */
- struct closure_waitlist mca_wait;
- struct task_struct *btree_cache_alloc_lock;
+ struct btree_cache btree_cache;
mempool_t btree_reserve_pool;
@@ -606,6 +573,7 @@ struct bch_fs {
struct workqueue_struct *copygc_wq;
/* ALLOCATION */
+ struct rw_semaphore alloc_gc_lock;
struct bch_pd_controller foreground_write_pd;
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
@@ -622,6 +590,7 @@ struct bch_fs {
struct bch_devs_mask rw_devs[BCH_DATA_NR];
struct bch_tier tiers[BCH_TIER_MAX];
/* NULL if we only have devices in one tier: */
+ struct bch_devs_mask *fastest_devs;
struct bch_tier *fastest_tier;
u64 capacity; /* sectors */
@@ -654,17 +623,17 @@ struct bch_fs {
struct io_clock io_clock[2];
/* SECTOR ALLOCATOR */
- struct list_head open_buckets_open;
- struct list_head open_buckets_free;
- unsigned open_buckets_nr_free;
- struct closure_waitlist open_buckets_wait;
spinlock_t open_buckets_lock;
+ u8 open_buckets_freelist;
+ u8 open_buckets_nr_free;
+ struct closure_waitlist open_buckets_wait;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
struct write_point write_points[WRITE_POINT_COUNT];
- struct write_point promote_write_point;
+ struct hlist_head write_points_hash[WRITE_POINT_COUNT];
+ struct mutex write_points_hash_lock;
/*
* This write point is used for migrating data off a device
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 4147545d..22846d8a 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -31,13 +31,15 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
reserve += min_t(unsigned, 1,
c->btree_roots[i].b->level) * 8;
- c->btree_cache_reserve = reserve;
+ c->btree_cache.reserve = reserve;
}
-#define mca_can_free(c) \
- max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+ return max_t(int, 0, bc->used - bc->reserve);
+}
-static void __mca_data_free(struct bch_fs *c, struct btree *b)
+static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
{
EBUG_ON(btree_node_write_in_flight(b));
@@ -46,11 +48,13 @@ static void __mca_data_free(struct bch_fs *c, struct btree *b)
bch2_btree_keys_free(b);
}
-static void mca_data_free(struct bch_fs *c, struct btree *b)
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
- __mca_data_free(c, b);
- c->btree_cache_used--;
- list_move(&b->list, &c->btree_cache_freed);
+ struct btree_cache *bc = &c->btree_cache;
+
+ __btree_node_data_free(c, b);
+ bc->used--;
+ list_move(&b->list, &bc->freed);
}
static const struct rhashtable_params bch_btree_cache_params = {
@@ -59,8 +63,10 @@ static const struct rhashtable_params bch_btree_cache_params = {
.key_len = sizeof(struct bch_extent_ptr),
};
-static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
+ struct btree_cache *bc = &c->btree_cache;
+
b->data = kvpmalloc(btree_bytes(c), gfp);
if (!b->data)
goto err;
@@ -68,16 +74,16 @@ static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
goto err;
- c->btree_cache_used++;
- list_move(&b->list, &c->btree_cache_freeable);
+ bc->used++;
+ list_move(&b->list, &bc->freeable);
return;
err:
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
- list_move(&b->list, &c->btree_cache_freed);
+ list_move(&b->list, &bc->freed);
}
-static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
+static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
{
struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
@@ -88,49 +94,48 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- mca_data_alloc(c, b, gfp);
+ btree_node_data_alloc(c, b, gfp);
return b->data ? b : NULL;
}
/* Btree in memory cache - hash table */
-void bch2_btree_node_hash_remove(struct bch_fs *c, struct btree *b)
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
- rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
- bch_btree_cache_params);
+ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
/* Cause future lookups for this node to fail: */
bkey_i_to_extent(&b->key)->v._data[0] = 0;
}
-int __bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b)
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
- return rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
}
-int bch2_btree_node_hash_insert(struct bch_fs *c, struct btree *b,
- unsigned level, enum btree_id id)
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+ unsigned level, enum btree_id id)
{
int ret;
b->level = level;
b->btree_id = id;
- mutex_lock(&c->btree_cache_lock);
- ret = __bch2_btree_node_hash_insert(c, b);
+ mutex_lock(&bc->lock);
+ ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
- list_add(&b->list, &c->btree_cache);
- mutex_unlock(&c->btree_cache_lock);
+ list_add(&b->list, &bc->live);
+ mutex_unlock(&bc->lock);
return ret;
}
__flatten
-static inline struct btree *mca_find(struct bch_fs *c,
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
{
- return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
+ return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
bch_btree_cache_params);
}
@@ -140,9 +145,10 @@ static inline struct btree *mca_find(struct bch_fs *c,
*/
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
{
+ struct btree_cache *bc = &c->btree_cache;
int ret = 0;
- lockdep_assert_held(&c->btree_cache_lock);
+ lockdep_assert_held(&bc->lock);
if (!six_trylock_intent(&b->lock))
return -ENOMEM;
@@ -201,11 +207,12 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
return __btree_node_reclaim(c, b, true);
}
-static unsigned long bch2_mca_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache_shrink);
+ btree_cache.shrink);
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free;
@@ -218,8 +225,8 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
/* Return -1 if we can't do anything right now */
if (sc->gfp_mask & __GFP_IO)
- mutex_lock(&c->btree_cache_lock);
- else if (!mutex_trylock(&c->btree_cache_lock))
+ mutex_lock(&bc->lock);
+ else if (!mutex_trylock(&bc->lock))
return -1;
/*
@@ -230,11 +237,11 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
* IO can always make forward progress:
*/
nr /= btree_pages(c);
- can_free = mca_can_free(c);
+ can_free = btree_cache_can_free(bc);
nr = min_t(unsigned long, nr, can_free);
i = 0;
- list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
touched++;
if (freed >= nr)
@@ -242,34 +249,34 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink,
if (++i > 3 &&
!btree_node_reclaim(c, b)) {
- mca_data_free(c, b);
+ btree_node_data_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
freed++;
}
}
restart:
- list_for_each_entry_safe(b, t, &c->btree_cache, list) {
+ list_for_each_entry_safe(b, t, &bc->live, list) {
touched++;
if (freed >= nr) {
/* Save position */
- if (&t->list != &c->btree_cache)
- list_move_tail(&c->btree_cache, &t->list);
+ if (&t->list != &bc->live)
+ list_move_tail(&bc->live, &t->list);
break;
}
if (!btree_node_accessed(b) &&
!btree_node_reclaim(c, b)) {
- /* can't call bch2_btree_node_hash_remove under btree_cache_lock */
+ /* can't call bch2_btree_node_hash_remove under lock */
freed++;
- if (&t->list != &c->btree_cache)
- list_move_tail(&c->btree_cache, &t->list);
+ if (&t->list != &bc->live)
+ list_move_tail(&bc->live, &t->list);
- mca_data_free(c, b);
- mutex_unlock(&c->btree_cache_lock);
+ btree_node_data_free(c, b);
+ mutex_unlock(&bc->lock);
- bch2_btree_node_hash_remove(c, b);
+ bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
@@ -277,97 +284,97 @@ restart:
goto out;
if (sc->gfp_mask & __GFP_IO)
- mutex_lock(&c->btree_cache_lock);
- else if (!mutex_trylock(&c->btree_cache_lock))
+ mutex_lock(&bc->lock);
+ else if (!mutex_trylock(&bc->lock))
goto out;
goto restart;
} else
clear_btree_node_accessed(b);
}
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&bc->lock);
out:
return (unsigned long) freed * btree_pages(c);
}
-static unsigned long bch2_mca_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache_shrink);
+ btree_cache.shrink);
+ struct btree_cache *bc = &c->btree_cache;
if (btree_shrinker_disabled(c))
return 0;
- return mca_can_free(c) * btree_pages(c);
+ return btree_cache_can_free(bc) * btree_pages(c);
}
-void bch2_fs_btree_exit(struct bch_fs *c)
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b;
unsigned i;
- if (c->btree_cache_shrink.list.next)
- unregister_shrinker(&c->btree_cache_shrink);
+ if (bc->shrink.list.next)
+ unregister_shrinker(&bc->shrink);
- mutex_lock(&c->btree_cache_lock);
+ mutex_lock(&bc->lock);
#ifdef CONFIG_BCACHEFS_DEBUG
if (c->verify_data)
- list_move(&c->verify_data->list, &c->btree_cache);
+ list_move(&c->verify_data->list, &bc->live);
kvpfree(c->verify_ondisk, btree_bytes(c));
#endif
for (i = 0; i < BTREE_ID_NR; i++)
if (c->btree_roots[i].b)
- list_add(&c->btree_roots[i].b->list, &c->btree_cache);
+ list_add(&c->btree_roots[i].b->list, &bc->live);
- list_splice(&c->btree_cache_freeable,
- &c->btree_cache);
+ list_splice(&bc->freeable, &bc->live);
- while (!list_empty(&c->btree_cache)) {
- b = list_first_entry(&c->btree_cache, struct btree, list);
+ while (!list_empty(&bc->live)) {
+ b = list_first_entry(&bc->live, struct btree, list);
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
clear_btree_node_dirty(b);
- mca_data_free(c, b);
+ btree_node_data_free(c, b);
}
- while (!list_empty(&c->btree_cache_freed)) {
- b = list_first_entry(&c->btree_cache_freed,
- struct btree, list);
+ while (!list_empty(&bc->freed)) {
+ b = list_first_entry(&bc->freed, struct btree, list);
list_del(&b->list);
kfree(b);
}
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&bc->lock);
- if (c->btree_cache_table_init_done)
- rhashtable_destroy(&c->btree_cache_table);
+ if (bc->table_init_done)
+ rhashtable_destroy(&bc->table);
}
-int bch2_fs_btree_init(struct bch_fs *c)
+int bch2_fs_btree_cache_init(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
unsigned i;
int ret;
- ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
if (ret)
return ret;
- c->btree_cache_table_init_done = true;
+ bc->table_init_done = true;
bch2_recalc_btree_reserve(c);
- for (i = 0; i < c->btree_cache_reserve; i++)
- if (!mca_bucket_alloc(c, GFP_KERNEL))
+ for (i = 0; i < bc->reserve; i++)
+ if (!btree_node_mem_alloc(c, GFP_KERNEL))
return -ENOMEM;
- list_splice_init(&c->btree_cache,
- &c->btree_cache_freeable);
+ list_splice_init(&bc->live, &bc->freeable);
#ifdef CONFIG_BCACHEFS_DEBUG
mutex_init(&c->verify_lock);
@@ -376,42 +383,53 @@ int bch2_fs_btree_init(struct bch_fs *c)
if (!c->verify_ondisk)
return -ENOMEM;
- c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
+ c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
if (!c->verify_data)
return -ENOMEM;
list_del_init(&c->verify_data->list);
#endif
- c->btree_cache_shrink.count_objects = bch2_mca_count;
- c->btree_cache_shrink.scan_objects = bch2_mca_scan;
- c->btree_cache_shrink.seeks = 4;
- c->btree_cache_shrink.batch = btree_pages(c) * 2;
- register_shrinker(&c->btree_cache_shrink);
+ bc->shrink.count_objects = bch2_btree_cache_count;
+ bc->shrink.scan_objects = bch2_btree_cache_scan;
+ bc->shrink.seeks = 4;
+ bc->shrink.batch = btree_pages(c) * 2;
+ register_shrinker(&bc->shrink);
return 0;
}
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+ mutex_init(&bc->lock);
+ INIT_LIST_HEAD(&bc->live);
+ INIT_LIST_HEAD(&bc->freeable);
+ INIT_LIST_HEAD(&bc->freed);
+}
+
/*
* We can only have one thread cannibalizing other cached btree nodes at a time,
* or we'll deadlock. We use an open coded mutex to ensure that, which a
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-void bch2_btree_node_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
{
- if (c->btree_cache_alloc_lock == current) {
+ struct btree_cache *bc = &c->btree_cache;
+
+ if (bc->alloc_lock == current) {
trace_btree_node_cannibalize_unlock(c);
- c->btree_cache_alloc_lock = NULL;
- closure_wake_up(&c->mca_wait);
+ bc->alloc_lock = NULL;
+ closure_wake_up(&bc->alloc_wait);
}
}
-int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
{
+ struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+ old = cmpxchg(&bc->alloc_lock, NULL, current);
if (old == NULL || old == current)
goto success;
@@ -420,13 +438,13 @@ int bch2_btree_node_cannibalize_lock(struct bch_fs *c, struct closure *cl)
return -ENOMEM;
}
- closure_wait(&c->mca_wait, cl);
+ closure_wait(&bc->alloc_wait, cl);
/* Try again, after adding ourselves to waitlist */
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+ old = cmpxchg(&bc->alloc_lock, NULL, current);
if (old == NULL || old == current) {
/* We raced */
- closure_wake_up(&c->mca_wait);
+ closure_wake_up(&bc->alloc_wait);
goto success;
}
@@ -438,16 +456,17 @@ success:
return 0;
}
-static struct btree *mca_cannibalize(struct bch_fs *c)
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- list_for_each_entry_reverse(b, &c->btree_cache, list)
+ list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_reclaim(c, b))
return b;
while (1) {
- list_for_each_entry_reverse(b, &c->btree_cache, list)
+ list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_write_and_reclaim(c, b))
return b;
@@ -462,16 +481,17 @@ static struct btree *mca_cannibalize(struct bch_fs *c)
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b;
u64 start_time = local_clock();
- mutex_lock(&c->btree_cache_lock);
+ mutex_lock(&bc->lock);
/*
* btree_free() doesn't free memory; it sticks the node on the end of
* the list. Check if there's any freed nodes there:
*/
- list_for_each_entry(b, &c->btree_cache_freeable, list)
+ list_for_each_entry(b, &bc->freeable, list)
if (!btree_node_reclaim(c, b))
goto out_unlock;
@@ -479,9 +499,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
- list_for_each_entry(b, &c->btree_cache_freed, list)
+ list_for_each_entry(b, &bc->freed, list)
if (!btree_node_reclaim(c, b)) {
- mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+ btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
if (b->data)
goto out_unlock;
@@ -490,7 +510,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
goto err;
}
- b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
+ b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
if (!b)
goto err;
@@ -501,7 +521,7 @@ out_unlock:
BUG_ON(btree_node_write_in_flight(b));
list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&bc->lock);
out:
b->flags = 0;
b->written = 0;
@@ -517,18 +537,18 @@ out:
return b;
err:
/* Try to cannibalize another cached btree node: */
- if (c->btree_cache_alloc_lock == current) {
- b = mca_cannibalize(c);
+ if (bc->alloc_lock == current) {
+ b = btree_node_cannibalize(c);
list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&bc->lock);
- bch2_btree_node_hash_remove(c, b);
+ bch2_btree_node_hash_remove(bc, b);
trace_btree_node_cannibalize(c);
goto out;
}
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&bc->lock);
return ERR_PTR(-ENOMEM);
}
@@ -539,6 +559,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
unsigned level,
enum six_lock_type lock_type)
{
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b;
/*
@@ -552,15 +573,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return b;
bkey_copy(&b->key, k);
- if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) {
+ if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
/* raced with another fill: */
/* mark as unhashed... */
bkey_i_to_extent(&b->key)->v._data[0] = 0;
- mutex_lock(&c->btree_cache_lock);
- list_add(&b->list, &c->btree_cache_freeable);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_lock(&bc->lock);
+ list_add(&b->list, &bc->freeable);
+ mutex_unlock(&bc->lock);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
@@ -601,13 +622,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level,
enum six_lock_type lock_type)
{
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
BUG_ON(level >= BTREE_MAX_DEPTH);
retry:
rcu_read_lock();
- b = mca_find(c, k);
+ b = btree_cache_find(bc, k);
rcu_read_unlock();
if (unlikely(!b)) {
@@ -755,12 +777,13 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
unsigned level, enum btree_id btree_id)
{
+ struct btree_cache *bc = &c->btree_cache;
struct btree *b;
BUG_ON(level >= BTREE_MAX_DEPTH);
rcu_read_lock();
- b = mca_find(c, k);
+ b = btree_cache_find(bc, k);
rcu_read_unlock();
if (b)
@@ -771,15 +794,15 @@ void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
return;
bkey_copy(&b->key, k);
- if (bch2_btree_node_hash_insert(c, b, level, btree_id)) {
+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
/* mark as unhashed... */
bkey_i_to_extent(&b->key)->v._data[0] = 0;
- mutex_lock(&c->btree_cache_lock);
- list_add(&b->list, &c->btree_cache_freeable);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_lock(&bc->lock);
+ list_add(&b->list, &bc->freeable);
+ mutex_unlock(&bc->lock);
goto out;
}
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 5e836acd..46d536eb 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -11,13 +11,13 @@ extern const char * const bch2_btree_ids[];
void bch2_recalc_btree_reserve(struct bch_fs *);
-void bch2_btree_node_hash_remove(struct bch_fs *, struct btree *);
-int __bch2_btree_node_hash_insert(struct bch_fs *, struct btree *);
-int bch2_btree_node_hash_insert(struct bch_fs *, struct btree *,
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
-void bch2_btree_node_cannibalize_unlock(struct bch_fs *);
-int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *);
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
@@ -32,8 +32,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
unsigned, enum btree_id);
-void bch2_fs_btree_exit(struct bch_fs *);
-int bch2_fs_btree_init(struct bch_fs *);
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index e5cc00cc..b0901965 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -278,9 +278,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
struct open_bucket *ob;
+ const struct open_bucket_ptr *ptr;
size_t i, j, iter;
unsigned ci;
+ down_write(&c->alloc_gc_lock);
+
for_each_member_device(ca, c, ci) {
spin_lock(&ca->freelist_lock);
@@ -291,21 +294,26 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
+ for (ptr = ca->open_buckets_partial;
+ ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
+ ptr++)
+ bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
+
spin_unlock(&ca->freelist_lock);
}
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
- const struct bch_extent_ptr *ptr;
-
- mutex_lock(&ob->lock);
+ spin_lock(&ob->lock);
open_bucket_for_each_ptr(ob, ptr) {
- ca = c->devs[ptr->dev];
- bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true);
+ ca = c->devs[ptr->ptr.dev];
+ bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
}
- mutex_unlock(&ob->lock);
+ spin_unlock(&ob->lock);
}
+
+ up_write(&c->alloc_gc_lock);
}
static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 507a6a9d..d50e9e8e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1364,17 +1364,17 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
closure_init_stack(&cl);
do {
- ret = bch2_btree_node_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(c);
- bch2_btree_node_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
bkey_copy(&b->key, k);
- BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
bch2_btree_node_read(c, b, true);
six_unlock_write(&b->lock);
@@ -1844,8 +1844,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
unsigned i;
rcu_read_lock();
- tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
- &c->btree_cache_table);
+ tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(b, pos, tbl, i, hash)
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index b1b62339..b0e64957 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -769,7 +769,7 @@ retry_all:
closure_init_stack(&cl);
do {
- ret = bch2_btree_node_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
closure_sync(&cl);
} while (ret);
}
@@ -817,7 +817,7 @@ retry:
ret = btree_iter_linked(iter) ? -EINTR : 0;
out:
- bch2_btree_node_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(c);
return ret;
io_error:
BUG_ON(ret != -EIO);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index c0c16205..8b4df034 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -130,6 +130,42 @@ struct btree {
#endif
};
+struct btree_cache {
+ struct rhashtable table;
+ bool table_init_done;
+ /*
+ * We never free a struct btree, except on shutdown - we just put it on
+ * the btree_cache_freed list and reuse it later. This simplifies the
+ * code, and it doesn't cost us much memory as the memory usage is
+ * dominated by buffers that hold the actual btree node data and those
+ * can be freed - and the number of struct btrees allocated is
+ * effectively bounded.
+ *
+ * btree_cache_freeable effectively is a small cache - we use it because
+ * high order page allocations can be rather expensive, and it's quite
+ * common to delete and allocate btree nodes in quick succession. It
+ * should never grow past ~2-3 nodes in practice.
+ */
+ struct mutex lock;
+ struct list_head live;
+ struct list_head freeable;
+ struct list_head freed;
+
+ /* Number of elements in live + freeable lists */
+ unsigned used;
+ unsigned reserve;
+ struct shrinker shrink;
+
+ /*
+ * If we need to allocate memory for a new btree node and that
+ * allocation fails, we can cannibalize another node in the btree cache
+ * to satisfy the allocation - lock to guarantee only one thread does
+ * this at a time:
+ */
+ struct task_struct *alloc_lock;
+ struct closure_waitlist alloc_wait;
+};
+
#define BTREE_FLAG(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 922a4863..2efb01c1 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -237,11 +237,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
six_lock_write(&b->lock);
- bch2_btree_node_hash_remove(c, b);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_lock(&c->btree_cache_lock);
- list_move(&b->list, &c->btree_cache_freeable);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
/*
* By using six_unlock_write() directly instead of
@@ -339,11 +339,11 @@ retry:
bkey_extent_init(&tmp.k);
tmp.k.k.size = c->opts.btree_node_size,
- ob = bch2_alloc_sectors(c, &c->btree_write_point,
- bkey_i_to_extent(&tmp.k),
- res->nr_replicas,
- c->opts.metadata_replicas_required,
- alloc_reserve, cl);
+ ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
+ bkey_i_to_extent(&tmp.k),
+ res->nr_replicas,
+ c->opts.metadata_replicas_required,
+ alloc_reserve, 0, cl);
if (IS_ERR(ob))
return ERR_CAST(ob);
@@ -374,7 +374,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b = as->reserve->b[--as->reserve->nr];
- BUG_ON(bch2_btree_node_hash_insert(c, b, level, as->btree_id));
+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
set_btree_node_accessed(b);
set_btree_node_dirty(b);
@@ -515,7 +515,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
*/
- ret = bch2_btree_node_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret) {
bch2_disk_reservation_put(c, &disk_res);
return ERR_PTR(ret);
@@ -543,11 +543,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
reserve->b[reserve->nr++] = b;
}
- bch2_btree_node_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(c);
return reserve;
err_free:
bch2_btree_reserve_put(c, reserve);
- bch2_btree_node_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(c);
trace_btree_reserve_get_fail(c, nr_nodes, cl);
return ERR_PTR(ret);
}
@@ -1015,9 +1015,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
{
/* Root nodes cannot be reaped */
- mutex_lock(&c->btree_cache_lock);
+ mutex_lock(&c->btree_cache.lock);
list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
btree_node_root(c, b) = b;
@@ -1802,7 +1802,7 @@ retry:
PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
/* bch2_btree_reserve_get will unlock */
do {
- ret = bch2_btree_node_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
closure_sync(&cl);
} while (ret == -EAGAIN);
@@ -1873,23 +1873,24 @@ retry:
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i);
- BUG_ON(bch2_btree_node_hash_insert(c, new_hash,
- b->level, b->btree_id));
+ ret = bch2_btree_node_hash_insert(&c->btree_cache,
+ new_hash, b->level, b->btree_id);
+ BUG_ON(ret);
}
bch2_btree_insert_node(as, parent, &iter,
&keylist_single(&new_key->k_i));
if (new_hash) {
- mutex_lock(&c->btree_cache_lock);
- bch2_btree_node_hash_remove(c, new_hash);
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
- bch2_btree_node_hash_remove(c, b);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, &new_key->k_i);
- ret = __bch2_btree_node_hash_insert(c, b);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&c->btree_cache.lock);
} else {
bkey_copy(&b->key, &new_key->k_i);
}
@@ -1918,9 +1919,9 @@ retry:
bch2_btree_update_done(as);
out:
if (new_hash) {
- mutex_lock(&c->btree_cache_lock);
- list_move(&new_hash->list, &c->btree_cache_freeable);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&new_hash->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index fbc31012..6fdbb464 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -407,8 +407,11 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
{
- return sectors * crc_compressed_size(NULL, crc) /
- crc_uncompressed_size(NULL, crc);
+ if (!sectors)
+ return 0;
+
+ return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
+ crc_uncompressed_size(NULL, crc)));
}
/*
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index c8a03c7f..7b45bb78 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -25,7 +25,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
{
void *b;
- BUG_ON(size > c->sb.encoded_extent_max);
+ BUG_ON(size > c->sb.encoded_extent_max << 9);
b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
if (b)
@@ -164,8 +164,8 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
}
break;
case BCH_COMPRESSION_LZ4:
- ret = LZ4_decompress_safe(src_data.b, dst_data,
- src_len, dst_len);
+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+ src_len, dst_len, dst_len);
if (ret != dst_len) {
ret = -EIO;
goto err;
@@ -269,7 +269,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
int ret = -ENOMEM;
- if (crc_uncompressed_size(NULL, &crc) < c->sb.encoded_extent_max)
+ if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
+ crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
return -EIO;
dst_data = dst_len == dst_iter.bi_size
@@ -294,7 +295,7 @@ static int __bio_compress(struct bch_fs *c,
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
unsigned pad;
- int ret;
+ int ret = 0;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
@@ -307,23 +308,28 @@ static int __bio_compress(struct bch_fs *c,
void *workspace;
int len = src->bi_iter.bi_size;
- ret = 0;
-
workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
- while (len > block_bytes(c) &&
- (!(ret = LZ4_compress_destSize(
+ while (1) {
+ if (len <= block_bytes(c)) {
+ ret = 0;
+ break;
+ }
+
+ ret = LZ4_compress_destSize(
src_data.b, dst_data.b,
&len, dst->bi_iter.bi_size,
- workspace)) ||
- (len & (block_bytes(c) - 1)))) {
- /*
- * On error, the compressed data was bigger than
- * dst_len - round down to nearest block and try again:
- */
+ workspace);
+ if (ret >= len) {
+ /* uncompressible: */
+ ret = 0;
+ break;
+ }
+
+ if (!(len & (block_bytes(c) - 1)))
+ break;
len = round_down(len, block_bytes(c));
}
-
mempool_free(workspace, &c->lz4_workspace_pool);
if (!ret)
@@ -331,6 +337,7 @@ static int __bio_compress(struct bch_fs *c,
*src_len = len;
*dst_len = ret;
+ ret = 0;
break;
}
case BCH_COMPRESSION_GZIP: {
@@ -446,20 +453,22 @@ int bch2_check_set_has_compressed_data(struct bch_fs *c,
unsigned compression_type)
{
switch (compression_type) {
- case BCH_COMPRESSION_NONE:
+ case BCH_COMPRESSION_OPT_NONE:
return 0;
- case BCH_COMPRESSION_LZ4:
+ case BCH_COMPRESSION_OPT_LZ4:
if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
return 0;
bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
break;
- case BCH_COMPRESSION_GZIP:
+ case BCH_COMPRESSION_OPT_GZIP:
if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
return 0;
bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
break;
+ default:
+ BUG();
}
return bch2_fs_compress_init(c);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 1937f4cb..7d2f5ccb 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -511,19 +511,19 @@ static void extent_pick_read_device(struct bch_fs *c,
struct bch_dev *ca = c->devs[ptr->dev];
if (ptr->cached && ptr_stale(ca, ptr))
- return;
+ continue;
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
- return;
+ continue;
if (avoid && test_bit(ca->dev_idx, avoid->d))
- return;
+ continue;
if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
- return;
+ continue;
if (!percpu_ref_tryget(&ca->io_ref))
- return;
+ continue;
if (pick->ca)
percpu_ref_put(&pick->ca->io_ref);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 5eb62f9d..8b41be87 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -974,7 +974,8 @@ alloc_io:
(struct disk_reservation) {
.nr_replicas = c->opts.data_replicas,
},
- foreground_write_point(c, inode->ei_last_dirtied),
+ c->fastest_devs,
+ inode->ei_last_dirtied,
POS(inum, 0),
&inode->ei_journal_seq,
BCH_WRITE_THROTTLE);
@@ -1545,10 +1546,11 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
dio->iop.is_dio = true;
dio->iop.new_i_size = U64_MAX;
bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
- foreground_write_point(dio->c, (unsigned long) current),
- POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
- &inode->ei_journal_seq,
- flags|BCH_WRITE_THROTTLE);
+ dio->c->fastest_devs,
+ (unsigned long) dio->task,
+ POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
+ &inode->ei_journal_seq,
+ flags|BCH_WRITE_THROTTLE);
dio->iop.op.index_update_fn = bchfs_write_index_update;
dio->res.sectors -= bio_sectors(bio);
@@ -1568,13 +1570,13 @@ static void bch2_dio_write_loop_async(struct closure *cl)
bch2_dio_write_done(dio);
if (dio->iter.count && !dio->error) {
- use_mm(dio->mm);
+ use_mm(dio->task->mm);
pagecache_block_get(&mapping->add_lock);
bch2_do_direct_IO_write(dio);
pagecache_block_put(&mapping->add_lock);
- unuse_mm(dio->mm);
+ unuse_mm(dio->task->mm);
continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
} else {
@@ -1617,7 +1619,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
dio->offset = offset;
dio->iovec = NULL;
dio->iter = *iter;
- dio->mm = current->mm;
+ dio->task = current;
closure_init(&dio->cl, NULL);
if (offset + iter->count > inode->v.i_size)
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index dfdc9b52..505cea73 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -74,7 +74,7 @@ struct dio_write {
struct iovec inline_vecs[UIO_FASTIOV];
struct iov_iter iter;
- struct mm_struct *mm;
+ struct task_struct *task;
/* must be last: */
struct bchfs_write_op iop;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 946c75bb..e5fc72da 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -350,7 +350,7 @@ static void init_append_extent(struct bch_write_op *op,
bch2_keylist_push(&op->insert_keys);
}
-static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
{
struct bch_fs *c = op->c;
struct bio *orig = &op->wbio.bio;
@@ -371,7 +371,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
/* Need to decompress data? */
if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
(crc_uncompressed_size(NULL, &op->crc) != op->size ||
- crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
+ crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
int ret;
ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
@@ -389,7 +389,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
op->crc.nonce,
op->crc.csum,
op->crc.csum_type,
- ob);
+ wp->ob);
bio = orig;
wbio = wbio_init(bio);
@@ -398,7 +398,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
compression_type != BCH_COMPRESSION_NONE) {
/* all units here in bytes */
unsigned total_output = 0, output_available =
- min(ob->sectors_free << 9, orig->bi_iter.bi_size);
+ min(wp->sectors_free << 9, orig->bi_iter.bi_size);
unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
? op->nonce : 0;
struct bch_csum csum;
@@ -441,7 +441,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
init_append_extent(op,
dst_len >> 9, src_len >> 9,
fragment_compression_type,
- crc_nonce, csum, csum_type, ob);
+ crc_nonce, csum, csum_type, wp->ob);
total_output += dst_len;
bio_advance(bio, dst_len);
@@ -468,14 +468,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct open_bucket *ob)
more = orig->bi_iter.bi_size != 0;
} else {
- bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
+ bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
&c->bio_write);
wbio = wbio_init(bio);
wbio->put_bio = bio != orig;
init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
compression_type, 0,
- (struct bch_csum) { 0 }, csum_type, ob);
+ (struct bch_csum) { 0 }, csum_type, wp->ob);
more = bio != orig;
}
@@ -505,7 +505,8 @@ static void __bch2_write(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
unsigned open_bucket_nr = 0;
- struct open_bucket *b;
+ struct write_point *wp;
+ struct open_bucket *ob;
int ret;
do {
@@ -519,16 +520,19 @@ static void __bch2_write(struct closure *cl)
BKEY_EXTENT_U64s_MAX))
continue_at(cl, bch2_write_index, index_update_wq(op));
- b = bch2_alloc_sectors_start(c, op->wp,
+ wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+ op->devs,
+ op->write_point,
op->nr_replicas,
c->opts.data_replicas_required,
op->alloc_reserve,
+ op->flags,
(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
- EBUG_ON(!b);
+ EBUG_ON(!wp);
- if (unlikely(IS_ERR(b))) {
- if (unlikely(PTR_ERR(b) != -EAGAIN)) {
- ret = PTR_ERR(b);
+ if (unlikely(IS_ERR(wp))) {
+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+ ret = PTR_ERR(wp);
goto err;
}
@@ -561,13 +565,15 @@ static void __bch2_write(struct closure *cl)
continue;
}
- BUG_ON(b - c->open_buckets == 0 ||
- b - c->open_buckets > U8_MAX);
- op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
+ ob = wp->ob;
- ret = bch2_write_extent(op, b);
+ BUG_ON(ob - c->open_buckets == 0 ||
+ ob - c->open_buckets > U8_MAX);
+ op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
- bch2_alloc_sectors_done(c, op->wp, b);
+ ret = bch2_write_extent(op, wp);
+
+ bch2_alloc_sectors_done(c, wp);
if (ret < 0)
goto err;
@@ -704,7 +710,9 @@ void bch2_write(struct closure *cl)
void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct disk_reservation res,
- struct write_point *wp, struct bpos pos,
+ struct bch_devs_mask *devs,
+ unsigned long write_point,
+ struct bpos pos,
u64 *journal_seq, unsigned flags)
{
EBUG_ON(res.sectors && !res.nr_replicas);
@@ -723,7 +731,8 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->pos = pos;
op->version = ZERO_VERSION;
op->res = res;
- op->wp = wp;
+ op->devs = devs;
+ op->write_point = write_point;
if (journal_seq) {
op->journal_seq_p = journal_seq;
@@ -826,6 +835,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
* Adjust bio to correspond to _live_ portion of @k -
* which might be less than what we're actually reading:
*/
+ bio->bi_iter.bi_size = sectors << 9;
bio_advance(bio, pick->crc.offset << 9);
BUG_ON(bio_sectors(bio) < k.k->size);
bio->bi_iter.bi_size = k.k->size << 9;
@@ -836,7 +846,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
*/
op->write.op.pos.offset = iter.bi_sector;
}
- bch2_migrate_write_init(c, &op->write, &c->promote_write_point,
+ bch2_migrate_write_init(c, &op->write,
+ c->fastest_devs,
k, NULL,
BCH_WRITE_ALLOC_NOWAIT|
BCH_WRITE_CACHED);
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 674cdf7a..658c15a5 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -22,11 +22,12 @@ enum bch_write_flags {
BCH_WRITE_FLUSH = (1 << 2),
BCH_WRITE_DATA_COMPRESSED = (1 << 3),
BCH_WRITE_THROTTLE = (1 << 4),
+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 5),
- BCH_WRITE_DONE = (1 << 6),
- BCH_WRITE_LOOPED = (1 << 7),
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6),
+ BCH_WRITE_DONE = (1 << 7),
+ BCH_WRITE_LOOPED = (1 << 8),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -35,15 +36,10 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
-static inline struct write_point *foreground_write_point(struct bch_fs *c,
- unsigned long v)
-{
- return c->write_points +
- hash_long(v, ilog2(ARRAY_SIZE(c->write_points)));
-}
-
void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
- struct disk_reservation, struct write_point *,
+ struct disk_reservation,
+ struct bch_devs_mask *,
+ unsigned long,
struct bpos, u64 *, unsigned);
void bch2_write(struct closure *);
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index ae4f8f3c..f77106be 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -116,9 +116,10 @@ struct bch_write_op {
struct bch_extent_crc128 crc;
unsigned size;
- struct disk_reservation res;
+ struct bch_devs_mask *devs;
+ unsigned long write_point;
- struct write_point *wp;
+ struct disk_reservation res;
union {
u8 open_buckets[16];
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index c6659259..d7f27a3d 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -15,6 +15,7 @@
static int issue_migration_move(struct bch_dev *ca,
struct moving_context *ctxt,
+ struct bch_devs_mask *devs,
struct bkey_s_c k)
{
struct bch_fs *c = ca->fs;
@@ -33,7 +34,7 @@ static int issue_migration_move(struct bch_dev *ca,
found:
/* XXX: we need to be doing something with the disk reservation */
- ret = bch2_data_move(c, ctxt, &c->migration_write_point, k, ptr);
+ ret = bch2_data_move(c, ctxt, devs, k, ptr);
if (ret)
bch2_disk_reservation_put(c, &res);
return ret;
@@ -110,7 +111,7 @@ int bch2_move_data_off_device(struct bch_dev *ca)
ca->dev_idx))
goto next;
- ret = issue_migration_move(ca, &ctxt, k);
+ ret = issue_migration_move(ca, &ctxt, NULL, k);
if (ret == -ENOMEM) {
bch2_btree_iter_unlock(&iter);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index f78cd72f..0c5b924c 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -139,7 +139,7 @@ out:
void bch2_migrate_write_init(struct bch_fs *c,
struct migrate_write *m,
- struct write_point *wp,
+ struct bch_devs_mask *devs,
struct bkey_s_c k,
const struct bch_extent_ptr *move_ptr,
unsigned flags)
@@ -155,8 +155,10 @@ void bch2_migrate_write_init(struct bch_fs *c,
(move_ptr && move_ptr->cached))
flags |= BCH_WRITE_CACHED;
- bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, wp,
- bkey_start_pos(k.k), NULL, flags);
+ bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
+ devs, (unsigned long) current,
+ bkey_start_pos(k.k), NULL,
+ flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
if (m->move)
m->op.alloc_reserve = RESERVE_MOVINGGC;
@@ -249,7 +251,7 @@ static void read_moving_endio(struct bio *bio)
int bch2_data_move(struct bch_fs *c,
struct moving_context *ctxt,
- struct write_point *wp,
+ struct bch_devs_mask *devs,
struct bkey_s_c k,
const struct bch_extent_ptr *move_ptr)
{
@@ -280,7 +282,7 @@ int bch2_data_move(struct bch_fs *c,
migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
- bch2_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
+ bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
trace_move_read(&io->write.key.k);
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 71edcf13..a756a462 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -20,12 +20,9 @@ struct migrate_write {
struct bch_write_op op;
};
-void bch2_migrate_write_init(struct bch_fs *,
- struct migrate_write *,
- struct write_point *,
- struct bkey_s_c,
- const struct bch_extent_ptr *,
- unsigned);
+void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+ struct bch_devs_mask *, struct bkey_s_c,
+ const struct bch_extent_ptr *, unsigned);
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
@@ -69,11 +66,9 @@ struct moving_io {
struct bio_vec bi_inline_vecs[0];
};
-int bch2_data_move(struct bch_fs *,
- struct moving_context *,
- struct write_point *,
- struct bkey_s_c,
- const struct bch_extent_ptr *);
+int bch2_data_move(struct bch_fs *, struct moving_context *,
+ struct bch_devs_mask *, struct bkey_s_c,
+ const struct bch_extent_ptr *);
int bch2_move_ctxt_wait(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 72cbb9d5..125159ee 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -14,6 +14,7 @@
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
+#include "super-io.h"
#include <trace/events/bcachefs.h>
#include <linux/freezer.h>
@@ -72,7 +73,7 @@ static int issue_moving_gc_move(struct bch_dev *ca,
if (!ptr) /* We raced - bucket's been reused */
return 0;
- ret = bch2_data_move(c, ctxt, &ca->copygc_write_point, k, ptr);
+ ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
if (!ret)
trace_gc_copy(k.k);
else
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index dfb95d0d..0342778d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -376,7 +376,7 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
bch2_fs_encryption_exit(c);
- bch2_fs_btree_exit(c);
+ bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
@@ -491,7 +491,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
mutex_init(&c->replicas_gc_lock);
- mutex_init(&c->btree_cache_lock);
mutex_init(&c->bucket_lock);
mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
@@ -507,9 +506,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_tiering_init(c);
INIT_LIST_HEAD(&c->list);
- INIT_LIST_HEAD(&c->btree_cache);
- INIT_LIST_HEAD(&c->btree_cache_freeable);
- INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->btree_interior_update_list);
mutex_init(&c->btree_reserve_cache_lock);
@@ -546,6 +542,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.blocked_time = &c->journal_blocked_time;
c->journal.flush_seq_time = &c->journal_flush_seq_time;
+ bch2_fs_btree_cache_init_early(&c->btree_cache);
+
mutex_lock(&c->sb_lock);
if (bch2_sb_to_fs(c, sb)) {
@@ -599,7 +597,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
- bch2_fs_btree_init(c) ||
+ bch2_fs_btree_cache_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
bch2_check_set_has_compressed_data(c, c->opts.compression))
@@ -1107,8 +1105,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
- ca->copygc_write_point.type = BCH_DATA_USER;
-
spin_lock_init(&ca->freelist_lock);
bch2_dev_moving_gc_init(ca);
@@ -1169,8 +1165,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
for (i = 0; i < RESERVE_NR; i++)
total_reserve += ca->free[i].size;
- ca->copygc_write_point.group = &ca->self;
-
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 07d9be75..c20769b7 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -209,11 +209,11 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
size_t ret = 0;
struct btree *b;
- mutex_lock(&c->btree_cache_lock);
- list_for_each_entry(b, &c->btree_cache, list)
+ mutex_lock(&c->btree_cache.lock);
+ list_for_each_entry(b, &c->btree_cache.live, list)
ret += btree_bytes(c);
- mutex_unlock(&c->btree_cache_lock);
+ mutex_unlock(&c->btree_cache.lock);
return ret;
}
@@ -436,7 +436,7 @@ STORE(__bch2_fs)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc);
+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
}
return size;
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index b68cae75..cbfcfccc 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -54,7 +54,7 @@ static int issue_tiering_move(struct bch_fs *c,
{
int ret;
- ret = bch2_data_move(c, ctxt, &tier->wp, k, NULL);
+ ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
if (!ret)
trace_tiering_copy(k.k);
else
@@ -241,6 +241,5 @@ void bch2_fs_tiering_init(struct bch_fs *c)
for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
c->tiers[i].idx = i;
bch2_pd_controller_init(&c->tiers[i].pd);
- c->tiers[i].wp.group = &c->tiers[i].devs;
}
}