summaryrefslogtreecommitdiff
path: root/libbcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-04-10 19:19:09 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2018-04-10 19:23:58 -0400
commitc598d91dcb0c7e95abdacb2711898ae14ab52ca1 (patch)
tree645b7838f62826547ea0c830738a88061827c698 /libbcachefs
parentff5e165532a2eed87700649d03f91a612a58e92a (diff)
Update bcachefs sources to edf5f38218 bcachefs: Refactor superblock code
Diffstat (limited to 'libbcachefs')
-rw-r--r--libbcachefs/alloc.c397
-rw-r--r--libbcachefs/alloc.h12
-rw-r--r--libbcachefs/alloc_types.h5
-rw-r--r--libbcachefs/bcachefs.h11
-rw-r--r--libbcachefs/bcachefs_format.h5
-rw-r--r--libbcachefs/bkey_methods.c26
-rw-r--r--libbcachefs/bkey_methods.h2
-rw-r--r--libbcachefs/btree_gc.c14
-rw-r--r--libbcachefs/btree_iter.c21
-rw-r--r--libbcachefs/btree_iter.h30
-rw-r--r--libbcachefs/btree_types.h2
-rw-r--r--libbcachefs/btree_update_interior.c14
-rw-r--r--libbcachefs/btree_update_interior.h2
-rw-r--r--libbcachefs/btree_update_leaf.c12
-rw-r--r--libbcachefs/buckets.c51
-rw-r--r--libbcachefs/buckets.h8
-rw-r--r--libbcachefs/buckets_types.h7
-rw-r--r--libbcachefs/chardev.c20
-rw-r--r--libbcachefs/checksum.c16
-rw-r--r--libbcachefs/checksum.h1
-rw-r--r--libbcachefs/clock_types.h2
-rw-r--r--libbcachefs/compress.c2
-rw-r--r--libbcachefs/debug.c13
-rw-r--r--libbcachefs/dirent.c12
-rw-r--r--libbcachefs/dirent.h9
-rw-r--r--libbcachefs/disk_groups.c462
-rw-r--r--libbcachefs/disk_groups.h99
-rw-r--r--libbcachefs/extents.c230
-rw-r--r--libbcachefs/extents.h33
-rw-r--r--libbcachefs/fs-io.c5
-rw-r--r--libbcachefs/inode.c12
-rw-r--r--libbcachefs/inode.h8
-rw-r--r--libbcachefs/io.c91
-rw-r--r--libbcachefs/io.h2
-rw-r--r--libbcachefs/journal.c143
-rw-r--r--libbcachefs/journal.h2
-rw-r--r--libbcachefs/migrate.c1
-rw-r--r--libbcachefs/move.c1
-rw-r--r--libbcachefs/movinggc.c19
-rw-r--r--libbcachefs/opts.c1
-rw-r--r--libbcachefs/quota.c45
-rw-r--r--libbcachefs/quota.h11
-rw-r--r--libbcachefs/replicas.c698
-rw-r--r--libbcachefs/replicas.h51
-rw-r--r--libbcachefs/super-io.c1195
-rw-r--r--libbcachefs/super-io.h148
-rw-r--r--libbcachefs/super.c487
-rw-r--r--libbcachefs/super.h1
-rw-r--r--libbcachefs/super_types.h6
-rw-r--r--libbcachefs/sysfs.c111
-rw-r--r--libbcachefs/tier.c1
-rw-r--r--libbcachefs/xattr.c12
-rw-r--r--libbcachefs/xattr.h9
53 files changed, 2460 insertions, 2118 deletions
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index ede44f73..16bdc48c 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -58,11 +58,13 @@
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "checksum.h"
#include "clock.h"
#include "debug.h"
+#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "io.h"
@@ -79,7 +81,7 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
/* Ratelimiting/PD controllers */
@@ -130,8 +132,7 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-static const char *bch2_alloc_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (k.k->p.inode >= c->sb.nr_devices ||
!c->devs[k.k->p.inode])
@@ -152,8 +153,8 @@ static const char *bch2_alloc_invalid(const struct bch_fs *c,
return NULL;
}
-static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
buf[0] = '\0';
@@ -163,11 +164,6 @@ static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
}
}
-const struct bkey_ops bch2_bkey_alloc_ops = {
- .key_invalid = bch2_alloc_invalid,
- .val_to_text = bch2_alloc_to_text,
-};
-
static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
{
unsigned v;
@@ -236,9 +232,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
d = a.v->data;
if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
- g->prio[READ] = get_alloc_field(&d, 2);
+ g->io_time[READ] = get_alloc_field(&d, 2);
if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
- g->prio[WRITE] = get_alloc_field(&d, 2);
+ g->io_time[WRITE] = get_alloc_field(&d, 2);
lg_local_unlock(&c->usage_lock);
}
@@ -270,21 +266,21 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
- mutex_lock(&c->prio_clock[READ].lock);
+ mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
- bch2_recalc_min_prio(c, ca, READ);
+ bch2_recalc_oldest_io(c, ca, READ);
up_read(&ca->bucket_lock);
}
- mutex_unlock(&c->prio_clock[READ].lock);
+ mutex_unlock(&c->bucket_clock[READ].lock);
- mutex_lock(&c->prio_clock[WRITE].lock);
+ mutex_lock(&c->bucket_clock[WRITE].lock);
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
- bch2_recalc_min_prio(c, ca, WRITE);
+ bch2_recalc_oldest_io(c, ca, WRITE);
up_read(&ca->bucket_lock);
}
- mutex_unlock(&c->prio_clock[WRITE].lock);
+ mutex_unlock(&c->bucket_clock[WRITE].lock);
return 0;
}
@@ -320,9 +316,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
d = a->v.data;
if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
- put_alloc_field(&d, 2, g->prio[READ]);
+ put_alloc_field(&d, 2, g->io_time[READ]);
if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
- put_alloc_field(&d, 2, g->prio[WRITE]);
+ put_alloc_field(&d, 2, g->io_time[WRITE]);
lg_local_unlock(&c->usage_lock);
ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
@@ -395,38 +391,34 @@ int bch2_alloc_write(struct bch_fs *c)
/* Bucket IO clocks: */
-static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
{
- struct prio_clock *clock = &c->prio_clock[rw];
+ struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets = bucket_array(ca);
struct bucket *g;
- u16 max_delta = 1;
+ u16 max_last_io = 0;
unsigned i;
- lockdep_assert_held(&c->prio_clock[rw].lock);
+ lockdep_assert_held(&c->bucket_clock[rw].lock);
- /* Determine min prio for this particular device */
+ /* Recalculate max_last_io for this device: */
for_each_bucket(g, buckets)
- max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
+ max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
- ca->min_prio[rw] = clock->hand - max_delta;
+ ca->max_last_bucket_io[rw] = max_last_io;
- /*
- * This may possibly increase the min prio for the whole device, check
- * that as well.
- */
- max_delta = 1;
+ /* Recalculate global max_last_io: */
+ max_last_io = 0;
for_each_member_device(ca, c, i)
- max_delta = max(max_delta,
- (u16) (clock->hand - ca->min_prio[rw]));
+ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
- clock->min_prio = clock->hand - max_delta;
+ clock->max_last_io = max_last_io;
}
-static void bch2_rescale_prios(struct bch_fs *c, int rw)
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
{
- struct prio_clock *clock = &c->prio_clock[rw];
+ struct bucket_clock *clock = &c->bucket_clock[rw];
struct bucket_array *buckets;
struct bch_dev *ca;
struct bucket *g;
@@ -439,10 +431,10 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
- g->prio[rw] = clock->hand -
- (clock->hand - g->prio[rw]) / 2;
+ g->io_time[rw] = clock->hand -
+ bucket_last_io(c, g, rw) / 2;
- bch2_recalc_min_prio(c, ca, rw);
+ bch2_recalc_oldest_io(c, ca, rw);
up_read(&ca->bucket_lock);
}
@@ -450,19 +442,26 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
static void bch2_inc_clock_hand(struct io_timer *timer)
{
- struct prio_clock *clock = container_of(timer,
- struct prio_clock, rescale);
+ struct bucket_clock *clock = container_of(timer,
+ struct bucket_clock, rescale);
struct bch_fs *c = container_of(clock,
- struct bch_fs, prio_clock[clock->rw]);
+ struct bch_fs, bucket_clock[clock->rw]);
+ struct bch_dev *ca;
u64 capacity;
+ unsigned i;
mutex_lock(&clock->lock);
- clock->hand++;
-
/* if clock cannot be advanced more, rescale prio */
- if (clock->hand == (u16) (clock->min_prio - 1))
- bch2_rescale_prios(c, clock->rw);
+ if (clock->max_last_io >= U16_MAX - 2)
+ bch2_rescale_bucket_io_times(c, clock->rw);
+
+ BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+ for_each_member_device(ca, c, i)
+ ca->max_last_bucket_io[clock->rw]++;
+ clock->max_last_io++;
+ clock->hand++;
mutex_unlock(&clock->lock);
@@ -484,9 +483,9 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
bch2_io_timer_add(&c->io_clock[clock->rw], timer);
}
-static void bch2_prio_timer_init(struct bch_fs *c, int rw)
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
{
- struct prio_clock *clock = &c->prio_clock[rw];
+ struct bucket_clock *clock = &c->bucket_clock[rw];
clock->hand = 1;
clock->rw = rw;
@@ -536,7 +535,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
- ret = -1;
+ ret = 1;
break;
}
@@ -635,13 +634,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark m)
{
+ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+ unsigned max_last_io = ca->max_last_bucket_io[READ];
+
/*
* Time since last read, scaled to [0, 8) where larger value indicates
* more recently read data:
*/
- unsigned long hotness =
- (bucket(ca, b)->prio[READ] - ca->min_prio[READ]) * 7 /
- (c->prio_clock[READ].hand - ca->min_prio[READ]);
+ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
/* How much we want to keep the data in this bucket: */
unsigned long data_wantness =
@@ -659,23 +659,25 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
struct alloc_heap_entry l,
struct alloc_heap_entry r)
{
- return (l.key > r.key) - (l.key < r.key);
+ return (l.key > r.key) - (l.key < r.key) ?:
+ (l.nr < r.nr) - (l.nr > r.nr) ?:
+ (l.bucket > r.bucket) - (l.bucket < r.bucket);
}
static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
- struct alloc_heap_entry e;
+ struct alloc_heap_entry e = { 0 };
size_t b;
ca->alloc_heap.used = 0;
- mutex_lock(&c->prio_clock[READ].lock);
+ mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
- bch2_recalc_min_prio(c, ca, READ);
+ bch2_recalc_oldest_io(c, ca, READ);
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@@ -684,30 +686,45 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+ unsigned long key = bucket_sort_key(c, ca, b, m);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .key = bucket_sort_key(c, ca, b, m)
- };
+ if (e.nr && e.bucket + e.nr == b && e.key == key) {
+ e.nr++;
+ } else {
+ if (e.nr)
+ heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+ e = (struct alloc_heap_entry) {
+ .bucket = b,
+ .nr = 1,
+ .key = key,
+ };
+ }
- heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+ cond_resched();
}
+ if (e.nr)
+ heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
up_read(&ca->bucket_lock);
- mutex_unlock(&c->prio_clock[READ].lock);
+ mutex_unlock(&c->bucket_clock[READ].lock);
heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
- /*
- * If we run out of buckets to invalidate, bch2_allocator_thread() will
- * kick stuff and retry us
- */
- while (!fifo_full(&ca->free_inc) &&
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
- bch2_invalidate_one_bucket(c, ca, e.bucket);
+ while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
+ for (b = e.bucket;
+ b < e.bucket + e.nr;
+ b++) {
+ if (fifo_full(&ca->free_inc))
+ return;
+
+ bch2_invalidate_one_bucket(c, ca, b);
+ }
+ }
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -729,6 +746,8 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
if (bch2_can_invalidate_bucket(ca, b, m))
bch2_invalidate_one_bucket(c, ca, b);
+
+ cond_resched();
}
}
@@ -749,6 +768,8 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
if (bch2_can_invalidate_bucket(ca, b, m))
bch2_invalidate_one_bucket(c, ca, b);
+
+ cond_resched();
}
}
@@ -850,7 +871,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
if ((current->flags & PF_KTHREAD) &&
kthread_should_stop()) {
- ret = -1;
+ ret = 1;
break;
}
@@ -880,7 +901,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
ca->mi.bucket_size, GFP_NOIO, 0);
if (push_invalidated_bucket(c, ca, bucket))
- return -1;
+ return 1;
}
return 0;
@@ -905,17 +926,32 @@ static int bch2_allocator_thread(void *arg)
while (1) {
while (1) {
+ cond_resched();
+
+ pr_debug("discarding %zu invalidated buckets",
+ ca->nr_invalidated);
+
ret = discard_invalidated_buckets(c, ca);
if (ret)
- return 0;
+ goto stop;
if (fifo_empty(&ca->free_inc))
break;
+ pr_debug("invalidating %zu buckets",
+ fifo_used(&ca->free_inc));
+
journal_seq = 0;
ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
- if (ret)
- return 0;
+ if (ret) {
+ bch_err(ca, "error invalidating buckets: %i", ret);
+ goto stop;
+ }
+
+ if (!ca->nr_invalidated) {
+ bch_err(ca, "allocator thread unable to make forward progress!");
+ goto stop;
+ }
if (ca->allocator_invalidating_data)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@@ -927,22 +963,29 @@ static int bch2_allocator_thread(void *arg)
* journal error - buckets haven't actually been
* invalidated, can't discard them:
*/
- if (ret)
- return 0;
+ if (ret) {
+ bch_err(ca, "journal error: %i", ret);
+ goto stop;
+ }
}
+ pr_debug("free_inc now empty");
+
/* Reset front/back so we can easily sort fifo entries later: */
ca->free_inc.front = ca->free_inc.back = 0;
ca->allocator_journal_seq_flush = 0;
ca->allocator_invalidating_data = false;
down_read(&c->gc_lock);
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
- up_read(&c->gc_lock);
- return 0;
- }
-
while (1) {
+ size_t prev = fifo_used(&ca->free_inc);
+
+ if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+ up_read(&c->gc_lock);
+ bch_err(ca, "gc failure");
+ goto stop;
+ }
+
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
@@ -950,7 +993,14 @@ static int bch2_allocator_thread(void *arg)
* another cache tier
*/
+ pr_debug("scanning for reclaimable buckets");
+
find_reclaimable_buckets(c, ca);
+
+ pr_debug("found %zu buckets (free_inc %zu/%zu)",
+ fifo_used(&ca->free_inc) - prev,
+ fifo_used(&ca->free_inc), ca->free_inc.size);
+
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
@@ -977,15 +1027,20 @@ static int bch2_allocator_thread(void *arg)
ca->allocator_blocked = true;
closure_wake_up(&c->freelist_wait);
- if (wait_buckets_available(c, ca)) {
+ ret = wait_buckets_available(c, ca);
+ if (ret) {
up_read(&c->gc_lock);
- return 0;
+ goto stop;
}
}
ca->allocator_blocked = false;
up_read(&c->gc_lock);
+ pr_debug("free_inc now %zu/%zu",
+ fifo_used(&ca->free_inc),
+ ca->free_inc.size);
+
sort_free_inc(c, ca);
/*
@@ -993,6 +1048,10 @@ static int bch2_allocator_thread(void *arg)
* write out the new bucket gens:
*/
}
+
+stop:
+ pr_debug("alloc thread stopping (ret %i)", ret);
+ return 0;
}
/* Allocation */
@@ -1046,8 +1105,8 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
return ob;
}
-/* _only_ for allocating the journal and btree roots on a brand new fs: */
-int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
{
struct bucket_array *buckets;
ssize_t b;
@@ -1056,14 +1115,8 @@ int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
buckets = bucket_array(ca);
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
- if (is_available_bucket(buckets->b[b].mark)) {
- bch2_mark_alloc_bucket(c, ca, b, true,
- gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
- BCH_BUCKET_MARK_GC_LOCK_HELD);
- set_bit(b, ca->buckets_dirty);
+ if (is_available_bucket(buckets->b[b].mark))
goto success;
- }
b = -1;
success:
rcu_read_unlock();
@@ -1135,9 +1188,8 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
break;
}
- if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
- (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
- goto out;
+ if (cl)
+ closure_wait(&c->freelist_wait, cl);
spin_unlock(&c->freelist_lock);
@@ -1218,7 +1270,7 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
*v = *v < scale ? 0 : *v - scale;
}
-static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
struct write_point *wp,
unsigned nr_replicas,
enum alloc_reserve reserve,
@@ -1284,52 +1336,22 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
break;
}
}
+ rcu_read_unlock();
EBUG_ON(reserve == RESERVE_MOVINGGC &&
ret != ALLOC_SUCCESS &&
ret != OPEN_BUCKETS_EMPTY);
- rcu_read_unlock();
- return ret;
-}
-
-static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
- unsigned nr_replicas,
- enum alloc_reserve reserve,
- struct bch_devs_mask *devs,
- struct closure *cl)
-{
- bool waiting = false;
-
- while (1) {
- switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
- reserve, devs, cl)) {
- case ALLOC_SUCCESS:
- if (waiting)
- closure_wake_up(&c->freelist_wait);
-
- return 0;
-
- case NO_DEVICES:
- if (waiting)
- closure_wake_up(&c->freelist_wait);
- return -EROFS;
-
- case FREELIST_EMPTY:
- if (!cl)
- return -ENOSPC;
- if (waiting)
- return -EAGAIN;
-
- /* Retry allocation after adding ourself to waitlist: */
- closure_wait(&c->freelist_wait, cl);
- waiting = true;
- break;
- case OPEN_BUCKETS_EMPTY:
- return cl ? -EAGAIN : -ENOSPC;
- default:
- BUG();
- }
+ switch (ret) {
+ case ALLOC_SUCCESS:
+ return 0;
+ case NO_DEVICES:
+ return -EROFS;
+ case FREELIST_EMPTY:
+ case OPEN_BUCKETS_EMPTY:
+ return cl ? -EAGAIN : -ENOSPC;
+ default:
+ BUG();
}
}
@@ -1530,11 +1552,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
nr_ptrs_have = wp->first_ptr;
/* does writepoint have ptrs we don't want to use? */
- writepoint_for_each_ptr(wp, ob, i)
- if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
- swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
- wp->first_ptr++;
- }
+ if (target)
+ writepoint_for_each_ptr(wp, ob, i)
+ if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+ swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+ wp->first_ptr++;
+ }
if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
ret = open_bucket_add_buckets(c, target, wp, devs_have,
@@ -1551,7 +1574,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
nr_replicas, reserve, cl);
}
- if (ret)
+ if (ret && ret != -EROFS)
goto err;
alloc_done:
/* check for more than one cache: */
@@ -1584,6 +1607,13 @@ alloc_done:
nr_ptrs_effective += ca->mi.durability;
}
+ if (ret == -EROFS &&
+ nr_ptrs_effective >= nr_replicas_required)
+ ret = 0;
+
+ if (ret)
+ goto err;
+
if (nr_ptrs_effective > nr_replicas) {
writepoint_for_each_ptr(wp, ob, i) {
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
@@ -1749,14 +1779,14 @@ void bch2_recalc_capacity(struct bch_fs *c)
if (c->capacity) {
bch2_io_timer_add(&c->io_clock[READ],
- &c->prio_clock[READ].rescale);
+ &c->bucket_clock[READ].rescale);
bch2_io_timer_add(&c->io_clock[WRITE],
- &c->prio_clock[WRITE].rescale);
+ &c->bucket_clock[WRITE].rescale);
} else {
bch2_io_timer_del(&c->io_clock[READ],
- &c->prio_clock[READ].rescale);
+ &c->bucket_clock[READ].rescale);
bch2_io_timer_del(&c->io_clock[WRITE],
- &c->prio_clock[WRITE].rescale);
+ &c->bucket_clock[WRITE].rescale);
}
/* Wake up case someone was waiting for buckets */
@@ -1889,7 +1919,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
if (ca->alloc_thread)
return 0;
- p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
+ p = kthread_create(bch2_allocator_thread, ca,
+ "bch_alloc[%s]", ca->name);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -1923,7 +1954,7 @@ static void allocator_start_issue_discards(struct bch_fs *c)
static int __bch2_fs_allocator_start(struct bch_fs *c)
{
struct bch_dev *ca;
- size_t bu, i, devs_have_enough = 0;
+ size_t bu, i;
unsigned dev_iter;
u64 journal_seq = 0;
bool invalidating_data = false;
@@ -1964,16 +1995,21 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
/* did we find enough buckets? */
for_each_rw_member(ca, c, dev_iter)
- devs_have_enough += (fifo_used(&ca->free_inc) >=
- ca->free[RESERVE_BTREE].size);
+ if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+ percpu_ref_put(&ca->io_ref);
+ goto not_enough;
+ }
- if (devs_have_enough >= c->opts.metadata_replicas)
- return 0;
+ return 0;
+not_enough:
+ pr_debug("did not find enough empty buckets; issuing discards");
/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
for_each_rw_member(ca, c, dev_iter)
discard_invalidated_buckets(c, ca);
+ pr_debug("scanning for reclaimable buckets");
+
for_each_rw_member(ca, c, dev_iter) {
BUG_ON(!fifo_empty(&ca->free_inc));
ca->free_inc.front = ca->free_inc.back = 0;
@@ -1988,6 +2024,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
break;
}
+ pr_debug("done scanning for reclaimable buckets");
+
/*
* We're moving buckets to freelists _before_ they've been marked as
* invalidated on disk - we have to so that we can allocate new btree
@@ -1997,10 +2035,13 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
* have cached data in them, which is live until they're marked as
* invalidated on disk:
*/
- if (invalidating_data)
+ if (invalidating_data) {
+ pr_debug("invalidating existing data");
set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
- else
+ } else {
+ pr_debug("issuing discards");
allocator_start_issue_discards(c);
+ }
/*
* XXX: it's possible for this to deadlock waiting on journal reclaim,
@@ -2017,13 +2058,15 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
}
if (invalidating_data) {
+ pr_debug("flushing journal");
+
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
if (ret)
return ret;
- }
- if (invalidating_data)
+ pr_debug("issuing discards");
allocator_start_issue_discards(c);
+ }
for_each_rw_member(ca, c, dev_iter)
while (ca->nr_invalidated) {
@@ -2038,19 +2081,43 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
+ bool flush_updates;
+ size_t nr_pending_updates;
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
+ pr_debug("flushing dirty btree nodes");
+ cond_resched();
+
+ flush_updates = false;
+ nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_dirty(b) && (!b->written || b->level)) {
- rcu_read_unlock();
- six_lock_read(&b->lock);
- bch2_btree_node_write(c, b, SIX_LOCK_read);
- six_unlock_read(&b->lock);
- goto again;
+ if (btree_node_may_write(b)) {
+ rcu_read_unlock();
+ six_lock_read(&b->lock);
+ bch2_btree_node_write(c, b, SIX_LOCK_read);
+ six_unlock_read(&b->lock);
+ goto again;
+ } else {
+ flush_updates = true;
+ }
}
rcu_read_unlock();
+
+ /*
+ * This is ugly, but it's needed to flush btree node writes
+ * without spinning...
+ */
+ if (flush_updates) {
+ closure_wait_event(&c->btree_interior_update_wait,
+ bch2_btree_interior_updates_nr_pending(c) <
+ nr_pending_updates);
+ goto again;
+ }
}
return 0;
@@ -2087,8 +2154,8 @@ void bch2_fs_allocator_init(struct bch_fs *c)
mutex_init(&c->write_points_hash_lock);
spin_lock_init(&c->freelist_lock);
- bch2_prio_timer_init(c, READ);
- bch2_prio_timer_init(c, WRITE);
+ bch2_bucket_clock_init(c, READ);
+ bch2_bucket_clock_init(c, WRITE);
/* open bucket 0 is a sentinal NULL: */
spin_lock_init(&c->open_buckets[0].lock);
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index f914dbd5..372cc047 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -9,6 +9,14 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+}
+
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@@ -30,6 +38,8 @@ enum bucket_alloc_ret {
NO_DEVICES = -3, /* -EROFS */
};
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
@@ -127,6 +137,4 @@ int bch2_alloc_write(struct bch_fs *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_init(struct bch_fs *);
-extern const struct bkey_ops bch2_bkey_alloc_ops;
-
#endif /* _BCACHEFS_ALLOC_H */
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index f3bd4701..8a71a376 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -8,7 +8,7 @@
#include "fifo.h"
/* There's two of these clocks, one for reads and one for writes: */
-struct prio_clock {
+struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
@@ -23,7 +23,7 @@ struct prio_clock {
* consistent.
*/
u16 hand;
- u16 min_prio;
+ u16 max_last_io;
int rw;
@@ -80,6 +80,7 @@ struct write_point_specifier {
struct alloc_heap_entry {
size_t bucket;
+ size_t nr;
unsigned long key;
};
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 369d078c..bc10324f 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -384,7 +384,7 @@ struct bch_dev {
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
spinlock_t freelist_lock;
- unsigned nr_invalidated;
+ size_t nr_invalidated;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
@@ -392,7 +392,7 @@ struct bch_dev {
size_t fifo_last_bucket;
/* last calculated minimum prio */
- u16 min_prio[2];
+ u16 max_last_bucket_io[2];
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
@@ -431,11 +431,11 @@ struct bch_dev {
*/
enum {
/* startup: */
- BCH_FS_BRAND_NEW_FS,
BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOCATOR_STARTED,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_FSCK_DONE,
+ BCH_FS_STARTED,
/* shutdown: */
BCH_FS_EMERGENCY_RO,
@@ -519,8 +519,7 @@ struct bch_fs {
u64 features;
} sb;
- struct bch_sb *disk_sb;
- unsigned disk_sb_order;
+ struct bch_sb_handle disk_sb;
unsigned short block_bits; /* ilog2(block_size) */
@@ -595,7 +594,7 @@ struct bch_fs {
* those together consistently we keep track of the smallest nonzero
* priority of any bucket.
*/
- struct prio_clock prio_clock[2];
+ struct bucket_clock bucket_clock[2];
struct io_clock io_clock[2];
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index d89f7781..eed6fb85 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -955,8 +955,9 @@ struct bch_disk_group {
__le64 flags[2];
};
-LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
struct bch_sb_field_disk_groups {
struct bch_sb_field field;
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 84cdf662..e4f62f90 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -10,20 +10,20 @@
#include "quota.h"
#include "xattr.h"
-const struct bkey_ops *bch2_bkey_ops[] = {
- [BKEY_TYPE_EXTENTS] = &bch2_bkey_extent_ops,
- [BKEY_TYPE_INODES] = &bch2_bkey_inode_ops,
- [BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops,
- [BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops,
- [BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops,
- [BKEY_TYPE_QUOTAS] = &bch2_bkey_quota_ops,
- [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
+const struct bkey_ops bch2_bkey_ops[] = {
+ [BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops,
+ [BKEY_TYPE_INODES] = bch2_bkey_inode_ops,
+ [BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops,
+ [BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
+ [BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
+ [BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
+ [BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
};
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ const struct bkey_ops *ops = &bch2_bkey_ops[type];
switch (k.k->type) {
case KEY_TYPE_DELETED:
@@ -51,7 +51,7 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ const struct bkey_ops *ops = &bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
@@ -100,7 +100,7 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
enum bkey_type type = btree_node_type(b);
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ const struct bkey_ops *ops = &bch2_bkey_ops[type];
const char *invalid;
BUG_ON(!k.k->u64s);
@@ -141,7 +141,7 @@ int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ const struct bkey_ops *ops = &bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
switch (k.k->type) {
@@ -182,7 +182,7 @@ void bch2_bkey_swab(enum bkey_type type,
const struct bkey_format *f,
struct bkey_packed *k)
{
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ const struct bkey_ops *ops = &bch2_bkey_ops[type];
bch2_bkey_swab_key(f, k);
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 59db3037..9e2c90d5 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -81,6 +81,6 @@ int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
-extern const struct bkey_ops *bch2_bkey_ops[];
+extern const struct bkey_ops bch2_bkey_ops[];
#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index f2e9c10e..ad51f29c 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -18,6 +18,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "replicas.h"
#include "super-io.h"
#include <linux/slab.h>
@@ -317,7 +318,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
unsigned i;
u64 b;
- lockdep_assert_held(&c->sb_lock);
+ if (c)
+ lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) {
u64 offset = le64_to_cpu(layout->sb_offset[i]);
@@ -331,7 +333,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
BCH_DATA_SB, flags);
}
- spin_lock(&c->journal.lock);
+ if (c)
+ spin_lock(&c->journal.lock);
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
@@ -340,7 +343,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
gc_phase(GC_PHASE_SB), flags);
}
- spin_unlock(&c->journal.lock);
+ if (c)
+ spin_unlock(&c->journal.lock);
}
static void bch2_mark_superblocks(struct bch_fs *c)
@@ -1034,8 +1038,8 @@ static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
int ret;
mutex_lock(&c->sb_lock);
- if (!bch2_sb_get_replicas(c->disk_sb)) {
- if (BCH_SB_INITIALIZED(c->disk_sb))
+ if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+ if (BCH_SB_INITIALIZED(c->disk_sb.sb))
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index cc5bcbb2..465aadba 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1290,16 +1290,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
+ iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
- struct bkey_s_c k;
+ /*
+ * XXX: when we just need to relock we should be able to avoid
+ * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+ * for that to work
+ */
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- k = bch2_btree_iter_peek_slot(iter);
- if (btree_iter_err(k))
- return k;
+ return bch2_btree_iter_peek_slot(iter);
}
- iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
-
if (!bkey_deleted(&iter->k))
__btree_iter_advance(&iter->l[0]);
@@ -1318,6 +1321,8 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
iter->c = c;
iter->pos = pos;
+ bkey_init(&iter->k);
+ iter->k.p = pos;
iter->flags = flags;
iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
iter->btree_id = btree_id;
@@ -1330,6 +1335,10 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
iter->l[iter->level].b = BTREE_ITER_NOT_END;
iter->next = iter;
+ if (unlikely((flags & BTREE_ITER_IS_EXTENTS) &&
+ !bkey_cmp(pos, POS_MAX)))
+ iter->uptodate = BTREE_ITER_END;
+
prefetch(c->btree_roots[btree_id].b);
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 318b0424..95191ba2 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -231,6 +231,20 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
return __btree_iter_cmp(l->btree_id, l->pos, r);
}
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+{
+ if (need_resched()) {
+ bch2_btree_iter_unlock(iter);
+ schedule();
+ } else if (race_fault()) {
+ bch2_btree_iter_unlock(iter);
+ }
+}
+
#define __for_each_btree_node(_iter, _c, _btree_id, _start, \
_locks_want, _depth, _flags, _b) \
for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
@@ -253,6 +267,8 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
unsigned flags)
{
+ bch2_btree_iter_cond_resched(iter);
+
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_next_slot(iter)
: bch2_btree_iter_next(iter);
@@ -275,18 +291,4 @@ static inline int btree_iter_err(struct bkey_s_c k)
return PTR_ERR_OR_ZERO(k.k);
}
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
-{
- if (need_resched()) {
- bch2_btree_iter_unlock(iter);
- schedule();
- } else if (race_fault()) {
- bch2_btree_iter_unlock(iter);
- }
-}
-
#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index e86c6bce..8854305d 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -299,7 +299,7 @@ static inline enum bkey_type btree_node_type(struct btree *b)
static inline const struct bkey_ops *btree_node_ops(struct btree *b)
{
- return bch2_bkey_ops[btree_node_type(b)];
+ return &bch2_bkey_ops[btree_node_type(b)];
}
static inline bool btree_node_has_ptrs(struct btree *b)
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index f42239da..63696920 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -13,6 +13,7 @@
#include "extents.h"
#include "journal.h"
#include "keylist.h"
+#include "replicas.h"
#include "super-io.h"
#include <linux/random.h>
@@ -2116,3 +2117,16 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
return out - buf;
}
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+ size_t ret = 0;
+ struct list_head *i;
+
+ mutex_lock(&c->btree_interior_update_lock);
+ list_for_each(i, &c->btree_interior_update_list)
+ ret++;
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ return ret;
+}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 0b58ccc9..3e66d69e 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -343,4 +343,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 007aa5ef..53b39de5 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -443,8 +443,20 @@ split:
* potentially blocks the allocator:
*/
ret = bch2_btree_split_leaf(c, split, trans->flags);
+
+ /*
+ * This can happen when we insert part of an extent - with an update
+ * with multiple keys, we don't want to redo the entire update - that's
+ * just too confusing:
+ */
+ if (!ret &&
+ (trans->flags & BTREE_INSERT_ATOMIC) &&
+ trans->did_work)
+ ret = -EINTR;
+
if (ret)
goto err;
+
/*
* if the split didn't have to drop locks the insert will still be
* atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 864de940..1f944cb8 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -309,7 +309,7 @@ static bool bucket_became_unavailable(struct bch_fs *c,
{
return is_available_bucket(old) &&
!is_available_bucket(new) &&
- c && c->gc_pos.phase == GC_PHASE_DONE;
+ (!c || c->gc_pos.phase == GC_PHASE_DONE);
}
void bch2_fs_usage_apply(struct bch_fs *c,
@@ -351,12 +351,16 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
{
struct bch_dev_usage *dev_usage;
- lockdep_assert_held(&c->usage_lock);
+ if (c)
+ lockdep_assert_held(&c->usage_lock);
- bch2_fs_inconsistent_on(old.data_type && new.data_type &&
- old.data_type != new.data_type, c,
+ if (old.data_type && new.data_type &&
+ old.data_type != new.data_type) {
+ BUG_ON(!c);
+ bch2_fs_inconsistent(c,
"different types of data in same bucket: %u, %u",
old.data_type, new.data_type);
+ }
dev_usage = this_cpu_ptr(ca->usage_percpu);
@@ -466,21 +470,29 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(!type);
- lg_local_lock(&c->usage_lock);
- g = bucket(ca, b);
+ if (likely(c)) {
+ lg_local_lock(&c->usage_lock);
- if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
- gc_will_visit(c, pos)) {
- lg_local_unlock(&c->usage_lock);
- return;
+ if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+ gc_will_visit(c, pos)) {
+ lg_local_unlock(&c->usage_lock);
+ return;
+ }
}
+ preempt_disable();
+
+ g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, g, new, ({
saturated_add(ca, new.dirty_sectors, sectors,
GC_MAX_SECTORS_USED);
new.data_type = type;
}));
- lg_local_unlock(&c->usage_lock);
+
+ preempt_enable();
+
+ if (likely(c))
+ lg_local_unlock(&c->usage_lock);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
@@ -859,9 +871,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bch2_copygc_stop(ca);
- down_write(&c->gc_lock);
- down_write(&ca->bucket_lock);
- lg_global_lock(&c->usage_lock);
+ if (resize) {
+ down_write(&c->gc_lock);
+ down_write(&ca->bucket_lock);
+ lg_global_lock(&c->usage_lock);
+ }
old_buckets = bucket_array(ca);
@@ -885,7 +899,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_dirty, buckets_dirty);
- lg_global_unlock(&c->usage_lock);
+ if (resize)
+ lg_global_unlock(&c->usage_lock);
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++) {
@@ -904,8 +919,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
nbuckets = ca->mi.nbuckets;
- up_write(&ca->bucket_lock);
- up_write(&c->gc_lock);
+ if (resize) {
+ up_write(&ca->bucket_lock);
+ up_write(&c->gc_lock);
+ }
if (start_copygc &&
bch2_copygc_start(c, ca))
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index fda7fd70..399a853c 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -31,6 +31,7 @@
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
{
return rcu_dereference_check(ca->buckets,
+ !ca->fs ||
lockdep_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
@@ -47,7 +48,12 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
- bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand;
+ bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+ return c->bucket_clock[rw].hand - g->io_time[rw];
}
/*
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index a0256e13..28bd2c59 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -31,12 +31,12 @@ struct bucket_mark {
};
struct bucket {
- u16 prio[2];
-
union {
struct bucket_mark _mark;
const struct bucket_mark mark;
};
+
+ u16 io_time[2];
};
struct bucket_array {
@@ -85,8 +85,9 @@ struct disk_reservation {
};
struct copygc_heap_entry {
+ u8 gen;
+ u32 sectors;
u64 offset;
- struct bucket_mark mark;
};
typedef HEAP(struct copygc_heap_entry) copygc_heap;
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index ab6dc665..8403bae6 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -372,6 +372,9 @@ static long bch2_ioctl_usage(struct bch_fs *c,
unsigned i, j;
int ret;
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EINVAL;
+
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
@@ -460,7 +463,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
sb = ca->disk_sb.sb;
} else {
- sb = c->disk_sb;
+ sb = c->disk_sb.sb;
}
if (vstruct_bytes(sb) > arg.size) {
@@ -535,13 +538,22 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- /* ioctls that do require admin cap: */
switch (cmd) {
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
case BCH_IOCTL_STOP:
return bch2_ioctl_stop(c);
+ case BCH_IOCTL_READ_SUPER:
+ BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+ case BCH_IOCTL_DISK_GET_IDX:
+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+ }
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EINVAL;
+
+ /* ioctls that do require admin cap: */
+ switch (cmd) {
case BCH_IOCTL_DISK_ADD:
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
case BCH_IOCTL_DISK_REMOVE:
@@ -554,10 +566,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
case BCH_IOCTL_DATA:
BCH_IOCTL(data, struct bch_ioctl_data);
- case BCH_IOCTL_READ_SUPER:
- BCH_IOCTL(read_super, struct bch_ioctl_read_super);
- case BCH_IOCTL_DISK_GET_IDX:
- BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 56bd99fd..6d8543eb 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -569,7 +569,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
if (!bch2_key_is_encrypted(&sb_key))
goto out;
- ret = bch2_request_key(c->disk_sb, &user_key);
+ ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
bch_err(c, "error requesting encryption key: %i", ret);
goto err;
@@ -623,7 +623,7 @@ int bch2_disable_encryption(struct bch_fs *c)
mutex_lock(&c->sb_lock);
- crypt = bch2_sb_get_crypt(c->disk_sb);
+ crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
goto out;
@@ -639,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c)
crypt->key.magic = BCH_KEY_MAGIC;
crypt->key.key = key;
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
@@ -657,7 +657,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
mutex_lock(&c->sb_lock);
/* Do we already have an encryption key? */
- if (bch2_sb_get_crypt(c->disk_sb))
+ if (bch2_sb_get_crypt(c->disk_sb.sb))
goto err;
ret = bch2_alloc_ciphers(c);
@@ -668,7 +668,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
get_random_bytes(&key.key, sizeof(key.key));
if (keyed) {
- ret = bch2_request_key(c->disk_sb, &user_key);
+ ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
bch_err(c, "error requesting encryption key: %i", ret);
goto err;
@@ -685,7 +685,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
if (ret)
goto err;
- crypt = bch2_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
+ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
if (!crypt) {
ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
goto err;
@@ -694,7 +694,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
crypt->key = key;
/* write superblock */
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
@@ -728,7 +728,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
goto out;
}
- crypt = bch2_sb_get_crypt(c->disk_sb);
+ crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
goto out;
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 7862294b..2690cc4b 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -117,6 +117,7 @@ static const unsigned bch_crc_bytes[] = {
[BCH_CSUM_CHACHA20_POLY1305_128] = 16,
};
+/* returns true if not equal */
static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
{
/*
diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h
index bfd4b303..df404b6d 100644
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@@ -3,7 +3,7 @@
#include "util.h"
-#define NR_IO_TIMERS 8
+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
/*
* Clocks/timers in units of sectors of IO:
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 18c94598..1af62621 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -500,7 +500,7 @@ int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
return ret;
}
- c->disk_sb->features[0] |= cpu_to_le64(f);
+ c->disk_sb.sb->features[0] |= cpu_to_le64(f);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 00e0de16..7190990d 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -212,17 +212,20 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
return i->ret;
- for_each_btree_key(&iter, i->c, i->id, i->from,
- BTREE_ITER_PREFETCH, k) {
- i->from = iter.pos;
+ bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+ k = bch2_btree_iter_peek(&iter);
+ while (k.k && !(err = btree_iter_err(k))) {
bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
- i->buf, sizeof(i->buf), k);
+ i->buf, sizeof(i->buf), k);
i->bytes = strlen(i->buf);
BUG_ON(i->bytes >= PAGE_SIZE);
i->buf[i->bytes] = '\n';
i->bytes++;
+ k = bch2_btree_iter_next(&iter);
+ i->from = iter.pos;
+
err = flush_buf(i);
if (err)
break;
@@ -230,7 +233,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
break;
}
- err = bch2_btree_iter_unlock(&iter) ?: err;
+ bch2_btree_iter_unlock(&iter);
return err < 0 ? err : i->ret;
}
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c
index 6bdece3a..df9913f8 100644
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -79,8 +79,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.cmp_bkey = dirent_cmp_bkey,
};
-static const char *bch2_dirent_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
unsigned len;
@@ -116,8 +115,8 @@ static const char *bch2_dirent_invalid(const struct bch_fs *c,
}
}
-static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_dirent_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
size_t n = 0;
@@ -136,11 +135,6 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
}
}
-const struct bkey_ops bch2_bkey_dirent_ops = {
- .key_invalid = bch2_dirent_invalid,
- .val_to_text = bch2_dirent_to_text,
-};
-
static struct bkey_i_dirent *dirent_create_key(u8 type,
const struct qstr *name, u64 dst)
{
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h
index 98405b5b..5d066af1 100644
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -4,7 +4,14 @@
#include "str_hash.h"
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-extern const struct bkey_ops bch2_bkey_dirent_ops;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_dirent_ops (struct bkey_ops) { \
+ .key_invalid = bch2_dirent_invalid, \
+ .val_to_text = bch2_dirent_to_text, \
+}
struct qstr;
struct file;
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
new file mode 100644
index 00000000..c129a33e
--- /dev/null
+++ b/libbcachefs/disk_groups.c
@@ -0,0 +1,462 @@
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+ const struct bch_disk_group *l = _l;
+ const struct bch_disk_group *r = _r;
+
+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+ strncmp(l->label, r->label, sizeof(l->label));
+}
+
+const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_disk_groups *groups =
+ field_to_type(f, disk_groups);
+ struct bch_disk_group *g, *sorted = NULL;
+ struct bch_sb_field_members *mi;
+ struct bch_member *m;
+ unsigned i, nr_groups, len;
+ const char *err = NULL;
+
+ mi = bch2_sb_get_members(sb);
+ groups = bch2_sb_get_disk_groups(sb);
+ nr_groups = disk_groups_nr(groups);
+
+ for (m = mi->members;
+ m < mi->members + sb->nr_devices;
+ m++) {
+ unsigned g;
+
+ if (!BCH_MEMBER_GROUP(m))
+ continue;
+
+ g = BCH_MEMBER_GROUP(m) - 1;
+
+ if (g >= nr_groups ||
+ BCH_GROUP_DELETED(&groups->entries[g]))
+ return "disk has invalid group";
+ }
+
+ if (!nr_groups)
+ return NULL;
+
+ for (g = groups->entries;
+ g < groups->entries + nr_groups;
+ g++) {
+ if (BCH_GROUP_DELETED(g))
+ continue;
+
+ len = strnlen(g->label, sizeof(g->label));
+ if (!len) {
+ err = "group with empty label";
+ goto err;
+ }
+ }
+
+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+ if (!sorted)
+ return "cannot allocate memory";
+
+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+ for (i = 0; i + 1 < nr_groups; i++)
+ if (!BCH_GROUP_DELETED(sorted + i) &&
+ !group_cmp(sorted + i, sorted + i + 1)) {
+ err = "duplicate groups";
+ goto err;
+ }
+
+ err = NULL;
+err:
+ kfree(sorted);
+ return err;
+}
+
+static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ char *out = buf, *end = buf + size;
+ struct bch_sb_field_disk_groups *groups =
+ field_to_type(f, disk_groups);
+ struct bch_disk_group *g;
+ unsigned nr_groups = disk_groups_nr(groups);
+
+ for (g = groups->entries;
+ g < groups->entries + nr_groups;
+ g++) {
+ if (g != groups->entries)
+ out += scnprintf(out, end - out, " ");
+
+ if (BCH_GROUP_DELETED(g))
+ out += scnprintf(out, end - out, "[deleted]");
+ else
+ out += scnprintf(out, end - out,
+ "[parent %llu name %s]",
+ BCH_GROUP_PARENT(g),
+ g->label);
+ }
+
+ return out - buf;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+ .validate = bch2_sb_disk_groups_validate,
+ .to_text = bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_members *mi;
+ struct bch_sb_field_disk_groups *groups;
+ struct bch_disk_groups_cpu *cpu_g, *old_g;
+ unsigned i, g, nr_groups;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ mi = bch2_sb_get_members(c->disk_sb.sb);
+ groups = bch2_sb_get_disk_groups(c->disk_sb.sb);
+ nr_groups = disk_groups_nr(groups);
+
+ if (!groups)
+ return 0;
+
+ cpu_g = kzalloc(sizeof(*cpu_g) +
+ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+ if (!cpu_g)
+ return -ENOMEM;
+
+ cpu_g->nr = nr_groups;
+
+ for (i = 0; i < nr_groups; i++) {
+ struct bch_disk_group *src = &groups->entries[i];
+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
+
+ dst->deleted = BCH_GROUP_DELETED(src);
+ dst->parent = BCH_GROUP_PARENT(src);
+ }
+
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
+ struct bch_disk_group_cpu *dst =
+ &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+ if (!bch2_member_exists(m))
+ continue;
+
+ g = BCH_MEMBER_GROUP(m);
+ while (g) {
+ dst = &cpu_g->entries[g - 1];
+ __set_bit(i, dst->devs.d);
+ g = dst->parent;
+ }
+ }
+
+ old_g = c->disk_groups;
+ rcu_assign_pointer(c->disk_groups, cpu_g);
+ if (old_g)
+ kfree_rcu(old_g, rcu);
+
+ return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+ struct target t = target_decode(target);
+
+ switch (t.type) {
+ case TARGET_DEV: {
+ struct bch_dev *ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+ return ca ? &ca->self : NULL;
+ }
+ case TARGET_GROUP: {
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+ return t.group < g->nr && !g->entries[t.group].deleted
+ ? &g->entries[t.group].devs
+ : NULL;
+ }
+ default:
+ BUG();
+ }
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+ unsigned parent,
+ const char *name, unsigned namelen)
+{
+ unsigned i, nr_groups = disk_groups_nr(groups);
+
+ if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+ return -EINVAL;
+
+ for (i = 0; i < nr_groups; i++) {
+ struct bch_disk_group *g = groups->entries + i;
+
+ if (BCH_GROUP_DELETED(g))
+ continue;
+
+ if (!BCH_GROUP_DELETED(g) &&
+ BCH_GROUP_PARENT(g) == parent &&
+ strnlen(g->label, sizeof(g->label)) == namelen &&
+ !memcmp(name, g->label, namelen))
+ return i;
+ }
+
+ return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+ const char *name, unsigned namelen)
+{
+ struct bch_sb_field_disk_groups *groups =
+ bch2_sb_get_disk_groups(sb->sb);
+ unsigned i, nr_groups = disk_groups_nr(groups);
+ struct bch_disk_group *g;
+
+ if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+ return -EINVAL;
+
+ for (i = 0;
+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+ i++)
+ ;
+
+ if (i == nr_groups) {
+ unsigned u64s =
+ (sizeof(struct bch_sb_field_disk_groups) +
+ sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+ sizeof(u64);
+
+ groups = bch2_sb_resize_disk_groups(sb, u64s);
+ if (!groups)
+ return -ENOSPC;
+
+ nr_groups = disk_groups_nr(groups);
+ }
+
+ BUG_ON(i >= nr_groups);
+
+ g = &groups->entries[i];
+
+ memcpy(g->label, name, namelen);
+ if (namelen < sizeof(g->label))
+ g->label[namelen] = '\0';
+ SET_BCH_GROUP_DELETED(g, 0);
+ SET_BCH_GROUP_PARENT(g, parent);
+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+ return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+ struct bch_sb_field_disk_groups *groups =
+ bch2_sb_get_disk_groups(sb->sb);
+ int v = -1;
+
+ do {
+ const char *next = strchrnul(name, '.');
+ unsigned len = next - name;
+
+ if (*next == '.')
+ next++;
+
+ v = __bch2_disk_group_find(groups, v + 1, name, len);
+ name = next;
+ } while (*name && v >= 0);
+
+ return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+ struct bch_sb_field_disk_groups *groups;
+ unsigned parent = 0;
+ int v = -1;
+
+ do {
+ const char *next = strchrnul(name, '.');
+ unsigned len = next - name;
+
+ if (*next == '.')
+ next++;
+
+ groups = bch2_sb_get_disk_groups(sb->sb);
+
+ v = __bch2_disk_group_find(groups, parent, name, len);
+ if (v < 0)
+ v = __bch2_disk_group_add(sb, parent, name, len);
+ if (v < 0)
+ return v;
+
+ parent = v + 1;
+ name = next;
+ } while (*name && v >= 0);
+
+ return v;
+}
+
+int bch2_disk_path_print(struct bch_sb_handle *sb,
+ char *buf, size_t len, unsigned v)
+{
+ char *out = buf, *end = out + len;
+ struct bch_sb_field_disk_groups *groups =
+ bch2_sb_get_disk_groups(sb->sb);
+ struct bch_disk_group *g;
+ unsigned nr = 0;
+ u16 path[32];
+
+ while (1) {
+ if (nr == ARRAY_SIZE(path))
+ goto inval;
+
+ if (v >= disk_groups_nr(groups))
+ goto inval;
+
+ g = groups->entries + v;
+
+ if (BCH_GROUP_DELETED(g))
+ goto inval;
+
+ path[nr++] = v;
+
+ if (!BCH_GROUP_PARENT(g))
+ break;
+
+ v = BCH_GROUP_PARENT(g) - 1;
+ }
+
+ while (nr) {
+ unsigned b = 0;
+
+ v = path[--nr];
+ g = groups->entries + v;
+
+ if (end != out)
+ b = min_t(size_t, end - out,
+ strnlen(g->label, sizeof(g->label)));
+ memcpy(out, g->label, b);
+ if (b < end - out)
+ out[b] = '\0';
+ out += b;
+
+ if (nr)
+ out += scnprintf(out, end - out, ".");
+ }
+
+ return out - buf;
+inval:
+ return scnprintf(buf, len, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+ struct bch_member *mi;
+ int v = -1;
+
+ mutex_lock(&c->sb_lock);
+
+ if (!strlen(name) || !strcmp(name, "none"))
+ goto write_sb;
+
+ v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+ if (v < 0) {
+ mutex_unlock(&c->sb_lock);
+ return v;
+ }
+
+write_sb:
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+ SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+ struct bch_dev *ca;
+ int g;
+
+ if (!strlen(buf) || !strcmp(buf, "none")) {
+ *v = 0;
+ return 0;
+ }
+
+ /* Is it a device? */
+ ca = bch2_dev_lookup(c, buf);
+ if (!IS_ERR(ca)) {
+ *v = dev_to_target(ca->dev_idx);
+ percpu_ref_put(&ca->ref);
+ return 0;
+ }
+
+ mutex_lock(&c->sb_lock);
+ g = bch2_disk_path_find(&c->disk_sb, buf);
+ mutex_unlock(&c->sb_lock);
+
+ if (g >= 0) {
+ *v = group_to_target(g);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+ struct target t = target_decode(v);
+ int ret;
+
+ switch (t.type) {
+ case TARGET_NULL:
+ return scnprintf(buf, len, "none");
+ case TARGET_DEV: {
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ char b[BDEVNAME_SIZE];
+
+ ret = scnprintf(buf, len, "/dev/%s",
+ bdevname(ca->disk_sb.bdev, b));
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ ret = scnprintf(buf, len, "offline device %u", t.dev);
+ } else {
+ ret = scnprintf(buf, len, "invalid device %u", t.dev);
+ }
+
+ rcu_read_unlock();
+ break;
+ }
+ case TARGET_GROUP:
+ mutex_lock(&c->sb_lock);
+ ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+ mutex_unlock(&c->sb_lock);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h
new file mode 100644
index 00000000..9da9805a
--- /dev/null
+++ b/libbcachefs/disk_groups.h
@@ -0,0 +1,99 @@
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+ return groups
+ ? (vstruct_end(&groups->field) -
+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+ : 0;
+}
+
+struct target {
+ enum {
+ TARGET_NULL,
+ TARGET_DEV,
+ TARGET_GROUP,
+ } type;
+ union {
+ unsigned dev;
+ unsigned group;
+ };
+};
+
+#define TARGET_DEV_START 1
+#define TARGET_GROUP_START (256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+ return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+ return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+ if (target >= TARGET_GROUP_START)
+ return (struct target) {
+ .type = TARGET_GROUP,
+ .group = target - TARGET_GROUP_START
+ };
+
+ if (target >= TARGET_DEV_START)
+ return (struct target) {
+ .type = TARGET_DEV,
+ .group = target - TARGET_DEV_START
+ };
+
+ return (struct target) { .type = TARGET_NULL };
+}
+
+static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
+{
+ struct target t = target_decode(target);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ return false;
+ case TARGET_DEV:
+ return ca->dev_idx == t.dev;
+ case TARGET_GROUP:
+ return ca->mi.group && ca->mi.group - 1 == t.group;
+ default:
+ BUG();
+ }
+}
+
+static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+ struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index f73e7562..c5d1e7cb 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -14,10 +14,12 @@
#include "checksum.h"
#include "debug.h"
#include "dirent.h"
+#include "disk_groups.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "journal.h"
+#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "util.h"
@@ -25,9 +27,6 @@
#include <trace/events/bcachefs.h>
-static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
- struct bkey_i *, struct bkey_i *);
-
static void sort_key_next(struct btree_node_iter_large *iter,
struct btree *b,
struct btree_node_iter_set *i)
@@ -160,9 +159,13 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
{
const struct bch_extent_ptr *ptr;
- extent_for_each_ptr(e, ptr)
- if (dev_in_target(c->devs[ptr->dev], target))
+ extent_for_each_ptr(e, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (dev_in_target(ca, target) &&
+ (!ptr->cached || !ptr_stale(ca, ptr)))
return ptr;
+ }
return NULL;
}
@@ -356,11 +359,25 @@ restart_narrow_pointers:
return true;
}
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+ struct bch_extent_crc_unpacked r)
+{
+ return (l.csum_type != r.csum_type ||
+ l.compression_type != r.compression_type ||
+ l.compressed_size != r.compressed_size ||
+ l.uncompressed_size != r.uncompressed_size ||
+ l.offset != r.offset ||
+ l.live_size != r.live_size ||
+ l.nonce != r.nonce ||
+ bch2_crc_cmp(l.csum, r.csum));
+}
+
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
{
union bch_extent_entry *entry = e.v->start;
union bch_extent_crc *crc, *prev = NULL;
- struct bch_extent_crc_unpacked u, prev_u;
+ struct bch_extent_crc_unpacked u, prev_u = { 0 };
while (entry != extent_entry_last(e)) {
union bch_extent_entry *next = extent_entry_next(entry);
@@ -382,7 +399,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
goto drop;
}
- if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
+ if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
/* identical to previous crc entry: */
goto drop;
}
@@ -439,13 +456,12 @@ static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
bch2_extent_drop_redundant_crcs(e);
}
-static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
- struct bkey_s k)
+bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
{
return bch2_extent_normalize(c, k);
}
-static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
{
switch (k->type) {
case BCH_EXTENT:
@@ -628,8 +644,7 @@ use:
/* Btree ptrs */
-static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (bkey_extent_is_cached(k.k))
return "cached";
@@ -671,8 +686,8 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
}
}
-static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k)
{
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
@@ -727,8 +742,8 @@ err:
mark.gen, (unsigned) mark.counter);
}
-static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
const char *invalid;
@@ -756,13 +771,6 @@ bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
return pick;
}
-const struct bkey_ops bch2_bkey_btree_ops = {
- .key_invalid = bch2_btree_ptr_invalid,
- .key_debugcheck = btree_ptr_debugcheck,
- .val_to_text = bch2_btree_ptr_to_text,
- .swab = bch2_ptr_swab,
-};
-
/* Extents */
static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
@@ -1436,7 +1444,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
}
static enum btree_insert_ret
-bch2_delete_fixup_extent(struct extent_insert_state *s)
+__bch2_delete_fixup_extent(struct extent_insert_state *s)
{
struct bch_fs *c = s->trans->c;
struct btree_iter *iter = s->insert->iter;
@@ -1450,8 +1458,7 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
- s->whiteout = *insert;
- s->do_journal = false;
+ s->whiteout = *insert;
while (bkey_cmp(s->committed, insert->k.p) < 0 &&
(ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
@@ -1474,12 +1481,12 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
overlap = bch2_extent_overlap(&insert->k, k.k);
ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
- if (ret != BTREE_INSERT_OK)
- goto stop;
+ if (ret)
+ break;
ret = extent_insert_advance_pos(s, k.s_c);
if (ret)
- goto stop;
+ break;
s->do_journal = true;
@@ -1520,25 +1527,65 @@ next:
bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
}
- if (ret == BTREE_INSERT_OK &&
- bkey_cmp(s->committed, insert->k.p) < 0)
- ret = extent_insert_advance_pos(s, bkey_s_c_null);
-stop:
- extent_insert_committed(s);
+ return ret;
+}
- bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
- gc_pos_btree_node(b));
+static enum btree_insert_ret
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
+{
+ struct btree_iter *iter = s->insert->iter;
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree *b = l->b;
+ struct btree_node_iter *node_iter = &l->iter;
+ struct bkey_packed *_k;
+ struct bkey unpacked;
+ struct bkey_i *insert = s->insert->k;
+ enum btree_insert_ret ret = BTREE_INSERT_OK;
- EBUG_ON(bkey_cmp(iter->pos, s->committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
- !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
+ while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+ (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+ (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+ struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+ struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+ enum bch_extent_overlap overlap;
- bch2_cut_front(iter->pos, insert);
+ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+ EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
- if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+ break;
+
+ overlap = bch2_extent_overlap(&insert->k, k.k);
+
+ ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+ if (ret)
+ break;
+
+ if (!k.k->size)
+ goto squash;
+
+ /*
+ * Only call advance pos & call hook for nonzero size extents:
+ */
+ ret = extent_insert_advance_pos(s, k.s_c);
+ if (ret)
+ break;
- EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+ if (k.k->size &&
+ (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+ insert->k.needs_whiteout = true;
+
+ if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+ bkey_whiteout(k.k) &&
+ k.k->needs_whiteout) {
+ unreserve_whiteout(b, t, _k);
+ _k->needs_whiteout = false;
+ }
+squash:
+ ret = extent_squash(s, insert, t, _k, k, overlap);
+ if (ret != BTREE_INSERT_OK)
+ break;
+ }
return ret;
}
@@ -1590,9 +1637,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
struct btree_iter *iter = insert->iter;
struct btree_iter_level *l = &iter->l[0];
struct btree *b = l->b;
- struct btree_node_iter *node_iter = &l->iter;
- struct bkey_packed *_k;
- struct bkey unpacked;
enum btree_insert_ret ret = BTREE_INSERT_OK;
struct extent_insert_state s = {
@@ -1605,9 +1649,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
EBUG_ON(iter->level);
EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
- if (s.deleting)
- return bch2_delete_fixup_extent(&s);
-
/*
* As we process overlapping extents, we advance @iter->pos both to
* signal to our caller (btree_insert_key()) how much of @insert->k has
@@ -1616,67 +1657,32 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
*/
EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
- if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!s.deleting &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
bkey_start_offset(&insert->k->k),
insert->k->k.size);
- while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
- (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
- (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
- struct bset_tree *t = bch2_bkey_to_bset(b, _k);
- struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
- enum bch_extent_overlap overlap;
-
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
- EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
- break;
-
- overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
- ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
- if (ret != BTREE_INSERT_OK)
- goto stop;
-
- if (!k.k->size)
- goto squash;
-
- /*
- * Only call advance pos & call hook for nonzero size extents:
- */
- ret = extent_insert_advance_pos(&s, k.s_c);
- if (ret != BTREE_INSERT_OK)
- goto stop;
-
- if (k.k->size &&
- (k.k->needs_whiteout || bset_written(b, bset(b, t))))
- insert->k->k.needs_whiteout = true;
-
- if (overlap == BCH_EXTENT_OVERLAP_ALL &&
- bkey_whiteout(k.k) &&
- k.k->needs_whiteout) {
- unreserve_whiteout(b, t, _k);
- _k->needs_whiteout = false;
- }
-squash:
- ret = extent_squash(&s, insert->k, t, _k, k, overlap);
- if (ret != BTREE_INSERT_OK)
- goto stop;
- }
+ ret = !s.deleting
+ ? __bch2_insert_fixup_extent(&s)
+ : __bch2_delete_fixup_extent(&s);
if (ret == BTREE_INSERT_OK &&
bkey_cmp(s.committed, insert->k->k.p) < 0)
ret = extent_insert_advance_pos(&s, bkey_s_c_null);
-stop:
+
extent_insert_committed(&s);
+
+ if (s.deleting)
+ bch2_cut_front(iter->pos, insert->k);
+
/*
* Subtract any remaining sectors from @insert, if we bailed out early
* and didn't fully insert @insert:
*/
- if (insert->k->k.size &&
- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!s.deleting &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+ insert->k->k.size)
bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
bkey_start_offset(&insert->k->k),
insert->k->k.size);
@@ -1692,13 +1698,13 @@ stop:
if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
ret = BTREE_INSERT_NEED_TRAVERSE;
- EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
+ WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
+ "ret %u insert->k.size %u", ret, insert->k->k.size);
return ret;
}
-static const char *bch2_extent_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
return "value too big";
@@ -1865,8 +1871,7 @@ bad_ptr:
return;
}
-static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
switch (k.k->type) {
case BCH_EXTENT:
@@ -1880,8 +1885,8 @@ static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
}
}
-static void bch2_extent_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_extent_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
const char *invalid;
@@ -1963,7 +1968,7 @@ void bch2_extent_crc_append(struct bkey_i_extent *e,
extent_for_each_crc(extent_i_to_s(e), crc, i)
;
- if (!memcmp(&crc, &new, sizeof(crc)))
+ if (!bch2_crc_unpacked_cmp(crc, new))
return;
bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
@@ -2089,9 +2094,8 @@ void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
}
}
-static enum merge_result bch2_extent_merge(struct bch_fs *c,
- struct btree *bk,
- struct bkey_i *l, struct bkey_i *r)
+enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+ struct bkey_i *l, struct bkey_i *r)
{
struct bkey_s_extent el, er;
union bch_extent_entry *en_l, *en_r;
@@ -2410,13 +2414,3 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
return ret;
}
-
-const struct bkey_ops bch2_bkey_extent_ops = {
- .key_invalid = bch2_extent_invalid,
- .key_debugcheck = bch2_extent_debugcheck,
- .val_to_text = bch2_extent_to_text,
- .swab = bch2_ptr_swab,
- .key_normalize = bch2_ptr_normalize,
- .key_merge = bch2_extent_merge,
- .is_extents = true,
-};
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 376e51c9..8dc15484 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -15,6 +15,36 @@ struct extent_insert_hook;
struct bch_devs_mask;
union bch_extent_crc;
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+ struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_btree_ops (struct bkey_ops) { \
+ .key_invalid = bch2_btree_ptr_invalid, \
+ .key_debugcheck = bch2_btree_ptr_debugcheck, \
+ .val_to_text = bch2_btree_ptr_to_text, \
+ .swab = bch2_ptr_swab, \
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
+ struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_extent_ops (struct bkey_ops) { \
+ .key_invalid = bch2_extent_invalid, \
+ .key_debugcheck = bch2_extent_debugcheck, \
+ .val_to_text = bch2_extent_to_text, \
+ .swab = bch2_ptr_swab, \
+ .key_normalize = bch2_ptr_normalize, \
+ .key_merge = bch2_extent_merge, \
+ .is_extents = true, \
+}
+
struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
struct btree *,
struct btree_node_iter_large *);
@@ -23,9 +53,6 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct btree *,
struct btree_node_iter_large *);
-extern const struct bkey_ops bch2_bkey_btree_ops;
-extern const struct bkey_ops bch2_bkey_extent_ops;
-
struct extent_pick_ptr
bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
struct bch_devs_mask *avoid);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index cb90738c..d1473f2a 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -468,7 +468,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
}
BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
- BUG_ON(!ret != !k->k.size);
+
+ if (WARN_ONCE(!ret != !k->k.size,
+ "ret %i k->size %u", ret, k->k.size))
+ ret = k->k.size ? -EINTR : 0;
err:
if (ret == -EINTR)
continue;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 797aa2a9..3ae5ac97 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -175,8 +175,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
return 0;
}
-static const char *bch2_inode_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (k.k->p.offset)
return "nonzero offset";
@@ -224,8 +223,8 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
}
}
-static void bch2_inode_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_inode_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
char *out = buf, *end = out + size;
struct bkey_s_c_inode inode;
@@ -247,11 +246,6 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
}
}
-const struct bkey_ops bch2_bkey_inode_ops = {
- .key_invalid = bch2_inode_invalid,
- .val_to_text = bch2_inode_to_text,
-};
-
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 5c7aeadc..26461063 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -5,7 +5,13 @@
#include <linux/math64.h>
-extern const struct bkey_ops bch2_bkey_inode_ops;
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_inode_ops (struct bkey_ops) { \
+ .key_invalid = bch2_inode_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+}
struct bch_inode_unpacked {
u64 bi_inum;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 7ee9c392..27e45081 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -20,6 +20,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "tier.h"
@@ -196,8 +197,6 @@ static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- BUG_ON(!(op->flags & BCH_WRITE_DONE));
-
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&op->c->journal);
@@ -205,7 +204,6 @@ static void bch2_write_done(struct closure *cl)
bch2_disk_reservation_put(op->c, &op->res);
percpu_ref_put(&op->c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
- op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
closure_return(cl);
}
@@ -232,9 +230,8 @@ int bch2_write_index_default(struct bch_write_op *op)
/**
* bch_write_index - after a write, update index to point to new data
*/
-static void bch2_write_index(struct closure *cl)
+static void __bch2_write_index(struct bch_write_op *op)
{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_s_extent e;
@@ -242,8 +239,6 @@ static void bch2_write_index(struct closure *cl)
struct bkey_i *src, *dst = keys->keys, *n, *k;
int ret;
- op->flags |= BCH_WRITE_LOOPED;
-
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
bkey_copy(dst, src);
@@ -292,9 +287,19 @@ static void bch2_write_index(struct closure *cl)
}
out:
bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
+ return;
+err:
+ keys->top = keys->keys;
+ op->error = ret;
+ goto out;
+}
- if (!(op->flags & BCH_WRITE_DONE))
- continue_at(cl, __bch2_write, op->io_wq);
+static void bch2_write_index(struct closure *cl)
+{
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_fs *c = op->c;
+
+ __bch2_write_index(op);
if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
bch2_journal_flush_seq_async(&c->journal,
@@ -304,12 +309,6 @@ out:
} else {
continue_at_nobarrier(cl, bch2_write_done, NULL);
}
- return;
-err:
- keys->top = keys->keys;
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- goto out;
}
static void bch2_write_endio(struct bio *bio)
@@ -730,18 +729,18 @@ static void __bch2_write(struct closure *cl)
struct bch_fs *c = op->c;
struct write_point *wp;
int ret;
-
+again:
do {
/* +1 for possible cache device: */
if (op->open_buckets_nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets))
- continue_at(cl, bch2_write_index, index_update_wq(op));
+ goto flush_io;
if (bch2_keylist_realloc(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_EXTENT_U64s_MAX))
- continue_at(cl, bch2_write_index, index_update_wq(op));
+ goto flush_io;
wp = bch2_alloc_sectors_start(c,
op->target,
@@ -760,33 +759,7 @@ static void __bch2_write(struct closure *cl)
goto err;
}
- /*
- * If we already have some keys, must insert them first
- * before allocating another open bucket. We only hit
- * this case if open_bucket_nr > 1.
- */
- if (!bch2_keylist_empty(&op->insert_keys))
- continue_at(cl, bch2_write_index,
- index_update_wq(op));
-
- /*
- * If we've looped, we're running out of a workqueue -
- * not the bch2_write() caller's context - and we don't
- * want to block the workqueue:
- */
- if (op->flags & BCH_WRITE_LOOPED)
- continue_at(cl, __bch2_write, op->io_wq);
-
- /*
- * Otherwise, we do want to block the caller on alloc
- * failure instead of letting it queue up more and more
- * writes:
- * XXX: this technically needs a try_to_freeze() -
- * except that that's not safe because caller may have
- * issued other IO... hmm..
- */
- closure_sync(cl);
- continue;
+ goto flush_io;
}
ret = bch2_write_extent(op, wp);
@@ -802,28 +775,24 @@ static void __bch2_write(struct closure *cl)
goto err;
} while (ret);
- op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
err:
- /*
- * Right now we can only error here if we went RO - the
- * allocation failed, but we already checked for -ENOSPC when we
- * got our reservation.
- *
- * XXX capacity might have changed, but we don't check for that
- * yet:
- */
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- /*
- * No reason not to insert keys for whatever data was successfully
- * written (especially for a cmpxchg operation that's moving data
- * around)
- */
continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
? bch2_write_index
: bch2_write_done, index_update_wq(op));
+flush_io:
+ closure_sync(cl);
+
+ if (!bch2_keylist_empty(&op->insert_keys)) {
+ __bch2_write_index(op);
+
+ if (op->error)
+ continue_at_nobarrier(cl, bch2_write_done, NULL);
+ }
+
+ goto again;
}
/**
@@ -969,7 +938,7 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
if (percpu_ref_is_dying(&c->writes))
return false;
- return bch2_extent_has_target(c, e, target);
+ return bch2_extent_has_target(c, e, target) == NULL;
}
/* Read */
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index bf0b17e1..a0c795ab 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -36,8 +36,6 @@ enum bch_write_flags {
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
- BCH_WRITE_DONE = (1 << 10),
- BCH_WRITE_LOOPED = (1 << 11),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index e5000767..b525a85c 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -19,6 +19,7 @@
#include "io.h"
#include "keylist.h"
#include "journal.h"
+#include "replicas.h"
#include "super-io.h"
#include "vstructs.h"
@@ -1582,40 +1583,19 @@ err:
return ret;
}
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+ bool new_fs, struct closure *cl)
{
- struct journal *j = &c->journal;
+ struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets;
- struct disk_reservation disk_res = { 0, 0 };
- struct closure cl;
u64 *new_bucket_seq = NULL, *new_buckets = NULL;
int ret = 0;
- closure_init_stack(&cl);
-
/* don't handle reducing nr of buckets yet: */
if (nr <= ja->nr)
return 0;
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- */
-
- if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0))
- return -ENOSPC;
-
- mutex_lock(&c->sb_lock);
-
ret = -ENOMEM;
new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
@@ -1627,29 +1607,41 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
if (!journal_buckets)
goto err;
- spin_lock(&j->lock);
+ if (c)
+ spin_lock(&c->journal.lock);
+
memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq);
- spin_unlock(&j->lock);
+
+ if (c)
+ spin_unlock(&c->journal.lock);
while (ja->nr < nr) {
- struct open_bucket *ob;
- size_t bucket;
- int ob_idx;
+ struct open_bucket *ob = NULL;
+ long bucket;
- ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
- if (ob_idx < 0) {
- if (!closure_wait(&c->freelist_wait, &cl))
- closure_sync(&cl);
- continue;
+ if (new_fs) {
+ bucket = bch2_bucket_alloc_new_fs(ca);
+ if (bucket < 0) {
+ ret = -ENOSPC;
+ goto err;
+ }
+ } else {
+ int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
+ if (ob_idx < 0) {
+ ret = cl ? -EAGAIN : -ENOSPC;
+ goto err;
+ }
+
+ ob = c->open_buckets + ob_idx;
+ bucket = sector_to_bucket(ca, ob->ptr.offset);
}
- ob = c->open_buckets + ob_idx;
- bucket = sector_to_bucket(ca, ob->ptr.offset);
+ if (c)
+ spin_lock(&c->journal.lock);
- spin_lock(&j->lock);
__array_insert_item(ja->buckets, ja->nr, ja->last_idx);
__array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
__array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
@@ -1664,34 +1656,77 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
ja->last_idx++;
}
ja->nr++;
- spin_unlock(&j->lock);
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB), 0);
+ if (c)
+ spin_unlock(&c->journal.lock);
- bch2_open_bucket_put(c, ob);
+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ new_fs
+ ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
+ : 0);
+
+ if (!new_fs)
+ bch2_open_bucket_put(c, ob);
}
- bch2_write_super(c);
-
ret = 0;
err:
- mutex_unlock(&c->sb_lock);
-
kfree(new_bucket_seq);
kfree(new_buckets);
- bch2_disk_reservation_put(c, &disk_res);
- if (!ret)
- bch2_dev_allocator_add(c, ca);
+ return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr)
+{
+ struct journal_device *ja = &ca->journal;
+ struct closure cl;
+ unsigned current_nr;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ do {
+ struct disk_reservation disk_res = { 0, 0 };
+
+ closure_sync(&cl);
+
+ mutex_lock(&c->sb_lock);
+ current_nr = ja->nr;
+
+ /*
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+ * when space used goes up without a reservation - but we do need the
+ * reservation to ensure we'll actually be able to allocate:
+ */
+
+ if (bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+ mutex_unlock(&c->sb_lock);
+ return -ENOSPC;
+ }
+
+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+ bch2_disk_reservation_put(c, &disk_res);
- closure_sync(&cl);
+ if (ja->nr != current_nr)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ } while (ret == -EAGAIN);
return ret;
}
-int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_dev *ca)
{
unsigned nr;
@@ -1707,7 +1742,7 @@ int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
min(1 << 10,
(1 << 20) / ca->mi.bucket_size));
- return bch2_set_nr_journal_buckets(c, ca, nr);
+ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
}
/* Journalling */
@@ -2320,8 +2355,8 @@ static void journal_write(struct closure *cl)
journal_write_compact(jset);
- jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
+ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
+ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
jset->version = cpu_to_le32(BCACHE_JSET_VERSION);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 46ae8f0d..cf5cc9ba 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -400,7 +400,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
-int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 1bc0e714..ea519102 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -11,6 +11,7 @@
#include "keylist.h"
#include "migrate.h"
#include "move.h"
+#include "replicas.h"
#include "super-io.h"
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 07d2e2c8..87e6e80c 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -6,6 +6,7 @@
#include "inode.h"
#include "io.h"
#include "move.h"
+#include "replicas.h"
#include "super-io.h"
#include "keylist.h"
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 3b4a5292..28dabca7 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -9,6 +9,7 @@
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
+#include "disk_groups.h"
#include "extents.h"
#include "eytzinger.h"
#include "io.h"
@@ -51,7 +52,7 @@ static inline int sectors_used_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
{
- return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
+ return (l.sectors > r.sectors) - (l.sectors < r.sectors);
}
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
@@ -78,7 +79,7 @@ static bool __copygc_pred(struct bch_dev *ca,
return (i >= 0 &&
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
- ptr->gen == h->data[i].mark.gen);
+ ptr->gen == h->data[i].gen);
}
return false;
@@ -154,8 +155,9 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
continue;
e = (struct copygc_heap_entry) {
- .offset = bucket_to_sector(ca, b),
- .mark = m
+ .gen = m.gen,
+ .sectors = bucket_sectors_used(m),
+ .offset = bucket_to_sector(ca, b),
};
heap_add_or_replace(h, e, -sectors_used_cmp);
}
@@ -163,11 +165,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
up_read(&c->gc_lock);
for (i = h->data; i < h->data + h->used; i++)
- sectors_to_move += bucket_sectors_used(i->mark);
+ sectors_to_move += i->sectors;
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
- sectors_to_move -= bucket_sectors_used(e.mark);
+ sectors_to_move -= e.sectors;
}
buckets_to_move = h->used;
@@ -191,7 +193,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
size_t b = sector_to_bucket(ca, i->offset);
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
- if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+ if (i->gen == m.gen && bucket_sectors_used(m)) {
sectors_not_moved += bucket_sectors_used(m);
buckets_not_moved++;
}
@@ -284,7 +286,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
- t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
+ t = kthread_create(bch2_copygc_thread, ca,
+ "bch_copygc[%s]", ca->name);
if (IS_ERR(t))
return PTR_ERR(t);
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 326b8ad9..8db8096e 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include "bcachefs.h"
+#include "disk_groups.h"
#include "opts.h"
#include "super-io.h"
#include "util.h"
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index d28f1333..bb03d83a 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -4,7 +4,22 @@
#include "quota.h"
#include "super-io.h"
-static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+ if (vstruct_bytes(&q->field) != sizeof(*q))
+ return "invalid field quota: wrong size";
+
+ return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+ .validate = bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_quota dq;
@@ -30,8 +45,8 @@ static const char * const bch2_quota_counters[] = {
"inodes",
};
-static void bch2_quota_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_quota_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
char *out = buf, *end= buf + size;
struct bkey_s_c_quota dq;
@@ -50,11 +65,6 @@ static void bch2_quota_to_text(struct bch_fs *c, char *buf,
}
}
-const struct bkey_ops bch2_bkey_quota_ops = {
- .key_invalid = bch2_quota_invalid,
- .val_to_text = bch2_quota_to_text,
-};
-
#ifdef CONFIG_BCACHEFS_QUOTA
#include <linux/cred.h>
@@ -399,7 +409,7 @@ static void bch2_sb_quota_read(struct bch_fs *c)
struct bch_sb_field_quota *sb_quota;
unsigned i, j;
- sb_quota = bch2_sb_get_quota(c->disk_sb);
+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
if (!sb_quota)
return;
@@ -476,13 +486,13 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_UDQ_ENFD)
- SET_BCH_SB_USRQUOTA(c->disk_sb, true);
+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
if (uflags & FS_QUOTA_GDQ_ENFD)
- SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
if (uflags & FS_QUOTA_PDQ_ENFD)
- SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -499,13 +509,13 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_UDQ_ENFD)
- SET_BCH_SB_USRQUOTA(c->disk_sb, false);
+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
if (uflags & FS_QUOTA_GDQ_ENFD)
- SET_BCH_SB_GRPQUOTA(c->disk_sb, false);
+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
if (uflags & FS_QUOTA_PDQ_ENFD)
- SET_BCH_SB_PRJQUOTA(c->disk_sb, false);
+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -616,9 +626,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
q = &c->quotas[type];
mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_quota(c->disk_sb);
+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
if (!sb_quota) {
- sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64));
+ sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+ sizeof(*sb_quota) / sizeof(u64));
if (!sb_quota)
return -ENOSPC;
}
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h
index 509b7f0e..0b24f22c 100644
--- a/libbcachefs/quota.h
+++ b/libbcachefs/quota.h
@@ -1,9 +1,18 @@
#ifndef _BCACHEFS_QUOTA_H
#define _BCACHEFS_QUOTA_H
+#include "inode.h"
#include "quota_types.h"
-extern const struct bkey_ops bch2_bkey_quota_ops;
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_quota_ops (struct bkey_ops) { \
+ .key_invalid = bch2_quota_invalid, \
+ .val_to_text = bch2_quota_to_text, \
+}
enum quota_acct_mode {
BCH_QUOTA_PREALLOC,
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
new file mode 100644
index 00000000..6c52d1d4
--- /dev/null
+++ b/libbcachefs/replicas.c
@@ -0,0 +1,698 @@
+
+#include "bcachefs.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+ struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+ _i = (void *) (_i) + (_r)->entry_size)
+
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+ return (void *) r->entries + r->entry_size * i;
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+ unsigned dev)
+{
+ return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+ unsigned dev)
+{
+ e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+ return (r->entry_size -
+ offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
+ char *buf, size_t size)
+{
+ char *out = buf, *end = out + size;
+ struct bch_replicas_cpu_entry *e;
+ bool first = true;
+ unsigned i;
+
+ for_each_cpu_replicas_entry(r, e) {
+ bool first_e = true;
+
+ if (!first)
+ out += scnprintf(out, end - out, " ");
+ first = false;
+
+ out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+ for (i = 0; i < replicas_dev_slots(r); i++)
+ if (replicas_test_dev(e, i)) {
+ if (!first_e)
+ out += scnprintf(out, end - out, " ");
+ first_e = false;
+ out += scnprintf(out, end - out, "%u", i);
+ }
+ out += scnprintf(out, end - out, "]");
+ }
+
+ return out - buf;
+}
+
+static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+ enum bch_data_type data_type,
+ struct bch_replicas_cpu_entry *r,
+ unsigned *max_dev)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned nr = 0;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_SB ||
+ data_type >= BCH_DATA_NR);
+
+ memset(r, 0, sizeof(*r));
+ r->data_type = data_type;
+
+ *max_dev = 0;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached) {
+ *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+ replicas_set_dev(r, ptr->dev);
+ nr++;
+ }
+ return nr;
+}
+
+static inline void devlist_to_replicas(struct bch_devs_list devs,
+ enum bch_data_type data_type,
+ struct bch_replicas_cpu_entry *r,
+ unsigned *max_dev)
+{
+ unsigned i;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_SB ||
+ data_type >= BCH_DATA_NR);
+
+ memset(r, 0, sizeof(*r));
+ r->data_type = data_type;
+
+ *max_dev = 0;
+
+ for (i = 0; i < devs.nr; i++) {
+ *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
+ replicas_set_dev(r, devs.devs[i]);
+ }
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+ struct bch_replicas_cpu_entry new_entry,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *new;
+ unsigned i, nr, entry_size;
+
+ entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+ DIV_ROUND_UP(max_dev + 1, 8);
+ entry_size = max(entry_size, old->entry_size);
+ nr = old->nr + 1;
+
+ new = kzalloc(sizeof(struct bch_replicas_cpu) +
+ nr * entry_size, GFP_NOIO);
+ if (!new)
+ return NULL;
+
+ new->nr = nr;
+ new->entry_size = entry_size;
+
+ for (i = 0; i < old->nr; i++)
+ memcpy(cpu_replicas_entry(new, i),
+ cpu_replicas_entry(old, i),
+ min(new->entry_size, old->entry_size));
+
+ memcpy(cpu_replicas_entry(new, old->nr),
+ &new_entry,
+ new->entry_size);
+
+ bch2_cpu_replicas_sort(new);
+ return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+ struct bch_replicas_cpu_entry search,
+ unsigned max_dev)
+{
+ return max_dev < replicas_dev_slots(r) &&
+ eytzinger0_find(r->entries, r->nr,
+ r->entry_size,
+ memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+ struct bch_replicas_cpu_entry new_entry,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+ int ret = -ENOMEM;
+
+ mutex_lock(&c->sb_lock);
+
+ old_gc = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+ if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+ new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+ if (!new_gc)
+ goto err;
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+ if (!replicas_has_entry(old_r, new_entry, max_dev)) {
+ new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+ if (!new_r)
+ goto err;
+
+ ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+ if (ret)
+ goto err;
+ }
+
+ /* allocations done, now commit: */
+
+ if (new_r)
+ bch2_write_super(c);
+
+ /* don't update in memory replicas until changes are persistent */
+
+ if (new_gc) {
+ rcu_assign_pointer(c->replicas_gc, new_gc);
+ kfree_rcu(old_gc, rcu);
+ }
+
+ if (new_r) {
+ rcu_assign_pointer(c->replicas, new_r);
+ kfree_rcu(old_r, rcu);
+ }
+
+ mutex_unlock(&c->sb_lock);
+ return 0;
+err:
+ mutex_unlock(&c->sb_lock);
+ if (new_gc)
+ kfree(new_gc);
+ if (new_r)
+ kfree(new_r);
+ return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
+{
+ struct bch_replicas_cpu_entry search;
+ struct bch_replicas_cpu *r, *gc_r;
+ unsigned max_dev;
+ bool marked;
+
+ if (!devs.nr)
+ return 0;
+
+ BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
+ devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+ rcu_read_lock();
+ r = rcu_dereference(c->replicas);
+ gc_r = rcu_dereference(c->replicas_gc);
+ marked = replicas_has_entry(r, search, max_dev) &&
+ (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+ rcu_read_unlock();
+
+ return likely(marked) ? 0
+ : bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bkey_s_c k)
+{
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < cached.nr; i++)
+ if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+ bch2_dev_list_single(cached.devs[i]))))
+ return ret;
+
+ return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+ struct bch_replicas_cpu *new_r, *old_r;
+ int ret = 0;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+
+ new_r = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+
+ if (err) {
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(new_r, rcu);
+ goto err;
+ }
+
+ if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ rcu_assign_pointer(c->replicas, new_r);
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(old_r, rcu);
+
+ bch2_write_super(c);
+err:
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+ struct bch_replicas_cpu *dst, *src;
+ struct bch_replicas_cpu_entry *e;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+ BUG_ON(c->replicas_gc);
+
+ src = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+ src->nr * src->entry_size, GFP_NOIO);
+ if (!dst) {
+ mutex_unlock(&c->sb_lock);
+ return -ENOMEM;
+ }
+
+ dst->nr = 0;
+ dst->entry_size = src->entry_size;
+
+ for_each_cpu_replicas_entry(src, e)
+ if (!((1 << e->data_type) & typemask))
+ memcpy(cpu_replicas_entry(dst, dst->nr++),
+ e, dst->entry_size);
+
+ bch2_cpu_replicas_sort(dst);
+
+ rcu_assign_pointer(c->replicas_gc, dst);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+ unsigned *nr,
+ unsigned *bytes,
+ unsigned *max_dev)
+{
+ struct bch_replicas_entry *i;
+ unsigned j;
+
+ *nr = 0;
+ *bytes = sizeof(*r);
+ *max_dev = 0;
+
+ if (!r)
+ return;
+
+ for_each_replicas_entry(r, i) {
+ for (j = 0; j < i->nr; j++)
+ *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+ (*nr)++;
+ }
+
+ *bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+ struct bch_replicas_cpu *cpu_r;
+ unsigned i, nr, bytes, max_dev, entry_size;
+
+ bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+ entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+ DIV_ROUND_UP(max_dev + 1, 8);
+
+ cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+ nr * entry_size, GFP_NOIO);
+ if (!cpu_r)
+ return NULL;
+
+ cpu_r->nr = nr;
+ cpu_r->entry_size = entry_size;
+
+ if (nr) {
+ struct bch_replicas_cpu_entry *dst =
+ cpu_replicas_entry(cpu_r, 0);
+ struct bch_replicas_entry *src = sb_r->entries;
+
+ while (dst < cpu_replicas_entry(cpu_r, nr)) {
+ dst->data_type = src->data_type;
+ for (i = 0; i < src->nr; i++)
+ replicas_set_dev(dst, src->devs[i]);
+
+ src = replicas_entry_next(src);
+ dst = (void *) dst + entry_size;
+ }
+ }
+
+ bch2_cpu_replicas_sort(cpu_r);
+ return cpu_r;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_cpu *cpu_r, *old_r;
+
+ sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
+ cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+ if (!cpu_r)
+ return -ENOMEM;
+
+ old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
+ rcu_assign_pointer(c->replicas, cpu_r);
+ if (old_r)
+ kfree_rcu(old_r, rcu);
+
+ return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+ struct bch_replicas_cpu *r)
+{
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_entry *sb_e;
+ struct bch_replicas_cpu_entry *e;
+ size_t i, bytes;
+
+ bytes = sizeof(struct bch_sb_field_replicas);
+
+ for_each_cpu_replicas_entry(r, e) {
+ bytes += sizeof(struct bch_replicas_entry);
+ for (i = 0; i < r->entry_size - 1; i++)
+ bytes += hweight8(e->devs[i]);
+ }
+
+ sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+ DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+ if (!sb_r)
+ return -ENOSPC;
+
+ memset(&sb_r->entries, 0,
+ vstruct_end(&sb_r->field) -
+ (void *) &sb_r->entries);
+
+ sb_e = sb_r->entries;
+ for_each_cpu_replicas_entry(r, e) {
+ sb_e->data_type = e->data_type;
+
+ for (i = 0; i < replicas_dev_slots(r); i++)
+ if (replicas_test_dev(e, i))
+ sb_e->devs[sb_e->nr++] = i;
+
+ sb_e = replicas_entry_next(sb_e);
+
+ BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+ }
+
+ return 0;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ struct bch_replicas_cpu *cpu_r = NULL;
+ struct bch_replicas_entry *e;
+ const char *err;
+ unsigned i;
+
+ for_each_replicas_entry(sb_r, e) {
+ err = "invalid replicas entry: invalid data type";
+ if (e->data_type >= BCH_DATA_NR)
+ goto err;
+
+ err = "invalid replicas entry: no devices";
+ if (!e->nr)
+ goto err;
+
+ err = "invalid replicas entry: too many devices";
+ if (e->nr >= BCH_REPLICAS_MAX)
+ goto err;
+
+ err = "invalid replicas entry: invalid device";
+ for (i = 0; i < e->nr; i++)
+ if (!bch2_dev_exists(sb, mi, e->devs[i]))
+ goto err;
+ }
+
+ err = "cannot allocate memory";
+ cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+ if (!cpu_r)
+ goto err;
+
+ sort_cmp_size(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ memcmp, NULL);
+
+ for (i = 0; i + 1 < cpu_r->nr; i++) {
+ struct bch_replicas_cpu_entry *l =
+ cpu_replicas_entry(cpu_r, i);
+ struct bch_replicas_cpu_entry *r =
+ cpu_replicas_entry(cpu_r, i + 1);
+
+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+ err = "duplicate replicas entry";
+ if (!memcmp(l, r, cpu_r->entry_size))
+ goto err;
+ }
+
+ err = NULL;
+err:
+ kfree(cpu_r);
+ return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+ .validate = bch2_sb_validate_replicas,
+};
+
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+{
+ char *out = buf, *end = out + size;
+ struct bch_replicas_entry *e;
+ bool first = true;
+ unsigned i;
+
+ if (!r) {
+ out += scnprintf(out, end - out, "(no replicas section found)");
+ return out - buf;
+ }
+
+ for_each_replicas_entry(r, e) {
+ if (!first)
+ out += scnprintf(out, end - out, " ");
+ first = false;
+
+ out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+ for (i = 0; i < e->nr; i++)
+ out += scnprintf(out, end - out,
+ i ? " %u" : "%u", e->devs[i]);
+ out += scnprintf(out, end - out, "]");
+ }
+
+ return out - buf;
+}
+
+/* Query replicas: */
+
+bool bch2_replicas_marked(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
+{
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+ bool ret;
+
+ if (!devs.nr)
+ return true;
+
+ devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+ rcu_read_lock();
+ ret = replicas_has_entry(rcu_dereference(c->replicas),
+ search, max_dev);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bkey_s_c k)
+{
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
+
+ for (i = 0; i < cached.nr; i++)
+ if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+ bch2_dev_list_single(cached.devs[i])))
+ return false;
+
+ return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+ struct bch_devs_mask online_devs)
+{
+ struct bch_sb_field_members *mi;
+ struct bch_replicas_cpu_entry *e;
+ struct bch_replicas_cpu *r;
+ unsigned i, dev, dev_slots, nr_online, nr_offline;
+ struct replicas_status ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+ ret.replicas[i].nr_online = UINT_MAX;
+
+ mi = bch2_sb_get_members(c->disk_sb.sb);
+ rcu_read_lock();
+
+ r = rcu_dereference(c->replicas);
+ dev_slots = replicas_dev_slots(r);
+
+ for_each_cpu_replicas_entry(r, e) {
+ if (e->data_type >= ARRAY_SIZE(ret.replicas))
+ panic("e %p data_type %u\n", e, e->data_type);
+
+ nr_online = nr_offline = 0;
+
+ for (dev = 0; dev < dev_slots; dev++) {
+ if (!replicas_test_dev(e, dev))
+ continue;
+
+ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+
+ if (test_bit(dev, online_devs.d))
+ nr_online++;
+ else
+ nr_offline++;
+ }
+
+ ret.replicas[e->data_type].nr_online =
+ min(ret.replicas[e->data_type].nr_online,
+ nr_online);
+
+ ret.replicas[e->data_type].nr_offline =
+ max(ret.replicas[e->data_type].nr_offline,
+ nr_offline);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+ return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+ enum bch_data_type type,
+ bool force_if_degraded,
+ bool force_if_lost)
+{
+ return (!s.replicas[type].nr_offline || force_if_degraded) &&
+ (s.replicas[type].nr_online || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+ return (have_enough_devs(s, BCH_DATA_JOURNAL,
+ flags & BCH_FORCE_IF_METADATA_DEGRADED,
+ flags & BCH_FORCE_IF_METADATA_LOST) &&
+ have_enough_devs(s, BCH_DATA_BTREE,
+ flags & BCH_FORCE_IF_METADATA_DEGRADED,
+ flags & BCH_FORCE_IF_METADATA_LOST) &&
+ have_enough_devs(s, BCH_DATA_USER,
+ flags & BCH_FORCE_IF_DATA_DEGRADED,
+ flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+ struct replicas_status s = bch2_replicas_status(c);
+
+ return meta
+ ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+ s.replicas[BCH_DATA_BTREE].nr_online)
+ : s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bch_replicas_cpu_entry *e;
+ struct bch_replicas_cpu *r;
+ unsigned ret = 0;
+
+ rcu_read_lock();
+ r = rcu_dereference(c->replicas);
+
+ if (ca->dev_idx >= replicas_dev_slots(r))
+ goto out;
+
+ for_each_cpu_replicas_entry(r, e)
+ if (replicas_test_dev(e, ca->dev_idx))
+ ret |= 1 << e->data_type;
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
new file mode 100644
index 00000000..49f114b0
--- /dev/null
+++ b/libbcachefs/replicas.h
@@ -0,0 +1,51 @@
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
+ struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+ struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+ struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+ struct bkey_s_c);
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
+
+struct replicas_status {
+ struct {
+ unsigned nr_online;
+ unsigned nr_offline;
+ } replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+ struct bch_devs_mask);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+ return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+ (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 69101f3a..a2b981a3 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -1,8 +1,11 @@
#include "bcachefs.h"
#include "checksum.h"
+#include "disk_groups.h"
#include "error.h"
#include "io.h"
+#include "replicas.h"
+#include "quota.h"
#include "super-io.h"
#include "super.h"
#include "vstructs.h"
@@ -10,13 +13,6 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
-static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
- struct bch_replicas_cpu *);
-static int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-/* superblock fields (optional/variable size sections: */
-
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -24,34 +20,8 @@ const char * const bch2_sb_fields[] = {
NULL
};
-#define x(f, nr) \
-static const char *bch2_sb_validate_##f(struct bch_sb *, struct bch_sb_field *);
- BCH_SB_FIELDS()
-#undef x
-
-struct bch_sb_field_ops {
- const char * (*validate)(struct bch_sb *, struct bch_sb_field *);
-};
-
-static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
-#define x(f, nr) \
- [BCH_SB_FIELD_##f] = { \
- .validate = bch2_sb_validate_##f, \
- },
- BCH_SB_FIELDS()
-#undef x
-};
-
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
-
-{
- unsigned type = le32_to_cpu(f->type);
-
- return type < BCH_SB_FIELD_NR
- ? bch2_sb_field_ops[type].validate(sb, f)
- : NULL;
-}
+static const char *bch2_sb_field_validate(struct bch_sb *,
+ struct bch_sb_field *);
struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
enum bch_sb_field_type type)
@@ -66,14 +36,18 @@ struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
return NULL;
}
-static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
- struct bch_sb_field *f,
- unsigned u64s)
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+ struct bch_sb_field *f,
+ unsigned u64s)
{
unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
+ sb->page_order);
if (!f) {
- f = vstruct_last(sb);
+ f = vstruct_last(sb->sb);
memset(f, 0, sizeof(u64) * u64s);
f->u64s = cpu_to_le32(u64s);
f->type = 0;
@@ -84,13 +58,13 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
f->u64s = cpu_to_le32(u64s);
dst = vstruct_end(f);
- memmove(dst, src, vstruct_end(sb) - src);
+ memmove(dst, src, vstruct_end(sb->sb) - src);
if (dst > src)
memset(src, 0, dst - src);
}
- le32_add_cpu(&sb->u64s, u64s - old_u64s);
+ sb->sb->u64s = cpu_to_le32(sb_u64s);
return f;
}
@@ -108,26 +82,42 @@ void bch2_free_super(struct bch_sb_handle *sb)
memset(sb, 0, sizeof(*sb));
}
-static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order)
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
{
+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+ unsigned order = get_order(new_bytes);
struct bch_sb *new_sb;
struct bio *bio;
+ if (sb->have_layout) {
+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+ if (new_bytes > max_bytes) {
+ char buf[BDEVNAME_SIZE];
+
+ pr_err("%s: superblock too big: want %zu but have %llu",
+ bdevname(sb->bdev, buf), new_bytes, max_bytes);
+ return -ENOSPC;
+ }
+ }
+
if (sb->page_order >= order && sb->sb)
return 0;
if (dynamic_fault("bcachefs:add:super_realloc"))
return -ENOMEM;
- bio = bio_kmalloc(GFP_KERNEL, 1 << order);
- if (!bio)
- return -ENOMEM;
+ if (sb->have_bio) {
+ bio = bio_kmalloc(GFP_KERNEL, 1 << order);
+ if (!bio)
+ return -ENOMEM;
- if (sb->bio)
- bio_put(sb->bio);
- sb->bio = bio;
+ if (sb->bio)
+ bio_put(sb->bio);
+ sb->bio = bio;
+ }
- new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
+ new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
if (!new_sb)
return -ENOMEM;
@@ -142,45 +132,6 @@ static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order)
return 0;
}
-static int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
-{
- u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
- u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
- if (new_bytes > max_bytes) {
- char buf[BDEVNAME_SIZE];
-
- pr_err("%s: superblock too big: want %llu but have %llu",
- bdevname(sb->bdev, buf), new_bytes, max_bytes);
- return -ENOSPC;
- }
-
- return __bch2_super_realloc(sb, get_order(new_bytes));
-}
-
-static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
-{
- u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
- struct bch_sb *sb;
- unsigned order = get_order(bytes);
-
- if (c->disk_sb && order <= c->disk_sb_order)
- return 0;
-
- sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
- if (!sb)
- return -ENOMEM;
-
- if (c->disk_sb)
- memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
-
- free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
-
- c->disk_sb = sb;
- c->disk_sb_order = order;
- return 0;
-}
-
struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
enum bch_sb_field_type type,
unsigned u64s)
@@ -192,38 +143,26 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
return NULL;
- f = __bch2_sb_field_resize(sb->sb, f, u64s);
- f->type = cpu_to_le32(type);
- return f;
-}
-
-struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type);
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
- struct bch_dev *ca;
- unsigned i;
-
- lockdep_assert_held(&c->sb_lock);
+ if (sb->fs_sb) {
+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+ struct bch_dev *ca;
+ unsigned i;
- if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
- return NULL;
+ lockdep_assert_held(&c->sb_lock);
- /* XXX: we're not checking that offline device have enough space */
+ /* XXX: we're not checking that offline device have enough space */
- for_each_online_member(ca, c, i) {
- struct bch_sb_handle *sb = &ca->disk_sb;
+ for_each_online_member(ca, c, i) {
+ struct bch_sb_handle *sb = &ca->disk_sb;
- if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->ref);
- return NULL;
+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+ percpu_ref_put(&ca->ref);
+ return NULL;
+ }
}
}
- f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
+ f = __bch2_sb_field_resize(sb, f, u64s);
f->type = cpu_to_le32(type);
return f;
}
@@ -384,7 +323,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
static void bch2_sb_update(struct bch_fs *c)
{
- struct bch_sb *src = c->disk_sb;
+ struct bch_sb *src = c->disk_sb.sb;
struct bch_sb_field_members *mi = bch2_sb_get_members(src);
struct bch_dev *ca;
unsigned i;
@@ -407,9 +346,10 @@ static void bch2_sb_update(struct bch_fs *c)
}
/* doesn't copy member info */
-static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
{
struct bch_sb_field *src_f, *dst_f;
+ struct bch_sb *dst = dst_handle->sb;
dst->version = src->version;
dst->seq = src->seq;
@@ -433,8 +373,8 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
continue;
dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
- dst_f = __bch2_sb_field_resize(dst, dst_f,
- le32_to_cpu(src_f->u64s));
+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+ le32_to_cpu(src_f->u64s));
memcpy(dst_f, src_f, vstruct_bytes(src_f));
}
@@ -451,11 +391,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
lockdep_assert_held(&c->sb_lock);
- ret = bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s);
+ ret = bch2_sb_realloc(&c->disk_sb,
+ le32_to_cpu(src->u64s) - journal_u64s);
if (ret)
return ret;
- __copy_super(c->disk_sb, src);
+ __copy_super(&c->disk_sb, src);
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret)
@@ -471,7 +412,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
+ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(dst);
unsigned journal_u64s = journal_buckets
@@ -484,7 +425,7 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
if (ret)
return ret;
- __copy_super(dst, src);
+ __copy_super(&ca->disk_sb, src);
return 0;
}
@@ -494,7 +435,6 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
{
struct bch_csum csum;
size_t bytes;
- unsigned order;
reread:
bio_reset(sb->bio);
bio_set_dev(sb->bio, sb->bdev);
@@ -518,9 +458,8 @@ reread:
if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
return "Bad superblock: too big";
- order = get_order(bytes);
- if (order > sb->page_order) {
- if (__bch2_super_realloc(sb, order))
+ if (get_order(bytes) > sb->page_order) {
+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
return "cannot allocate memory";
goto reread;
}
@@ -550,7 +489,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
pr_verbose_init(*opts, "");
memset(sb, 0, sizeof(*sb));
- sb->mode = FMODE_READ;
+ sb->mode = FMODE_READ;
+ sb->have_bio = true;
if (!opt_get(*opts, noexcl))
sb->mode |= FMODE_EXCL;
@@ -575,7 +515,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
}
err = "cannot allocate memory";
- ret = __bch2_super_realloc(sb, 0);
+ ret = bch2_sb_realloc(sb, 0);
if (ret)
goto err;
@@ -644,6 +584,7 @@ got_super:
bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
|= BDI_CAP_STABLE_WRITES;
ret = 0;
+ sb->have_layout = true;
out:
pr_verbose_init(*opts, "ret %i", ret);
return ret;
@@ -711,7 +652,7 @@ void bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
- le64_add_cpu(&c->disk_sb->seq, 1);
+ le64_add_cpu(&c->disk_sb.sb->seq, 1);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
@@ -837,6 +778,10 @@ err:
return err;
}
+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+ .validate = bch2_sb_validate_journal,
+};
+
/* BCH_SB_FIELD_members: */
static const char *bch2_sb_validate_members(struct bch_sb *sb,
@@ -880,6 +825,10 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb,
return NULL;
}
+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
+ .validate = bch2_sb_validate_members,
+};
+
/* BCH_SB_FIELD_crypt: */
static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
@@ -896,980 +845,42 @@ static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
return NULL;
}
-/* BCH_SB_FIELD_replicas: */
-
-/* Replicas tracking - in memory: */
-
-#define for_each_cpu_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
- _i = (void *) (_i) + (_r)->entry_size)
-
-static inline struct bch_replicas_cpu_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
- return (void *) r->entries + r->entry_size * i;
-}
-
-static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
-{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
-}
-
-static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
- unsigned dev)
-{
- return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
-}
-
-static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
- unsigned dev)
-{
- e->devs[dev >> 3] |= 1 << (dev & 7);
-}
-
-static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
-{
- return (r->entry_size -
- offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
-}
-
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
- char *buf, size_t size)
-{
- char *out = buf, *end = out + size;
- struct bch_replicas_cpu_entry *e;
- bool first = true;
- unsigned i;
-
- for_each_cpu_replicas_entry(r, e) {
- bool first_e = true;
-
- if (!first)
- out += scnprintf(out, end - out, " ");
- first = false;
-
- out += scnprintf(out, end - out, "%u: [", e->data_type);
-
- for (i = 0; i < replicas_dev_slots(r); i++)
- if (replicas_test_dev(e, i)) {
- if (!first_e)
- out += scnprintf(out, end - out, " ");
- first_e = false;
- out += scnprintf(out, end - out, "%u", i);
- }
- out += scnprintf(out, end - out, "]");
- }
-
- return out - buf;
-}
-
-static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
- enum bch_data_type data_type,
- struct bch_replicas_cpu_entry *r,
- unsigned *max_dev)
-{
- const struct bch_extent_ptr *ptr;
- unsigned nr = 0;
-
- BUG_ON(!data_type ||
- data_type == BCH_DATA_SB ||
- data_type >= BCH_DATA_NR);
-
- memset(r, 0, sizeof(*r));
- r->data_type = data_type;
-
- *max_dev = 0;
-
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached) {
- *max_dev = max_t(unsigned, *max_dev, ptr->dev);
- replicas_set_dev(r, ptr->dev);
- nr++;
- }
- return nr;
-}
-
-static inline void devlist_to_replicas(struct bch_devs_list devs,
- enum bch_data_type data_type,
- struct bch_replicas_cpu_entry *r,
- unsigned *max_dev)
-{
- unsigned i;
-
- BUG_ON(!data_type ||
- data_type == BCH_DATA_SB ||
- data_type >= BCH_DATA_NR);
-
- memset(r, 0, sizeof(*r));
- r->data_type = data_type;
-
- *max_dev = 0;
-
- for (i = 0; i < devs.nr; i++) {
- *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
- replicas_set_dev(r, devs.devs[i]);
- }
-}
-
-static struct bch_replicas_cpu *
-cpu_replicas_add_entry(struct bch_replicas_cpu *old,
- struct bch_replicas_cpu_entry new_entry,
- unsigned max_dev)
-{
- struct bch_replicas_cpu *new;
- unsigned i, nr, entry_size;
-
- entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
- DIV_ROUND_UP(max_dev + 1, 8);
- entry_size = max(entry_size, old->entry_size);
- nr = old->nr + 1;
-
- new = kzalloc(sizeof(struct bch_replicas_cpu) +
- nr * entry_size, GFP_NOIO);
- if (!new)
- return NULL;
-
- new->nr = nr;
- new->entry_size = entry_size;
-
- for (i = 0; i < old->nr; i++)
- memcpy(cpu_replicas_entry(new, i),
- cpu_replicas_entry(old, i),
- min(new->entry_size, old->entry_size));
-
- memcpy(cpu_replicas_entry(new, old->nr),
- &new_entry,
- new->entry_size);
-
- bch2_cpu_replicas_sort(new);
- return new;
-}
-
-static bool replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_cpu_entry search,
- unsigned max_dev)
-{
- return max_dev < replicas_dev_slots(r) &&
- eytzinger0_find(r->entries, r->nr,
- r->entry_size,
- memcmp, &search) < r->nr;
-}
-
-noinline
-static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_cpu_entry new_entry,
- unsigned max_dev)
-{
- struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
- int ret = -ENOMEM;
-
- mutex_lock(&c->sb_lock);
-
- old_gc = rcu_dereference_protected(c->replicas_gc,
- lockdep_is_held(&c->sb_lock));
- if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
- new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
- if (!new_gc)
- goto err;
- }
-
- old_r = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
- if (!replicas_has_entry(old_r, new_entry, max_dev)) {
- new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
- if (!new_r)
- goto err;
-
- ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
- if (ret)
- goto err;
- }
-
- /* allocations done, now commit: */
-
- if (new_r)
- bch2_write_super(c);
-
- /* don't update in memory replicas until changes are persistent */
-
- if (new_gc) {
- rcu_assign_pointer(c->replicas_gc, new_gc);
- kfree_rcu(old_gc, rcu);
- }
-
- if (new_r) {
- rcu_assign_pointer(c->replicas, new_r);
- kfree_rcu(old_r, rcu);
- }
-
- mutex_unlock(&c->sb_lock);
- return 0;
-err:
- mutex_unlock(&c->sb_lock);
- if (new_gc)
- kfree(new_gc);
- if (new_r)
- kfree(new_r);
- return ret;
-}
-
-int bch2_mark_replicas(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
-{
- struct bch_replicas_cpu_entry search;
- struct bch_replicas_cpu *r, *gc_r;
- unsigned max_dev;
- bool marked;
-
- if (!devs.nr)
- return 0;
-
- BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
-
- devlist_to_replicas(devs, data_type, &search, &max_dev);
-
- rcu_read_lock();
- r = rcu_dereference(c->replicas);
- gc_r = rcu_dereference(c->replicas_gc);
- marked = replicas_has_entry(r, search, max_dev) &&
- (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
- rcu_read_unlock();
-
- return likely(marked) ? 0
- : bch2_mark_replicas_slowpath(c, search, max_dev);
-}
-
-int bch2_mark_bkey_replicas(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bkey_s_c k)
-{
- struct bch_devs_list cached = bch2_bkey_cached_devs(k);
- unsigned i;
- int ret;
-
- for (i = 0; i < cached.nr; i++)
- if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
- bch2_dev_list_single(cached.devs[i]))))
- return ret;
-
- return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
- struct bch_replicas_cpu *new_r, *old_r;
- int ret = 0;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
-
- new_r = rcu_dereference_protected(c->replicas_gc,
- lockdep_is_held(&c->sb_lock));
-
- if (err) {
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(new_r, rcu);
- goto err;
- }
-
- if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
- ret = -ENOSPC;
- goto err;
- }
-
- old_r = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
-
- rcu_assign_pointer(c->replicas, new_r);
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(old_r, rcu);
-
- bch2_write_super(c);
-err:
- mutex_unlock(&c->sb_lock);
- return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
- struct bch_replicas_cpu *dst, *src;
- struct bch_replicas_cpu_entry *e;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- BUG_ON(c->replicas_gc);
-
- src = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
-
- dst = kzalloc(sizeof(struct bch_replicas_cpu) +
- src->nr * src->entry_size, GFP_NOIO);
- if (!dst) {
- mutex_unlock(&c->sb_lock);
- return -ENOMEM;
- }
-
- dst->nr = 0;
- dst->entry_size = src->entry_size;
-
- for_each_cpu_replicas_entry(src, e)
- if (!((1 << e->data_type) & typemask))
- memcpy(cpu_replicas_entry(dst, dst->nr++),
- e, dst->entry_size);
-
- bch2_cpu_replicas_sort(dst);
-
- rcu_assign_pointer(c->replicas_gc, dst);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-/* Replicas tracking - superblock: */
-
-static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
- unsigned *nr,
- unsigned *bytes,
- unsigned *max_dev)
-{
- struct bch_replicas_entry *i;
- unsigned j;
-
- *nr = 0;
- *bytes = sizeof(*r);
- *max_dev = 0;
-
- if (!r)
- return;
-
- for_each_replicas_entry(r, i) {
- for (j = 0; j < i->nr; j++)
- *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
- (*nr)++;
- }
-
- *bytes = (void *) i - (void *) r;
-}
-
-static struct bch_replicas_cpu *
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
-{
- struct bch_replicas_cpu *cpu_r;
- unsigned i, nr, bytes, max_dev, entry_size;
-
- bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
-
- entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
- DIV_ROUND_UP(max_dev + 1, 8);
-
- cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
- nr * entry_size, GFP_NOIO);
- if (!cpu_r)
- return NULL;
-
- cpu_r->nr = nr;
- cpu_r->entry_size = entry_size;
-
- if (nr) {
- struct bch_replicas_cpu_entry *dst =
- cpu_replicas_entry(cpu_r, 0);
- struct bch_replicas_entry *src = sb_r->entries;
-
- while (dst < cpu_replicas_entry(cpu_r, nr)) {
- dst->data_type = src->data_type;
- for (i = 0; i < src->nr; i++)
- replicas_set_dev(dst, src->devs[i]);
-
- src = replicas_entry_next(src);
- dst = (void *) dst + entry_size;
- }
- }
-
- bch2_cpu_replicas_sort(cpu_r);
- return cpu_r;
-}
-
-static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
-{
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_cpu *cpu_r, *old_r;
-
- sb_r = bch2_sb_get_replicas(c->disk_sb);
- cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
- if (!cpu_r)
- return -ENOMEM;
-
- old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
- rcu_assign_pointer(c->replicas, cpu_r);
- if (old_r)
- kfree_rcu(old_r, rcu);
-
- return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *sb_e;
- struct bch_replicas_cpu_entry *e;
- size_t i, bytes;
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for_each_cpu_replicas_entry(r, e) {
- bytes += sizeof(struct bch_replicas_entry);
- for (i = 0; i < r->entry_size - 1; i++)
- bytes += hweight8(e->devs[i]);
- }
-
- sb_r = bch2_fs_sb_resize_replicas(c,
- DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
- if (!sb_r)
- return -ENOSPC;
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- sb_e = sb_r->entries;
- for_each_cpu_replicas_entry(r, e) {
- sb_e->data_type = e->data_type;
-
- for (i = 0; i < replicas_dev_slots(r); i++)
- if (replicas_test_dev(e, i))
- sb_e->devs[sb_e->nr++] = i;
-
- sb_e = replicas_entry_next(sb_e);
-
- BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
- }
-
- return 0;
-}
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_replicas_cpu *cpu_r = NULL;
- struct bch_replicas_entry *e;
- const char *err;
- unsigned i;
-
- for_each_replicas_entry(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
-
- err = "invalid replicas entry: no devices";
- if (!e->nr)
- goto err;
-
- err = "invalid replicas entry: too many devices";
- if (e->nr >= BCH_REPLICAS_MAX)
- goto err;
-
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
- }
-
- err = "cannot allocate memory";
- cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
- if (!cpu_r)
- goto err;
-
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- memcmp, NULL);
-
- for (i = 0; i + 1 < cpu_r->nr; i++) {
- struct bch_replicas_cpu_entry *l =
- cpu_replicas_entry(cpu_r, i);
- struct bch_replicas_cpu_entry *r =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
- err = "duplicate replicas entry";
- if (!memcmp(l, r, cpu_r->entry_size))
- goto err;
- }
-
- err = NULL;
-err:
- kfree(cpu_r);
- return err;
-}
-
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
-{
- char *out = buf, *end = out + size;
- struct bch_replicas_entry *e;
- bool first = true;
- unsigned i;
-
- if (!r) {
- out += scnprintf(out, end - out, "(no replicas section found)");
- return out - buf;
- }
-
- for_each_replicas_entry(r, e) {
- if (!first)
- out += scnprintf(out, end - out, " ");
- first = false;
-
- out += scnprintf(out, end - out, "%u: [", e->data_type);
-
- for (i = 0; i < e->nr; i++)
- out += scnprintf(out, end - out,
- i ? " %u" : "%u", e->devs[i]);
- out += scnprintf(out, end - out, "]");
- }
-
- return out - buf;
-}
-
-/* Query replicas: */
-
-bool bch2_replicas_marked(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
- bool ret;
-
- if (!devs.nr)
- return true;
-
- devlist_to_replicas(devs, data_type, &search, &max_dev);
-
- rcu_read_lock();
- ret = replicas_has_entry(rcu_dereference(c->replicas),
- search, max_dev);
- rcu_read_unlock();
-
- return ret;
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bkey_s_c k)
-{
- struct bch_devs_list cached = bch2_bkey_cached_devs(k);
- unsigned i;
-
- for (i = 0; i < cached.nr; i++)
- if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
- bch2_dev_list_single(cached.devs[i])))
- return false;
-
- return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
-}
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
- struct bch_devs_mask online_devs)
-{
- struct bch_sb_field_members *mi;
- struct bch_replicas_cpu_entry *e;
- struct bch_replicas_cpu *r;
- unsigned i, dev, dev_slots, nr_online, nr_offline;
- struct replicas_status ret;
-
- memset(&ret, 0, sizeof(ret));
-
- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
- ret.replicas[i].nr_online = UINT_MAX;
-
- mi = bch2_sb_get_members(c->disk_sb);
- rcu_read_lock();
-
- r = rcu_dereference(c->replicas);
- dev_slots = replicas_dev_slots(r);
-
- for_each_cpu_replicas_entry(r, e) {
- if (e->data_type >= ARRAY_SIZE(ret.replicas))
- panic("e %p data_type %u\n", e, e->data_type);
-
- nr_online = nr_offline = 0;
-
- for (dev = 0; dev < dev_slots; dev++) {
- if (!replicas_test_dev(e, dev))
- continue;
-
- BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
-
- if (test_bit(dev, online_devs.d))
- nr_online++;
- else
- nr_offline++;
- }
-
- ret.replicas[e->data_type].nr_online =
- min(ret.replicas[e->data_type].nr_online,
- nr_online);
-
- ret.replicas[e->data_type].nr_offline =
- max(ret.replicas[e->data_type].nr_offline,
- nr_offline);
- }
-
- rcu_read_unlock();
-
- return ret;
-}
-
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
-{
- return __bch2_replicas_status(c, bch2_online_devs(c));
-}
-
-static bool have_enough_devs(struct replicas_status s,
- enum bch_data_type type,
- bool force_if_degraded,
- bool force_if_lost)
-{
- return (!s.replicas[type].nr_offline || force_if_degraded) &&
- (s.replicas[type].nr_online || force_if_lost);
-}
-
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
- return (have_enough_devs(s, BCH_DATA_JOURNAL,
- flags & BCH_FORCE_IF_METADATA_DEGRADED,
- flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_BTREE,
- flags & BCH_FORCE_IF_METADATA_DEGRADED,
- flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_USER,
- flags & BCH_FORCE_IF_DATA_DEGRADED,
- flags & BCH_FORCE_IF_DATA_LOST));
-}
-
-unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
-{
- struct replicas_status s = bch2_replicas_status(c);
-
- return meta
- ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
- s.replicas[BCH_DATA_BTREE].nr_online)
- : s.replicas[BCH_DATA_USER].nr_online;
-}
-
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bch_replicas_cpu_entry *e;
- struct bch_replicas_cpu *r;
- unsigned ret = 0;
-
- rcu_read_lock();
- r = rcu_dereference(c->replicas);
-
- if (ca->dev_idx >= replicas_dev_slots(r))
- goto out;
-
- for_each_cpu_replicas_entry(r, e)
- if (replicas_test_dev(e, ca->dev_idx))
- ret |= 1 << e->data_type;
-out:
- rcu_read_unlock();
-
- return ret;
-}
+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+ .validate = bch2_sb_validate_crypt,
+};
-/* Quotas: */
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr) \
+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+ BCH_SB_FIELDS()
+#undef x
+};
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+static const char *bch2_sb_field_validate(struct bch_sb *sb,
struct bch_sb_field *f)
{
- struct bch_sb_field_quota *q = field_to_type(f, quota);
-
- if (vstruct_bytes(&q->field) != sizeof(*q))
- return "invalid field quota: wrong size";
-
- return NULL;
-}
-
-/* Disk groups: */
-
-static int strcmp_void(const void *l, const void *r)
-{
- return strcmp(l, r);
-}
-
-static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_disk_groups *groups =
- field_to_type(f, disk_groups);
- struct bch_disk_group *g;
- struct bch_sb_field_members *mi;
- struct bch_member *m;
- unsigned i, nr_groups, nr_live = 0, len;
- char **labels, *l;
- const char *err = NULL;
-
- mi = bch2_sb_get_members(sb);
- groups = bch2_sb_get_disk_groups(sb);
- nr_groups = disk_groups_nr(groups);
-
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++) {
- unsigned g;
-
- if (!BCH_MEMBER_GROUP(m))
- continue;
-
- g = BCH_MEMBER_GROUP(m) - 1;
-
- if (g >= nr_groups ||
- BCH_GROUP_DELETED(&groups->entries[g]))
- return "disk has invalid group";
- }
-
- if (!nr_groups)
- return NULL;
-
- labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
- if (!labels)
- return "cannot allocate memory";
-
- for (g = groups->entries;
- g < groups->entries + nr_groups;
- g++) {
- if (BCH_GROUP_DELETED(g))
- continue;
-
- len = strnlen(g->label, sizeof(g->label));
-
- labels[nr_live++] = l = kmalloc(len + 1, GFP_KERNEL);
- if (!l) {
- err = "cannot allocate memory";
- goto err;
- }
-
- memcpy(l, g->label, len);
- l[len] = '\0';
- }
-
- sort(labels, nr_live, sizeof(labels[0]), strcmp_void, NULL);
-
- for (i = 0; i + 1 < nr_live; i++)
- if (!strcmp(labels[i], labels[i + 1])) {
- err = "duplicate group labels";
- goto err;
- }
-
- err = NULL;
-err:
- for (i = 0; i < nr_live; i++)
- kfree(labels[i]);
- kfree(labels);
- return err;
-}
-
-static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_members *mi;
- struct bch_sb_field_disk_groups *groups;
- struct bch_disk_groups_cpu *cpu_g, *old_g;
- unsigned i, nr_groups;
-
- lockdep_assert_held(&c->sb_lock);
-
- mi = bch2_sb_get_members(c->disk_sb);
- groups = bch2_sb_get_disk_groups(c->disk_sb);
- nr_groups = disk_groups_nr(groups);
-
- if (!groups)
- return 0;
-
- cpu_g = kzalloc(sizeof(*cpu_g) +
- sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
- if (!cpu_g)
- return -ENOMEM;
-
- cpu_g->nr = nr_groups;
-
- for (i = 0; i < nr_groups; i++) {
- struct bch_disk_group *src = &groups->entries[i];
- struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
-
- dst->deleted = BCH_GROUP_DELETED(src);
- }
-
- for (i = 0; i < c->disk_sb->nr_devices; i++) {
- struct bch_member *m = mi->members + i;
- struct bch_disk_group_cpu *dst =
- &cpu_g->entries[BCH_MEMBER_GROUP(m)];
-
- if (!bch2_member_exists(m))
- continue;
-
- dst = BCH_MEMBER_GROUP(m)
- ? &cpu_g->entries[BCH_MEMBER_GROUP(m) - 1]
- : NULL;
- if (dst)
- __set_bit(i, dst->devs.d);
- }
-
- old_g = c->disk_groups;
- rcu_assign_pointer(c->disk_groups, cpu_g);
- if (old_g)
- kfree_rcu(old_g, rcu);
-
- return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
- struct target t = target_decode(target);
-
- switch (t.type) {
- case TARGET_DEV: {
- struct bch_dev *ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
- return ca ? &ca->self : NULL;
- }
- case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
- return t.group < g->nr && !g->entries[t.group].deleted
- ? &g->entries[t.group].devs
- : NULL;
- }
- default:
- BUG();
- }
-}
-
-int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
- const char *name)
-{
- unsigned i, nr_groups = disk_groups_nr(groups);
- unsigned len = strlen(name);
-
- for (i = 0; i < nr_groups; i++) {
- struct bch_disk_group *g = groups->entries + i;
-
- if (BCH_GROUP_DELETED(g))
- continue;
-
- if (strnlen(g->label, sizeof(g->label)) == len &&
- !memcmp(name, g->label, len))
- return i;
- }
-
- return -1;
-}
-
-static int bch2_disk_group_find(struct bch_fs *c, const char *name)
-{
- int ret;
-
- mutex_lock(&c->sb_lock);
- ret = __bch2_disk_group_find(bch2_sb_get_disk_groups(c->disk_sb), name);
- mutex_unlock(&c->sb_lock);
+ unsigned type = le32_to_cpu(f->type);
- return ret;
+ return type < BCH_SB_FIELD_NR
+ ? bch2_sb_field_ops[type]->validate(sb, f)
+ : NULL;
}
-int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+size_t bch2_sb_field_to_text(char *buf, size_t size,
+ struct bch_sb *sb, struct bch_sb_field *f)
{
- struct bch_dev *ca;
- int g;
-
- if (!strlen(buf) || !strcmp(buf, "none")) {
- *v = 0;
- return 0;
- }
-
- /* Is it a device? */
- ca = bch2_dev_lookup(c, buf);
- if (!IS_ERR(ca)) {
- *v = dev_to_target(ca->dev_idx);
- percpu_ref_put(&ca->ref);
- return 0;
- }
+ unsigned type = le32_to_cpu(f->type);
+ size_t (*to_text)(char *, size_t, struct bch_sb *,
+ struct bch_sb_field *) =
+ type < BCH_SB_FIELD_NR
+ ? bch2_sb_field_ops[type]->to_text
+ : NULL;
- g = bch2_disk_group_find(c, buf);
- if (g >= 0) {
- *v = group_to_target(g);
+ if (!to_text) {
+ if (size)
+ buf[0] = '\0';
return 0;
}
- return -EINVAL;
-}
-
-int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
-{
- struct target t = target_decode(v);
- int ret;
-
- switch (t.type) {
- case TARGET_NULL:
- return scnprintf(buf, len, "none");
- case TARGET_DEV: {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- char b[BDEVNAME_SIZE];
-
- ret = scnprintf(buf, len, "/dev/%s",
- bdevname(ca->disk_sb.bdev, b));
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- ret = scnprintf(buf, len, "offline device %u", t.dev);
- } else {
- ret = scnprintf(buf, len, "invalid device %u", t.dev);
- }
-
- rcu_read_unlock();
- break;
- }
- case TARGET_GROUP: {
- struct bch_sb_field_disk_groups *groups;
- struct bch_disk_group *g;
-
- mutex_lock(&c->sb_lock);
- groups = bch2_sb_get_disk_groups(c->disk_sb);
-
- g = t.group < disk_groups_nr(groups)
- ? groups->entries + t.group
- : NULL;
-
- if (g && !BCH_GROUP_DELETED(g)) {
- ret = len ? min(len - 1, strnlen(g->label, sizeof(g->label))) : 0;
-
- memcpy(buf, g->label, ret);
- if (len)
- buf[ret] = '\0';
- } else {
- ret = scnprintf(buf, len, "invalid group %u", t.group);
- }
-
- mutex_unlock(&c->sb_lock);
- break;
- }
- default:
- BUG();
- }
-
- return ret;
+ return to_text(buf, size, sb, f);
}
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index 2514ac8a..f407c205 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -11,8 +11,6 @@
struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
enum bch_sb_field_type, unsigned);
-struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *,
- enum bch_sb_field_type, unsigned);
#define field_to_type(_f, _name) \
container_of_or_null(_f, struct bch_sb_field_##_name, field)
@@ -30,13 +28,6 @@ bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \
{ \
return field_to_type(bch2_sb_field_resize(sb, \
BCH_SB_FIELD_##_name, u64s), _name); \
-} \
- \
-static inline struct bch_sb_field_##_name * \
-bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \
-{ \
- return field_to_type(bch2_fs_sb_field_resize(c, \
- BCH_SB_FIELD_##_name, u64s), _name); \
}
BCH_SB_FIELDS()
@@ -44,6 +35,12 @@ BCH_SB_FIELDS()
extern const char * const bch2_sb_fields[];
+struct bch_sb_field_ops {
+ const char * (*validate)(struct bch_sb *, struct bch_sb_field *);
+ size_t (*to_text)(char *, size_t, struct bch_sb *,
+ struct bch_sb_field *);
+};
+
static inline bool bch2_sb_test_feature(struct bch_sb *sb,
enum bch_sb_features f)
{
@@ -90,7 +87,7 @@ int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
-int bch2_super_realloc(struct bch_sb_handle *, unsigned);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
const char *bch2_sb_validate(struct bch_sb_handle *);
@@ -139,135 +136,4 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
};
}
-/* BCH_SB_FIELD_replicas: */
-
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
- struct bch_devs_list);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
- struct bkey_s_c);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
- struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
- struct bkey_s_c);
-
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
-
-struct replicas_status {
- struct {
- unsigned nr_online;
- unsigned nr_offline;
- } replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
- struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
-
-unsigned bch2_replicas_online(struct bch_fs *, bool);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
- return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
-/* disk groups: */
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
- return groups
- ? (vstruct_end(&groups->field) -
- (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
- : 0;
-}
-
-struct target {
- enum {
- TARGET_NULL,
- TARGET_DEV,
- TARGET_GROUP,
- } type;
- union {
- unsigned dev;
- unsigned group;
- };
-};
-
-#define TARGET_DEV_START 1
-#define TARGET_GROUP_START (256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
- return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
- return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
- if (target >= TARGET_GROUP_START)
- return (struct target) {
- .type = TARGET_GROUP,
- .group = target - TARGET_GROUP_START
- };
-
- if (target >= TARGET_DEV_START)
- return (struct target) {
- .type = TARGET_DEV,
- .group = target - TARGET_DEV_START
- };
-
- return (struct target) { .type = TARGET_NULL };
-}
-
-static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
-{
- struct target t = target_decode(target);
-
- switch (t.type) {
- case TARGET_NULL:
- return false;
- case TARGET_DEV:
- return ca->dev_idx == t.dev;
- case TARGET_GROUP:
- return ca->mi.group && ca->mi.group - 1 == t.group;
- default:
- BUG();
- }
-}
-
-static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
- bool ret;
-
- rcu_read_lock();
- ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
- rcu_read_unlock();
-
- return ret;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
-
#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 77670ea6..05910c40 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -18,6 +18,7 @@
#include "clock.h"
#include "compress.h"
#include "debug.h"
+#include "disk_groups.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
@@ -30,6 +31,7 @@
#include "migrate.h"
#include "movinggc.h"
#include "quota.h"
+#include "replicas.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
@@ -122,7 +124,7 @@ static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
lockdep_assert_held(&bch_fs_list_lock);
list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
return c;
return NULL;
@@ -203,23 +205,12 @@ static void bch_fs_mark_clean(struct bch_fs *c)
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb, true);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
}
-static bool btree_interior_updates_done(struct bch_fs *c)
-{
- bool ret;
-
- mutex_lock(&c->btree_interior_update_lock);
- ret = list_empty(&c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- return ret;
-}
-
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -251,7 +242,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* fully complete:
*/
closure_wait_event(&c->btree_interior_update_wait,
- btree_interior_updates_done(c));
+ !bch2_btree_interior_updates_nr_pending(c));
if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
bch2_btree_verify_flushed(c);
@@ -433,7 +424,8 @@ static void bch2_fs_free(struct bch_fs *c)
if (c->wq)
destroy_workqueue(c->wq);
- free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
+ free_pages((unsigned long) c->disk_sb.sb,
+ c->disk_sb.page_order);
kvpfree(c, sizeof(*c));
module_put(THIS_MODULE);
}
@@ -501,11 +493,54 @@ void bch2_fs_stop(struct bch_fs *c)
kobject_put(&c->kobj);
}
+static const char *bch2_fs_online(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ const char *err = NULL;
+ unsigned i;
+ int ret;
+
+ lockdep_assert_held(&bch_fs_list_lock);
+
+ if (!list_empty(&c->list))
+ return NULL;
+
+ if (__bch2_uuid_to_fs(c->sb.uuid))
+ return "filesystem UUID already open";
+
+ ret = bch2_fs_chardev_init(c);
+ if (ret)
+ return "error creating character device";
+
+ bch2_fs_debug_init(c);
+
+ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
+ kobject_add(&c->internal, &c->kobj, "internal") ||
+ kobject_add(&c->opts_dir, &c->kobj, "options") ||
+ kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+ bch2_opts_create_sysfs_files(&c->opts_dir))
+ return "error creating sysfs objects";
+
+ mutex_lock(&c->state_lock);
+
+ err = "error creating sysfs objects";
+ __for_each_member_device(ca, c, i, NULL)
+ if (bch2_dev_sysfs_online(c, ca))
+ goto err;
+
+ list_add(&c->list, &bch_fs_list);
+ err = NULL;
+err:
+ mutex_unlock(&c->state_lock);
+ return err;
+}
+
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
unsigned i, iter_size;
+ const char *err;
pr_verbose_init(opts, "");
@@ -516,6 +551,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
__module_get(THIS_MODULE);
c->minor = -1;
+ c->disk_sb.fs_sb = true;
mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
@@ -627,9 +663,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
- mi = bch2_sb_get_members(c->disk_sb);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb, mi, i) &&
+ if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
bch2_dev_alloc(c, i))
goto err;
@@ -644,6 +680,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
+ mutex_lock(&bch_fs_list_lock);
+ err = bch2_fs_online(c);
+ mutex_unlock(&bch_fs_list_lock);
+ if (err) {
+ bch_err(c, "bch2_fs_online() error: %s", err);
+ goto err;
+ }
out:
pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
return c;
@@ -653,60 +697,7 @@ err:
goto out;
}
-static const char *__bch2_fs_online(struct bch_fs *c)
-{
- struct bch_dev *ca;
- const char *err = NULL;
- unsigned i;
- int ret;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- if (!list_empty(&c->list))
- return NULL;
-
- if (__bch2_uuid_to_fs(c->sb.uuid))
- return "filesystem UUID already open";
-
- ret = bch2_fs_chardev_init(c);
- if (ret)
- return "error creating character device";
-
- bch2_fs_debug_init(c);
-
- if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
- kobject_add(&c->internal, &c->kobj, "internal") ||
- kobject_add(&c->opts_dir, &c->kobj, "options") ||
- kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
- bch2_opts_create_sysfs_files(&c->opts_dir))
- return "error creating sysfs objects";
-
- mutex_lock(&c->state_lock);
-
- err = "error creating sysfs objects";
- __for_each_member_device(ca, c, i, NULL)
- if (bch2_dev_sysfs_online(c, ca))
- goto err;
-
- list_add(&c->list, &bch_fs_list);
- err = NULL;
-err:
- mutex_unlock(&c->state_lock);
- return err;
-}
-
-static const char *bch2_fs_online(struct bch_fs *c)
-{
- const char *err;
-
- mutex_lock(&bch_fs_list_lock);
- err = __bch2_fs_online(c);
- mutex_unlock(&bch_fs_list_lock);
-
- return err;
-}
-
-static const char *__bch2_fs_start(struct bch_fs *c)
+const char *bch2_fs_start(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
@@ -730,15 +721,15 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- if (BCH_SB_INITIALIZED(c->disk_sb)) {
+ if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
ret = bch2_journal_read(c, &journal);
if (ret)
goto err;
j = &list_entry(journal.prev, struct journal_replay, list)->j;
- c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
- c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+ c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+ c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
@@ -824,21 +815,18 @@ static const char *__bch2_fs_start(struct bch_fs *c)
bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
- set_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
err = "unable to allocate journal buckets";
- for_each_rw_member(ca, c, i)
- if (bch2_dev_journal_alloc(c, ca)) {
+ for_each_online_member(ca, c, i)
+ if (bch2_dev_journal_alloc(ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
- clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
-
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
@@ -889,18 +877,20 @@ recovery_done:
}
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
now = ktime_get_seconds();
for_each_member_device(ca, c, i)
mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
- SET_BCH_SB_INITIALIZED(c->disk_sb, true);
- SET_BCH_SB_CLEAN(c->disk_sb, false);
+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ set_bit(BCH_FS_STARTED, &c->flags);
+
err = NULL;
out:
mutex_unlock(&c->state_lock);
@@ -939,11 +929,6 @@ fsck_err:
goto out;
}
-const char *bch2_fs_start(struct bch_fs *c)
-{
- return __bch2_fs_start(c) ?: bch2_fs_online(c);
-}
-
static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
{
struct bch_sb_field_members *sb_mi;
@@ -956,7 +941,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
return "mismatched block size";
if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
return "new cache bucket size is too small";
return NULL;
@@ -1082,28 +1067,19 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
return 0;
}
-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+ struct bch_member *member)
{
- struct bch_member *member;
- struct bch_dev *ca = NULL;
- int ret = 0;
-
- pr_verbose_init(c->opts, "");
-
- if (bch2_fs_init_fault("dev_alloc"))
- goto err;
+ struct bch_dev *ca;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- goto err;
+ return NULL;
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
- ca->dev_idx = dev_idx;
- __set_bit(ca->dev_idx, ca->self.d);
-
init_rwsem(&ca->bucket_lock);
writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
@@ -1113,14 +1089,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
- if (bch2_fs_init_fault("dev_alloc"))
- goto err;
-
- member = bch2_sb_get_members(c->disk_sb)->members + dev_idx;
-
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
@@ -1132,11 +1102,43 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
+ return ca;
+err:
+ bch2_dev_free(ca);
+ return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+ unsigned dev_idx)
+{
+ ca->dev_idx = dev_idx;
+ __set_bit(ca->dev_idx, ca->self.d);
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
if (bch2_dev_sysfs_online(c, ca))
pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+ struct bch_member *member =
+ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+ struct bch_dev *ca = NULL;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
+
+ if (bch2_fs_init_fault("dev_alloc"))
+ goto err;
+
+ ca = __bch2_dev_alloc(c, member);
+ if (!ca)
+ goto err;
+
+ bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
@@ -1147,21 +1149,9 @@ err:
goto out;
}
-static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
{
- struct bch_dev *ca;
- int ret;
-
- lockdep_assert_held(&c->state_lock);
-
- if (le64_to_cpu(sb->sb->seq) >
- le64_to_cpu(c->disk_sb->seq))
- bch2_sb_to_fs(c, sb->sb);
-
- BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
- !c->devs[sb->sb->dev_idx]);
-
- ca = bch_dev_locked(c, sb->sb->dev_idx);
+ unsigned ret;
if (bch2_dev_is_online(ca)) {
bch_err(ca, "already have device online in slot %u",
@@ -1179,7 +1169,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
- bch_err(c, "device too small");
+ bch_err(ca, "device too small");
return -EINVAL;
}
@@ -1187,35 +1177,50 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
- /*
- * Increase journal write timeout if flushes to this device are
- * expensive:
- */
- if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
- journal_flushes_device(ca))
- c->journal.write_delay_ms =
- max(c->journal.write_delay_ms, 1000U);
-
/* Commit: */
ca->disk_sb = *sb;
if (sb->mode & FMODE_EXCL)
ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
+ if (ca->fs)
+ mutex_lock(&ca->fs->sb_lock);
+
+ bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
+ if (ca->fs)
+ mutex_unlock(&ca->fs->sb_lock);
+
+ percpu_ref_reinit(&ca->io_ref);
+
+ return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ lockdep_assert_held(&c->state_lock);
+
+ if (le64_to_cpu(sb->sb->seq) >
+ le64_to_cpu(c->disk_sb.sb->seq))
+ bch2_sb_to_fs(c, sb->sb);
+
+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+ !c->devs[sb->sb->dev_idx]);
+
+ ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+ ret = __bch2_dev_attach_bdev(ca, sb);
+ if (ret)
+ return ret;
+
if (c->sb.nr_devices == 1)
bdevname(ca->disk_sb.bdev, c->name);
bdevname(ca->disk_sb.bdev, ca->name);
- mutex_lock(&c->sb_lock);
- bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
- mutex_unlock(&c->sb_lock);
-
- if (ca->mi.state == BCH_MEMBER_STATE_RW)
- bch2_dev_allocator_add(c, ca);
-
rebalance_wakeup(c);
-
- percpu_ref_reinit(&ca->io_ref);
return 0;
}
@@ -1289,10 +1294,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
if (!c->opts.degraded) {
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
- for (i = 0; i < c->disk_sb->nr_devices; i++) {
- if (!bch2_dev_exists(c->disk_sb, mi, i))
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
continue;
ca = bch_dev_locked(c, i);
@@ -1360,7 +1365,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch_notice(ca, "%s", bch2_dev_state[new_state]);
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1470,7 +1475,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* this device must be gone:
*/
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
bch2_write_super(c);
@@ -1492,8 +1497,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_sb_handle sb;
const char *err;
struct bch_dev *ca = NULL;
- struct bch_sb_field_members *mi, *dev_mi;
- struct bch_member saved_mi;
+ struct bch_sb_field_members *mi;
+ struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
int ret;
@@ -1505,24 +1510,52 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (err)
return -EINVAL;
+ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+
err = bch2_dev_may_add(sb.sb, c);
if (err)
return -EINVAL;
+ ca = __bch2_dev_alloc(c, &dev_mi);
+ if (!ca) {
+ bch2_free_super(&sb);
+ return -ENOMEM;
+ }
+
+ ret = __bch2_dev_attach_bdev(ca, &sb);
+ if (ret) {
+ bch2_dev_free(ca);
+ return ret;
+ }
+
+ err = "journal alloc failed";
+ ret = bch2_dev_journal_alloc(ca);
+ if (ret)
+ goto err;
+
mutex_lock(&c->state_lock);
mutex_lock(&c->sb_lock);
- /* Grab member info for new disk: */
- dev_mi = bch2_sb_get_members(sb.sb);
- saved_mi = dev_mi->members[sb.sb->dev_idx];
- saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
+ err = "insufficient space in new superblock";
+ ret = bch2_sb_from_fs(c, ca);
+ if (ret)
+ goto err_unlock;
+
+ mi = bch2_sb_get_members(ca->disk_sb.sb);
+
+ if (!bch2_sb_resize_members(&ca->disk_sb,
+ le32_to_cpu(mi->field.u64s) +
+ sizeof(dev_mi) / sizeof(u64))) {
+ ret = -ENOSPC;
+ goto err_unlock;
+ }
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
- mi = bch2_sb_get_members(c->disk_sb);
+ mi = bch2_sb_get_members(c->disk_sb.sb);
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
+ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
goto have_slot;
no_slot:
err = "no slots available in superblock";
@@ -1533,64 +1566,47 @@ have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
u64s = (sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
- err = "no space in superblock for member info";
- dev_mi = bch2_sb_resize_members(&sb, u64s);
- if (!dev_mi)
- goto err_unlock;
+ err = "no space in superblock for member info";
+ ret = -ENOSPC;
- mi = bch2_fs_sb_resize_members(c, u64s);
+ mi = bch2_sb_resize_members(&c->disk_sb, u64s);
if (!mi)
goto err_unlock;
- memcpy(dev_mi, mi, u64s * sizeof(u64));
- dev_mi->members[dev_idx] = saved_mi;
+ /* success: */
- sb.sb->uuid = c->disk_sb->uuid;
- sb.sb->dev_idx = dev_idx;
- sb.sb->nr_devices = nr_devices;
+ mi->members[dev_idx] = dev_mi;
+ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
+ c->disk_sb.sb->nr_devices = nr_devices;
- /* commit new member info */
- memcpy(mi, dev_mi, u64s * sizeof(u64));
- c->disk_sb->nr_devices = nr_devices;
- c->sb.nr_devices = nr_devices;
+ ca->disk_sb.sb->dev_idx = dev_idx;
+ bch2_dev_attach(c, ca, dev_idx);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (bch2_dev_alloc(c, dev_idx)) {
- err = "cannot allocate memory";
- ret = -ENOMEM;
- goto err;
- }
-
- if (__bch2_dev_online(c, &sb)) {
- err = "bch2_dev_online() error";
- ret = -ENOMEM;
- goto err;
- }
-
- ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
- goto err;
-
- err = "journal alloc failed";
- if (bch2_dev_journal_alloc(c, ca))
- goto err;
+ goto err_late;
}
mutex_unlock(&c->state_lock);
return 0;
+
err_unlock:
mutex_unlock(&c->sb_lock);
-err:
mutex_unlock(&c->state_lock);
+err:
+ if (ca)
+ bch2_dev_free(ca);
bch2_free_super(&sb);
-
bch_err(c, "Unable to add device: %s", err);
- return ret ?: -EINVAL;
+ return ret;
+err_late:
+ bch_err(c, "Error going rw after adding device: %s", err);
+ return -EINVAL;
}
/* Hot add existing device to running filesystem: */
@@ -1613,12 +1629,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- err = bch2_dev_in_fs(c->disk_sb, sb.sb);
+ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
if (err)
goto err;
- if (__bch2_dev_online(c, &sb)) {
- err = "__bch2_dev_online() error";
+ if (bch2_dev_attach_bdev(c, &sb)) {
+ err = "bch2_dev_attach_bdev() error";
goto err;
}
@@ -1688,7 +1704,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
}
mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
mi->nbuckets = cpu_to_le64(nbuckets);
bch2_write_super(c);
@@ -1721,74 +1737,6 @@ found:
return ca;
}
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label)
-{
- struct bch_sb_field_disk_groups *groups;
- struct bch_disk_group *g;
- struct bch_member *mi;
- unsigned i, v, nr_groups;
- int ret;
-
- if (strlen(label) > BCH_SB_LABEL_SIZE)
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
- groups = bch2_sb_get_disk_groups(c->disk_sb);
- nr_groups = disk_groups_nr(groups);
-
- if (!strcmp(label, "none")) {
- v = 0;
- goto write_sb;
- }
-
- ret = __bch2_disk_group_find(groups, label);
- if (ret >= 0) {
- v = ret + 1;
- goto write_sb;
- }
-
- /* not found - create a new disk group: */
-
- for (i = 0;
- i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
- i++)
- ;
-
- if (i == nr_groups) {
- unsigned u64s =
- (sizeof(struct bch_sb_field_disk_groups) +
- sizeof(struct bch_disk_group) * (nr_groups + 1)) /
- sizeof(u64);
-
- groups = bch2_fs_sb_resize_disk_groups(c, u64s);
- if (!groups) {
- mutex_unlock(&c->sb_lock);
- return -ENOSPC;
- }
-
- nr_groups = disk_groups_nr(groups);
- }
-
- BUG_ON(i >= nr_groups);
-
- g = &groups->entries[i];
- v = i + 1;
-
- memcpy(g->label, label, strlen(label));
- if (strlen(label) < sizeof(g->label))
- g->label[strlen(label)] = '\0';
- SET_BCH_GROUP_DELETED(g, 0);
- SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-write_sb:
- mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
- SET_BCH_MEMBER_GROUP(mi, v);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
/* Filesystem open: */
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
@@ -1845,7 +1793,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
err = "bch2_dev_online() error";
mutex_lock(&c->state_lock);
for (i = 0; i < nr_devices; i++)
- if (__bch2_dev_online(c, &sb[i])) {
+ if (bch2_dev_attach_bdev(c, &sb[i])) {
mutex_unlock(&c->state_lock);
goto err_print;
}
@@ -1856,15 +1804,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err_print;
if (!c->opts.nostart) {
- err = __bch2_fs_start(c);
+ err = bch2_fs_start(c);
if (err)
goto err_print;
}
-
- err = bch2_fs_online(c);
- if (err)
- goto err_print;
-
out:
kfree(sb);
module_put(THIS_MODULE);
@@ -1900,7 +1843,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
if (c) {
closure_get(&c->cl);
- err = bch2_dev_in_fs(c->disk_sb, sb->sb);
+ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
if (err)
goto err;
} else {
@@ -1915,22 +1858,18 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
err = "bch2_dev_online() error";
mutex_lock(&c->sb_lock);
- if (__bch2_dev_online(c, sb)) {
+ if (bch2_dev_attach_bdev(c, sb)) {
mutex_unlock(&c->sb_lock);
goto err;
}
mutex_unlock(&c->sb_lock);
if (!c->opts.nostart && bch2_fs_may_start(c)) {
- err = __bch2_fs_start(c);
+ err = bch2_fs_start(c);
if (err)
goto err;
}
- err = __bch2_fs_online(c);
- if (err)
- goto err;
-
closure_put(&c->cl);
mutex_unlock(&bch_fs_list_lock);
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 652a572f..a52ee3bb 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -195,7 +195,6 @@ int bch2_dev_online(struct bch_fs *, const char *);
int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index f5468182..ab83ade9 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -7,6 +7,9 @@ struct bch_sb_handle {
struct bio *bio;
unsigned page_order;
fmode_t mode;
+ unsigned have_layout:1;
+ unsigned have_bio:1;
+ unsigned fs_sb:1;
};
struct bch_devs_mask {
@@ -44,8 +47,9 @@ struct bch_replicas_cpu {
};
struct bch_disk_group_cpu {
- struct bch_devs_mask devs;
bool deleted;
+ u16 parent;
+ struct bch_devs_mask devs;
};
struct bch_disk_groups_cpu {
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 82457348..e8089db9 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -18,11 +18,13 @@
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "disk_groups.h"
#include "inode.h"
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "opts.h"
+#include "replicas.h"
#include "super-io.h"
#include "tier.h"
@@ -140,10 +142,10 @@ read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(durability);
read_attribute(iostats);
-read_attribute(read_priority_stats);
-read_attribute(write_priority_stats);
-read_attribute(fragmentation_stats);
-read_attribute(oldest_gen_stats);
+read_attribute(last_read_quantiles);
+read_attribute(last_write_quantiles);
+read_attribute(fragmentation_quantiles);
+read_attribute(oldest_gen_quantiles);
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@@ -167,7 +169,7 @@ rw_attribute(journal_reclaim_delay_ms);
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
-rw_attribute(group);
+rw_attribute(label);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
@@ -546,7 +548,7 @@ STORE(bch2_fs_opts_dir)
if (opt->set_sb != SET_NO_SB_OPT) {
mutex_lock(&c->sb_lock);
- opt->set_sb(c->disk_sb, v);
+ opt->set_sb(c->disk_sb.sb, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
@@ -621,36 +623,41 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *);
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
+ size_t, void *);
-static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b,
- void *private)
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, void *private)
{
- struct bucket *g = bucket(ca, b);
int rw = (private ? 1 : 0);
- return ca->fs->prio_clock[rw].hand - g->prio[rw];
+ return bucket_last_io(c, bucket(ca, b), rw);
}
-static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b,
- void *private)
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, void *private)
{
struct bucket *g = bucket(ca, b);
return bucket_sectors_used(g->mark);
}
-static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b,
- void *private)
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, void *private)
{
return bucket_gc_gen(ca, b);
}
-static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
- bucket_map_fn *fn, void *private)
+static int unsigned_cmp(const void *_l, const void *_r)
{
- int cmp(const void *l, const void *r)
- { return *((unsigned *) r) - *((unsigned *) l); }
+ unsigned l = *((unsigned *) _l);
+ unsigned r = *((unsigned *) _r);
+
+ return (l > r) - (l < r);
+}
+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
+ char *buf, bucket_map_fn *fn, void *private)
+{
size_t i, n;
/* Compute 31 quantiles */
unsigned q[31], *p;
@@ -666,9 +673,9 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
}
for (i = ca->mi.first_bucket; i < n; i++)
- p[i] = fn(ca, i, private);
+ p[i] = fn(c, ca, i, private);
- sort(p, n, sizeof(unsigned), cmp, NULL);
+ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
up_read(&ca->bucket_lock);
while (n &&
@@ -804,24 +811,18 @@ SHOW(bch2_dev)
sysfs_print(durability, ca->mi.durability);
sysfs_print(discard, ca->mi.discard);
- if (attr == &sysfs_group) {
- struct bch_sb_field_disk_groups *groups;
- struct bch_disk_group *g;
- unsigned len;
-
- if (!ca->mi.group)
- return scnprintf(out, end - out, "none\n");
-
- mutex_lock(&c->sb_lock);
- groups = bch2_sb_get_disk_groups(c->disk_sb);
-
- g = &groups->entries[ca->mi.group - 1];
- len = strnlen(g->label, sizeof(g->label));
- memcpy(buf, g->label, len);
- mutex_unlock(&c->sb_lock);
+ if (attr == &sysfs_label) {
+ if (ca->mi.group) {
+ mutex_lock(&c->sb_lock);
+ out += bch2_disk_path_print(&c->disk_sb, out, end - out,
+ ca->mi.group - 1);
+ mutex_unlock(&c->sb_lock);
+ } else {
+ out += scnprintf(out, end - out, "none");
+ }
- buf[len++] = '\n';
- return len;
+ out += scnprintf(out, end - out, "\n");
+ return out - buf;
}
if (attr == &sysfs_has_data) {
@@ -852,14 +853,16 @@ SHOW(bch2_dev)
if (attr == &sysfs_iostats)
return show_dev_iostats(ca, buf);
- if (attr == &sysfs_read_priority_stats)
- return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
- if (attr == &sysfs_write_priority_stats)
- return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
- if (attr == &sysfs_fragmentation_stats)
- return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
- if (attr == &sysfs_oldest_gen_stats)
- return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
+
+ if (attr == &sysfs_last_read_quantiles)
+ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+ if (attr == &sysfs_last_write_quantiles)
+ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+ if (attr == &sysfs_fragmentation_quantiles)
+ return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+ if (attr == &sysfs_oldest_gen_quantiles)
+ return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+
if (attr == &sysfs_reserve_stats)
return show_reserve_stats(ca, buf);
if (attr == &sysfs_alloc_debug)
@@ -880,7 +883,7 @@ STORE(bch2_dev)
bool v = strtoul_or_return(buf);
mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
if (v != BCH_MEMBER_DISCARD(mi)) {
SET_BCH_MEMBER_DISCARD(mi, v);
@@ -896,7 +899,7 @@ STORE(bch2_dev)
return v;
mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
SET_BCH_MEMBER_REPLACEMENT(mi, v);
@@ -905,7 +908,7 @@ STORE(bch2_dev)
mutex_unlock(&c->sb_lock);
}
- if (attr == &sysfs_group) {
+ if (attr == &sysfs_label) {
char *tmp;
int ret;
@@ -938,16 +941,16 @@ struct attribute *bch2_dev_files[] = {
&sysfs_discard,
&sysfs_cache_replacement_policy,
&sysfs_state_rw,
- &sysfs_group,
+ &sysfs_label,
&sysfs_has_data,
&sysfs_iostats,
/* alloc info - other stats: */
- &sysfs_read_priority_stats,
- &sysfs_write_priority_stats,
- &sysfs_fragmentation_stats,
- &sysfs_oldest_gen_stats,
+ &sysfs_last_read_quantiles,
+ &sysfs_last_write_quantiles,
+ &sysfs_fragmentation_quantiles,
+ &sysfs_oldest_gen_quantiles,
&sysfs_reserve_stats,
/* debug: */
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index 211a844c..a15a0fa9 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
+#include "disk_groups.h"
#include "extents.h"
#include "io.h"
#include "move.h"
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 81e942e5..79a98f75 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -86,8 +86,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
.cmp_bkey = xattr_cmp_bkey,
};
-static const char *bch2_xattr_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
@@ -126,8 +125,8 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c,
}
}
-static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
+void bch2_xattr_to_text(struct bch_fs *c, char *buf,
+ size_t size, struct bkey_s_c k)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
@@ -159,11 +158,6 @@ static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
}
}
-const struct bkey_ops bch2_bkey_xattr_ops = {
- .key_invalid = bch2_xattr_invalid,
- .val_to_text = bch2_xattr_to_text,
-};
-
int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
const char *name, void *buffer, size_t size, int type)
{
diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h
index 9c815a2d..a58e7e30 100644
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@@ -4,7 +4,14 @@
#include "str_hash.h"
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-extern const struct bkey_ops bch2_bkey_xattr_ops;
+
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_xattr_ops (struct bkey_ops) { \
+ .key_invalid = bch2_xattr_invalid, \
+ .val_to_text = bch2_xattr_to_text, \
+}
struct dentry;
struct xattr_handler;