summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-12-13 16:01:18 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2017-12-13 16:12:38 -0500
commitea83a3985d28372d56ec7cea6e73907551869f63 (patch)
tree42b8b0d3da3b1fa96eb4400455559e60a78c4294
parentf2feceddae6f3bd3722247f3458860b955f539bc (diff)
Update bcachefs sources to e57b5958cf bcachefs: fix for building in userspace
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c2
-rw-r--r--include/trace/events/bcachefs.h226
-rw-r--r--libbcachefs/alloc.c921
-rw-r--r--libbcachefs/alloc.h87
-rw-r--r--libbcachefs/alloc_types.h25
-rw-r--r--libbcachefs/bcachefs.h85
-rw-r--r--libbcachefs/bcachefs_format.h14
-rw-r--r--libbcachefs/bkey.c8
-rw-r--r--libbcachefs/bset.h24
-rw-r--r--libbcachefs/btree_gc.c193
-rw-r--r--libbcachefs/btree_gc.h10
-rw-r--r--libbcachefs/btree_io.c8
-rw-r--r--libbcachefs/btree_io.h1
-rw-r--r--libbcachefs/btree_locking.h2
-rw-r--r--libbcachefs/btree_types.h30
-rw-r--r--libbcachefs/btree_update_interior.c96
-rw-r--r--libbcachefs/btree_update_leaf.c5
-rw-r--r--libbcachefs/buckets.c350
-rw-r--r--libbcachefs/buckets.h76
-rw-r--r--libbcachefs/buckets_types.h15
-rw-r--r--libbcachefs/checksum.c168
-rw-r--r--libbcachefs/checksum.h36
-rw-r--r--libbcachefs/compress.c135
-rw-r--r--libbcachefs/compress.h10
-rw-r--r--libbcachefs/extents.c539
-rw-r--r--libbcachefs/extents.h339
-rw-r--r--libbcachefs/extents_types.h27
-rw-r--r--libbcachefs/eytzinger.h86
-rw-r--r--libbcachefs/fs-io.c193
-rw-r--r--libbcachefs/fs-io.h65
-rw-r--r--libbcachefs/fs.c26
-rw-r--r--libbcachefs/io.c1209
-rw-r--r--libbcachefs/io.h93
-rw-r--r--libbcachefs/io_types.h53
-rw-r--r--libbcachefs/journal.c50
-rw-r--r--libbcachefs/keylist.h5
-rw-r--r--libbcachefs/migrate.c125
-rw-r--r--libbcachefs/move.c466
-rw-r--r--libbcachefs/move.h80
-rw-r--r--libbcachefs/movinggc.c303
-rw-r--r--libbcachefs/movinggc.h28
-rw-r--r--libbcachefs/super-io.c5
-rw-r--r--libbcachefs/super.c33
-rw-r--r--libbcachefs/super.h24
-rw-r--r--libbcachefs/super_types.h29
-rw-r--r--libbcachefs/sysfs.c80
-rw-r--r--libbcachefs/tier.c126
-rw-r--r--libbcachefs/util.c135
-rw-r--r--libbcachefs/util.h24
50 files changed, 3405 insertions, 3267 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 04ebc308..77247162 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-192d759a491f50d92c89c2e842639d2307c815a5
+e57b5958cf4e8530d26f7c36a6e1427fb284cc70
diff --git a/cmd_migrate.c b/cmd_migrate.c
index d683a5f2..58c0bb96 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c,
if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret));
- bch2_write_op_init(&op, c, res, NULL, 0,
+ bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl);
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 0c9f3de5..bf187f5e 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -98,23 +98,6 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
-DECLARE_EVENT_CLASS(page_alloc_fail,
- TP_PROTO(struct bch_fs *c, u64 size),
- TP_ARGS(c, size),
-
- TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(u64, size )
- ),
-
- TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- __entry->size = size;
- ),
-
- TP_printk("%pU size %llu", __entry->uuid, __entry->size)
-);
-
/* io.c: */
DEFINE_EVENT(bio, read_split,
@@ -137,34 +120,6 @@ DEFINE_EVENT(bio, promote,
TP_ARGS(bio)
);
-TRACE_EVENT(write_throttle,
- TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay),
- TP_ARGS(c, inode, bio, delay),
-
- TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(u64, inode )
- __field(sector_t, sector )
- __field(unsigned int, nr_sector )
- __array(char, rwbs, 6 )
- __field(u64, delay )
- ),
-
- TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- __entry->inode = inode;
- __entry->sector = bio->bi_iter.bi_sector;
- __entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
- __entry->delay = delay;
- ),
-
- TP_printk("%pU inode %llu %s %llu + %u delay %llu",
- __entry->uuid, __entry->inode,
- __entry->rwbs, (unsigned long long)__entry->sector,
- __entry->nr_sector, __entry->delay)
-);
-
/* Journal */
DEFINE_EVENT(bch_fs, journal_full,
@@ -439,16 +394,6 @@ TRACE_EVENT(alloc_batch,
__entry->uuid, __entry->free, __entry->total)
);
-DEFINE_EVENT(bch_dev, prio_write_start,
- TP_PROTO(struct bch_dev *ca),
- TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bch_dev, prio_write_end,
- TP_PROTO(struct bch_dev *ca),
- TP_ARGS(ca)
-);
-
TRACE_EVENT(invalidate,
TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
TP_ARGS(ca, offset, sectors),
@@ -502,174 +447,77 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
TP_ARGS(ca, reserve)
);
-TRACE_EVENT(freelist_empty_fail,
- TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve,
- struct closure *cl),
- TP_ARGS(c, reserve, cl),
-
- TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(enum alloc_reserve, reserve )
- __field(struct closure *, cl )
- ),
-
- TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- __entry->reserve = reserve;
- __entry->cl = cl;
- ),
-
- TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve,
- __entry->cl)
-);
-
-DECLARE_EVENT_CLASS(open_bucket_alloc,
- TP_PROTO(struct bch_fs *c, struct closure *cl),
- TP_ARGS(c, cl),
-
- TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(struct closure *, cl )
- ),
-
- TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- __entry->cl = cl;
- ),
-
- TP_printk("%pU cl %p",
- __entry->uuid, __entry->cl)
-);
-
-DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc,
- TP_PROTO(struct bch_fs *c, struct closure *cl),
- TP_ARGS(c, cl)
-);
-
-DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail,
- TP_PROTO(struct bch_fs *c, struct closure *cl),
- TP_ARGS(c, cl)
+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+ TP_ARGS(ca, reserve)
);
/* Moving IO */
-DECLARE_EVENT_CLASS(moving_io,
- TP_PROTO(struct bkey *k),
- TP_ARGS(k),
-
- TP_STRUCT__entry(
- __field(__u32, inode )
- __field(__u64, offset )
- __field(__u32, sectors )
- ),
-
- TP_fast_assign(
- __entry->inode = k->p.inode;
- __entry->offset = k->p.offset;
- __entry->sectors = k->size;
- ),
-
- TP_printk("%u:%llu sectors %u",
- __entry->inode, __entry->offset, __entry->sectors)
-);
-
-DEFINE_EVENT(moving_io, move_read,
- TP_PROTO(struct bkey *k),
- TP_ARGS(k)
-);
-
-DEFINE_EVENT(moving_io, move_read_done,
- TP_PROTO(struct bkey *k),
+DEFINE_EVENT(bkey, move_extent,
+ TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(moving_io, move_write,
- TP_PROTO(struct bkey *k),
+DEFINE_EVENT(bkey, move_alloc_fail,
+ TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(moving_io, copy_collision,
- TP_PROTO(struct bkey *k),
+DEFINE_EVENT(bkey, move_race,
+ TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-/* Copy GC */
-
-DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail,
- TP_PROTO(struct bch_fs *c, u64 size),
- TP_ARGS(c, size)
-);
-
-DEFINE_EVENT(bch_dev, moving_gc_start,
- TP_PROTO(struct bch_dev *ca),
- TP_ARGS(ca)
-);
-
-TRACE_EVENT(moving_gc_end,
- TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved,
- u64 buckets_moved),
- TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
+TRACE_EVENT(move_data,
+ TP_PROTO(struct bch_fs *c, u64 sectors_moved,
+ u64 keys_moved),
+ TP_ARGS(c, sectors_moved, keys_moved),
TP_STRUCT__entry(
__array(char, uuid, 16 )
__field(u64, sectors_moved )
__field(u64, keys_moved )
- __field(u64, buckets_moved )
),
TP_fast_assign(
- memcpy(__entry->uuid, ca->uuid.b, 16);
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->sectors_moved = sectors_moved;
__entry->keys_moved = keys_moved;
- __entry->buckets_moved = buckets_moved;
),
- TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu",
- __entry->uuid, __entry->sectors_moved, __entry->keys_moved,
- __entry->buckets_moved)
-);
-
-DEFINE_EVENT(bkey, gc_copy,
- TP_PROTO(const struct bkey *k),
- TP_ARGS(k)
-);
-
-/* Tiering */
-
-DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail,
- TP_PROTO(struct bch_fs *c, u64 size),
- TP_ARGS(c, size)
+ TP_printk("%pU sectors_moved %llu keys_moved %llu",
+ __entry->uuid, __entry->sectors_moved, __entry->keys_moved)
);
-DEFINE_EVENT(bch_fs, tiering_start,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-TRACE_EVENT(tiering_end,
- TP_PROTO(struct bch_fs *c, u64 sectors_moved,
- u64 keys_moved),
- TP_ARGS(c, sectors_moved, keys_moved),
+TRACE_EVENT(copygc,
+ TP_PROTO(struct bch_dev *ca,
+ u64 sectors_moved, u64 sectors_not_moved,
+ u64 buckets_moved, u64 buckets_not_moved),
+ TP_ARGS(ca,
+ sectors_moved, sectors_not_moved,
+ buckets_moved, buckets_not_moved),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(u64, sectors_moved )
- __field(u64, keys_moved )
+ __array(char, uuid, 16 )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_not_moved )
+ __field(u64, buckets_moved )
+ __field(u64, buckets_not_moved )
),
TP_fast_assign(
- memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
- __entry->sectors_moved = sectors_moved;
- __entry->keys_moved = keys_moved;
+ memcpy(__entry->uuid, ca->uuid.b, 16);
+ __entry->sectors_moved = sectors_moved;
+ __entry->sectors_not_moved = sectors_not_moved;
+ __entry->buckets_moved = buckets_moved;
+ __entry->buckets_not_moved = buckets_moved;
),
- TP_printk("%pU sectors_moved %llu keys_moved %llu",
- __entry->uuid, __entry->sectors_moved, __entry->keys_moved)
-);
-
-DEFINE_EVENT(bkey, tiering_copy,
- TP_PROTO(const struct bkey *k),
- TP_ARGS(k)
+ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
+ __entry->uuid,
+ __entry->sectors_moved, __entry->sectors_not_moved,
+ __entry->buckets_moved, __entry->buckets_not_moved)
);
#endif /* _TRACE_BCACHE_H */
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index dc7348fc..d29d871a 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -56,6 +56,7 @@
#include "bcachefs.h"
#include "alloc.h"
#include "btree_update.h"
+#include "btree_gc.h"
#include "buckets.h"
#include "checksum.h"
#include "clock.h"
@@ -76,7 +77,7 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static void bch2_recalc_min_prio(struct bch_dev *, int);
+static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
/* Ratelimiting/PD controllers */
@@ -92,8 +93,6 @@ static void pd_controllers_update(struct work_struct *work)
u64 faster_tiers_size = 0;
u64 faster_tiers_dirty = 0;
- u64 fastest_tier_size = 0;
- u64 fastest_tier_free = 0;
u64 copygc_can_free = 0;
rcu_read_lock();
@@ -105,7 +104,7 @@ static void pd_controllers_update(struct work_struct *work)
-1);
for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) {
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
u64 size = bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket) << 9;
@@ -125,18 +124,12 @@ static void pd_controllers_update(struct work_struct *work)
fragmented = max(0LL, fragmented);
- bch2_pd_controller_update(&ca->moving_gc_pd,
+ bch2_pd_controller_update(&ca->copygc_pd,
free, fragmented, -1);
faster_tiers_size += size;
faster_tiers_dirty += dirty;
- if (!c->fastest_tier ||
- c->fastest_tier == &c->tiers[i]) {
- fastest_tier_size += size;
- fastest_tier_free += free;
- }
-
copygc_can_free += fragmented;
}
}
@@ -157,14 +150,6 @@ static void pd_controllers_update(struct work_struct *work)
if (c->fastest_tier)
copygc_can_free = U64_MAX;
- bch2_pd_controller_update(&c->foreground_write_pd,
- min(copygc_can_free,
- div_u64(fastest_tier_size *
- c->foreground_target_percent,
- 100)),
- fastest_tier_free,
- -1);
-
schedule_delayed_work(&c->pd_controllers_update,
c->pd_controllers_update_seconds * HZ);
}
@@ -295,6 +280,8 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
struct journal_replay *r;
struct btree_iter iter;
struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
int ret;
if (!c->btree_roots[BTREE_ID_ALLOC].b)
@@ -318,6 +305,11 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
+ for_each_member_device(ca, c, i) {
+ bch2_recalc_min_prio(c, ca, READ);
+ bch2_recalc_min_prio(c, ca, WRITE);
+ }
+
return 0;
}
@@ -436,7 +428,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
if (gc_count != c->gc_count)
ca->inc_gen_really_needs_gc = 0;
- if ((ssize_t) (dev_buckets_available(ca) -
+ if ((ssize_t) (dev_buckets_available(c, ca) -
ca->inc_gen_really_needs_gc) >=
(ssize_t) fifo_free(&ca->free_inc))
break;
@@ -451,9 +443,10 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
-static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
+static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
+ size_t bucket)
{
- if (expensive_debug_checks(ca->fs)) {
+ if (expensive_debug_checks(c)) {
size_t iter;
long i;
unsigned j;
@@ -468,9 +461,8 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
/* Bucket heap / gen */
-void bch2_recalc_min_prio(struct bch_dev *ca, int rw)
+void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
{
- struct bch_fs *c = ca->fs;
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket *g;
u16 max_delta = 1;
@@ -478,14 +470,14 @@ void bch2_recalc_min_prio(struct bch_dev *ca, int rw)
lockdep_assert_held(&c->bucket_lock);
- /* Determine min prio for this particular cache */
+ /* Determine min prio for this particular device */
for_each_bucket(g, ca)
max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
ca->min_prio[rw] = clock->hand - max_delta;
/*
- * This may possibly increase the min prio for the whole cache, check
+ * This may possibly increase the min prio for the whole device, check
* that as well.
*/
max_delta = 1;
@@ -511,7 +503,7 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
g->prio[rw] = clock->hand -
(clock->hand - g->prio[rw]) / 2;
- bch2_recalc_min_prio(ca, rw);
+ bch2_recalc_min_prio(c, ca, rw);
}
}
@@ -588,20 +580,20 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
return can_inc_bucket_gen(ca, g);
}
-static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
+static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g)
{
- struct bch_fs *c = ca->fs;
struct bucket_mark m;
- spin_lock(&ca->freelist_lock);
- if (!bch2_invalidate_bucket(ca, g, &m)) {
- spin_unlock(&ca->freelist_lock);
+ spin_lock(&c->freelist_lock);
+ if (!bch2_invalidate_bucket(c, ca, g, &m)) {
+ spin_unlock(&c->freelist_lock);
return;
}
- verify_not_on_freelist(ca, g - ca->buckets);
+ verify_not_on_freelist(c, ca, g - ca->buckets);
BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
- spin_unlock(&ca->freelist_lock);
+ spin_unlock(&c->freelist_lock);
g->prio[READ] = c->prio_clock[READ].hand;
g->prio[WRITE] = c->prio_clock[WRITE].hand;
@@ -641,9 +633,8 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
* number wraparound.
*/
-static unsigned long bucket_sort_key(struct bch_dev *ca,
- struct bucket *g,
- struct bucket_mark m)
+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, struct bucket_mark m)
{
/*
* Time since last read, scaled to [0, 8) where larger value indicates
@@ -651,14 +642,14 @@ static unsigned long bucket_sort_key(struct bch_dev *ca,
*/
unsigned long hotness =
(g->prio[READ] - ca->min_prio[READ]) * 7 /
- (ca->fs->prio_clock[READ].hand - ca->min_prio[READ]);
+ (c->prio_clock[READ].hand - ca->min_prio[READ]);
/* How much we want to keep the data in this bucket: */
unsigned long data_wantness =
(hotness + 1) * bucket_sectors_used(m);
unsigned long needs_journal_commit =
- bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
+ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
return (data_wantness << 9) |
(needs_journal_commit << 8) |
@@ -672,16 +663,16 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
return (l.key > r.key) - (l.key < r.key);
}
-static void invalidate_buckets_lru(struct bch_dev *ca)
+static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct alloc_heap_entry e;
struct bucket *g;
ca->alloc_heap.used = 0;
- mutex_lock(&ca->fs->bucket_lock);
- bch2_recalc_min_prio(ca, READ);
- bch2_recalc_min_prio(ca, WRITE);
+ mutex_lock(&c->bucket_lock);
+ bch2_recalc_min_prio(c, ca, READ);
+ bch2_recalc_min_prio(c, ca, WRITE);
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@@ -696,7 +687,7 @@ static void invalidate_buckets_lru(struct bch_dev *ca)
e = (struct alloc_heap_entry) {
.bucket = g - ca->buckets,
- .key = bucket_sort_key(ca, g, m)
+ .key = bucket_sort_key(c, ca, g, m)
};
heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
@@ -710,12 +701,12 @@ static void invalidate_buckets_lru(struct bch_dev *ca)
*/
while (!fifo_full(&ca->free_inc) &&
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
- bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]);
+ bch2_invalidate_one_bucket(c, ca, &ca->buckets[e.bucket]);
- mutex_unlock(&ca->fs->bucket_lock);
+ mutex_unlock(&c->bucket_lock);
}
-static void invalidate_buckets_fifo(struct bch_dev *ca)
+static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_mark m;
struct bucket *g;
@@ -730,14 +721,14 @@ static void invalidate_buckets_fifo(struct bch_dev *ca)
m = READ_ONCE(g->mark);
if (bch2_can_invalidate_bucket(ca, g, m))
- bch2_invalidate_one_bucket(ca, g);
+ bch2_invalidate_one_bucket(c, ca, g);
if (++checked >= ca->mi.nbuckets)
return;
}
}
-static void invalidate_buckets_random(struct bch_dev *ca)
+static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_mark m;
struct bucket *g;
@@ -752,27 +743,27 @@ static void invalidate_buckets_random(struct bch_dev *ca)
m = READ_ONCE(g->mark);
if (bch2_can_invalidate_bucket(ca, g, m))
- bch2_invalidate_one_bucket(ca, g);
+ bch2_invalidate_one_bucket(c, ca, g);
if (++checked >= ca->mi.nbuckets / 2)
return;
}
}
-static void invalidate_buckets(struct bch_dev *ca)
+static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
{
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
case CACHE_REPLACEMENT_LRU:
- invalidate_buckets_lru(ca);
+ invalidate_buckets_lru(c, ca);
break;
case CACHE_REPLACEMENT_FIFO:
- invalidate_buckets_fifo(ca);
+ invalidate_buckets_fifo(c, ca);
break;
case CACHE_REPLACEMENT_RANDOM:
- invalidate_buckets_random(ca);
+ invalidate_buckets_random(c, ca);
break;
}
}
@@ -812,7 +803,8 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
* Given an invalidated, ready to use bucket: issue a discard to it if enabled,
* then add it to the freelist, waiting until there's room if necessary:
*/
-static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
+static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca,
+ long bucket)
{
if (ca->mi.discard &&
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
@@ -830,15 +822,15 @@ static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
* Don't remove from free_inc until after it's added to
* freelist, so gc can find it:
*/
- spin_lock(&ca->freelist_lock);
+ spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
- closure_wake_up(&ca->fs->freelist_wait);
+ closure_wake_up(&c->freelist_wait);
pushed = true;
break;
}
- spin_unlock(&ca->freelist_lock);
+ spin_unlock(&c->freelist_lock);
if (pushed)
break;
@@ -877,7 +869,7 @@ static int bch2_allocator_thread(void *arg)
BUG_ON(fifo_empty(&ca->free_inc));
bucket = fifo_peek(&ca->free_inc);
- discard_invalidated_bucket(ca, bucket);
+ discard_invalidated_bucket(c, ca, bucket);
if (kthread_should_stop())
return 0;
--ca->nr_invalidated;
@@ -924,7 +916,7 @@ static int bch2_allocator_thread(void *arg)
* another cache tier
*/
- invalidate_buckets(ca);
+ invalidate_buckets(c, ca);
trace_alloc_batch(ca, fifo_used(&ca->free_inc),
ca->free_inc.size);
@@ -949,12 +941,12 @@ static int bch2_allocator_thread(void *arg)
BUG_ON(ca->free_inc.front);
- spin_lock(&ca->freelist_lock);
+ spin_lock(&c->freelist_lock);
sort(ca->free_inc.data,
ca->free_inc.back,
sizeof(ca->free_inc.data[0]),
size_t_cmp, NULL);
- spin_unlock(&ca->freelist_lock);
+ spin_unlock(&c->freelist_lock);
/*
* free_inc is now full of newly-invalidated buckets: next,
@@ -966,6 +958,55 @@ static int bch2_allocator_thread(void *arg)
/* Allocation */
/*
+ * Open buckets represent a bucket that's currently being allocated from. They
+ * serve two purposes:
+ *
+ * - They track buckets that have been partially allocated, allowing for
+ * sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ * - They provide a reference to the buckets they own that mark and sweep GC
+ * can find, until the new allocation has a pointer to it inserted into the
+ * btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = c->devs[ob->ptr.dev];
+
+ spin_lock(&ob->lock);
+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
+ gc_pos_alloc(c, ob), 0);
+ ob->valid = false;
+ spin_unlock(&ob->lock);
+
+ spin_lock(&c->freelist_lock);
+ ob->freelist = c->open_buckets_freelist;
+ c->open_buckets_freelist = ob - c->open_buckets;
+ c->open_buckets_nr_free++;
+ spin_unlock(&c->freelist_lock);
+
+ closure_wake_up(&c->open_buckets_wait);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+ struct open_bucket *ob;
+
+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+ ob = c->open_buckets + c->open_buckets_freelist;
+ c->open_buckets_freelist = ob->freelist;
+ atomic_set(&ob->pin, 1);
+
+ c->open_buckets_nr_free--;
+ return ob;
+}
+
+/*
* XXX: allocation on startup is still sketchy. There is insufficient
* synchronization for bch2_bucket_alloc_startup() to work correctly after
* bch2_alloc_write() has been called, and we aren't currently doing anything
@@ -994,7 +1035,7 @@ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
for_each_bucket(g, ca)
if (!g->mark.touched_this_mount &&
is_available_bucket(g->mark) &&
- bch2_mark_alloc_bucket_startup(ca, g)) {
+ bch2_mark_alloc_bucket_startup(c, ca, g)) {
r = g - ca->buckets;
set_bit(r, ca->bucket_dirty);
break;
@@ -1004,69 +1045,105 @@ out:
return r;
}
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+{
+ switch (reserve) {
+ case RESERVE_ALLOC:
+ return 0;
+ case RESERVE_BTREE:
+ return BTREE_NODE_RESERVE / 2;
+ default:
+ return BTREE_NODE_RESERVE;
+ }
+}
+
/**
* bch_bucket_alloc - allocate a single bucket from a specific device
*
* Returns index of bucket on success, 0 on failure
* */
-long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum alloc_reserve reserve)
+int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ bool may_alloc_partial,
+ struct closure *cl)
{
- size_t r;
+ struct open_bucket *ob;
+ long bucket;
+
+ spin_lock(&c->freelist_lock);
+ if (may_alloc_partial &&
+ ca->open_buckets_partial_nr) {
+ int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+ c->open_buckets[ret].on_partial_list = false;
+ spin_unlock(&c->freelist_lock);
+ return ret;
+ }
+
+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+ if (cl)
+ closure_wait(&c->open_buckets_wait, cl);
+ spin_unlock(&c->freelist_lock);
+ trace_open_bucket_alloc_fail(ca, reserve);
+ return OPEN_BUCKETS_EMPTY;
+ }
- spin_lock(&ca->freelist_lock);
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
goto out;
switch (reserve) {
case RESERVE_ALLOC:
- if (fifo_pop(&ca->free[RESERVE_BTREE], r))
+ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
goto out;
break;
case RESERVE_BTREE:
if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
ca->free[RESERVE_BTREE].size &&
- fifo_pop(&ca->free[RESERVE_BTREE], r))
+ fifo_pop(&ca->free[RESERVE_BTREE], bucket))
goto out;
break;
case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
goto out;
break;
default:
break;
}
- spin_unlock(&ca->freelist_lock);
-
if (unlikely(!ca->alloc_thread_started) &&
(reserve == RESERVE_ALLOC) &&
- (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
- verify_not_on_freelist(ca, r);
- goto out2;
- }
+ (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
+ goto out;
+
+ spin_unlock(&c->freelist_lock);
trace_bucket_alloc_fail(ca, reserve);
- return -1;
+ return FREELIST_EMPTY;
out:
- verify_not_on_freelist(ca, r);
- spin_unlock(&ca->freelist_lock);
+ verify_not_on_freelist(c, ca, bucket);
+
+ ob = bch2_open_bucket_alloc(c);
+
+ spin_lock(&ob->lock);
+ ob->valid = true;
+ ob->sectors_free = ca->mi.bucket_size;
+ ob->ptr = (struct bch_extent_ptr) {
+ .gen = ca->buckets[bucket].mark.gen,
+ .offset = bucket_to_sector(ca, bucket),
+ .dev = ca->dev_idx,
+ };
+ spin_unlock(&ob->lock);
+
+ spin_unlock(&c->freelist_lock);
bch2_wake_allocator(ca);
-out2:
- ca->buckets[r].prio[READ] = c->prio_clock[READ].hand;
- ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand;
+
+ ca->buckets[bucket].prio[READ] = c->prio_clock[READ].hand;
+ ca->buckets[bucket].prio[WRITE] = c->prio_clock[WRITE].hand;
trace_bucket_alloc(ca, reserve);
- return r;
+ return ob - c->open_buckets;
}
-enum bucket_alloc_ret {
- ALLOC_SUCCESS,
- NO_DEVICES, /* -EROFS */
- FREELIST_EMPTY, /* Allocator thread not keeping up */
-};
-
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
struct write_point *wp,
struct bch_devs_mask *devs)
@@ -1091,11 +1168,7 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
break;
}
- memmove(&ret.devs[j + 1],
- &ret.devs[j],
- sizeof(ret.devs[0]) * (ret.nr - j));
- ret.nr++;
- ret.devs[j] = i;
+ array_insert_item(ret.devs, ret.nr, j, i);
}
return ret;
@@ -1112,63 +1185,46 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
struct write_point *wp,
- struct open_bucket *ob,
unsigned nr_replicas,
enum alloc_reserve reserve,
- struct bch_devs_mask *devs)
+ struct bch_devs_mask *devs,
+ struct closure *cl)
{
enum bucket_alloc_ret ret = NO_DEVICES;
struct dev_alloc_list devs_sorted;
u64 buckets_free;
unsigned i;
- BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
+ BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
- if (ob->nr_ptrs >= nr_replicas)
+ if (wp->nr_ptrs >= nr_replicas)
return ALLOC_SUCCESS;
rcu_read_lock();
devs_sorted = bch2_wp_alloc_list(c, wp, devs);
- spin_lock(&ob->lock);
for (i = 0; i < devs_sorted.nr; i++) {
struct bch_dev *ca =
rcu_dereference(c->devs[devs_sorted.devs[i]]);
- struct open_bucket_ptr ptr;
+ int ob;
if (!ca)
continue;
- if (wp->type == BCH_DATA_USER &&
- ca->open_buckets_partial_nr) {
- ptr = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
- } else {
- long bucket = bch2_bucket_alloc(c, ca, reserve);
- if (bucket < 0) {
- ret = FREELIST_EMPTY;
- continue;
- }
-
- ptr = (struct open_bucket_ptr) {
- .ptr.gen = ca->buckets[bucket].mark.gen,
- .ptr.offset = bucket_to_sector(ca, bucket),
- .ptr.dev = ca->dev_idx,
- .sectors_free = ca->mi.bucket_size,
- };
+ ob = bch2_bucket_alloc(c, ca, reserve,
+ wp->type == BCH_DATA_USER, cl);
+ if (ob < 0) {
+ ret = ob;
+ if (ret == OPEN_BUCKETS_EMPTY)
+ break;
+ continue;
}
- /*
- * open_bucket_add_buckets expects new pointers at the head of
- * the list:
- */
- BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs));
- memmove(&ob->ptrs[1],
- &ob->ptrs[0],
- ob->nr_ptrs * sizeof(ob->ptrs[0]));
- ob->nr_ptrs++;
- ob->ptrs[0] = ptr;
-
- buckets_free = U64_MAX, dev_buckets_free(ca);
+ BUG_ON(ob <= 0 || ob > U8_MAX);
+ BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
+ wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
+
+ buckets_free = U64_MAX, dev_buckets_free(c, ca);
if (buckets_free)
wp->next_alloc[ca->dev_idx] +=
div64_u64(U64_MAX, buckets_free *
@@ -1179,20 +1235,21 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
__clear_bit(ca->dev_idx, devs->d);
- if (ob->nr_ptrs == nr_replicas) {
+ if (wp->nr_ptrs == nr_replicas) {
ret = ALLOC_SUCCESS;
break;
}
}
- EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
- spin_unlock(&ob->lock);
+ EBUG_ON(reserve == RESERVE_MOVINGGC &&
+ ret != ALLOC_SUCCESS &&
+ ret != OPEN_BUCKETS_EMPTY);
rcu_read_unlock();
return ret;
}
static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
- struct open_bucket *ob, unsigned nr_replicas,
+ unsigned nr_replicas,
enum alloc_reserve reserve,
struct bch_devs_mask *devs,
struct closure *cl)
@@ -1200,8 +1257,8 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
bool waiting = false;
while (1) {
- switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
- reserve, devs)) {
+ switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
+ reserve, devs, cl)) {
case ALLOC_SUCCESS:
if (waiting)
closure_wake_up(&c->freelist_wait);
@@ -1214,10 +1271,6 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
return -EROFS;
case FREELIST_EMPTY:
- if (!cl || waiting)
- trace_freelist_empty_fail(c,
- reserve, cl);
-
if (!cl)
return -ENOSPC;
@@ -1228,226 +1281,89 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
closure_wait(&c->freelist_wait, cl);
waiting = true;
break;
+ case OPEN_BUCKETS_EMPTY:
+ return cl ? -EAGAIN : -ENOSPC;
default:
BUG();
}
}
}
-/* Open buckets: */
-
-/*
- * Open buckets represent one or more buckets (on multiple devices) that are
- * currently being allocated from. They serve two purposes:
- *
- * - They track buckets that have been partially allocated, allowing for
- * sub-bucket sized allocations - they're used by the sector allocator below
- *
- * - They provide a reference to the buckets they own that mark and sweep GC
- * can find, until the new allocation has a pointer to it inserted into the
- * btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
+/* Sector allocator */
-void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+static void writepoint_drop_ptrs(struct bch_fs *c,
+ struct write_point *wp,
+ struct bch_devs_mask *devs,
+ unsigned nr_ptrs_dislike)
{
- const struct open_bucket_ptr *ptr;
- u8 new_ob;
+ int i;
- if (!atomic_dec_and_test(&ob->pin))
+ if (!nr_ptrs_dislike)
return;
- down_read(&c->alloc_gc_lock);
- spin_lock(&ob->lock);
-
- open_bucket_for_each_ptr(ob, ptr) {
- struct bch_dev *ca = c->devs[ptr->ptr.dev];
+ for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+ struct open_bucket *ob = wp->ptrs[i];
+ struct bch_dev *ca = c->devs[ob->ptr.dev];
- if (ptr->sectors_free) {
- /*
- * This is a ptr to a bucket that still has free space,
- * but we don't want to use it
- */
+ if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
BUG_ON(ca->open_buckets_partial_nr >=
ARRAY_SIZE(ca->open_buckets_partial));
- spin_lock(&ca->freelist_lock);
- ca->open_buckets_partial[ca->open_buckets_partial_nr++]
- = *ptr;
- spin_unlock(&ca->freelist_lock);
- } else {
- bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false);
- }
- }
- ob->nr_ptrs = 0;
-
- spin_unlock(&ob->lock);
- up_read(&c->alloc_gc_lock);
-
- new_ob = ob->new_ob;
- ob->new_ob = 0;
-
- spin_lock(&c->open_buckets_lock);
- ob->freelist = c->open_buckets_freelist;
- c->open_buckets_freelist = ob - c->open_buckets;
- c->open_buckets_nr_free++;
- spin_unlock(&c->open_buckets_lock);
-
- closure_wake_up(&c->open_buckets_wait);
-
- if (new_ob)
- bch2_open_bucket_put(c, c->open_buckets + new_ob);
-}
-
-static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
- unsigned nr_reserved,
- struct closure *cl)
-{
- struct open_bucket *ret;
-
- spin_lock(&c->open_buckets_lock);
-
- if (c->open_buckets_nr_free > nr_reserved) {
- BUG_ON(!c->open_buckets_freelist);
-
- ret = c->open_buckets + c->open_buckets_freelist;
- c->open_buckets_freelist = ret->freelist;
- atomic_set(&ret->pin, 1); /* XXX */
+ spin_lock(&c->freelist_lock);
+ ob->on_partial_list = true;
+ ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+ ob - c->open_buckets;
+ spin_unlock(&c->freelist_lock);
- BUG_ON(ret->new_ob);
- BUG_ON(ret->nr_ptrs);
-
- c->open_buckets_nr_free--;
- trace_open_bucket_alloc(c, cl);
- } else {
- trace_open_bucket_alloc_fail(c, cl);
-
- if (cl) {
- closure_wait(&c->open_buckets_wait, cl);
- ret = ERR_PTR(-EAGAIN);
- } else
- ret = ERR_PTR(-ENOSPC);
- }
-
- spin_unlock(&c->open_buckets_lock);
-
- return ret;
-}
-
-static unsigned open_bucket_sectors_free(struct bch_fs *c,
- struct open_bucket *ob,
- unsigned nr_replicas)
-{
- unsigned sectors_free = UINT_MAX;
- struct open_bucket_ptr *ptr;
-
- open_bucket_for_each_ptr(ob, ptr)
- sectors_free = min(sectors_free, ptr->sectors_free);
-
- return sectors_free != UINT_MAX ? sectors_free : 0;
-}
-
-static void open_bucket_move_ptrs(struct bch_fs *c,
- struct open_bucket *dst,
- struct open_bucket *src,
- struct bch_devs_mask *devs,
- unsigned nr_ptrs_dislike)
-{
- bool moved_ptr = false;
- int i;
-
- down_read(&c->alloc_gc_lock);
-
- if (dst < src) {
- spin_lock(&dst->lock);
- spin_lock_nested(&src->lock, 1);
- } else {
- spin_lock(&src->lock);
- spin_lock_nested(&dst->lock, 1);
- }
+ closure_wake_up(&c->open_buckets_wait);
+ closure_wake_up(&c->freelist_wait);
- for (i = src->nr_ptrs - 1; i >= 0; --i) {
- if (!src->ptrs[i].sectors_free) {
- /*
- * Don't do anything: leave the ptr on the old
- * open_bucket for gc to find
- */
- } else if (nr_ptrs_dislike &&
- !test_bit(src->ptrs[i].ptr.dev, devs->d)) {
- /*
- * We don't want this pointer; bch2_open_bucket_put()
- * will stick it on ca->open_buckets_partial to be
- * reused
- */
+ array_remove_item(wp->ptrs, wp->nr_ptrs, i);
--nr_ptrs_dislike;
- } else {
- BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs));
-
- dst->ptrs[dst->nr_ptrs++] = src->ptrs[i];
-
- src->nr_ptrs--;
- memmove(&src->ptrs[i],
- &src->ptrs[i + 1],
- (src->nr_ptrs - i) * sizeof(src->ptrs[0]));
-
- moved_ptr = true;
}
}
-
- if (moved_ptr) {
- BUG_ON(src->new_ob);
-
- atomic_inc(&dst->pin);
- src->new_ob = dst - c->open_buckets;
- }
-
- spin_unlock(&dst->lock);
- spin_unlock(&src->lock);
- up_read(&c->alloc_gc_lock);
}
-static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
+static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- const struct open_bucket_ptr *ptr;
+ struct open_bucket *ob;
+ unsigned i;
- open_bucket_for_each_ptr(ob, ptr) {
- struct bch_dev *ca = c->devs[ptr->ptr.dev];
+ writepoint_for_each_ptr(wp, ob, i) {
+ struct bch_dev *ca = c->devs[ob->ptr.dev];
- BUG_ON(ptr_stale(ca, &ptr->ptr));
+ BUG_ON(ptr_stale(ca, &ob->ptr));
}
#endif
}
-/* Sector allocator */
-
static int open_bucket_add_buckets(struct bch_fs *c,
- struct write_point *wp,
struct bch_devs_mask *_devs,
- struct open_bucket *ob,
+ struct write_point *wp,
+ struct bch_devs_list *devs_have,
unsigned nr_replicas,
enum alloc_reserve reserve,
struct closure *cl)
{
struct bch_devs_mask devs = c->rw_devs[wp->type];
- struct open_bucket_ptr *ptr;
+ struct open_bucket *ob;
+ unsigned i;
- if (ob->nr_ptrs >= nr_replicas)
+ if (wp->nr_ptrs >= nr_replicas)
return 0;
+ /* Don't allocate from devices we already have pointers to: */
+ for (i = 0; i < devs_have->nr; i++)
+ __clear_bit(devs_have->devs[i], devs.d);
+
+ writepoint_for_each_ptr(wp, ob, i)
+ __clear_bit(ob->ptr.dev, devs.d);
+
if (_devs)
bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX);
- /* Don't allocate from devices we already have pointers to: */
- open_bucket_for_each_ptr(ob, ptr)
- if (ptr->sectors_free)
- __clear_bit(ptr->ptr.dev, devs.d);
-
- return bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
- reserve, &devs, cl);
+ return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
}
static struct write_point *__writepoint_find(struct hlist_head *head,
@@ -1455,15 +1371,9 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
{
struct write_point *wp;
- hlist_for_each_entry_rcu(wp, head, node) {
- if (wp->write_point == write_point)
- continue;
-
- mutex_lock(&wp->lock);
+ hlist_for_each_entry_rcu(wp, head, node)
if (wp->write_point == write_point)
return wp;
- mutex_unlock(&wp->lock);
- }
return NULL;
}
@@ -1478,47 +1388,49 @@ static struct hlist_head *writepoint_hash(struct bch_fs *c,
}
static struct write_point *writepoint_find(struct bch_fs *c,
- enum bch_data_type data_type,
unsigned long write_point)
{
- struct write_point *wp, *oldest = NULL;
+ struct write_point *wp, *oldest;
struct hlist_head *head;
- switch (data_type) {
- case BCH_DATA_BTREE:
- wp = &c->btree_write_point;
+ if (!(write_point & 1UL)) {
+ wp = (struct write_point *) write_point;
mutex_lock(&wp->lock);
return wp;
- case BCH_DATA_USER:
- break;
- default:
- BUG();
}
head = writepoint_hash(c, write_point);
+restart_find:
wp = __writepoint_find(head, write_point);
- if (wp)
- goto out;
-
- mutex_lock(&c->write_points_hash_lock);
- wp = __writepoint_find(head, write_point);
- if (wp)
- goto out_unlock;
+ if (wp) {
+lock_wp:
+ mutex_lock(&wp->lock);
+ if (wp->write_point == write_point)
+ goto out;
+ mutex_unlock(&wp->lock);
+ goto restart_find;
+ }
+ oldest = NULL;
for (wp = c->write_points;
wp < c->write_points + ARRAY_SIZE(c->write_points);
wp++)
if (!oldest || time_before64(wp->last_used, oldest->last_used))
oldest = wp;
- wp = oldest;
- BUG_ON(!wp);
+ mutex_lock(&oldest->lock);
+ mutex_lock(&c->write_points_hash_lock);
+ wp = __writepoint_find(head, write_point);
+ if (wp && wp != oldest) {
+ mutex_unlock(&c->write_points_hash_lock);
+ mutex_unlock(&oldest->lock);
+ goto lock_wp;
+ }
- mutex_lock(&wp->lock);
+ wp = oldest;
hlist_del_rcu(&wp->node);
wp->write_point = write_point;
hlist_add_head_rcu(&wp->node, head);
-out_unlock:
mutex_unlock(&c->write_points_hash_lock);
out:
wp->last_used = sched_clock();
@@ -1529,97 +1441,81 @@ out:
* Get us an open_bucket we can allocate from, return with it locked:
*/
struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_mask *devs,
- unsigned long write_point,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *cl)
+ struct bch_devs_mask *devs,
+ struct write_point_specifier write_point,
+ struct bch_devs_list *devs_have,
+ unsigned nr_replicas,
+ unsigned nr_replicas_required,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl)
{
- struct open_bucket *ob;
struct write_point *wp;
- struct open_bucket_ptr *ptr;
- unsigned open_buckets_reserved = data_type == BCH_DATA_BTREE
- ? 0 : BTREE_NODE_RESERVE;
- unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0;
+ struct open_bucket *ob;
+ unsigned i, nr_ptrs_dislike = 0, nr_ptrs_have = 0;
int ret;
- BUG_ON(!nr_replicas);
+ BUG_ON(!nr_replicas || !nr_replicas_required);
- wp = writepoint_find(c, data_type, write_point);
- BUG_ON(wp->type != data_type);
-
- wp->last_used = sched_clock();
-
- ob = wp->ob;
+ wp = writepoint_find(c, write_point.v);
/* does ob have ptrs we don't need? */
- open_bucket_for_each_ptr(ob, ptr) {
- if (!ptr->sectors_free)
- nr_ptrs_empty++;
- else if (devs && !test_bit(ptr->ptr.dev, devs->d))
+ writepoint_for_each_ptr(wp, ob, i)
+ if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev))
+ nr_ptrs_have++;
+ else if (devs && !test_bit(ob->ptr.dev, devs->d))
nr_ptrs_dislike++;
- }
- ret = open_bucket_add_buckets(c, wp, devs, ob,
- nr_replicas + nr_ptrs_empty + nr_ptrs_dislike,
+ ret = open_bucket_add_buckets(c, devs, wp, devs_have,
+ nr_replicas + nr_ptrs_have + nr_ptrs_dislike,
reserve, cl);
if (ret && ret != -EROFS)
goto err;
- if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
- goto alloc_done;
-
- /*
- * XXX:
- * Should this allocation be _forced_ to used the specified device (e.g.
- * internal migration), or should we fall back to allocating from all
- * devices?
- */
- ret = open_bucket_add_buckets(c, wp, NULL, ob,
- nr_replicas + nr_ptrs_empty,
- reserve, cl);
- if (ret && ret != -EROFS)
- goto err;
-alloc_done:
- if (ob->nr_ptrs - nr_ptrs_empty -
- ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
- < nr_replicas_required) {
+ if (wp->nr_ptrs <
+ nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) {
ret = -EROFS;
goto err;
}
+ if ((int) wp->nr_ptrs - nr_ptrs_dislike < nr_replicas)
+ nr_ptrs_dislike = clamp_t(int, wp->nr_ptrs - nr_replicas,
+ 0, nr_ptrs_dislike);
+
+ /* Remove pointers we don't want to use: */
+ writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike);
+
/*
- * If ob->sectors_free == 0, one or more of the buckets ob points to is
- * full. We can't drop pointers from an open bucket - garbage collection
- * still needs to find them; instead, we must allocate a new open bucket
- * and copy any pointers to non-full buckets into the new open bucket.
+ * Move pointers to devices we already have to end of open bucket
+ * pointer list - note that removing pointers we don't want to use might
+ * have changed nr_ptrs_have:
*/
- BUG_ON(ob->nr_ptrs - nr_ptrs_empty - nr_replicas > nr_ptrs_dislike);
- nr_ptrs_dislike = ob->nr_ptrs - nr_ptrs_empty - nr_replicas;
-
- if (nr_ptrs_empty || nr_ptrs_dislike) {
- ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
- if (IS_ERR(ob)) {
- ret = PTR_ERR(ob);
- goto err;
- }
+ if (nr_ptrs_have) {
+ i = nr_ptrs_have = 0;
+ while (i < wp->nr_ptrs - nr_ptrs_have)
+ if (bch2_dev_list_has_dev(*devs_have, wp->ptrs[i]->ptr.dev)) {
+ nr_ptrs_have++;
+ swap(wp->ptrs[i], wp->ptrs[wp->nr_ptrs - nr_ptrs_have]);
+ } else {
+ i++;
+ }
+ }
- /* Remove pointers we don't want to use: */
+ wp->nr_ptrs_can_use =
+ min_t(unsigned, nr_replicas, wp->nr_ptrs - nr_ptrs_have);
- open_bucket_move_ptrs(c, ob, wp->ob, devs, nr_ptrs_dislike);
- bch2_open_bucket_put(c, wp->ob);
- wp->ob = ob;
- }
+ BUG_ON(wp->nr_ptrs_can_use < nr_replicas_required ||
+ wp->nr_ptrs_can_use > wp->nr_ptrs);
+
+ wp->sectors_free = UINT_MAX;
- BUG_ON(ob->nr_ptrs < nr_replicas_required);
+ for (i = 0; i < wp->nr_ptrs_can_use; i++)
+ wp->sectors_free = min(wp->sectors_free,
+ wp->ptrs[i]->sectors_free);
- wp->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
- BUG_ON(!wp->sectors_free);
- verify_not_stale(c, ob);
+ verify_not_stale(c, wp);
return wp;
err:
@@ -1631,31 +1527,27 @@ err:
* Append pointers to the space we just allocated to @k, and mark @sectors space
* as allocated out of @ob
*/
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
- unsigned nr_replicas, struct open_bucket *ob,
- unsigned sectors)
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+ struct bkey_i_extent *e, unsigned sectors)
{
- struct bch_extent_ptr tmp;
- struct open_bucket_ptr *ptr;
+ unsigned i;
- /*
- * We're keeping any existing pointer k has, and appending new pointers:
- * __bch2_write() will only write to the pointers we add here:
- */
+ BUG_ON(sectors > wp->sectors_free);
+ wp->sectors_free -= sectors;
- for (ptr = ob->ptrs;
- ptr < ob->ptrs + min_t(u8, ob->nr_ptrs, nr_replicas); ptr++) {
- struct bch_dev *ca = c->devs[ptr->ptr.dev];
+ for (i = 0; i < wp->nr_ptrs_can_use; i++) {
+ struct open_bucket *ob = wp->ptrs[i];
+ struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_extent_ptr tmp = ob->ptr;
- EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev));
+ EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
- tmp = ptr->ptr;
tmp.cached = bkey_extent_is_cached(&e->k);
- tmp.offset += ca->mi.bucket_size - ptr->sectors_free;
+ tmp.offset += ca->mi.bucket_size - ob->sectors_free;
extent_ptr_append(e, tmp);
- BUG_ON(sectors > ptr->sectors_free);
- ptr->sectors_free -= sectors;
+ BUG_ON(sectors > ob->sectors_free);
+ ob->sectors_free -= sectors;
}
}
@@ -1665,76 +1557,20 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
*/
void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
{
- struct open_bucket *ob = wp->ob, *new_ob = NULL;
- struct open_bucket_ptr *ptr;
- bool empty = false;
-
- open_bucket_for_each_ptr(ob, ptr)
- empty |= !ptr->sectors_free;
+ int i;
- if (empty)
- new_ob = bch2_open_bucket_get(c, 0, NULL);
+ for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+ struct open_bucket *ob = wp->ptrs[i];
- if (!IS_ERR_OR_NULL(new_ob)) {
- /* writepoint's ref becomes our ref: */
- wp->ob = new_ob;
- open_bucket_move_ptrs(c, new_ob, ob, 0, 0);
- } else {
- atomic_inc(&ob->pin);
+ if (!ob->sectors_free) {
+ array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+ bch2_open_bucket_put(c, ob);
+ }
}
mutex_unlock(&wp->lock);
}
-/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates k->size and k->offset (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, k->size indicates how many
- * sectors were actually allocated.
- *
- * Return codes:
- * - -EAGAIN: closure was added to waitlist
- * - -ENOSPC: out of space and no closure provided
- *
- * @c - filesystem.
- * @wp - write point to use for allocating sectors.
- * @k - key to return the allocated space information.
- * @cl - closure to wait for a bucket
- */
-struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_mask *devs,
- unsigned long write_point,
- struct bkey_i_extent *e,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *cl)
-{
- struct write_point *wp;
- struct open_bucket *ob;
-
- wp = bch2_alloc_sectors_start(c, data_type, devs, write_point,
- nr_replicas, nr_replicas_required,
- reserve, flags, cl);
- if (IS_ERR_OR_NULL(wp))
- return ERR_CAST(wp);
-
- ob = wp->ob;
-
- if (e->k.size > wp->sectors_free)
- bch2_key_resize(&e->k, wp->sectors_free);
-
- bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
-
- bch2_alloc_sectors_done(c, wp);
-
- return ob;
-}
-
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
@@ -1839,46 +1675,15 @@ set_capacity:
closure_wake_up(&c->freelist_wait);
}
-static bool open_bucket_has_device(struct open_bucket *ob,
- struct bch_dev *ca)
-{
- struct open_bucket_ptr *ptr;
- bool ret = false;
-
- spin_lock(&ob->lock);
- open_bucket_for_each_ptr(ob, ptr)
- ret |= ptr->ptr.dev == ca->dev_idx;
- spin_unlock(&ob->lock);
-
- return ret;
-}
-
static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
{
- struct open_bucket *ob;
- struct closure cl;
+ struct bch_devs_mask not_self;
- closure_init_stack(&cl);
-retry:
- mutex_lock(&wp->lock);
- if (!open_bucket_has_device(wp->ob, ca)) {
- mutex_unlock(&wp->lock);
- return;
- }
-
- ob = bch2_open_bucket_get(c, 0, &cl);
- if (IS_ERR(ob)) {
- mutex_unlock(&wp->lock);
- closure_sync(&cl);
- goto retry;
-
- }
-
- open_bucket_move_ptrs(c, ob, wp->ob, &ca->self, ob->nr_ptrs);
- bch2_open_bucket_put(c, wp->ob);
- wp->ob = ob;
+ bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
+ mutex_lock(&wp->lock);
+ writepoint_drop_ptrs(c, wp, &not_self, wp->nr_ptrs);
mutex_unlock(&wp->lock);
}
@@ -1889,9 +1694,13 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++)
- if (atomic_read(&ob->pin))
- ret |= open_bucket_has_device(ob, ca);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid && !ob->on_partial_list &&
+ ob->ptr.dev == ca->dev_idx)
+ ret = true;
+ spin_unlock(&ob->lock);
+ }
return ret;
}
@@ -1899,13 +1708,10 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- struct closure cl;
unsigned i;
BUG_ON(ca->alloc_thread);
- closure_init_stack(&cl);
-
/* First, remove device from allocation groups: */
clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
@@ -1920,6 +1726,9 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* Next, close write points that point to this device... */
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
bch2_stop_write_point(c, ca, &c->write_points[i]);
+
+ bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+ bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
bch2_stop_write_point(c, ca, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
@@ -1927,7 +1736,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
struct btree_alloc *a =
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
- bch2_open_bucket_put(c, a->ob);
+ bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
}
mutex_unlock(&c->btree_reserve_cache_lock);
@@ -1945,16 +1754,8 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* Now wait for any in flight writes: */
- while (1) {
- closure_wait(&c->open_buckets_wait, &cl);
-
- if (!bch2_dev_has_open_write_point(c, ca)) {
- closure_wake_up(&c->open_buckets_wait);
- break;
- }
-
- closure_sync(&cl);
- }
+ closure_wait_event(&c->open_buckets_wait,
+ !bch2_dev_has_open_write_point(c, ca));
}
/* device goes rw: */
@@ -2015,10 +1816,10 @@ void bch2_fs_allocator_init(struct bch_fs *c)
{
struct open_bucket *ob;
struct write_point *wp;
+ unsigned i;
mutex_init(&c->write_points_hash_lock);
- init_rwsem(&c->alloc_gc_lock);
- spin_lock_init(&c->open_buckets_lock);
+ spin_lock_init(&c->freelist_lock);
bch2_prio_timer_init(c, READ);
bch2_prio_timer_init(c, WRITE);
@@ -2034,40 +1835,20 @@ void bch2_fs_allocator_init(struct bch_fs *c)
c->open_buckets_freelist = ob - c->open_buckets;
}
- mutex_init(&c->btree_write_point.lock);
- c->btree_write_point.type = BCH_DATA_BTREE;
- c->btree_write_point.ob = bch2_open_bucket_get(c, 0, NULL);
- BUG_ON(IS_ERR(c->btree_write_point.ob));
+ writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+ writepoint_init(&c->tiers[i].wp, BCH_DATA_USER);
for (wp = c->write_points;
wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
- mutex_init(&wp->lock);
- wp->type = BCH_DATA_USER;
- wp->ob = bch2_open_bucket_get(c, 0, NULL);
- wp->last_used = sched_clock();
+ writepoint_init(wp, BCH_DATA_USER);
+ wp->last_used = sched_clock();
wp->write_point = (unsigned long) wp;
hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-
- BUG_ON(IS_ERR(wp->ob));
}
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-
- spin_lock_init(&c->foreground_write_pd_lock);
- bch2_pd_controller_init(&c->foreground_write_pd);
- /*
- * We do not want the write rate to have an effect on the computed
- * rate, for two reasons:
- *
- * We do not call bch2_ratelimit_delay() at all if the write rate
- * exceeds 1GB/s. In this case, the PD controller will think we are
- * not "keeping up" and not change the rate.
- */
- c->foreground_write_pd.backpressure = 0;
- init_timer(&c->foreground_write_wakeup);
-
- c->foreground_write_wakeup.data = (unsigned long) c;
- c->foreground_write_wakeup.function = bch2_wake_delayed_writes;
}
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index 1ea747d2..8dffb864 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -8,7 +8,7 @@ struct bkey;
struct bucket;
struct bch_dev;
struct bch_fs;
-struct dev_group;
+struct bch_devs_List;
struct dev_alloc_list {
unsigned nr;
@@ -24,33 +24,61 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
-long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
+enum bucket_alloc_ret {
+ ALLOC_SUCCESS = 0,
+ OPEN_BUCKETS_EMPTY = -1,
+ FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
+ NO_DEVICES = -3, /* -EROFS */
+};
+
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+ struct closure *);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+ if (atomic_dec_and_test(&ob->pin))
+ __bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+ unsigned i;
+
+ for (i = 0; i < *nr; i++)
+ bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+ *nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+ struct write_point *wp,
+ u8 *nr, u8 *refs)
+{
+ unsigned i;
-void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+ for (i = 0; i < wp->nr_ptrs_can_use; i++) {
+ struct open_bucket *ob = wp->ptrs[i];
+
+ atomic_inc(&ob->pin);
+ refs[(*nr)++] = ob - c->open_buckets;
+ }
+}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
- enum bch_data_type,
struct bch_devs_mask *,
- unsigned long,
+ struct write_point_specifier,
+ struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
- unsigned, struct open_bucket *, unsigned);
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+ struct bkey_i_extent *, unsigned);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
- enum bch_data_type,
- struct bch_devs_mask *,
- unsigned long,
- struct bkey_i_extent *,
- unsigned, unsigned,
- enum alloc_reserve,
- unsigned,
- struct closure *);
-
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;
@@ -61,10 +89,20 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
rcu_read_unlock();
}
-#define open_bucket_for_each_ptr(_ob, _ptr) \
- for ((_ptr) = (_ob)->ptrs; \
- (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \
- (_ptr)++)
+#define writepoint_for_each_ptr(_wp, _ob, _i) \
+ for ((_i) = 0; \
+ (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
+ (_i)++)
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+ return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+ return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
void bch2_recalc_capacity(struct bch_fs *);
@@ -74,6 +112,13 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
+static inline void writepoint_init(struct write_point *wp,
+ enum bch_data_type type)
+{
+ mutex_init(&wp->lock);
+ wp->type = type;
+}
+
void bch2_fs_allocator_init(struct bch_fs *);
extern const struct bkey_ops bch2_bkey_alloc_ops;
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index c48d0aaa..90123ff7 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -47,19 +47,14 @@ enum alloc_reserve {
#define OPEN_BUCKETS_COUNT 256
#define WRITE_POINT_COUNT 32
-struct open_bucket_ptr {
- struct bch_extent_ptr ptr;
- unsigned sectors_free;
-};
-
struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
- u8 new_ob;
- u8 nr_ptrs;
-
- struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2];
+ bool valid;
+ bool on_partial_list;
+ unsigned sectors_free;
+ struct bch_extent_ptr ptr;
};
struct write_point {
@@ -69,13 +64,23 @@ struct write_point {
unsigned long write_point;
enum bch_data_type type;
+ u8 nr_ptrs;
+ /*
+ * number of pointers in @ob we can't use, because we already had
+ * pointers to those devices:
+ */
+ u8 nr_ptrs_can_use;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
- struct open_bucket *ob;
+ struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
+struct write_point_specifier {
+ unsigned long v;
+};
+
struct alloc_heap_entry {
size_t bucket;
unsigned long key;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 58d4723e..b679dd16 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -251,9 +251,6 @@ do { \
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
- BCH_DEBUG_PARAM(version_stress_test, \
- "Assigns random version numbers to newly written " \
- "extents, to test overlapping extent cases") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
@@ -310,8 +307,9 @@ struct crypto_blkcipher;
struct crypto_ahash;
enum gc_phase {
- GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
+ GC_PHASE_SB = BTREE_ID_NR + 1,
GC_PHASE_PENDING_DELETE,
+ GC_PHASE_ALLOC,
GC_PHASE_DONE
};
@@ -321,30 +319,6 @@ struct gc_pos {
unsigned level;
};
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u8 state;
- u8 tier;
- u8 replacement;
- u8 discard;
- u8 data_allowed;
- u8 valid;
-};
-
-struct bch_replicas_cpu_entry {
- u8 data_type;
- u8 devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
- struct rcu_head rcu;
- unsigned nr;
- unsigned entry_size;
- struct bch_replicas_cpu_entry entries[];
-};
-
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
@@ -372,7 +346,7 @@ struct bch_dev {
struct bch_devs_mask self;
- /* biosets used in cloned bios for replicas and moving_gc */
+ /* biosets used in cloned bios for writing multiple replicas */
struct bio_set replica_set;
struct task_struct *alloc_thread;
@@ -392,7 +366,7 @@ struct bch_dev {
unsigned nr_invalidated;
bool alloc_thread_started;
- struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
+ u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
size_t fifo_last_bucket;
@@ -422,18 +396,20 @@ struct bch_dev {
bool allocator_invalidating_data;
alloc_heap alloc_heap;
- bucket_heap copygc_heap;
- /* Moving GC: */
- struct task_struct *moving_gc_read;
-
- struct bch_pd_controller moving_gc_pd;
+ /* Copying GC: */
+ struct task_struct *copygc_thread;
+ copygc_heap copygc_heap;
+ struct bch_pd_controller copygc_pd;
+ struct write_point copygc_write_point;
struct journal_device journal;
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
+ atomic_t latency[2];
+
struct io_count __percpu *io_done;
};
@@ -473,6 +449,7 @@ struct bch_tier {
struct bch_pd_controller pd;
struct bch_devs_mask devs;
+ struct write_point wp;
};
enum bch_fs_state {
@@ -557,10 +534,7 @@ struct bch_fs {
* when allocating btree reserves fail halfway through) - instead, we
* can stick them here:
*/
- struct btree_alloc {
- struct open_bucket *ob;
- BKEY_PADDED(k);
- } btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
unsigned btree_reserve_cache_nr;
struct mutex btree_reserve_cache_lock;
@@ -573,15 +547,9 @@ struct bch_fs {
struct workqueue_struct *copygc_wq;
/* ALLOCATION */
- struct rw_semaphore alloc_gc_lock;
- struct bch_pd_controller foreground_write_pd;
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
- spinlock_t foreground_write_pd_lock;
- struct bch_write_op *write_wait_head;
- struct bch_write_op *write_wait_tail;
- struct timer_list foreground_write_wakeup;
/*
* These contain all r/w devices - i.e. devices we can currently
@@ -622,8 +590,8 @@ struct bch_fs {
struct io_clock io_clock[2];
- /* SECTOR ALLOCATOR */
- spinlock_t open_buckets_lock;
+ /* ALLOCATOR */
+ spinlock_t freelist_lock;
u8 open_buckets_freelist;
u8 open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
@@ -635,15 +603,6 @@ struct bch_fs {
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
struct mutex write_points_hash_lock;
- /*
- * This write point is used for migrating data off a device
- * and can point to any other device.
- * We can't use the normal write points because those will
- * gang up n replicas, and for migration we want only one new
- * replica.
- */
- struct write_point migration_write_point;
-
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
@@ -688,6 +647,11 @@ struct bch_fs {
atomic64_t key_version;
+ /* VFS IO PATH - fs-io.c */
+ struct bio_set writepage_bioset;
+ struct bio_set dio_write_bioset;
+ struct bio_set dio_read_bioset;
+
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
@@ -728,19 +692,14 @@ struct bch_fs {
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
+ atomic_long_t extent_migrate_done;
+ atomic_long_t extent_migrate_raced;
unsigned btree_gc_periodic:1;
- unsigned foreground_write_ratelimit_enabled:1;
unsigned copy_gc_enabled:1;
unsigned tiering_enabled:1;
unsigned tiering_percent;
- /*
- * foreground writes will be throttled when the number of free
- * buckets is below this percentage
- */
- unsigned foreground_target_percent;
-
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 16a1edd1..2dc9a7e0 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -344,11 +344,13 @@ struct bch_csum {
enum bch_csum_type {
BCH_CSUM_NONE = 0,
- BCH_CSUM_CRC32C = 1,
- BCH_CSUM_CRC64 = 2,
+ BCH_CSUM_CRC32C_NONZERO = 1,
+ BCH_CSUM_CRC64_NONZERO = 2,
BCH_CSUM_CHACHA20_POLY1305_80 = 3,
BCH_CSUM_CHACHA20_POLY1305_128 = 4,
- BCH_CSUM_NR = 5,
+ BCH_CSUM_CRC32C = 5,
+ BCH_CSUM_CRC64 = 6,
+ BCH_CSUM_NR = 7,
};
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
@@ -550,7 +552,7 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION);
/* Maximum possible size of an entire extent value: */
/* There's a hack in the keylist code that needs to be fixed.. */
#define BKEY_EXTENT_VAL_U64s_MAX \
- (BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
+ (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@@ -734,11 +736,13 @@ BKEY_VAL_TYPE(alloc, BCH_ALLOC);
/*
* Version 8: BCH_SB_ENCODED_EXTENT_MAX_BITS
* BCH_MEMBER_DATA_ALLOWED
+ * Version 9: incompatible extent nonce change
*/
#define BCH_SB_VERSION_MIN 7
#define BCH_SB_VERSION_EXTENT_MAX 8
-#define BCH_SB_VERSION_MAX 8
+#define BCH_SB_VERSION_EXTENT_NONCE_V1 9
+#define BCH_SB_VERSION_MAX 9
#define BCH_SB_SECTOR 8
#define BCH_SB_LABEL_SIZE 32
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index d33bc4e1..73089a90 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -4,6 +4,14 @@
#include "bset.h"
#include "util.h"
+#undef EBUG_ON
+
+#ifdef DEBUG_BKEYS
+#define EBUG_ON(cond) BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index a1337bf8..c195cd91 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -146,6 +146,17 @@
* first key in that range of bytes again.
*/
+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+ return false;
+#endif
+}
+
struct btree_node_iter;
struct btree_node_iter_set;
@@ -188,7 +199,7 @@ bkey_unpack_key_format_checked(const struct btree *b,
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(&dst, src);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+ if (btree_keys_expensive_checks(b)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
/*
@@ -260,17 +271,6 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
#define for_each_bset(_b, _t) \
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
- return false;
-#endif
-}
-
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
{
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index b0901965..1198fe39 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -24,6 +24,7 @@
#include <linux/bitops.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
+#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <trace/events/bcachefs.h>
@@ -111,19 +112,35 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
/*
* For runtime mark and sweep:
*/
-static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
- struct bkey_s_c k, unsigned flags)
+static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k, unsigned flags)
{
+ struct gc_pos pos = { 0 };
+ struct bch_fs_usage *stats;
+ u8 ret = 0;
+
+ preempt_disable();
+ stats = this_cpu_ptr(c->usage_percpu);
switch (type) {
case BKEY_TYPE_BTREE:
- bch2_gc_mark_key(c, k, c->opts.btree_node_size, true, flags);
- return 0;
+ bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
+ 0, flags|
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
+ break;
case BKEY_TYPE_EXTENTS:
- bch2_gc_mark_key(c, k, k.k->size, false, flags);
- return bch2_btree_key_recalc_oldest_gen(c, k);
+ bch2_mark_key(c, k, k.k->size, false, pos, stats,
+ 0, flags|
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
+ ret = bch2_btree_key_recalc_oldest_gen(c, k);
+ break;
default:
BUG();
}
+ preempt_enable();
+
+ return ret;
}
int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
@@ -182,7 +199,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
max_t(u64, k.k->version.lo,
atomic64_read(&c->key_version)));
- bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+ bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
fsck_err:
return ret;
}
@@ -200,7 +217,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
btree_node_is_extents(b),
&unpacked) {
bch2_bkey_debugcheck(c, b, k);
- stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
+ stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
}
return stale;
@@ -267,123 +284,79 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b;
- bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+ bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
mutex_unlock(&c->btree_root_lock);
return 0;
}
-static void bch2_mark_allocator_buckets(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct open_bucket *ob;
- const struct open_bucket_ptr *ptr;
- size_t i, j, iter;
- unsigned ci;
-
- down_write(&c->alloc_gc_lock);
-
- for_each_member_device(ca, c, ci) {
- spin_lock(&ca->freelist_lock);
-
- fifo_for_each_entry(i, &ca->free_inc, iter)
- bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
- for (ptr = ca->open_buckets_partial;
- ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
- ptr++)
- bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
-
- spin_unlock(&ca->freelist_lock);
- }
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- spin_lock(&ob->lock);
- open_bucket_for_each_ptr(ob, ptr) {
- ca = c->devs[ptr->ptr.dev];
- bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
- }
- spin_unlock(&ob->lock);
- }
-
- up_write(&c->alloc_gc_lock);
-}
-
-static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
- enum bucket_data_type type)
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+ u64 start, u64 end,
+ enum bucket_data_type type,
+ unsigned flags)
{
u64 b = sector_to_bucket(ca, start);
do {
- bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+ bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type,
+ gc_phase(GC_PHASE_SB), flags);
b++;
} while (b < sector_to_bucket(ca, end));
}
-static void bch2_dev_mark_superblocks(struct bch_dev *ca)
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+ unsigned flags)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
unsigned i;
+ u64 b;
+
+ lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) {
if (layout->sb_offset[i] == BCH_SB_SECTOR)
- mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
- BUCKET_SB);
+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+ BUCKET_SB, flags);
- mark_metadata_sectors(ca,
+ mark_metadata_sectors(c, ca,
layout->sb_offset[i],
layout->sb_offset[i] +
(1 << layout->sb_max_size_bits),
- BUCKET_SB);
+ BUCKET_SB, flags);
}
-}
-
-/*
- * Mark non btree metadata - prios, journal
- */
-void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
-{
- unsigned i;
- u64 b;
-
- lockdep_assert_held(&c->sb_lock);
-
- bch2_dev_mark_superblocks(ca);
spin_lock(&c->journal.lock);
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
- bch2_mark_metadata_bucket(ca, ca->buckets + b,
- BUCKET_JOURNAL, true);
+ bch2_mark_metadata_bucket(c, ca, ca->buckets + b,
+ BUCKET_JOURNAL,
+ gc_phase(GC_PHASE_SB), flags);
}
spin_unlock(&c->journal.lock);
}
-static void bch2_mark_metadata(struct bch_fs *c)
+static void bch2_mark_superblocks(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
mutex_lock(&c->sb_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
+ gc_pos_set(c, gc_phase(GC_PHASE_SB));
for_each_online_member(ca, c, i)
- bch2_mark_dev_metadata(c, ca);
+ bch2_mark_dev_superblock(c, ca,
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
mutex_unlock(&c->sb_lock);
}
/* Also see bch2_pending_btree_node_free_insert_done() */
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
+ struct gc_pos pos = { 0 };
struct bch_fs_usage stats = { 0 };
struct btree_update *as;
struct pending_btree_node_free *d;
@@ -393,10 +366,11 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done)
- __bch2_mark_key(c, bkey_i_to_s_c(&d->key),
- c->opts.btree_node_size, true,
- &stats, 0,
- BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+ c->opts.btree_node_size, true, pos,
+ &stats, 0,
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
@@ -405,6 +379,51 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
mutex_unlock(&c->btree_interior_update_lock);
}
+static void bch2_mark_allocator_buckets(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ struct open_bucket *ob;
+ size_t i, j, iter;
+ unsigned ci;
+
+ spin_lock(&c->freelist_lock);
+ gc_pos_set(c, gc_pos_alloc(c, NULL));
+
+ for_each_member_device(ca, c, ci) {
+ fifo_for_each_entry(i, &ca->free_inc, iter)
+ bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
+ gc_pos_alloc(c, NULL),
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+
+
+ for (j = 0; j < RESERVE_NR; j++)
+ fifo_for_each_entry(i, &ca->free[j], iter)
+ bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
+ gc_pos_alloc(c, NULL),
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
+ }
+
+ spin_unlock(&c->freelist_lock);
+
+ for (ob = c->open_buckets;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid) {
+ gc_pos_set(c, gc_pos_alloc(c, ob));
+ ca = c->devs[ob->ptr.dev];
+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
+ gc_pos_alloc(c, ob),
+ BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+ BCH_BUCKET_MARK_GC_LOCK_HELD);
+ }
+ spin_unlock(&ob->lock);
+ }
+}
+
void bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -495,9 +514,6 @@ void bch2_gc(struct bch_fs *c)
bch2_gc_start(c);
- /* Walk allocator's references: */
- bch2_mark_allocator_buckets(c);
-
/* Walk btree: */
while (c->gc_pos.phase < (int) BTREE_ID_NR) {
int ret = c->btree_roots[c->gc_pos.phase].b
@@ -513,8 +529,9 @@ void bch2_gc(struct bch_fs *c)
gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
}
- bch2_mark_metadata(c);
+ bch2_mark_superblocks(c);
bch2_mark_pending_btree_node_frees(c);
+ bch2_mark_allocator_buckets(c);
for_each_member_device(ca, c, i)
atomic_long_set(&ca->saturated_count, 0);
@@ -570,7 +587,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
struct bkey_format new_format;
memset(new_nodes, 0, sizeof(new_nodes));
- bch2_keylist_init(&keylist, NULL, 0);
+ bch2_keylist_init(&keylist, NULL);
/* Count keys that are not deleted */
for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
@@ -1023,8 +1040,6 @@ again:
if (ret)
return ret;
- bch2_mark_metadata(c);
-
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
@@ -1043,6 +1058,8 @@ again:
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
+ bch2_mark_superblocks(c);
+
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 27dcc06c..4d1ab9db 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -13,7 +13,7 @@ int bch2_initial_gc(struct bch_fs *, struct list_head *);
u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
-void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
/*
* For concurrent mark and sweep (with other index updates), we define a total
@@ -88,6 +88,14 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
};
}
+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+ return (struct gc_pos) {
+ .phase = GC_PHASE_ALLOC,
+ .pos = POS(ob ? ob - c->open_buckets : 0, 0),
+ };
+}
+
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index d50e9e8e..38c373c6 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -146,9 +146,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
BUG_ON(iter->data->k > iter->data->end);
if (iter->data->k == iter->data->end)
- memmove(&iter->data[0],
- &iter->data[1],
- sizeof(iter->data[0]) * --iter->used);
+ array_remove_item(iter->data, iter->used, 0);
else
sort_iter_sift(iter, cmp);
}
@@ -1307,6 +1305,8 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
+ bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+
INIT_WORK(&rb->work, btree_node_read_work);
schedule_work(&rb->work);
}
@@ -1471,6 +1471,8 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
+ bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+
if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
bch2_meta_write_fault("btree"))
set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index f3290f98..61165a63 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -10,6 +10,7 @@ struct btree_iter;
struct btree_read_bio {
struct bch_fs *c;
+ unsigned submit_time_us;
u64 start_time;
struct extent_pick_ptr pick;
struct work_struct work;
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 0c174e4e..c2711892 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -91,7 +91,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
int lock_type = btree_node_locked_type(iter, level);
- EBUG_ON(iter->flags & BTREE_ITER_UPTODATE);
+ EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
if (lock_type != BTREE_NODE_UNLOCKED)
six_unlock_type(&iter->nodes[level]->lock, lock_type);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 8b4df034..f1e06a37 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -55,6 +55,16 @@ struct btree_write {
struct closure_waitlist wait;
};
+struct btree_ob_ref {
+ u8 nr;
+ u8 refs[BCH_REPLICAS_MAX];
+};
+
+struct btree_alloc {
+ struct btree_ob_ref ob;
+ BKEY_PADDED(k);
+};
+
struct btree {
/* Hottest entries first */
struct rhash_head hash;
@@ -118,7 +128,7 @@ struct btree {
*/
struct btree_update *will_make_reachable;
- struct open_bucket *ob;
+ struct btree_ob_ref ob;
/* lru list */
struct list_head list;
@@ -317,18 +327,6 @@ struct btree_root {
struct btree_iter;
struct btree_node_iter;
-enum extent_insert_hook_ret {
- BTREE_HOOK_DO_INSERT,
- BTREE_HOOK_NO_INSERT,
- BTREE_HOOK_RESTART_TRANS,
-};
-
-struct extent_insert_hook {
- enum extent_insert_hook_ret
- (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
- struct bkey_s_c, const struct bkey_i *);
-};
-
enum btree_insert_ret {
BTREE_INSERT_OK,
/* extent spanned multiple leaf nodes: have to traverse to next node: */
@@ -342,6 +340,12 @@ enum btree_insert_ret {
BTREE_INSERT_NEED_GC_LOCK,
};
+struct extent_insert_hook {
+ enum btree_insert_ret
+ (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+ struct bkey_s_c, const struct bkey_i *);
+};
+
enum btree_gc_coalesce_fail_reason {
BTREE_GC_COALESCE_FAIL_RESERVE_GET,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 2efb01c1..1fe8fff8 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -211,7 +211,7 @@ found:
-c->opts.btree_node_size, true, b
? gc_pos_btree_node(b)
: gc_pos_btree_root(as->btree_id),
- &tmp, 0);
+ &tmp, 0, 0);
/*
* Don't apply tmp - pending deletes aren't tracked in
* bch_alloc_stats:
@@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
BUG_ON(b == btree_node_root(c, b));
- BUG_ON(b->ob);
+ BUG_ON(b->ob.nr);
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable);
@@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
{
- struct open_bucket *ob = b->ob;
+ struct btree_ob_ref ob = b->ob;
btree_update_drop_new_node(c, b);
- b->ob = NULL;
+ b->ob.nr = 0;
clear_btree_node_dirty(b);
__btree_node_free(c, b, NULL);
- bch2_open_bucket_put(c, ob);
+ bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
}
void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
@@ -287,7 +287,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-c->opts.btree_node_size, true,
gc_phase(GC_PHASE_PENDING_DELETE),
- &stats, 0);
+ &stats, 0, 0);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
@@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
{
- bch2_open_bucket_put(c, b->ob);
- b->ob = NULL;
+ bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct closure *cl,
unsigned flags)
{
- BKEY_PADDED(k) tmp;
- struct open_bucket *ob;
+ struct write_point *wp;
struct btree *b;
+ BKEY_PADDED(k) tmp;
+ struct bkey_i_extent *e;
+ struct btree_ob_ref ob;
+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
@@ -335,31 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- /* alloc_sectors is weird, I suppose */
- bkey_extent_init(&tmp.k);
- tmp.k.k.size = c->opts.btree_node_size,
-
- ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
- bkey_i_to_extent(&tmp.k),
- res->nr_replicas,
- c->opts.metadata_replicas_required,
- alloc_reserve, 0, cl);
- if (IS_ERR(ob))
- return ERR_CAST(ob);
-
- if (tmp.k.k.size < c->opts.btree_node_size) {
- bch2_open_bucket_put(c, ob);
+ wp = bch2_alloc_sectors_start(c, NULL,
+ writepoint_ptr(&c->btree_write_point),
+ &devs_have,
+ res->nr_replicas,
+ c->opts.metadata_replicas_required,
+ alloc_reserve, 0, cl);
+ if (IS_ERR(wp))
+ return ERR_CAST(wp);
+
+ if (wp->sectors_free < c->opts.btree_node_size) {
+ struct open_bucket *ob;
+ unsigned i;
+
+ writepoint_for_each_ptr(wp, ob, i)
+ if (ob->sectors_free < c->opts.btree_node_size)
+ ob->sectors_free = 0;
+
+ bch2_alloc_sectors_done(c, wp);
goto retry;
}
+
+ e = bkey_extent_init(&tmp.k);
+ bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+
+ ob.nr = 0;
+ bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+ bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(c);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
- BUG_ON(b->ob);
+ BUG_ON(b->ob.nr);
bkey_copy(&b->key, &tmp.k);
- b->key.k.size = 0;
b->ob = ob;
return b;
@@ -466,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
a->ob = b->ob;
- b->ob = NULL;
+ b->ob.nr = 0;
bkey_copy(&a->k, &b->key);
} else {
- bch2_open_bucket_put(c, b->ob);
- b->ob = NULL;
+ bch2_btree_open_bucket_put(c, b);
}
__btree_node_free(c, b, NULL);
@@ -857,10 +868,7 @@ static void __btree_interior_update_drop_new_node(struct btree *b)
BUG();
found:
- as->nr_new_nodes--;
- memmove(&as->new_nodes[i],
- &as->new_nodes[i + 1],
- sizeof(struct btree *) * (as->nr_new_nodes - i));
+ array_remove_item(as->new_nodes, as->nr_new_nodes, i);
b->will_make_reachable = NULL;
}
@@ -1000,8 +1008,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
as->reserve = reserve;
INIT_LIST_HEAD(&as->write_blocked_list);
- bch2_keylist_init(&as->parent_keys, as->inline_keys,
- ARRAY_SIZE(as->inline_keys));
+ bch2_keylist_init(&as->parent_keys, as->inline_keys);
mutex_lock(&c->btree_interior_update_lock);
list_add(&as->list, &c->btree_interior_update_list);
@@ -1037,7 +1044,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
bch2_mark_key(c, bkey_i_to_s_c(&b->key),
c->opts.btree_node_size, true,
gc_pos_btree_root(b->btree_id),
- &stats, 0);
+ &stats, 0, 0);
if (old)
bch2_btree_node_free_index(as, NULL,
@@ -1121,7 +1128,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
if (bkey_extent_is_data(&insert->k))
bch2_mark_key(c, bkey_i_to_s_c(insert),
c->opts.btree_node_size, true,
- gc_pos_btree_node(b), &stats, 0);
+ gc_pos_btree_node(b), &stats, 0, 0);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
!btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
@@ -1479,6 +1486,13 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
struct closure cl;
int ret = 0;
+ /*
+ * We already have a disk reservation and open buckets pinned; this
+ * allocation must not block:
+ */
+ if (iter->btree_id == BTREE_ID_EXTENTS)
+ btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
+
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
@@ -1519,6 +1533,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_set_locks_want(iter, 1);
out:
up_read(&c->gc_lock);
+ closure_sync(&cl);
return ret;
}
@@ -1904,7 +1919,7 @@ retry:
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true,
gc_pos_btree_root(b->btree_id),
- &stats, 0);
+ &stats, 0, 0);
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
&stats);
@@ -1928,6 +1943,7 @@ out:
}
bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
+ closure_sync(&cl);
return ret;
err:
if (as)
@@ -1965,13 +1981,13 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE,
&cl);
+ closure_sync(&cl);
+
if (!IS_ERR(as))
break;
if (PTR_ERR(as) == -ENOSPC)
return PTR_ERR(as);
-
- closure_sync(&cl);
}
b = __btree_root_alloc(as, 0);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 6c490dd3..e62e0d2e 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -355,6 +355,11 @@ retry:
multi_lock_write(c, trans);
+ if (race_fault()) {
+ ret = -EINTR;
+ goto unlock;
+ }
+
u64s = 0;
trans_for_each_entry(trans, i) {
/* Multiple inserts might go to same leaf: */
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 6fdbb464..b73002de 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
stats.online_reserved);
}
+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+ struct bch_dev_usage stats =
+ __bch2_dev_usage_read(ca);
+ u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+
+ BUG_ON(stats.buckets[S_META] > n);
+ BUG_ON(stats.buckets[S_DIRTY] > n);
+ BUG_ON(stats.buckets_cached > n);
+ BUG_ON(stats.buckets_alloc > n);
+ BUG_ON(stats.buckets_unavailable > n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+ if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+ u64 used = __bch2_fs_sectors_used(c);
+ u64 cached = 0;
+ u64 avail = atomic64_read(&c->sectors_available);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+ if (used + avail + cached > c->capacity)
+ panic("used %llu avail %llu cached %llu capacity %llu\n",
+ used, avail, cached, c->capacity);
+ }
+}
+
#else
static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
#endif
@@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
return bch2_usage_read_raw(ca->usage_percpu);
}
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
- return bch2_usage_read_cached(ca->fs,
- ca->usage_cached,
- ca->usage_percpu);
+ return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
}
struct bch_fs_usage
@@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m)
!m.dirty_sectors && !!m.cached_sectors;
}
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+ return !is_available_bucket(m);
+}
+
static inline enum s_alloc bucket_type(struct bucket_mark m)
{
return is_meta_bucket(m) ? S_META : S_DIRTY;
@@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c,
memset(stats, 0, sizeof(*stats));
}
-static void bch2_dev_usage_update(struct bch_dev *ca,
- struct bucket_mark old, struct bucket_mark new)
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, struct bucket_mark old,
+ struct bucket_mark new)
{
- struct bch_fs *c = ca->fs;
struct bch_dev_usage *dev_usage;
+ BUG_ON((g - ca->buckets) < ca->mi.first_bucket ||
+ (g - ca->buckets) >= ca->mi.nbuckets);
+
bch2_fs_inconsistent_on(old.data_type && new.data_type &&
old.data_type != new.data_type, c,
"different types of metadata in same bucket: %u, %u",
@@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
preempt_disable();
dev_usage = this_cpu_ptr(ca->usage_percpu);
- dev_usage->sectors_cached +=
- (int) new.cached_sectors - (int) old.cached_sectors;
-
- dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
- dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
-
+ dev_usage->buckets[S_META] +=
+ is_meta_bucket(new) - is_meta_bucket(old);
+ dev_usage->buckets[S_DIRTY] +=
+ is_dirty_bucket(new) - is_dirty_bucket(old);
+ dev_usage->buckets_cached +=
+ is_cached_bucket(new) - is_cached_bucket(old);
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+ dev_usage->buckets_unavailable +=
+ is_unavailable_bucket(new) - is_unavailable_bucket(old);
- dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
- dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
- dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
+ dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
+ dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
+ dev_usage->sectors_cached +=
+ (int) new.cached_sectors - (int) old.cached_sectors;
preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
+
+ bch2_dev_stats_verify(ca);
}
-#define bucket_data_cmpxchg(ca, g, new, expr) \
+#define bucket_data_cmpxchg(c, ca, g, new, expr) \
({ \
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
\
- bch2_dev_usage_update(ca, _old, new); \
+ bch2_dev_usage_update(c, ca, g, _old, new); \
_old; \
})
-bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
- struct bucket_mark *old)
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, struct bucket_mark *old)
{
struct bucket_mark new;
- *old = bucket_data_cmpxchg(ca, g, new, ({
+ lg_local_lock(&c->usage_lock);
+ *old = bucket_data_cmpxchg(c, ca, g, new, ({
if (!is_available_bucket(new))
return false;
@@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
new.dirty_sectors = 0;
new.gen++;
}));
+ lg_local_unlock(&c->usage_lock);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
@@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
return true;
}
-bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g)
{
struct bucket_mark new, old;
- old = bucket_data_cmpxchg(ca, g, new, ({
+ lg_local_lock(&c->usage_lock);
+ old = bucket_data_cmpxchg(c, ca, g, new, ({
if (new.touched_this_mount ||
!is_available_bucket(new))
return false;
@@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
new.owned_by_allocator = 1;
new.touched_this_mount = 1;
}));
+ lg_local_unlock(&c->usage_lock);
return true;
}
-void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, bool owned_by_allocator,
+ struct gc_pos pos, unsigned flags)
{
struct bucket_mark old, new;
- old = bucket_data_cmpxchg(ca, g, new, ({
- new.touched_this_mount = 1;
- new.owned_by_allocator = 0;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- }));
-
- BUG_ON(bucket_became_unavailable(ca->fs, old, new));
-}
-
-void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
- bool owned_by_allocator)
-{
- struct bucket_mark old, new;
+ lg_local_lock(&c->usage_lock);
+ if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+ gc_will_visit(c, pos)) {
+ lg_local_unlock(&c->usage_lock);
+ return;
+ }
- old = bucket_data_cmpxchg(ca, g, new, ({
+ old = bucket_data_cmpxchg(c, ca, g, new, ({
new.touched_this_mount = 1;
new.owned_by_allocator = owned_by_allocator;
}));
+ lg_local_unlock(&c->usage_lock);
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
- ca->fs->gc_pos.phase == GC_PHASE_DONE);
+ c->gc_pos.phase == GC_PHASE_DONE);
}
#define saturated_add(ca, dst, src, max) \
@@ -377,41 +419,49 @@ do { \
} \
} while (0)
-void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
- enum bucket_data_type type,
- bool may_make_unavailable)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *g, enum bucket_data_type type,
+ struct gc_pos pos, unsigned flags)
{
struct bucket_mark old, new;
BUG_ON(!type);
- old = bucket_data_cmpxchg(ca, g, new, ({
+ lg_local_lock(&c->usage_lock);
+ if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+ gc_will_visit(c, pos)) {
+ lg_local_unlock(&c->usage_lock);
+ return;
+ }
+
+ old = bucket_data_cmpxchg(c, ca, g, new, ({
saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
GC_MAX_SECTORS_USED);
new.data_type = type;
new.touched_this_mount = 1;
}));
+ lg_local_unlock(&c->usage_lock);
if (old.data_type != type &&
(old.data_type ||
old.cached_sectors ||
old.dirty_sectors))
- bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+ bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
g - ca->buckets, old.data_type, new.data_type);
- BUG_ON(!may_make_unavailable &&
- bucket_became_unavailable(ca->fs, old, new));
+ BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+ bucket_became_unavailable(c, old, new));
}
/* Reverting this until the copygc + compression issue is fixed: */
-static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
{
if (!sectors)
return 0;
- return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
- crc_uncompressed_size(NULL, crc)));
+ return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+ crc.uncompressed_size));
}
/*
@@ -420,12 +470,12 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
* that with the gc pos seqlock held.
*/
static void bch2_mark_pointer(struct bch_fs *c,
- struct bkey_s_c_extent e,
- const union bch_extent_crc *crc,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum s_alloc type,
- struct bch_fs_usage *stats,
- u64 journal_seq, unsigned flags)
+ struct bkey_s_c_extent e,
+ const struct bch_extent_ptr *ptr,
+ struct bch_extent_crc_unpacked crc,
+ s64 sectors, enum s_alloc type,
+ struct bch_fs_usage *stats,
+ u64 journal_seq, unsigned flags)
{
struct bucket_mark old, new;
unsigned saturated;
@@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
? BUCKET_BTREE : BUCKET_DATA;
u64 v;
- if (crc_compression_type(crc)) {
+ if (crc.compression_type) {
unsigned old_sectors, new_sectors;
if (sectors > 0) {
@@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
old.counter,
new.counter)) != old.counter);
- bch2_dev_usage_update(ca, old, new);
+ bch2_dev_usage_update(c, ca, g, old, new);
if (old.data_type != data_type &&
(old.data_type ||
old.cached_sectors ||
old.dirty_sectors))
- bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+ bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
g - ca->buckets, old.data_type, new.data_type);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
@@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
}
}
-static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
- s64 sectors, bool metadata,
- struct bch_fs_usage *stats,
- u64 journal_seq, unsigned flags)
-{
- const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
- enum s_alloc type = metadata ? S_META : S_DIRTY;
- unsigned replicas = 0;
-
- BUG_ON(metadata && bkey_extent_is_cached(e.k));
- BUG_ON(!sectors);
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- bch2_mark_pointer(c, e, crc, ptr, sectors, type,
- stats, journal_seq, flags);
- replicas += !ptr->cached;
- }
-
- BUG_ON(replicas >= BCH_REPLICAS_MAX);
-
- if (replicas)
- stats->s[replicas - 1].data[type] += sectors;
-}
-
-void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata,
- struct bch_fs_usage *stats,
- u64 journal_seq, unsigned flags)
-{
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
- stats, journal_seq, flags);
- break;
- case BCH_RESERVATION: {
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- if (r.v->nr_replicas)
- stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
- break;
- }
- }
-}
-
-void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata, unsigned flags)
-{
- struct bch_fs_usage stats = { 0 };
-
- __bch2_mark_key(c, k, sectors, metadata, &stats, 0,
- flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
- preempt_disable();
- bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
- preempt_enable();
-}
-
void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata, struct gc_pos gc_pos,
- struct bch_fs_usage *stats, u64 journal_seq)
+ s64 sectors, bool metadata,
+ struct gc_pos pos,
+ struct bch_fs_usage *stats,
+ u64 journal_seq, unsigned flags)
{
- unsigned flags = gc_will_visit(c, gc_pos)
- ? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
/*
* synchronization w.r.t. GC:
*
@@ -614,69 +605,104 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
* To know whether we should mark a given reference (GC either isn't
* running, or has already marked references at this position) we
* construct a total order for everything GC walks. Then, we can simply
- * compare the position of the reference we're marking - @gc_pos - with
+ * compare the position of the reference we're marking - @pos - with
* GC's current position. If GC is going to mark this reference, GC's
- * current position will be less than @gc_pos; if GC's current position
- * is greater than @gc_pos GC has either already walked this position,
- * or isn't running.
+ * current position will be less than @pos; if GC's current position is
+ * greater than @pos GC has either already walked this position, or
+ * isn't running.
*
* To avoid racing with GC's position changing, we have to deal with
* - GC's position being set to GC_POS_MIN when GC starts:
* usage_lock guards against this
- * - GC's position overtaking @gc_pos: we guard against this with
+ * - GC's position overtaking @pos: we guard against this with
* whatever lock protects the data structure the reference lives in
* (e.g. the btree node lock, or the relevant allocator lock).
*/
+
lg_local_lock(&c->usage_lock);
- __bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
- bch2_fs_stats_verify(c);
+ if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+ gc_will_visit(c, pos))
+ flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+ enum s_alloc type = metadata ? S_META : S_DIRTY;
+ unsigned replicas = 0;
+
+ BUG_ON(metadata && bkey_extent_is_cached(e.k));
+ BUG_ON(!sectors);
+
+ extent_for_each_ptr_crc(e, ptr, crc) {
+ bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+ stats, journal_seq, flags);
+ replicas += !ptr->cached;
+ }
+
+ BUG_ON(replicas >= BCH_REPLICAS_MAX);
+
+ if (replicas)
+ stats->s[replicas - 1].data[type] += sectors;
+ break;
+ }
+ case BCH_RESERVATION: {
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+ if (r.v->nr_replicas)
+ stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+ break;
+ }
+ }
lg_local_unlock(&c->usage_lock);
}
-static u64 __recalc_sectors_available(struct bch_fs *c)
-{
- return c->capacity - bch2_fs_sectors_used(c);
-}
+/* Disk reservations: */
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
+static u64 __recalc_sectors_available(struct bch_fs *c)
{
+ u64 avail;
int cpu;
- lg_global_lock(&c->usage_lock);
-
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
- atomic64_set(&c->sectors_available,
- __recalc_sectors_available(c));
+ avail = c->capacity - bch2_fs_sectors_used(c);
+ avail <<= RESERVE_FACTOR;
+ avail /= (1 << RESERVE_FACTOR) + 1;
+ return avail;
+}
+
+/* Used by gc when it's starting: */
+void bch2_recalc_sectors_available(struct bch_fs *c)
+{
+ lg_global_lock(&c->usage_lock);
+ atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
lg_global_unlock(&c->usage_lock);
}
-void bch2_disk_reservation_put(struct bch_fs *c,
- struct disk_reservation *res)
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
- if (res->sectors) {
- lg_local_lock(&c->usage_lock);
- this_cpu_sub(c->usage_percpu->online_reserved,
- res->sectors);
+ lg_local_lock(&c->usage_lock);
+ this_cpu_sub(c->usage_percpu->online_reserved,
+ res->sectors);
- bch2_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
+ bch2_fs_stats_verify(c);
+ lg_local_unlock(&c->usage_lock);
- res->sectors = 0;
- }
+ res->sectors = 0;
}
#define SECTORS_CACHE 1024
-int bch2_disk_reservation_add(struct bch_fs *c,
- struct disk_reservation *res,
- unsigned sectors, int flags)
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+ unsigned sectors, int flags)
{
struct bch_fs_usage *stats;
- u64 old, new, v;
+ u64 old, v, get;
s64 sectors_available;
int ret;
@@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c,
lg_local_lock(&c->usage_lock);
stats = this_cpu_ptr(c->usage_percpu);
- if (sectors >= stats->available_cache)
+ if (sectors <= stats->available_cache)
goto out;
v = atomic64_read(&c->sectors_available);
do {
old = v;
- if (old < sectors) {
+ get = min((u64) sectors + SECTORS_CACHE, old);
+
+ if (get < sectors) {
lg_local_unlock(&c->usage_lock);
goto recalculate;
}
-
- new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
} while ((v = atomic64_cmpxchg(&c->sectors_available,
- old, new)) != old);
+ old, old - get)) != old);
+
+ stats->available_cache += get;
- stats->available_cache += old - new;
out:
stats->available_cache -= sectors;
stats->online_reserved += sectors;
res->sectors += sectors;
+ bch2_disk_reservations_verify(c, flags);
bch2_fs_stats_verify(c);
lg_local_unlock(&c->usage_lock);
return 0;
@@ -738,6 +766,8 @@ recalculate:
stats->online_reserved += sectors;
res->sectors += sectors;
ret = 0;
+
+ bch2_disk_reservations_verify(c, flags);
} else {
atomic64_set(&c->sectors_available, sectors_available);
ret = -ENOSPC;
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 141aa4ad..7d2b08cb 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -95,37 +95,39 @@ static inline bool bucket_unused(struct bucket_mark mark)
/* Per device stats: */
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
- return max_t(s64, 0,
- ca->mi.nbuckets - ca->mi.first_bucket -
- stats.buckets[S_META] -
- stats.buckets[S_DIRTY] -
- stats.buckets_alloc);
+ u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+ if (WARN_ONCE(stats.buckets_unavailable > total,
+ "buckets_unavailable overflow\n"))
+ return 0;
+
+ return total - stats.buckets_unavailable;
}
/*
* Number of reclaimable buckets - only for use by the allocator thread:
*/
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
}
static inline u64 __dev_buckets_free(struct bch_dev *ca,
- struct bch_dev_usage stats)
+ struct bch_dev_usage stats)
{
return __dev_buckets_available(ca, stats) +
fifo_used(&ca->free[RESERVE_NONE]) +
fifo_used(&ca->free_inc);
}
-static inline u64 dev_buckets_free(struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
{
- return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
}
/* Cache set stats: */
@@ -133,7 +135,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca)
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, struct gc_pos);
+ struct disk_reservation *, struct gc_pos);
struct fs_usage_sum {
u64 data;
@@ -155,11 +157,18 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
return sum;
}
+#define RESERVE_FACTOR 6
+
+static u64 reserve_factor(u64 r)
+{
+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
{
struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));
- return sum.data + sum.reserved + (sum.reserved >> 7);
+ return sum.data + reserve_factor(sum.reserved);
}
static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
@@ -184,30 +193,35 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
void bch2_bucket_seq_cleanup(struct bch_fs *);
-bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
- struct bucket_mark *);
-bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
-void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
-void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
-void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
- enum bucket_data_type, bool);
+bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+ struct bucket *, struct bucket_mark *);
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
+ struct bucket *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+ struct bucket *, bool,
+ struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+ struct bucket *, enum bucket_data_type,
+ struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 1)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 2)
-
-void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
- struct bch_fs_usage *, u64, unsigned);
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
+#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
-void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
- s64, bool, unsigned);
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
- struct gc_pos, struct bch_fs_usage *, u64);
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
+ struct bch_fs_usage *, u64, unsigned);
void bch2_recalc_sectors_available(struct bch_fs *);
-void bch2_disk_reservation_put(struct bch_fs *,
- struct disk_reservation *);
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+ struct disk_reservation *res)
+{
+ if (res->sectors)
+ __bch2_disk_reservation_put(c, res);
+}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
#define BCH_DISK_RESERVATION_METADATA (1 << 1)
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 63f1b27f..0bd8d2d8 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -59,6 +59,7 @@ struct bch_dev_usage {
u64 buckets[S_ALLOC_NR];
u64 buckets_cached;
u64 buckets_alloc;
+ u64 buckets_unavailable;
/* _compressed_ sectors: */
u64 sectors[S_ALLOC_NR];
@@ -79,13 +80,6 @@ struct bch_fs_usage {
u64 available_cache;
};
-struct bucket_heap_entry {
- size_t bucket;
- struct bucket_mark mark;
-};
-
-typedef HEAP(struct bucket_heap_entry) bucket_heap;
-
/*
* A reservation for space on disk:
*/
@@ -95,4 +89,11 @@ struct disk_reservation {
unsigned nr_replicas;
};
+struct copygc_heap_entry {
+ u64 offset;
+ struct bucket_mark mark;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
#endif /* _BUCKETS_TYPES_H */
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 01bdc867..08755853 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -141,10 +141,14 @@ static u64 bch2_checksum_init(unsigned type)
switch (type) {
case BCH_CSUM_NONE:
return 0;
- case BCH_CSUM_CRC32C:
+ case BCH_CSUM_CRC32C_NONZERO:
return U32_MAX;
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_CRC64_NONZERO:
return U64_MAX;
+ case BCH_CSUM_CRC32C:
+ return 0;
+ case BCH_CSUM_CRC64:
+ return 0;
default:
BUG();
}
@@ -155,10 +159,14 @@ static u64 bch2_checksum_final(unsigned type, u64 crc)
switch (type) {
case BCH_CSUM_NONE:
return 0;
- case BCH_CSUM_CRC32C:
+ case BCH_CSUM_CRC32C_NONZERO:
return crc ^ U32_MAX;
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_CRC64_NONZERO:
return crc ^ U64_MAX;
+ case BCH_CSUM_CRC32C:
+ return crc;
+ case BCH_CSUM_CRC64:
+ return crc;
default:
BUG();
}
@@ -169,8 +177,10 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
switch (type) {
case BCH_CSUM_NONE:
return 0;
+ case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC32C:
return crc32c(crc, data, len);
+ case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC64:
return bch2_crc64_update(crc, data, len);
default:
@@ -243,6 +253,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
{
switch (type) {
case BCH_CSUM_NONE:
+ case BCH_CSUM_CRC32C_NONZERO:
+ case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64: {
u64 crc = bch2_checksum_init(type);
@@ -250,7 +262,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
crc = bch2_checksum_update(type, crc, data, len);
crc = bch2_checksum_final(type, crc);
- return (struct bch_csum) { .lo = crc };
+ return (struct bch_csum) { .lo = cpu_to_le64(crc) };
}
case BCH_CSUM_CHACHA20_POLY1305_80:
@@ -281,28 +293,36 @@ void bch2_encrypt(struct bch_fs *c, unsigned type,
do_encrypt(c->chacha20, nonce, data, len);
}
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio,
+ struct bvec_iter *iter)
{
struct bio_vec bv;
- struct bvec_iter iter;
switch (type) {
case BCH_CSUM_NONE:
return (struct bch_csum) { 0 };
+ case BCH_CSUM_CRC32C_NONZERO:
+ case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64: {
u64 crc = bch2_checksum_init(type);
- bio_for_each_contig_segment(bv, bio, iter) {
+#ifdef CONFIG_HIGHMEM
+ __bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
crc = bch2_checksum_update(type,
crc, p, bv.bv_len);
kunmap_atomic(p);
}
-
+#else
+ __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ crc = bch2_checksum_update(type, crc,
+ page_address(bv.bv_page) + bv.bv_offset,
+ bv.bv_len);
+#endif
crc = bch2_checksum_final(type, crc);
- return (struct bch_csum) { .lo = crc };
+ return (struct bch_csum) { .lo = cpu_to_le64(crc) };
}
case BCH_CSUM_CHACHA20_POLY1305_80:
@@ -313,13 +333,19 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
gen_poly_key(c, desc, nonce);
- bio_for_each_contig_segment(bv, bio, iter) {
+#ifdef CONFIG_HIGHMEM
+ __bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
crypto_shash_update(desc, p, bv.bv_len);
kunmap_atomic(p);
}
-
+#else
+ __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ crypto_shash_update(desc,
+ page_address(bv.bv_page) + bv.bv_offset,
+ bv.bv_len);
+#endif
crypto_shash_final(desc, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
@@ -330,6 +356,14 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
}
}
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ struct bvec_iter iter = bio->bi_iter;
+
+ return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
@@ -343,12 +377,12 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
sg_init_table(sgl, ARRAY_SIZE(sgl));
- bio_for_each_contig_segment(bv, bio, iter) {
+ bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
- le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
+ nonce = nonce_add(nonce, bytes);
bytes = 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
@@ -357,13 +391,115 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
bytes += bv.bv_len;
-
}
sg_mark_end(sg - 1);
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
}
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+ switch (type) {
+ case BCH_CSUM_NONE:
+ case BCH_CSUM_CRC32C:
+ case BCH_CSUM_CRC64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct bch_csum bch2_checksum_merge(unsigned type,
+ struct bch_csum a,
+ struct bch_csum b, size_t b_len)
+{
+ BUG_ON(!bch2_checksum_mergeable(type));
+
+ while (b_len) {
+ unsigned b = min(b_len, PAGE_SIZE);
+
+ a.lo = bch2_checksum_update(type, a.lo,
+ page_address(ZERO_PAGE(0)), b);
+ b_len -= b;
+ }
+
+ a.lo ^= b.lo;
+ a.hi ^= b.hi;
+ return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+ struct bversion version,
+ struct bch_extent_crc_unpacked crc_old,
+ struct bch_extent_crc_unpacked *crc_a,
+ struct bch_extent_crc_unpacked *crc_b,
+ unsigned len_a, unsigned len_b,
+ unsigned new_csum_type)
+{
+ struct bvec_iter iter = bio->bi_iter;
+ struct nonce nonce = extent_nonce(version, crc_old);
+ struct bch_csum merged = { 0 };
+ struct crc_split {
+ struct bch_extent_crc_unpacked *crc;
+ unsigned len;
+ unsigned csum_type;
+ struct bch_csum csum;
+ } splits[3] = {
+ { crc_a, len_a, new_csum_type },
+ { crc_b, len_b, new_csum_type },
+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
+ }, *i;
+ bool mergeable = crc_old.csum_type == new_csum_type &&
+ bch2_checksum_mergeable(new_csum_type);
+ unsigned crc_nonce = crc_old.nonce;
+
+ BUG_ON(len_a + len_b > bio_sectors(bio));
+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+ BUG_ON(crc_old.compression_type);
+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+ bch2_csum_type_is_encryption(new_csum_type));
+
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+ iter.bi_size = i->len << 9;
+ if (mergeable || i->crc)
+ i->csum = __bch2_checksum_bio(c, i->csum_type,
+ nonce, bio, &iter);
+ else
+ bio_advance_iter(bio, &iter, i->len << 9);
+ nonce = nonce_add(nonce, i->len << 9);
+ }
+
+ if (mergeable)
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+ merged = bch2_checksum_merge(new_csum_type, merged,
+ i->csum, i->len << 9);
+ else
+ merged = bch2_checksum_bio(c, crc_old.csum_type,
+ extent_nonce(version, crc_old), bio);
+
+ if (bch2_crc_cmp(merged, crc_old.csum))
+ return -EIO;
+
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+ if (i->crc)
+ *i->crc = (struct bch_extent_crc_unpacked) {
+ .csum_type = i->csum_type,
+ .compressed_size = i->len,
+ .uncompressed_size = i->len,
+ .offset = 0,
+ .live_size = i->len,
+ .nonce = crc_nonce,
+ .csum = i->csum,
+ };
+
+ if (bch2_csum_type_is_encryption(new_csum_type))
+ crc_nonce += i->len;
+ }
+
+ return 0;
+}
+
#ifdef __KERNEL__
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
{
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index e8f6ef41..1a089417 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -2,6 +2,7 @@
#define _BCACHEFS_CHECKSUM_H
#include "bcachefs.h"
+#include "extents_types.h"
#include "super-io.h"
#include <crypto/chacha20.h>
@@ -36,7 +37,14 @@ void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
void *data, size_t);
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
+ struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+ struct bch_extent_crc_unpacked,
+ struct bch_extent_crc_unpacked *,
+ struct bch_extent_crc_unpacked *,
+ unsigned, unsigned, unsigned);
+
void bch2_encrypt_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
@@ -49,15 +57,16 @@ int bch2_enable_encryption(struct bch_fs *, bool);
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type)
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+ bool data)
{
switch (type) {
case BCH_CSUM_OPT_NONE:
return BCH_CSUM_NONE;
case BCH_CSUM_OPT_CRC32C:
- return BCH_CSUM_CRC32C;
+ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
case BCH_CSUM_OPT_CRC64:
- return BCH_CSUM_CRC64;
+ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
default:
BUG();
}
@@ -70,7 +79,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80;
- return bch2_csum_opt_to_type(c->opts.data_checksum);
+ return bch2_csum_opt_to_type(c->opts.data_checksum, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@@ -78,7 +87,7 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
if (c->sb.encryption_type)
return BCH_CSUM_CHACHA20_POLY1305_128;
- return bch2_csum_opt_to_type(c->opts.metadata_checksum);
+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
static inline enum bch_compression_type
@@ -134,6 +143,21 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
return nonce;
}
+static inline struct nonce extent_nonce(struct bversion version,
+ struct bch_extent_crc_unpacked crc)
+{
+ unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+ struct nonce nonce = (struct nonce) {{
+ [0] = cpu_to_le32(size << 22),
+ [1] = cpu_to_le32(version.lo),
+ [2] = cpu_to_le32(version.lo >> 32),
+ [3] = cpu_to_le32(version.hi|
+ (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+ }};
+
+ return nonce_add(nonce, crc.nonce << 9);
+}
+
static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
{
return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 7b45bb78..64079981 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -1,4 +1,5 @@
#include "bcachefs.h"
+#include "checksum.h"
#include "compress.h"
#include "extents.h"
#include "io.h"
@@ -145,11 +146,11 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
}
static int __bio_uncompress(struct bch_fs *c, struct bio *src,
- void *dst_data, struct bch_extent_crc128 crc)
+ void *dst_data, struct bch_extent_crc_unpacked crc)
{
struct bbuf src_data = { NULL };
size_t src_len = src->bi_iter.bi_size;
- size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+ size_t dst_len = crc.uncompressed_size << 9;
int ret;
src_data = bio_map_or_bounce(c, src, READ);
@@ -212,65 +213,58 @@ err:
}
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
- unsigned live_data_sectors,
- struct bch_extent_crc128 crc)
+ struct bch_extent_crc_unpacked *crc)
{
- struct bbuf dst_data = { NULL };
- size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
- int ret = -ENOMEM;
+ struct bbuf data = { NULL };
+ size_t dst_len = crc->uncompressed_size << 9;
- BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
+ /* bio must own its pages: */
+ BUG_ON(!bio->bi_vcnt);
+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
- if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
- crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
+ if (crc->uncompressed_size > c->sb.encoded_extent_max ||
+ crc->compressed_size > c->sb.encoded_extent_max) {
+ bch_err(c, "error rewriting existing data: extent too big");
return -EIO;
+ }
- dst_data = __bounce_alloc(c, dst_len, WRITE);
-
- ret = __bio_uncompress(c, bio, dst_data.b, crc);
- if (ret)
- goto err;
-
- while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
- bv->bv_page = alloc_page(GFP_NOIO);
- if (!bv->bv_page)
- goto use_mempool;
+ data = __bounce_alloc(c, dst_len, WRITE);
- bv->bv_len = PAGE_SIZE;
- bv->bv_offset = 0;
- bio->bi_vcnt++;
+ if (__bio_uncompress(c, bio, data.b, *crc)) {
+ bch_err(c, "error rewriting existing data: decompression error");
+ bio_unmap_or_unbounce(c, data);
+ return -EIO;
}
- bio->bi_iter.bi_size = live_data_sectors << 9;
-copy_data:
- memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9));
-err:
- bio_unmap_or_unbounce(c, dst_data);
- return ret;
-use_mempool:
/*
- * We already allocated from mempool, we can't allocate from it again
- * without freeing the pages we already allocated or else we could
- * deadlock:
+ * might have to free existing pages and retry allocation from mempool -
+ * do this _after_ decompressing:
*/
+ bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+
+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
- bch2_bio_free_pages_pool(c, bio);
- bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
- goto copy_data;
+ crc->csum_type = 0;
+ crc->compression_type = 0;
+ crc->compressed_size = crc->live_size;
+ crc->uncompressed_size = crc->live_size;
+ crc->offset = 0;
+ crc->csum = (struct bch_csum) { 0, 0 };
+
+ bio_unmap_or_unbounce(c, data);
+ return 0;
}
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
struct bio *dst, struct bvec_iter dst_iter,
- struct bch_extent_crc128 crc)
+ struct bch_extent_crc_unpacked crc)
{
struct bbuf dst_data = { NULL };
- size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+ size_t dst_len = crc.uncompressed_size << 9;
int ret = -ENOMEM;
- if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
- crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max)
+ if (crc.uncompressed_size > c->sb.encoded_extent_max ||
+ crc.compressed_size > c->sb.encoded_extent_max)
return -EIO;
dst_data = dst_len == dst_iter.bi_size
@@ -288,21 +282,25 @@ err:
return ret;
}
-static int __bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- unsigned *compression_type)
+static unsigned __bio_compress(struct bch_fs *c,
+ struct bio *dst, size_t *dst_len,
+ struct bio *src, size_t *src_len,
+ unsigned compression_type)
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
unsigned pad;
int ret = 0;
+ /* If it's only one block, don't bother trying to compress: */
+ if (bio_sectors(src) <= c->opts.block_size)
+ goto err;
+
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
- switch (*compression_type) {
+ switch (compression_type) {
case BCH_COMPRESSION_LZ4_OLD:
- *compression_type = BCH_COMPRESSION_LZ4;
+ compression_type = BCH_COMPRESSION_LZ4;
case BCH_COMPRESSION_LZ4: {
void *workspace;
@@ -403,19 +401,24 @@ zlib_err:
if (dst_data.type != BB_NONE)
memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+ BUG_ON(*dst_len & (block_bytes(c) - 1));
+ BUG_ON(*src_len & (block_bytes(c) - 1));
out:
bio_unmap_or_unbounce(c, src_data);
bio_unmap_or_unbounce(c, dst_data);
- return ret;
+ return compression_type;
err:
- ret = -1;
+ compression_type = 0;
goto out;
}
-void bch2_bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- unsigned *compression_type)
+unsigned bch2_bio_compress(struct bch_fs *c,
+ struct bio *dst, size_t *dst_len,
+ struct bio *src, size_t *src_len,
+ unsigned compression_type)
{
unsigned orig_dst = dst->bi_iter.bi_size;
unsigned orig_src = src->bi_iter.bi_size;
@@ -423,29 +426,15 @@ void bch2_bio_compress(struct bch_fs *c,
/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
c->sb.encoded_extent_max << 9);
-
/* Don't generate a bigger output than input: */
- dst->bi_iter.bi_size =
- min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+ compression_type =
+ __bio_compress(c, dst, dst_len, src, src_len, compression_type);
- /* If it's only one block, don't bother trying to compress: */
- if (*compression_type != BCH_COMPRESSION_NONE &&
- bio_sectors(src) > c->opts.block_size &&
- !__bio_compress(c, dst, dst_len, src, src_len, compression_type))
- goto out;
-
- /* If compressing failed (didn't get smaller), just copy: */
- *compression_type = BCH_COMPRESSION_NONE;
- *dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
- bio_copy_data(dst, src);
-out:
dst->bi_iter.bi_size = orig_dst;
src->bi_iter.bi_size = orig_src;
-
- BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
- BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
- BUG_ON(*dst_len & (block_bytes(c) - 1));
- BUG_ON(*src_len & (block_bytes(c) - 1));
+ return compression_type;
}
/* doesn't write superblock: */
diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h
index ad1ba25d..06fff6a5 100644
--- a/libbcachefs/compress.h
+++ b/libbcachefs/compress.h
@@ -1,12 +1,14 @@
#ifndef _BCACHEFS_COMPRESS_H
#define _BCACHEFS_COMPRESS_H
+#include "extents_types.h"
+
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
- unsigned, struct bch_extent_crc128);
+ struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
- struct bvec_iter, struct bch_extent_crc128);
-void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
- struct bio *, size_t *, unsigned *);
+ struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+ struct bio *, size_t *, unsigned);
int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 7d2f5ccb..6e79f491 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -19,6 +19,7 @@
#include "inode.h"
#include "journal.h"
#include "super-io.h"
+#include "util.h"
#include "xattr.h"
#include <trace/events/bcachefs.h>
@@ -155,6 +156,44 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+ struct bkey_s_c_extent e;
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+ unsigned ret = 0;
+
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ e = bkey_s_c_to_extent(k);
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (!ptr->cached &&
+ crc.compression_type != BCH_COMPRESSION_NONE &&
+ crc.compressed_size < crc.live_size)
+ ret = max_t(unsigned, ret, crc.compressed_size);
+ }
+
+ return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+ struct bch_extent_ptr m, u64 offset)
+{
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+
+ extent_for_each_ptr_crc(e, ptr, crc)
+ if (ptr->dev == m.dev &&
+ ptr->gen == m.gen &&
+ (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+ (s64) m.offset - offset)
+ return ptr;
+
+ return NULL;
+}
+
/* Doesn't cleanup redundant crcs */
void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
{
@@ -186,24 +225,30 @@ found:
bch2_extent_drop_ptr(e, ptr);
}
-/* returns true if equal */
-static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+ struct bch_extent_crc_unpacked n)
{
- return extent_crc_type(l) == extent_crc_type(r) &&
- !memcmp(l, r, extent_entry_bytes(to_entry(l)));
+ return !u.compression_type &&
+ u.csum_type &&
+ u.uncompressed_size > u.live_size &&
+ bch2_csum_type_is_encryption(u.csum_type) ==
+ bch2_csum_type_is_encryption(n.csum_type);
}
-/* Increment pointers after @crc by crc's offset until the next crc entry: */
-void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+ struct bch_extent_crc_unpacked n)
{
- union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
- extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
- if (!extent_entry_is_ptr(entry))
- return;
+ if (!n.csum_type)
+ return false;
- entry->ptr.offset += crc_offset(crc);
- }
+ extent_for_each_crc(e, crc, i)
+ if (can_narrow_crc(crc, n))
+ return true;
+
+ return false;
}
/*
@@ -214,96 +259,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr
* not compressed, we can modify them to point to only the data that is
* currently live (so that readers won't have to bounce) while we've got the
* checksum we need:
- *
- * XXX: to guard against data being corrupted while in memory, instead of
- * recomputing the checksum here, it would be better in the read path to instead
- * of computing the checksum of the entire extent:
- *
- * | extent |
- *
- * compute the checksums of the live and dead data separately
- * | dead data || live data || dead data |
- *
- * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
- * use crc_live here (that we verified was correct earlier)
- *
- * note: doesn't work with encryption
*/
-void bch2_extent_narrow_crcs(struct bkey_s_extent e)
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+ struct bch_extent_crc_unpacked n)
{
- union bch_extent_crc *crc;
- bool have_wide = false, have_narrow = false;
- struct bch_csum csum = { 0 };
- unsigned csum_type = 0;
-
- extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc) ||
- bch2_csum_type_is_encryption(crc_csum_type(crc)))
- continue;
-
- if (crc_uncompressed_size(e.k, crc) != e.k->size) {
- have_wide = true;
- } else {
- have_narrow = true;
- csum = crc_csum(crc);
- csum_type = crc_csum_type(crc);
- }
- }
-
- if (!have_wide || !have_narrow)
- return;
-
- extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc))
- continue;
-
- if (crc_uncompressed_size(e.k, crc) != e.k->size) {
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- BUG();
- case BCH_EXTENT_CRC32:
- if (bch_crc_bytes[csum_type] > 4)
- continue;
-
- bch2_extent_crc_narrow_pointers(e, crc);
- crc->crc32._compressed_size = e.k->size - 1;
- crc->crc32._uncompressed_size = e.k->size - 1;
- crc->crc32.offset = 0;
- crc->crc32.csum_type = csum_type;
- crc->crc32.csum = csum.lo;
+ struct bch_extent_crc_unpacked u;
+ struct bch_extent_ptr *ptr;
+ union bch_extent_entry *i;
+
+ /* Find a checksum entry that covers only live data: */
+ if (!n.csum_type)
+ extent_for_each_crc(extent_i_to_s(e), u, i)
+ if (!u.compression_type &&
+ u.csum_type &&
+ u.live_size == u.uncompressed_size) {
+ n = u;
break;
- case BCH_EXTENT_CRC64:
- if (bch_crc_bytes[csum_type] > 10)
- continue;
+ }
- bch2_extent_crc_narrow_pointers(e, crc);
- crc->crc64._compressed_size = e.k->size - 1;
- crc->crc64._uncompressed_size = e.k->size - 1;
- crc->crc64.offset = 0;
- crc->crc64.csum_type = csum_type;
- crc->crc64.csum_lo = csum.lo;
- crc->crc64.csum_hi = csum.hi;
- break;
- case BCH_EXTENT_CRC128:
- if (bch_crc_bytes[csum_type] > 16)
- continue;
+ if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+ return false;
- bch2_extent_crc_narrow_pointers(e, crc);
- crc->crc128._compressed_size = e.k->size - 1;
- crc->crc128._uncompressed_size = e.k->size - 1;
- crc->crc128.offset = 0;
- crc->crc128.csum_type = csum_type;
- crc->crc128.csum = csum;
- break;
- }
+ BUG_ON(n.compression_type);
+ BUG_ON(n.offset);
+ BUG_ON(n.live_size != e->k.size);
+
+ bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+ extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+ if (can_narrow_crc(u, n)) {
+ ptr->offset += u.offset;
+ extent_ptr_append(e, *ptr);
+ __bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+ goto restart_narrow_pointers;
}
- }
+
+ bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+ return true;
}
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
{
union bch_extent_entry *entry = e.v->start;
union bch_extent_crc *crc, *prev = NULL;
+ struct bch_extent_crc_unpacked u, prev_u;
while (entry != extent_entry_last(e)) {
union bch_extent_entry *next = extent_entry_next(entry);
@@ -313,6 +312,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
goto next;
crc = entry_to_crc(entry);
+ u = bch2_extent_crc_unpack(e.k, crc);
if (next == extent_entry_last(e)) {
/* crc entry with no pointers after it: */
@@ -324,20 +324,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
goto drop;
}
- if (prev && crc_cmp(crc, prev)) {
+ if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
/* identical to previous crc entry: */
goto drop;
}
if (!prev &&
- !crc_csum_type(crc) &&
- !crc_compression_type(crc)) {
+ !u.csum_type &&
+ !u.compression_type) {
/* null crc entry: */
- bch2_extent_crc_narrow_pointers(e, crc);
+ union bch_extent_entry *e2;
+
+ extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+ if (!extent_entry_is_ptr(e2))
+ break;
+
+ e2->ptr.offset += u.offset;
+ }
goto drop;
}
prev = crc;
+ prev_u = u;
next:
entry = next;
continue;
@@ -453,7 +461,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
{
char *out = buf, *end = buf + size;
const union bch_extent_entry *entry;
- const union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
struct bch_dev *ca;
bool first = true;
@@ -468,13 +476,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
- crc = entry_to_crc(entry);
-
- p("crc: c_size %u size %u offset %u csum %u compress %u",
- crc_compressed_size(e.k, crc),
- crc_uncompressed_size(e.k, crc),
- crc_offset(crc), crc_csum_type(crc),
- crc_compression_type(crc));
+ crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+ p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+ crc.compressed_size,
+ crc.uncompressed_size,
+ crc.offset, crc.nonce,
+ crc.csum_type,
+ crc.compression_type);
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
@@ -499,13 +508,24 @@ out:
return out - buf;
}
+static inline bool dev_latency_better(struct bch_dev *dev1,
+ struct bch_dev *dev2)
+{
+ unsigned l1 = atomic_read(&dev1->latency[READ]);
+ unsigned l2 = atomic_read(&dev2->latency[READ]);
+
+ /* Pick at random, biased in favor of the faster device: */
+
+ return bch2_rand_range(l1 + l2) > l1;
+}
+
static void extent_pick_read_device(struct bch_fs *c,
struct bkey_s_c_extent e,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *pick)
{
- const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
struct bch_dev *ca = c->devs[ptr->dev];
@@ -516,12 +536,18 @@ static void extent_pick_read_device(struct bch_fs *c,
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
continue;
- if (avoid && test_bit(ca->dev_idx, avoid->d))
- continue;
+ if (avoid) {
+ if (test_bit(ca->dev_idx, avoid->d))
+ continue;
- if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
- continue;
+ if (pick->ca &&
+ test_bit(pick->ca->dev_idx, avoid->d))
+ goto use;
+ }
+ if (pick->ca && !dev_latency_better(ca, pick->ca))
+ continue;
+use:
if (!percpu_ref_tryget(&ca->io_ref))
continue;
@@ -530,11 +556,9 @@ static void extent_pick_read_device(struct bch_fs *c,
*pick = (struct extent_pick_ptr) {
.ptr = *ptr,
+ .crc = crc,
.ca = ca,
};
-
- if (e.k->size)
- pick->crc = crc_to_128(e.k, crc);
}
}
@@ -557,14 +581,17 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
const char *reason;
- extent_for_each_entry(e, entry)
+ extent_for_each_entry(e, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
- extent_for_each_ptr_crc(e, ptr, crc) {
+ if (extent_entry_is_crc(entry))
+ return "has crc field";
+ }
+
+ extent_for_each_ptr(e, ptr) {
reason = extent_ptr_invalid(c, e, ptr,
c->opts.btree_node_size,
true);
@@ -572,9 +599,6 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
return reason;
}
- if (crc)
- return "has crc field";
-
return NULL;
}
@@ -699,28 +723,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
__set_bkey_deleted(k.k);
else if (bkey_extent_is_data(k.k)) {
struct bkey_s_extent e = bkey_s_to_extent(k);
- struct bch_extent_ptr *ptr;
- union bch_extent_crc *crc, *prev_crc = NULL;
+ union bch_extent_entry *entry;
+ bool seen_crc = false;
- extent_for_each_ptr_crc(e, ptr, crc) {
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- ptr->offset += e.k->size - len;
+ extent_for_each_entry(e, entry) {
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ if (!seen_crc)
+ entry->ptr.offset += e.k->size - len;
break;
- case BCH_EXTENT_CRC32:
- if (prev_crc != crc)
- crc->crc32.offset += e.k->size - len;
+ case BCH_EXTENT_ENTRY_crc32:
+ entry->crc32.offset += e.k->size - len;
break;
- case BCH_EXTENT_CRC64:
- if (prev_crc != crc)
- crc->crc64.offset += e.k->size - len;
+ case BCH_EXTENT_ENTRY_crc64:
+ entry->crc64.offset += e.k->size - len;
break;
- case BCH_EXTENT_CRC128:
- if (prev_crc != crc)
- crc->crc128.offset += e.k->size - len;
+ case BCH_EXTENT_ENTRY_crc128:
+ entry->crc128.offset += e.k->size - len;
break;
}
- prev_crc = crc;
+
+ if (extent_entry_is_crc(entry))
+ seen_crc = true;
}
}
@@ -989,7 +1013,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
return;
bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
- &s->stats, s->trans->journal_res.seq);
+ &s->stats, s->trans->journal_res.seq, 0);
}
static void bch2_subtract_sectors(struct extent_insert_state *s,
@@ -1123,7 +1147,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
bkey_cmp(s->committed, insert->k.p) &&
- bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
+ bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
/* XXX: possibly need to increase our reservation? */
bch2_cut_subtract_back(s, s->committed,
bkey_i_to_s(&split.k));
@@ -1152,46 +1176,24 @@ done:
s->trans->did_work = true;
}
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
__extent_insert_advance_pos(struct extent_insert_state *s,
struct bpos next_pos,
struct bkey_s_c k)
{
struct extent_insert_hook *hook = s->trans->hook;
- enum extent_insert_hook_ret ret;
-#if 0
- /*
- * Currently disabled for encryption - broken with fcollapse. Will have
- * to reenable when versions are exposed for send/receive - versions
- * will have to be monotonic then:
- */
- if (k.k && k.k->size &&
- !bversion_zero(s->insert->k->k.version) &&
- bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
- ret = BTREE_HOOK_NO_INSERT;
- } else
-#endif
+ enum btree_insert_ret ret;
+
if (hook)
ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
else
- ret = BTREE_HOOK_DO_INSERT;
+ ret = BTREE_INSERT_OK;
EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
- switch (ret) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- extent_insert_committed(s);
- bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
-
- bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
- break;
- case BTREE_HOOK_RESTART_TRANS:
- return ret;
- }
+ if (ret == BTREE_INSERT_OK)
+ s->committed = next_pos;
- s->committed = next_pos;
return ret;
}
@@ -1199,39 +1201,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
* Update iter->pos, marking how much of @insert we've processed, and call hook
* fn:
*/
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
{
struct btree *b = s->insert->iter->nodes[0];
struct bpos next_pos = bpos_min(s->insert->k->k.p,
k.k ? k.k->p : b->key.k.p);
+ enum btree_insert_ret ret;
+
+ if (race_fault())
+ return BTREE_INSERT_NEED_TRAVERSE;
/* hole? */
if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
- bool have_uncommitted = bkey_cmp(s->committed,
- bkey_start_pos(&s->insert->k->k)) > 0;
-
- switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
- bkey_s_c_null)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- /*
- * we had to split @insert and insert the committed
- * part - need to bail out and recheck journal
- * reservation/btree node before we advance pos past @k:
- */
- if (have_uncommitted)
- return BTREE_HOOK_NO_INSERT;
- break;
- case BTREE_HOOK_RESTART_TRANS:
- return BTREE_HOOK_RESTART_TRANS;
- }
+ ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+ bkey_s_c_null);
+ if (ret != BTREE_INSERT_OK)
+ return ret;
}
/* avoid redundant calls to hook fn: */
if (!bkey_cmp(s->committed, next_pos))
- return BTREE_HOOK_DO_INSERT;
+ return BTREE_INSERT_OK;
return __extent_insert_advance_pos(s, next_pos, k);
}
@@ -1245,7 +1236,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
unsigned sectors;
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bkey_extent_is_compressed(k))) {
+ (sectors = bch2_extent_is_compressed(k))) {
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
if (s->trans->flags & BTREE_INSERT_NOFAIL)
@@ -1277,6 +1268,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
struct btree_iter *iter = s->insert->iter;
struct btree *b = iter->nodes[0];
struct btree_node_iter *node_iter = &iter->node_iters[0];
+ enum btree_insert_ret ret;
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
@@ -1322,9 +1314,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
k.k->p = orig_pos;
extent_save(b, node_iter, _k, k.k);
- if (extent_insert_advance_pos(s, k.s_c) ==
- BTREE_HOOK_RESTART_TRANS)
- return BTREE_INSERT_NEED_TRAVERSE;
+ ret = extent_insert_advance_pos(s, k.s_c);
+ if (ret != BTREE_INSERT_OK)
+ return ret;
extent_insert_committed(s);
/*
@@ -1420,15 +1412,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
if (ret != BTREE_INSERT_OK)
goto stop;
- switch (extent_insert_advance_pos(s, k.s_c)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- continue;
- case BTREE_HOOK_RESTART_TRANS:
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ ret = extent_insert_advance_pos(s, k.s_c);
+ if (ret)
goto stop;
- }
s->do_journal = true;
@@ -1469,10 +1455,9 @@ next:
bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
}
- if (bkey_cmp(s->committed, insert->k.p) < 0 &&
- ret == BTREE_INSERT_OK &&
- extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ if (ret == BTREE_INSERT_OK &&
+ bkey_cmp(s->committed, insert->k.p) < 0)
+ ret = extent_insert_advance_pos(s, bkey_s_c_null);
stop:
extent_insert_committed(s);
@@ -1594,18 +1579,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
/*
* Only call advance pos & call hook for nonzero size extents:
- * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
- * overlaps with @k:
*/
- switch (extent_insert_advance_pos(&s, k.s_c)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- continue;
- case BTREE_HOOK_RESTART_TRANS:
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ ret = extent_insert_advance_pos(&s, k.s_c);
+ if (ret != BTREE_INSERT_OK)
goto stop;
- }
if (k.k->size &&
(k.k->needs_whiteout || bset_written(b, bset(b, t))))
@@ -1623,10 +1600,9 @@ squash:
goto stop;
}
- if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
- ret == BTREE_INSERT_OK &&
- extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
- ret = BTREE_INSERT_NEED_TRAVERSE;
+ if (ret == BTREE_INSERT_OK &&
+ bkey_cmp(s.committed, insert->k->k.p) < 0)
+ ret = extent_insert_advance_pos(&s, bkey_s_c_null);
stop:
extent_insert_committed(&s);
/*
@@ -1669,29 +1645,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c,
case BCH_EXTENT_CACHED: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
- const union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
unsigned size_ondisk = e.k->size;
const char *reason;
+ unsigned nonce = UINT_MAX;
extent_for_each_entry(e, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return "invalid extent entry type";
if (extent_entry_is_crc(entry)) {
- crc = entry_to_crc(entry);
+ crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
- if (crc_offset(crc) + e.k->size >
- crc_uncompressed_size(e.k, crc))
+ if (crc.offset + e.k->size >
+ crc.uncompressed_size)
return "checksum offset + key size > uncompressed size";
- size_ondisk = crc_compressed_size(e.k, crc);
+ size_ondisk = crc.compressed_size;
- if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
+ if (!bch2_checksum_type_valid(c, crc.csum_type))
return "invalid checksum type";
- if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+ if (crc.compression_type >= BCH_COMPRESSION_NR)
return "invalid compression type";
+
+ if (bch2_csum_type_is_encryption(crc.csum_type)) {
+ if (nonce == UINT_MAX)
+ nonce = crc.offset + crc.nonce;
+ else if (nonce != crc.offset + crc.nonce)
+ return "incorrect nonce";
+ }
} else {
ptr = entry_to_ptr(entry);
@@ -1864,102 +1848,75 @@ static unsigned PTR_TIER(struct bch_fs *c,
}
static void bch2_extent_crc_init(union bch_extent_crc *crc,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type)
-{
- if (bch_crc_bytes[csum_type] <= 4 &&
- uncompressed_size <= CRC32_SIZE_MAX &&
- nonce <= CRC32_NONCE_MAX) {
+ struct bch_extent_crc_unpacked new)
+{
+#define common_fields(_crc) \
+ .csum_type = _crc.csum_type, \
+ .compression_type = _crc.compression_type, \
+ ._compressed_size = _crc.compressed_size - 1, \
+ ._uncompressed_size = _crc.uncompressed_size - 1, \
+ .offset = _crc.offset
+
+ if (bch_crc_bytes[new.csum_type] <= 4 &&
+ new.uncompressed_size <= CRC32_SIZE_MAX &&
+ new.nonce <= CRC32_NONCE_MAX) {
crc->crc32 = (struct bch_extent_crc32) {
.type = 1 << BCH_EXTENT_ENTRY_crc32,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum = *((__le32 *) &csum.lo),
+ common_fields(new),
+ .csum = *((__le32 *) &new.csum.lo),
};
return;
}
- if (bch_crc_bytes[csum_type] <= 10 &&
- uncompressed_size <= CRC64_SIZE_MAX &&
- nonce <= CRC64_NONCE_MAX) {
+ if (bch_crc_bytes[new.csum_type] <= 10 &&
+ new.uncompressed_size <= CRC64_SIZE_MAX &&
+ new.nonce <= CRC64_NONCE_MAX) {
crc->crc64 = (struct bch_extent_crc64) {
.type = 1 << BCH_EXTENT_ENTRY_crc64,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .nonce = nonce,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum_lo = csum.lo,
- .csum_hi = *((__le16 *) &csum.hi),
+ common_fields(new),
+ .nonce = new.nonce,
+ .csum_lo = new.csum.lo,
+ .csum_hi = *((__le16 *) &new.csum.hi),
};
return;
}
- if (bch_crc_bytes[csum_type] <= 16 &&
- uncompressed_size <= CRC128_SIZE_MAX &&
- nonce <= CRC128_NONCE_MAX) {
+ if (bch_crc_bytes[new.csum_type] <= 16 &&
+ new.uncompressed_size <= CRC128_SIZE_MAX &&
+ new.nonce <= CRC128_NONCE_MAX) {
crc->crc128 = (struct bch_extent_crc128) {
.type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .nonce = nonce,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum = csum,
+ common_fields(new),
+ .nonce = new.nonce,
+ .csum = new.csum,
};
return;
}
-
+#undef common_fields
BUG();
}
void bch2_extent_crc_append(struct bkey_i_extent *e,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type)
+ struct bch_extent_crc_unpacked new)
{
- union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
- BUG_ON(compressed_size > uncompressed_size);
- BUG_ON(uncompressed_size != e->k.size);
- BUG_ON(!compressed_size || !uncompressed_size);
+ BUG_ON(new.compressed_size > new.uncompressed_size);
+ BUG_ON(new.live_size != e->k.size);
+ BUG_ON(!new.compressed_size || !new.uncompressed_size);
/*
* Look up the last crc entry, so we can check if we need to add
* another:
*/
- extent_for_each_crc(extent_i_to_s(e), crc)
+ extent_for_each_crc(extent_i_to_s(e), crc, i)
;
- if (!crc && !csum_type && !compression_type)
- return;
-
- if (crc &&
- crc_compressed_size(&e->k, crc) == compressed_size &&
- crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
- crc_offset(crc) == 0 &&
- crc_nonce(crc) == nonce &&
- crc_csum_type(crc) == csum_type &&
- crc_compression_type(crc) == compression_type &&
- crc_csum(crc).lo == csum.lo &&
- crc_csum(crc).hi == csum.hi)
+ if (!memcmp(&crc, &new, sizeof(crc)))
return;
- bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
- compressed_size,
- uncompressed_size,
- compression_type,
- nonce, csum, csum_type);
+ bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
__extent_entry_push(e);
}
@@ -2011,16 +1968,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
}
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
- struct bkey_s_extent e,
- unsigned nr_cached)
+ struct bkey_s_extent e)
{
struct bch_extent_ptr *ptr;
+ unsigned tier = 0, nr_cached = 0, nr_good = 0;
bool have_higher_tier;
- unsigned tier = 0;
- if (!nr_cached)
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached &&
+ c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
+ nr_good++;
+
+ if (nr_good <= c->opts.data_replicas)
return;
+ nr_cached = nr_good - c->opts.data_replicas;
+
do {
have_higher_tier = false;
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 634159f2..1ec2db5e 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -3,7 +3,7 @@
#include "bcachefs.h"
#include "bkey.h"
-#include "io_types.h"
+#include "extents_types.h"
struct bch_fs;
struct journal_res;
@@ -38,11 +38,17 @@ bch2_insert_fixup_extent(struct btree_insert *,
struct btree_insert_entry *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *,
- struct bkey_s_extent, unsigned);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+ struct bch_extent_ptr, u64);
static inline bool bkey_extent_is_data(const struct bkey *k)
{
@@ -67,6 +73,12 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
}
}
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+ return bkey_extent_is_allocation(k.k) &&
+ !bch2_extent_is_compressed(k);
+}
+
static inline bool bkey_extent_is_cached(const struct bkey *k)
{
return k->type == BCH_EXTENT_CACHED;
@@ -170,6 +182,8 @@ union bch_extent_crc {
(struct bch_extent_ptr *) (_entry)); \
})
+/* checksum entries: */
+
enum bch_extent_crc_type {
BCH_EXTENT_CRC_NONE,
BCH_EXTENT_CRC32,
@@ -208,6 +222,50 @@ __extent_crc_type(const union bch_extent_crc *crc)
: __extent_crc_type((union bch_extent_crc *) _crc); \
})
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc) \
+ .csum_type = _crc.csum_type, \
+ .compression_type = _crc.compression_type, \
+ .compressed_size = _crc._compressed_size + 1, \
+ .uncompressed_size = _crc._uncompressed_size + 1, \
+ .offset = _crc.offset, \
+ .live_size = k->size
+
+ switch (extent_crc_type(crc)) {
+ case BCH_EXTENT_CRC_NONE:
+ return (struct bch_extent_crc_unpacked) {
+ .compressed_size = k->size,
+ .uncompressed_size = k->size,
+ .live_size = k->size,
+ };
+ case BCH_EXTENT_CRC32:
+ return (struct bch_extent_crc_unpacked) {
+ common_fields(crc->crc32),
+ .csum.lo = crc->crc32.csum,
+ };
+ case BCH_EXTENT_CRC64:
+ return (struct bch_extent_crc_unpacked) {
+ common_fields(crc->crc64),
+ .nonce = crc->crc64.nonce,
+ .csum.lo = crc->crc64.csum_lo,
+ .csum.hi = crc->crc64.csum_hi,
+ };
+ case BCH_EXTENT_CRC128:
+ return (struct bch_extent_crc_unpacked) {
+ common_fields(crc->crc128),
+ .nonce = crc->crc128.nonce,
+ .csum = crc->crc128.csum,
+ };
+ default:
+ BUG();
+ }
+#undef common_fields
+}
+
+/* Extent entry iteration: */
+
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
@@ -226,7 +284,7 @@ __extent_crc_type(const union bch_extent_crc *crc)
/* Iterate over crcs only: */
-#define extent_crc_next(_e, _p) \
+#define __extent_crc_next(_e, _p) \
({ \
typeof(&(_e).v->start[0]) _entry = _p; \
\
@@ -237,25 +295,41 @@ __extent_crc_type(const union bch_extent_crc *crc)
entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \
})
-#define extent_for_each_crc(_e, _crc) \
- for ((_crc) = extent_crc_next(_e, (_e).v->start); \
+#define __extent_for_each_crc(_e, _crc) \
+ for ((_crc) = __extent_crc_next(_e, (_e).v->start); \
(_crc); \
- (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+ (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+#define extent_crc_next(_e, _crc, _iter) \
+({ \
+ extent_for_each_entry_from(_e, _iter, _iter) \
+ if (extent_entry_is_crc(_iter)) { \
+ (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+ break; \
+ } \
+ \
+ (_iter) < extent_entry_last(_e); \
+})
+
+#define extent_for_each_crc(_e, _crc, _iter) \
+ for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
+ (_iter) = (_e).v->start; \
+ extent_crc_next(_e, _crc, _iter); \
+ (_iter) = extent_entry_next(_iter))
/* Iterate over pointers, with crcs: */
-#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \
+#define extent_ptr_crc_next(_e, _ptr, _crc) \
({ \
__label__ out; \
typeof(&(_e).v->start[0]) _entry; \
\
extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \
if (extent_entry_is_crc(_entry)) { \
- (_crc) = entry_to_crc(_entry); \
+ (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
} else { \
_ptr = entry_to_ptr(_entry); \
- if (_filter) \
- goto out; \
+ goto out; \
} \
\
_ptr = NULL; \
@@ -263,35 +337,26 @@ out: \
_ptr; \
})
-#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \
- for ((_crc) = NULL, \
+#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
+ for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
(_ptr) = &(_e).v->start->ptr; \
- ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
+ ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \
(_ptr)++)
-#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
- extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
-
/* Iterate over pointers only, and from a given position: */
-#define extent_ptr_next_filter(_e, _ptr, _filter) \
+#define extent_ptr_next(_e, _ptr) \
({ \
- typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \
+ struct bch_extent_crc_unpacked _crc; \
\
- extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \
+ extent_ptr_crc_next(_e, _ptr, _crc); \
})
-#define extent_ptr_next(_e, _ptr) \
- extent_ptr_next_filter(_e, _ptr, true)
-
-#define extent_for_each_ptr_filter(_e, _ptr, _filter) \
+#define extent_for_each_ptr(_e, _ptr) \
for ((_ptr) = &(_e).v->start->ptr; \
- ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \
+ ((_ptr) = extent_ptr_next(_e, _ptr)); \
(_ptr)++)
-#define extent_for_each_ptr(_e, _ptr) \
- extent_for_each_ptr_filter(_e, _ptr, true)
-
#define extent_ptr_prev(_e, _ptr) \
({ \
typeof(&(_e).v->start->ptr) _p; \
@@ -315,8 +380,8 @@ out: \
(_ptr); \
(_ptr) = extent_ptr_prev(_e, _ptr))
-void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
- unsigned, unsigned, struct bch_csum, unsigned);
+void bch2_extent_crc_append(struct bkey_i_extent *,
+ struct bch_extent_crc_unpacked);
static inline void __extent_entry_push(struct bkey_i_extent *e)
{
@@ -336,226 +401,26 @@ static inline void extent_ptr_append(struct bkey_i_extent *e,
__extent_entry_push(e);
}
-static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
- const union bch_extent_crc *crc)
+static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
{
- EBUG_ON(!k->size);
-
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return (struct bch_extent_crc128) {
- ._compressed_size = k->size - 1,
- ._uncompressed_size = k->size - 1,
- };
- case BCH_EXTENT_CRC32:
- return (struct bch_extent_crc128) {
- .type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = crc->crc32._compressed_size,
- ._uncompressed_size = crc->crc32._uncompressed_size,
- .offset = crc->crc32.offset,
- .csum_type = crc->crc32.csum_type,
- .compression_type = crc->crc32.compression_type,
- .csum.lo = crc->crc32.csum,
- };
- case BCH_EXTENT_CRC64:
- return (struct bch_extent_crc128) {
- .type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = crc->crc64._compressed_size,
- ._uncompressed_size = crc->crc64._uncompressed_size,
- .offset = crc->crc64.offset,
- .nonce = crc->crc64.nonce,
- .csum_type = crc->crc64.csum_type,
- .compression_type = crc->crc64.compression_type,
- .csum.lo = crc->crc64.csum_lo,
- .csum.hi = crc->crc64.csum_hi,
- };
- case BCH_EXTENT_CRC128:
- return crc->crc128;
- default:
- BUG();
- }
-}
-
-#define crc_compressed_size(_k, _crc) \
-({ \
- unsigned _size = 0; \
- \
- switch (extent_crc_type(_crc)) { \
- case BCH_EXTENT_CRC_NONE: \
- _size = ((const struct bkey *) (_k))->size; \
- break; \
- case BCH_EXTENT_CRC32: \
- _size = ((struct bch_extent_crc32 *) _crc) \
- ->_compressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC64: \
- _size = ((struct bch_extent_crc64 *) _crc) \
- ->_compressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC128: \
- _size = ((struct bch_extent_crc128 *) _crc) \
- ->_compressed_size + 1; \
- break; \
- } \
- _size; \
-})
-
-#define crc_uncompressed_size(_k, _crc) \
-({ \
- unsigned _size = 0; \
- \
- switch (extent_crc_type(_crc)) { \
- case BCH_EXTENT_CRC_NONE: \
- _size = ((const struct bkey *) (_k))->size; \
- break; \
- case BCH_EXTENT_CRC32: \
- _size = ((struct bch_extent_crc32 *) _crc) \
- ->_uncompressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC64: \
- _size = ((struct bch_extent_crc64 *) _crc) \
- ->_uncompressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC128: \
- _size = ((struct bch_extent_crc128 *) _crc) \
- ->_uncompressed_size + 1; \
- break; \
- } \
- _size; \
-})
-
-static inline unsigned crc_offset(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return 0;
- case BCH_EXTENT_CRC32:
- return crc->crc32.offset;
- case BCH_EXTENT_CRC64:
- return crc->crc64.offset;
- case BCH_EXTENT_CRC128:
- return crc->crc128.offset;
- default:
- BUG();
- }
-}
-
-static inline unsigned crc_nonce(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- case BCH_EXTENT_CRC32:
- return 0;
- case BCH_EXTENT_CRC64:
- return crc->crc64.nonce;
- case BCH_EXTENT_CRC128:
- return crc->crc128.nonce;
- default:
- BUG();
- }
-}
-
-static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return 0;
- case BCH_EXTENT_CRC32:
- return crc->crc32.csum_type;
- case BCH_EXTENT_CRC64:
- return crc->crc64.csum_type;
- case BCH_EXTENT_CRC128:
- return crc->crc128.csum_type;
- default:
- BUG();
- }
-}
-
-static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return 0;
- case BCH_EXTENT_CRC32:
- return crc->crc32.compression_type;
- case BCH_EXTENT_CRC64:
- return crc->crc64.compression_type;
- case BCH_EXTENT_CRC128:
- return crc->crc128.compression_type;
- default:
- BUG();
- }
-}
-
-static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return (struct bch_csum) { 0 };
- case BCH_EXTENT_CRC32:
- return (struct bch_csum) { .lo = crc->crc32.csum };
- case BCH_EXTENT_CRC64:
- return (struct bch_csum) {
- .lo = crc->crc64.csum_lo,
- .hi = crc->crc64.csum_hi,
- };
- case BCH_EXTENT_CRC128:
- return crc->crc128.csum;
- default:
- BUG();
- }
-}
-
-static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
-{
- struct bkey_s_c_extent e;
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
- unsigned ret = 0;
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr_crc(e, ptr, crc)
- if (!ptr->cached &&
- crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
- crc_compressed_size(e.k, crc) < k.k->size)
- ret = max_t(unsigned, ret,
- crc_compressed_size(e.k, crc));
- }
+ extent_for_each_ptr(e, ptr)
+ ret.devs[ret.nr++] = ptr->dev;
return ret;
}
-static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
-{
- const union bch_extent_crc *crc;
-
- extent_for_each_crc(e, crc)
- if (bch2_csum_type_is_encryption(crc_csum_type(crc)))
- return crc_offset(crc) + crc_nonce(crc);
-
- return 0;
-}
-
-void bch2_extent_narrow_crcs(struct bkey_s_extent);
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+ struct bch_extent_crc_unpacked);
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-struct bch_extent_ptr *
-bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
- struct bch_extent_ptr);
-struct bch_extent_ptr *
-bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
- struct bkey_s_c_extent);
-
bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned);
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
new file mode 100644
index 00000000..15805cd2
--- /dev/null
+++ b/libbcachefs/extents_types.h
@@ -0,0 +1,27 @@
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+ u8 csum_type;
+ u8 compression_type;
+
+ u16 compressed_size;
+ u16 uncompressed_size;
+
+ u16 offset;
+ u16 live_size;
+
+ u16 nonce;
+
+ struct bch_csum csum;
+};
+
+struct extent_pick_ptr {
+ struct bch_extent_ptr ptr;
+ struct bch_extent_crc_unpacked crc;
+ struct bch_dev *ca;
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h
index 04dcfc50..66fa227c 100644
--- a/libbcachefs/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@@ -80,7 +80,7 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
EBUG_ON(i >= size);
if (eytzinger1_left_child(i) < size) {
- i = eytzinger1_left_child(i);
+ i = eytzinger1_left_child(i) + 1;
i <<= __fls(size) - __fls(i);
i -= 1;
@@ -163,38 +163,6 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
-#if 0
-void eytzinger0_test(void)
-{
- unsigned i, j, size;
-
- for (size = 2;
- size < 65536000;
- size++) {
- if (!(size % 4096))
- printk(KERN_INFO "tree size %u\n", size);
-
- assert(eytzinger1_prev(0, size) == eytzinger1_last(size));
- assert(eytzinger1_next(0, size) == eytzinger1_first(size));
-
- assert(eytzinger1_prev(eytzinger1_first(size), size) == 0);
- assert(eytzinger1_next(eytzinger1_last(size), size) == 0);
-
- eytzinger1_for_each(j, size) {
- assert(from_inorder(i, size) == j);
- assert(to_inorder(j, size) == i);
-
- if (j != eytzinger1_last(size)) {
- unsigned next = eytzinger1_next(j, size);
-
- assert(eytzinger1_prev(next, size) == j);
- }
- }
- }
-
-}
-#endif
-
/* Zero based indexing version: */
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
@@ -214,27 +182,29 @@ static inline unsigned eytzinger0_right_child(unsigned i)
return eytzinger0_child(i, 1);
}
-#if 0
static inline unsigned eytzinger0_first(unsigned size)
{
+ return eytzinger1_first(size + 1) - 1;
}
static inline unsigned eytzinger0_last(unsigned size)
{
+ return eytzinger1_last(size + 1) - 1;
}
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
{
+ return eytzinger1_next(i + 1, size + 1) - 1;
}
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
{
+ return eytzinger1_prev(i + 1, size + 1) - 1;
}
-#endif
static inline unsigned eytzinger0_extra(unsigned size)
{
- return (size + 1 - rounddown_pow_of_two(size)) << 1;
+ return eytzinger1_extra(size + 1);
}
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
@@ -259,10 +229,41 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
}
+#define eytzinger0_for_each(_i, _size) \
+ for ((_i) = eytzinger0_first((_size)); \
+ (_i) != -1; \
+ (_i) = eytzinger0_next((_i), (_size)))
+
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+ eytzinger_cmp_fn cmp, const void *search)
+{
+ unsigned i, n = 0;
+
+ if (!nr)
+ return -1;
+
+ do {
+ i = n;
+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ } while (n < nr);
+
+ if (n & 1) {
+ /* @i was greater than @search, return previous node: */
+
+ if (i == eytzinger0_first(nr))
+ return -1;
+
+ return eytzinger0_prev(i, nr);
+ } else {
+ return i;
+ }
+}
+
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, void *search)
+ eytzinger_cmp_fn cmp, const void *search)
{
size_t i = 0;
int res;
@@ -271,17 +272,6 @@ static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
(res = cmp(search, base + i * size, size)))
i = eytzinger0_child(i, res > 0);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- bool found1 = i < nr, found2 = false;
- size_t j;
-
- for (j = 0; j < nr; j++)
- if (!cmp(base + j * size, search, size))
- found2 = true;
-
- BUG_ON(found1 != found2);
- }
-
return i;
}
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 8b41be87..298e3592 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -26,9 +26,67 @@
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
-struct bio_set *bch2_writepage_bioset;
-struct bio_set *bch2_dio_read_bioset;
-struct bio_set *bch2_dio_write_bioset;
+struct i_sectors_hook {
+ struct extent_insert_hook hook;
+ s64 sectors;
+ struct bch_inode_info *inode;
+};
+
+struct bchfs_write_op {
+ struct bch_inode_info *inode;
+ s64 sectors_added;
+ bool is_dio;
+ bool unalloc;
+ u64 new_i_size;
+
+ /* must be last: */
+ struct bch_write_op op;
+};
+
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+ struct bch_inode_info *inode,
+ bool is_dio)
+{
+ op->inode = inode;
+ op->sectors_added = 0;
+ op->is_dio = is_dio;
+ op->unalloc = false;
+ op->new_i_size = U64_MAX;
+}
+
+struct bch_writepage_io {
+ struct closure cl;
+
+ /* must be last: */
+ struct bchfs_write_op op;
+};
+
+struct dio_write {
+ struct closure cl;
+ struct kiocb *req;
+ struct bch_fs *c;
+ long written;
+ long error;
+ loff_t offset;
+
+ struct disk_reservation res;
+
+ struct iovec *iovec;
+ struct iovec inline_vecs[UIO_FASTIOV];
+ struct iov_iter iter;
+
+ struct task_struct *task;
+
+ /* must be last: */
+ struct bchfs_write_op iop;
+};
+
+struct dio_read {
+ struct closure cl;
+ struct kiocb *req;
+ long ret;
+ struct bch_read_bio rbio;
+};
/* pagecache_block must be held */
static int write_invalidate_inode_pages_range(struct address_space *mapping,
@@ -101,7 +159,7 @@ static inline void i_size_dirty_get(struct bch_inode_info *inode)
/* i_sectors accounting: */
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
i_sectors_hook_fn(struct extent_insert_hook *hook,
struct bpos committed_pos,
struct bpos next_pos,
@@ -119,7 +177,7 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
h->sectors += sectors * sign;
- return BTREE_HOOK_DO_INSERT;
+ return BTREE_INSERT_OK;
}
static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
@@ -208,7 +266,7 @@ struct bchfs_extent_trans_hook {
bool need_inode_update;
};
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
bchfs_extent_update_hook(struct extent_insert_hook *hook,
struct bpos committed_pos,
struct bpos next_pos,
@@ -224,6 +282,10 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
bool do_pack = false;
+ if (h->op->unalloc &&
+ !bch2_extent_is_fully_allocated(k))
+ return BTREE_INSERT_ENOSPC;
+
BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
/* XXX: inode->i_size locking */
@@ -232,7 +294,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
if (!h->need_inode_update) {
h->need_inode_update = true;
- return BTREE_HOOK_RESTART_TRANS;
+ return BTREE_INSERT_NEED_TRAVERSE;
}
h->inode_u.bi_size = offset;
@@ -247,7 +309,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
if (sectors) {
if (!h->need_inode_update) {
h->need_inode_update = true;
- return BTREE_HOOK_RESTART_TRANS;
+ return BTREE_INSERT_NEED_TRAVERSE;
}
h->inode_u.bi_sectors += sectors;
@@ -267,7 +329,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
if (do_pack)
bch2_inode_pack(&h->inode_p, &h->inode_u);
- return BTREE_HOOK_DO_INSERT;
+ return BTREE_INSERT_OK;
}
static int bchfs_write_index_update(struct bch_write_op *wop)
@@ -352,12 +414,16 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(&extent_iter, k));
}
+
+ BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+ BUG_ON(!ret != !k->k.size);
err:
if (ret == -EINTR)
continue;
if (ret)
break;
+ BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
bch2_keylist_pop_front(keys);
} while (!bch2_keylist_empty(keys));
@@ -748,8 +814,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(bio, k);
- if (!bkey_extent_is_allocation(k.k) ||
- bkey_extent_is_compressed(k))
+ if (!bch2_extent_is_fully_allocated(k))
bch2_mark_pages_unalloc(bio);
if (pick.ca) {
@@ -759,7 +824,8 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
trace_read_split(&rbio->bio);
}
- bch2_read_extent(c, rbio, k, &pick, flags);
+ bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
+ &pick, flags);
} else {
zero_fill_bio(bio);
@@ -963,22 +1029,20 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
alloc_io:
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
BIO_MAX_PAGES,
- bch2_writepage_bioset),
+ &c->writepage_bioset),
struct bch_writepage_io, op.op.wbio.bio);
closure_init(&w->io->cl, NULL);
- w->io->op.inode = inode;
- w->io->op.sectors_added = 0;
- w->io->op.is_dio = false;
+ bch2_fswrite_op_init(&w->io->op, inode, false);
bch2_write_op_init(&w->io->op.op, c,
(struct disk_reservation) {
.nr_replicas = c->opts.data_replicas,
},
c->fastest_devs,
- inode->ei_last_dirtied,
+ writepoint_hashed(inode->ei_last_dirtied),
POS(inum, 0),
&inode->ei_journal_seq,
- BCH_WRITE_THROTTLE);
+ 0);
w->io->op.op.index_update_fn = bchfs_write_index_update;
}
@@ -1409,7 +1473,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
- bch2_dio_read_bioset);
+ &c->dio_read_bioset);
bio->bi_end_io = bch2_direct_IO_read_endio;
@@ -1541,20 +1605,19 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
return;
}
- dio->iop.inode = inode;
dio->iop.sectors_added = 0;
- dio->iop.is_dio = true;
- dio->iop.new_i_size = U64_MAX;
bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
dio->c->fastest_devs,
- (unsigned long) dio->task,
+ writepoint_hashed((unsigned long) dio->task),
POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
&inode->ei_journal_seq,
- flags|BCH_WRITE_THROTTLE);
+ flags);
dio->iop.op.index_update_fn = bchfs_write_index_update;
- dio->res.sectors -= bio_sectors(bio);
- dio->iop.op.res.sectors = bio_sectors(bio);
+ if (!dio->iop.unalloc) {
+ dio->res.sectors -= bio_sectors(bio);
+ dio->iop.op.res.sectors = bio_sectors(bio);
+ }
task_io_account_write(bio->bi_iter.bi_size);
@@ -1589,6 +1652,31 @@ static void bch2_dio_write_loop_async(struct closure *cl)
}
}
+static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
+ u64 size)
+{
+ struct btree_iter iter;
+ struct bpos end = pos;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ end.offset += size;
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+ BTREE_ITER_WITH_HOLES, k) {
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+ break;
+
+ if (!bch2_extent_is_fully_allocated(k)) {
+ ret = -ENOSPC;
+ break;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
static int bch2_direct_IO_write(struct bch_fs *c,
struct kiocb *req, struct file *file,
struct bch_inode_info *inode,
@@ -1610,17 +1698,18 @@ static int bch2_direct_IO_write(struct bch_fs *c,
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
- bch2_dio_write_bioset);
+ &c->dio_write_bioset);
dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
- dio->req = req;
- dio->c = c;
- dio->written = 0;
- dio->error = 0;
- dio->offset = offset;
- dio->iovec = NULL;
- dio->iter = *iter;
- dio->task = current;
closure_init(&dio->cl, NULL);
+ dio->req = req;
+ dio->c = c;
+ dio->written = 0;
+ dio->error = 0;
+ dio->offset = offset;
+ dio->iovec = NULL;
+ dio->iter = *iter;
+ dio->task = current;
+ bch2_fswrite_op_init(&dio->iop, inode, true);
if (offset + iter->count > inode->v.i_size)
sync = true;
@@ -1635,9 +1724,15 @@ static int bch2_direct_IO_write(struct bch_fs *c,
*/
ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
if (unlikely(ret)) {
- closure_debug_destroy(&dio->cl);
- bio_put(bio);
- return ret;
+ if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
+ offset >> 9),
+ iter->count >> 9)) {
+ closure_debug_destroy(&dio->cl);
+ bio_put(bio);
+ return ret;
+ }
+
+ dio->iop.unalloc = true;
}
inode_dio_begin(&inode->v);
@@ -2318,7 +2413,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
if (reservation.v.nr_replicas < replicas ||
- bkey_extent_is_compressed(k)) {
+ bch2_extent_is_compressed(k)) {
ret = bch2_disk_reservation_get(c, &disk_res,
sectors, 0);
if (ret)
@@ -2564,4 +2659,24 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+ bioset_exit(&c->dio_write_bioset);
+ bioset_exit(&c->dio_read_bioset);
+ bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+ if (bioset_init(&c->writepage_bioset,
+ 4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+ bioset_init(&c->dio_read_bioset,
+ 4, offsetof(struct dio_read, rbio.bio)) ||
+ bioset_init(&c->dio_write_bioset,
+ 4, offsetof(struct dio_write, iop.op.wbio.bio)))
+ return -ENOMEM;
+
+ return 0;
+}
+
#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index 505cea73..30d1ea9d 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -1,7 +1,11 @@
#ifndef _BCACHEFS_FS_IO_H
#define _BCACHEFS_FS_IO_H
+#ifndef NO_BCACHEFS_FS
+
#include "buckets.h"
+#include "io_types.h"
+
#include <linux/uio.h>
int bch2_set_page_dirty(struct page *);
@@ -35,60 +39,11 @@ int bch2_releasepage(struct page *, gfp_t);
int bch2_migrate_page(struct address_space *, struct page *,
struct page *, enum migrate_mode);
-struct i_sectors_hook {
- struct extent_insert_hook hook;
- s64 sectors;
- struct bch_inode_info *inode;
-};
-
-struct bchfs_write_op {
- struct bch_inode_info *inode;
- s64 sectors_added;
- bool is_dio;
- u64 new_i_size;
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-struct bch_writepage_io {
- struct closure cl;
-
- /* must be last: */
- struct bchfs_write_op op;
-};
-
-extern struct bio_set *bch2_writepage_bioset;
-
-struct dio_write {
- struct closure cl;
- struct kiocb *req;
- struct bch_fs *c;
- long written;
- long error;
- loff_t offset;
-
- struct disk_reservation res;
-
- struct iovec *iovec;
- struct iovec inline_vecs[UIO_FASTIOV];
- struct iov_iter iter;
-
- struct task_struct *task;
-
- /* must be last: */
- struct bchfs_write_op iop;
-};
-
-extern struct bio_set *bch2_dio_write_bioset;
-
-struct dio_read {
- struct closure cl;
- struct kiocb *req;
- long ret;
- struct bch_read_bio rbio;
-};
-
-extern struct bio_set *bch2_dio_read_bioset;
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
#endif /* _BCACHEFS_FS_IO_H */
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 081ae140..43688cd3 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -654,17 +654,17 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
if (bkey_extent_is_data(&k->k)) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
int ret;
extent_for_each_ptr_crc(e, ptr, crc) {
int flags2 = 0;
u64 offset = ptr->offset;
- if (crc_compression_type(crc))
+ if (crc.compression_type)
flags2 |= FIEMAP_EXTENT_ENCODED;
else
- offset += crc_offset(crc);
+ offset += crc.offset;
if ((offset & (PAGE_SECTORS - 1)) ||
(e.k->size & (PAGE_SECTORS - 1)))
@@ -1336,12 +1336,6 @@ MODULE_ALIAS_FS("bcachefs");
void bch2_vfs_exit(void)
{
unregister_filesystem(&bcache_fs_type);
- if (bch2_dio_write_bioset)
- bioset_free(bch2_dio_write_bioset);
- if (bch2_dio_read_bioset)
- bioset_free(bch2_dio_read_bioset);
- if (bch2_writepage_bioset)
- bioset_free(bch2_writepage_bioset);
if (bch2_inode_cache)
kmem_cache_destroy(bch2_inode_cache);
}
@@ -1354,20 +1348,6 @@ int __init bch2_vfs_init(void)
if (!bch2_inode_cache)
goto err;
- bch2_writepage_bioset =
- bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
- if (!bch2_writepage_bioset)
- goto err;
-
- bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
- if (!bch2_dio_read_bioset)
- goto err;
-
- bch2_dio_write_bioset =
- bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
- if (!bch2_dio_write_bioset)
- goto err;
-
ret = register_filesystem(&bcache_fs_type);
if (ret)
goto err;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index e5fc72da..0c41e411 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -29,6 +29,29 @@
/* Allocate, free from mempool: */
+void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
+{
+ u64 now = local_clock();
+ unsigned io_latency = (now >> 10) - submit_time_us;
+ atomic_t *latency = &ca->latency[rw];
+ unsigned old, new, v = atomic_read(latency);
+
+ do {
+ old = v;
+
+ /*
+ * If the io latency was reasonably close to the current
+ * latency, skip doing the update and atomic operation - most of
+ * the time:
+ */
+ if (abs((int) (old - io_latency)) < (old >> 1) &&
+ now & ~(~0 << 5))
+ break;
+
+ new = ewma_add((u64) old, io_latency, 6);
+ } while ((v = atomic_cmpxchg(latency, old, new)) != old);
+}
+
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
struct bio_vec *bv;
@@ -63,10 +86,12 @@ pool_alloc:
}
void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t bytes)
+ size_t bytes)
{
bool using_mempool = false;
+ BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+
bio->bi_iter.bi_size = bytes;
while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
@@ -76,7 +101,35 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
mutex_unlock(&c->bio_bounce_pages_lock);
}
-/* Bios with headers */
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
+ size_t bytes)
+{
+ while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+ BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+
+ bv->bv_page = alloc_page(GFP_NOIO);
+ if (!bv->bv_page) {
+ /*
+ * We already allocated from mempool, we can't allocate from it again
+ * without freeing the pages we already allocated or else we could
+ * deadlock:
+ */
+ bch2_bio_free_pages_pool(c, bio);
+ bch2_bio_alloc_pages_pool(c, bio, bytes);
+ return;
+ }
+
+ bv->bv_len = PAGE_SIZE;
+ bv->bv_offset = 0;
+ bio->bi_vcnt++;
+ }
+
+ bio->bi_iter.bi_size = bytes;
+}
+
+/* Writes */
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
@@ -137,17 +190,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
}
-/* IO errors */
-
-/* Writes */
-
-static struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
- return op->alloc_reserve == RESERVE_MOVINGGC
- ? op->c->copygc_wq
- : op->c->wq;
-}
-
static void __bch2_write(struct closure *);
static void bch2_write_done(struct closure *cl)
@@ -176,7 +218,7 @@ static u64 keylist_sectors(struct keylist *keys)
return ret;
}
-static int bch2_write_index_default(struct bch_write_op *op)
+int bch2_write_index_default(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
struct btree_iter iter;
@@ -202,7 +244,6 @@ static void bch2_write_index(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
- unsigned i;
op->flags |= BCH_WRITE_LOOPED;
@@ -220,13 +261,7 @@ static void bch2_write_index(struct closure *cl)
}
}
- for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
- if (op->open_buckets[i]) {
- bch2_open_bucket_put(c,
- c->open_buckets +
- op->open_buckets[i]);
- op->open_buckets[i] = 0;
- }
+ bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
if (!(op->flags & BCH_WRITE_DONE))
continue_at(cl, __bch2_write, op->io_wq);
@@ -287,6 +322,8 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
+ bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+
if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
set_bit(ca->dev_idx, op->failed.d);
set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
@@ -307,179 +344,364 @@ static void bch2_write_endio(struct bio *bio)
closure_put(cl);
}
-static struct nonce extent_nonce(struct bversion version,
- unsigned nonce,
- unsigned uncompressed_size,
- unsigned compression_type)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32((nonce << 12) |
- (uncompressed_size << 22)),
- [1] = cpu_to_le32(version.lo),
- [2] = cpu_to_le32(version.lo >> 32),
- [3] = cpu_to_le32(version.hi|
- (compression_type << 24))^BCH_NONCE_EXTENT,
- }};
-}
-
static void init_append_extent(struct bch_write_op *op,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type,
- struct open_bucket *ob)
+ struct write_point *wp,
+ struct bversion version,
+ struct bch_extent_crc_unpacked crc)
{
struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
- op->pos.offset += uncompressed_size;
+ op->pos.offset += crc.uncompressed_size;
e->k.p = op->pos;
- e->k.size = uncompressed_size;
- e->k.version = op->version;
+ e->k.size = crc.uncompressed_size;
+ e->k.version = version;
bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
- bch2_extent_crc_append(e, compressed_size,
- uncompressed_size,
- compression_type,
- nonce, csum, csum_type);
-
- bch2_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
- ob, compressed_size);
+ bch2_extent_crc_append(e, crc);
+ bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED));
bch2_keylist_push(&op->insert_keys);
}
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+ struct write_point *wp,
+ struct bio *src,
+ bool *page_alloc_failed)
{
- struct bch_fs *c = op->c;
- struct bio *orig = &op->wbio.bio;
- struct bio *bio;
struct bch_write_bio *wbio;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
- struct bkey_i *key_to_write;
- unsigned csum_type = op->csum_type;
- unsigned compression_type = op->compression_type;
- int ret, more;
+ struct bio *bio;
+ unsigned output_available =
+ min(wp->sectors_free << 9, src->bi_iter.bi_size);
+ unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+
+ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+ wbio = wbio_init(bio);
+ wbio->bounce = true;
+ wbio->put_bio = true;
+ /* copy WRITE_SYNC flag */
+ wbio->bio.bi_opf = src->bi_opf;
+
+ /*
+ * We can't use mempool for more than c->sb.encoded_extent_max
+ * worth of pages, but we'd like to allocate more if we can:
+ */
+ while (bio->bi_iter.bi_size < output_available) {
+ unsigned len = min_t(unsigned, PAGE_SIZE,
+ output_available - bio->bi_iter.bi_size);
+ struct page *p;
+
+ p = alloc_page(GFP_NOIO);
+ if (!p) {
+ unsigned pool_max =
+ min_t(unsigned, output_available,
+ c->sb.encoded_extent_max << 9);
+
+ if (bio_sectors(bio) < pool_max)
+ bch2_bio_alloc_pages_pool(c, bio, pool_max);
+ break;
+ }
+
+ bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+ .bv_page = p,
+ .bv_len = len,
+ .bv_offset = 0,
+ };
+ bio->bi_iter.bi_size += len;
+ }
- /* don't refetch csum type/compression type */
- barrier();
+ *page_alloc_failed = bio->bi_vcnt < pages;
+ return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+ struct bch_write_op *op,
+ unsigned new_csum_type)
+{
+ struct bio *bio = &op->wbio.bio;
+ struct bch_extent_crc_unpacked new_crc;
+ int ret;
- BUG_ON(!bio_sectors(orig));
+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+ bch2_csum_type_is_encryption(new_csum_type))
+ new_csum_type = op->crc.csum_type;
+
+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
+ if (ret)
+ return ret;
+
+ bio_advance(bio, op->crc.offset << 9);
+ bio->bi_iter.bi_size = op->crc.live_size << 9;
+ op->crc = new_crc;
+ return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ struct bch_csum csum;
+
+ if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+ return 0;
+
+ /*
+ * If we need to decrypt data in the write path, we'll no longer be able
+ * to verify the existing checksum (poly1305 mac, in this case) after
+ * it's decrypted - this is the last point we'll be able to reverify the
+ * checksum:
+ */
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ if (bch2_crc_cmp(op->crc.csum, csum))
+ return -EIO;
+
+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ return 0;
+}
+
+static enum prep_encoded_ret {
+ PREP_ENCODED_OK,
+ PREP_ENCODED_ERR,
+ PREP_ENCODED_CHECKSUM_ERR,
+ PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+ struct bch_fs *c = op->c;
+ struct bio *bio = &op->wbio.bio;
- /* Need to decompress data? */
- if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
- (crc_uncompressed_size(NULL, &op->crc) != op->size ||
- crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
- int ret;
+ if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+ return PREP_ENCODED_OK;
- ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
- if (ret)
- return ret;
+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
- op->flags &= ~BCH_WRITE_DATA_COMPRESSED;
+ /* Can we just write the entire extent as is? */
+ if (op->crc.uncompressed_size == op->crc.live_size &&
+ op->crc.compressed_size <= wp->sectors_free &&
+ op->crc.compression_type == op->compression_type) {
+ if (!op->crc.compression_type &&
+ op->csum_type != op->crc.csum_type &&
+ bch2_write_rechecksum(c, op, op->csum_type))
+ return PREP_ENCODED_CHECKSUM_ERR;
+
+ return PREP_ENCODED_DO_WRITE;
}
- if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
- init_append_extent(op,
- crc_compressed_size(NULL, &op->crc),
- crc_uncompressed_size(NULL, &op->crc),
- op->crc.compression_type,
- op->crc.nonce,
- op->crc.csum,
- op->crc.csum_type,
- wp->ob);
-
- bio = orig;
- wbio = wbio_init(bio);
- more = 0;
- } else if (csum_type != BCH_CSUM_NONE ||
- compression_type != BCH_COMPRESSION_NONE) {
- /* all units here in bytes */
- unsigned total_output = 0, output_available =
- min(wp->sectors_free << 9, orig->bi_iter.bi_size);
- unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
- ? op->nonce : 0;
+ /*
+ * If the data is compressed and we couldn't write the entire extent as
+ * is, we have to decompress it:
+ */
+ if (op->crc.compression_type) {
struct bch_csum csum;
- struct nonce nonce;
- bio = bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(output_available, PAGE_SIZE),
- &c->bio_write);
- wbio = wbio_init(bio);
- wbio->bounce = true;
- wbio->put_bio = true;
- /* copy WRITE_SYNC flag */
- wbio->bio.bi_opf = orig->bi_opf;
+ if (bch2_write_decrypt(op))
+ return PREP_ENCODED_CHECKSUM_ERR;
- /*
- * XXX: can't use mempool for more than
- * BCH_COMPRESSED_EXTENT_MAX worth of pages
- */
- bch2_bio_alloc_pages_pool(c, bio, output_available);
+ /* Last point we can still verify checksum: */
+ csum = bch2_checksum_bio(c, op->crc.csum_type,
+ extent_nonce(op->version, op->crc),
+ bio);
+ if (bch2_crc_cmp(op->crc.csum, csum))
+ return PREP_ENCODED_CHECKSUM_ERR;
- do {
- unsigned fragment_compression_type = compression_type;
- size_t dst_len, src_len;
+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+ return PREP_ENCODED_ERR;
+ }
- bch2_bio_compress(c, bio, &dst_len,
- orig, &src_len,
- &fragment_compression_type);
+ /*
+ * No longer have compressed data after this point - data might be
+ * encrypted:
+ */
- nonce = extent_nonce(op->version,
- crc_nonce,
- src_len >> 9,
- fragment_compression_type);
+ /*
+ * If the data is checksummed and we're only writing a subset,
+ * rechecksum and adjust bio to point to currently live data:
+ */
+ if ((op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) &&
+ bch2_write_rechecksum(c, op, op->csum_type))
+ return PREP_ENCODED_CHECKSUM_ERR;
- swap(bio->bi_iter.bi_size, dst_len);
- bch2_encrypt_bio(c, csum_type, nonce, bio);
+ /*
+ * If we want to compress the data, it has to be decrypted:
+ */
+ if ((op->compression_type ||
+ bch2_csum_type_is_encryption(op->crc.csum_type) !=
+ bch2_csum_type_is_encryption(op->csum_type)) &&
+ bch2_write_decrypt(op))
+ return PREP_ENCODED_CHECKSUM_ERR;
- csum = bch2_checksum_bio(c, csum_type, nonce, bio);
- swap(bio->bi_iter.bi_size, dst_len);
+ return PREP_ENCODED_OK;
+}
- init_append_extent(op,
- dst_len >> 9, src_len >> 9,
- fragment_compression_type,
- crc_nonce, csum, csum_type, wp->ob);
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+{
+ struct bch_fs *c = op->c;
+ struct bio *src = &op->wbio.bio, *dst = src;
+ struct bvec_iter saved_iter;
+ struct bkey_i *key_to_write;
+ unsigned key_to_write_offset = op->insert_keys.top_p -
+ op->insert_keys.keys_p;
+ unsigned total_output = 0;
+ bool bounce = false, page_alloc_failed = false;
+ int ret, more = 0;
- total_output += dst_len;
- bio_advance(bio, dst_len);
- bio_advance(orig, src_len);
- } while (bio->bi_iter.bi_size &&
- orig->bi_iter.bi_size &&
- !bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX));
+ BUG_ON(!bio_sectors(src));
- BUG_ON(total_output > output_available);
+ switch (bch2_write_prep_encoded_data(op, wp)) {
+ case PREP_ENCODED_OK:
+ break;
+ case PREP_ENCODED_ERR:
+ ret = -EIO;
+ goto err;
+ case PREP_ENCODED_CHECKSUM_ERR:
+ goto csum_err;
+ case PREP_ENCODED_DO_WRITE:
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
+ }
- memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
- bio->bi_iter.bi_size = total_output;
+ if (op->compression_type ||
+ (op->csum_type &&
+ !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+ (bch2_csum_type_is_encryption(op->csum_type) &&
+ !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+ dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+ bounce = true;
+ }
- /*
- * Free unneeded pages after compressing:
- */
- while (bio->bi_vcnt * PAGE_SIZE >
- round_up(bio->bi_iter.bi_size, PAGE_SIZE))
- mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
- &c->bio_bounce_pages);
+ saved_iter = dst->bi_iter;
- more = orig->bi_iter.bi_size != 0;
- } else {
- bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
- &c->bio_write);
- wbio = wbio_init(bio);
- wbio->put_bio = bio != orig;
+ do {
+ struct bch_extent_crc_unpacked crc =
+ (struct bch_extent_crc_unpacked) { 0 };
+ struct bversion version = op->version;
+ size_t dst_len, src_len;
+
+ if (page_alloc_failed &&
+ bio_sectors(dst) < wp->sectors_free &&
+ bio_sectors(dst) < c->sb.encoded_extent_max)
+ break;
- init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
- compression_type, 0,
- (struct bch_csum) { 0 }, csum_type, wp->ob);
+ BUG_ON(op->compression_type &&
+ (op->flags & BCH_WRITE_DATA_ENCODED) &&
+ bch2_csum_type_is_encryption(op->crc.csum_type));
+ BUG_ON(op->compression_type && !bounce);
+
+ crc.compression_type = op->compression_type
+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+ op->compression_type)
+ : 0;
+ if (!crc.compression_type) {
+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+ if (op->csum_type)
+ dst_len = min_t(unsigned, dst_len,
+ c->sb.encoded_extent_max << 9);
+
+ if (bounce) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bio_copy_data(dst, src);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
- more = bio != orig;
+ src_len = dst_len;
+ }
+
+ BUG_ON(!src_len || !dst_len);
+
+ if (bch2_csum_type_is_encryption(op->csum_type)) {
+ if (bversion_zero(version)) {
+ version.lo = atomic64_inc_return(&c->key_version) + 1;
+ } else {
+ crc.nonce = op->nonce;
+ op->nonce += src_len >> 9;
+ }
+ }
+
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ !crc.compression_type &&
+ bch2_csum_type_is_encryption(op->crc.csum_type) ==
+ bch2_csum_type_is_encryption(op->csum_type)) {
+ /*
+ * Note: when we're using rechecksum(), we need to be
+ * checksumming @src because it has all the data our
+ * existing checksum covers - if we bounced (because we
+ * were trying to compress), @dst will only have the
+ * part of the data the new checksum will cover.
+ *
+ * But normally we want to be checksumming post bounce,
+ * because part of the reason for bouncing is so the
+ * data can't be modified (by userspace) while it's in
+ * flight.
+ */
+ if (bch2_rechecksum_bio(c, src, version, op->crc,
+ &crc, &op->crc,
+ src_len >> 9,
+ bio_sectors(src) - (src_len >> 9),
+ op->csum_type))
+ goto csum_err;
+ } else {
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ bch2_rechecksum_bio(c, src, version, op->crc,
+ NULL, &op->crc,
+ src_len >> 9,
+ bio_sectors(src) - (src_len >> 9),
+ op->crc.csum_type))
+ goto csum_err;
+
+ crc.compressed_size = dst_len >> 9;
+ crc.uncompressed_size = src_len >> 9;
+ crc.live_size = src_len >> 9;
+
+ swap(dst->bi_iter.bi_size, dst_len);
+ bch2_encrypt_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ crc.csum = bch2_checksum_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ crc.csum_type = op->csum_type;
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+
+ init_append_extent(op, wp, version, crc);
+
+ if (dst != src)
+ bio_advance(dst, dst_len);
+ bio_advance(src, src_len);
+ total_output += dst_len;
+ } while (dst->bi_iter.bi_size &&
+ src->bi_iter.bi_size &&
+ wp->sectors_free &&
+ !bch2_keylist_realloc(&op->insert_keys,
+ op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ BKEY_EXTENT_U64s_MAX));
+
+ more = src->bi_iter.bi_size != 0;
+
+ dst->bi_iter = saved_iter;
+
+ if (!bounce && more) {
+ dst = bio_split(src, total_output >> 9,
+ GFP_NOIO, &c->bio_write);
+ wbio_init(dst)->put_bio = true;
}
+ dst->bi_iter.bi_size = total_output;
+
+ /* Free unneeded pages after compressing: */
+ if (bounce)
+ while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
+ mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
+ &c->bio_bounce_pages);
+do_write:
/* might have done a realloc... */
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
@@ -487,30 +709,40 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
BCH_DATA_USER);
if (ret)
- return ret;
+ goto err;
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ dst->bi_end_io = bch2_write_endio;
+ dst->bi_private = &op->cl;
+ bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
- closure_get(bio->bi_private);
+ closure_get(dst->bi_private);
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
key_to_write);
return more;
+csum_err:
+ bch_err(c, "error verifying existing checksum while "
+ "rewriting existing data (memory corruption?)");
+ ret = -EIO;
+err:
+ if (bounce) {
+ bch2_bio_free_pages_pool(c, dst);
+ bio_put(dst);
+ }
+
+ return ret;
}
static void __bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- unsigned open_bucket_nr = 0;
struct write_point *wp;
- struct open_bucket *ob;
int ret;
do {
- if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+ if (op->open_buckets_nr + op->nr_replicas >
+ ARRAY_SIZE(op->open_buckets))
continue_at(cl, bch2_write_index, index_update_wq(op));
/* for the device pointers and 1 for the chksum */
@@ -520,11 +752,12 @@ static void __bch2_write(struct closure *cl)
BKEY_EXTENT_U64s_MAX))
continue_at(cl, bch2_write_index, index_update_wq(op));
- wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+ wp = bch2_alloc_sectors_start(c,
op->devs,
op->write_point,
+ &op->devs_have,
op->nr_replicas,
- c->opts.data_replicas_required,
+ op->nr_replicas_required,
op->alloc_reserve,
op->flags,
(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
@@ -565,14 +798,13 @@ static void __bch2_write(struct closure *cl)
continue;
}
- ob = wp->ob;
-
- BUG_ON(ob - c->open_buckets == 0 ||
- ob - c->open_buckets > U8_MAX);
- op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
-
ret = bch2_write_extent(op, wp);
+ BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use >
+ ARRAY_SIZE(op->open_buckets));
+ bch2_open_bucket_get(c, wp,
+ &op->open_buckets_nr,
+ op->open_buckets);
bch2_alloc_sectors_done(c, wp);
if (ret < 0)
@@ -603,30 +835,6 @@ err:
: bch2_write_done, index_update_wq(op));
}
-void bch2_wake_delayed_writes(unsigned long data)
-{
- struct bch_fs *c = (void *) data;
- struct bch_write_op *op;
- unsigned long flags;
-
- spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
-
- while ((op = c->write_wait_head)) {
- if (time_after(op->expires, jiffies)) {
- mod_timer(&c->foreground_write_wakeup, op->expires);
- break;
- }
-
- c->write_wait_head = op->next;
- if (!c->write_wait_head)
- c->write_wait_tail = NULL;
-
- closure_put(&op->cl);
- }
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
-}
-
/**
* bch_write - handle a write to a cache device or flash only volume
*
@@ -646,9 +854,17 @@ void bch2_wake_delayed_writes(unsigned long data)
void bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
- u64 inode = op->pos.inode;
+
+ BUG_ON(!op->nr_replicas);
+ BUG_ON(!op->write_point.v);
+ BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+ BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+
+ memset(&op->failed, 0, sizeof(op->failed));
+
+ bch2_keylist_init(&op->insert_keys, op->inline_keys);
+ wbio_init(&op->wbio.bio)->put_bio = false;
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
@@ -658,102 +874,11 @@ void bch2_write(struct closure *cl)
closure_return(cl);
}
- if (bversion_zero(op->version) &&
- bch2_csum_type_is_encryption(op->csum_type))
- op->version.lo =
- atomic64_inc_return(&c->key_version) + 1;
-
- bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
- /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
-
- if ((op->flags & BCH_WRITE_THROTTLE) &&
- c->foreground_write_ratelimit_enabled &&
- c->foreground_write_pd.rate.rate < (1 << 30)) {
- unsigned long flags;
- u64 delay;
-
- spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
- bch2_ratelimit_increment(&c->foreground_write_pd.rate,
- bio->bi_iter.bi_size);
-
- delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
-
- if (delay >= HZ / 100) {
- trace_write_throttle(c, inode, bio, delay);
-
- closure_get(&op->cl); /* list takes a ref */
-
- op->expires = jiffies + delay;
- op->next = NULL;
-
- if (c->write_wait_tail)
- c->write_wait_tail->next = op;
- else
- c->write_wait_head = op;
- c->write_wait_tail = op;
-
- if (!timer_pending(&c->foreground_write_wakeup))
- mod_timer(&c->foreground_write_wakeup,
- op->expires);
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock,
- flags);
- continue_at(cl, __bch2_write, index_update_wq(op));
- }
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
- }
+ bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
continue_at_nobarrier(cl, __bch2_write, NULL);
}
-void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct disk_reservation res,
- struct bch_devs_mask *devs,
- unsigned long write_point,
- struct bpos pos,
- u64 *journal_seq, unsigned flags)
-{
- EBUG_ON(res.sectors && !res.nr_replicas);
-
- op->c = c;
- op->io_wq = index_update_wq(op);
- op->written = 0;
- op->error = 0;
- op->flags = flags;
- op->csum_type = bch2_data_checksum_type(c);
- op->compression_type =
- bch2_compression_opt_to_type(c->opts.compression);
- op->nr_replicas = res.nr_replicas;
- op->alloc_reserve = RESERVE_NONE;
- op->nonce = 0;
- op->pos = pos;
- op->version = ZERO_VERSION;
- op->res = res;
- op->devs = devs;
- op->write_point = write_point;
-
- if (journal_seq) {
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
- } else {
- op->journal_seq = 0;
- }
-
- op->index_update_fn = bch2_write_index_default;
-
- memset(op->open_buckets, 0, sizeof(op->open_buckets));
- memset(&op->failed, 0, sizeof(op->failed));
-
- bch2_keylist_init(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys));
-
- if (version_stress_test(c))
- get_random_bytes(&op->version, sizeof(op->version));
-}
-
/* Cache promotion on read */
struct promote_op {
@@ -787,11 +912,20 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
trace_promote(&rbio->bio);
/* we now own pages: */
+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * bio->bi_vcnt);
rbio->promote = NULL;
+ __bch2_write_op_init(&op->write.op, c);
+
+ op->write.move_dev = -1;
+ op->write.op.devs = c->fastest_devs;
+ op->write.op.write_point = writepoint_hashed((unsigned long) current);
+ op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT;
+ op->write.op.flags |= BCH_WRITE_CACHED;
+
+ bch2_migrate_write_init(&op->write, rbio);
+
closure_init(cl, NULL);
closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
closure_return_with_destructor(cl, promote_done);
@@ -801,57 +935,27 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
* XXX: multiple promotes can race with each other, wastefully. Keep a list of
* outstanding promotes?
*/
-static struct promote_op *promote_alloc(struct bch_fs *c,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_pick_ptr *pick,
- bool read_full)
+static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
{
struct promote_op *op;
struct bio *bio;
- /*
- * biovec needs to be big enough to hold decompressed data, if
- * bch2_write_extent() has to decompress/recompress it:
- */
- unsigned sectors = max_t(unsigned, k.k->size,
- crc_uncompressed_size(NULL, &pick->crc));
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+ /* data might have to be decompressed in the write path: */
+ unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
+ PAGE_SECTORS);
- op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+ BUG_ON(!rbio->bounce);
+ BUG_ON(pages < rbio->bio.bi_vcnt);
+
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+ GFP_NOIO);
if (!op)
return NULL;
bio = &op->write.op.wbio.bio;
bio_init(bio, bio->bi_inline_vecs, pages);
- bio->bi_iter = iter;
-
- if (pick->crc.compression_type) {
- op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
- op->write.op.crc = pick->crc;
- op->write.op.size = k.k->size;
- } else if (read_full) {
- /*
- * Adjust bio to correspond to _live_ portion of @k -
- * which might be less than what we're actually reading:
- */
- bio->bi_iter.bi_size = sectors << 9;
- bio_advance(bio, pick->crc.offset << 9);
- BUG_ON(bio_sectors(bio) < k.k->size);
- bio->bi_iter.bi_size = k.k->size << 9;
- } else {
- /*
- * Set insert pos to correspond to what we're actually
- * reading:
- */
- op->write.op.pos.offset = iter.bi_sector;
- }
- bch2_migrate_write_init(c, &op->write,
- c->fastest_devs,
- k, NULL,
- BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_CACHED);
- op->write.promote = true;
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
return op;
}
@@ -863,9 +967,6 @@ static bool should_promote(struct bch_fs *c,
if (!(flags & BCH_READ_MAY_PROMOTE))
return false;
- if (flags & BCH_READ_IN_RETRY)
- return false;
-
if (percpu_ref_is_dying(&c->writes))
return false;
@@ -875,10 +976,20 @@ static bool should_promote(struct bch_fs *c,
/* Read */
+static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
+ struct bvec_iter, u64,
+ struct bch_devs_mask *, unsigned);
+
#define READ_RETRY_AVOID 1
#define READ_RETRY 2
#define READ_ERR 3
+enum rbio_context {
+ RBIO_CONTEXT_NULL,
+ RBIO_CONTEXT_HIGHPRI,
+ RBIO_CONTEXT_UNBOUND,
+};
+
static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio *rbio)
{
@@ -887,14 +998,14 @@ bch2_rbio_parent(struct bch_read_bio *rbio)
__always_inline
static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+ enum rbio_context context,
struct workqueue_struct *wq)
{
-
- if (!wq || rbio->process_context) {
+ if (context <= rbio->context) {
fn(&rbio->work);
} else {
rbio->work.func = fn;
- rbio->process_context = true;
+ rbio->context = context;
queue_work(wq, &rbio->work);
}
}
@@ -932,7 +1043,7 @@ static void bch2_rbio_retry(struct work_struct *work)
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
- u64 inode = rbio->inode;
+ u64 inode = rbio->pos.inode;
struct bch_devs_mask avoid;
trace_read_retry(&rbio->bio);
@@ -942,15 +1053,24 @@ static void bch2_rbio_retry(struct work_struct *work)
if (rbio->retry == READ_RETRY_AVOID)
__set_bit(rbio->pick.ca->dev_idx, avoid.d);
+ if (rbio->promote)
+ kfree(rbio->promote);
+ rbio->promote = NULL;
+
if (rbio->split)
rbio = bch2_rbio_free(rbio);
else
rbio->bio.bi_error = 0;
- flags |= BCH_READ_MUST_CLONE;
+ if (!(flags & BCH_READ_NODECODE))
+ flags |= BCH_READ_MUST_CLONE;
flags |= BCH_READ_IN_RETRY;
+ flags &= ~BCH_READ_MAY_PROMOTE;
- __bch2_read(c, rbio, iter, inode, &avoid, flags);
+ if (flags & BCH_READ_NODECODE)
+ bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+ else
+ __bch2_read(c, rbio, iter, inode, &avoid, flags);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
@@ -964,108 +1084,175 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
bch2_rbio_parent(rbio)->bio.bi_error = error;
bch2_rbio_done(rbio);
} else {
- bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ }
+}
+
+static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+ struct bch_fs *c = rbio->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_extent *e;
+ BKEY_PADDED(k) new;
+ struct bch_extent_crc_unpacked new_crc;
+ unsigned offset;
+ int ret;
+
+ if (rbio->pick.crc.compression_type)
+ return;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
+ BTREE_ITER_INTENT);
+retry:
+ k = bch2_btree_iter_peek(&iter);
+ if (IS_ERR_OR_NULL(k.k))
+ goto out;
+
+ if (!bkey_extent_is_data(k.k))
+ goto out;
+
+ bkey_reassemble(&new.k, k);
+ e = bkey_i_to_extent(&new.k);
+
+ if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset) ||
+ bversion_cmp(e->k.version, rbio->version))
+ goto out;
+
+ /* Extent was merged? */
+ if (bkey_start_offset(&e->k) < rbio->pos.offset ||
+ e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
+ goto out;
+
+ /* The extent might have been partially overwritten since we read it: */
+ offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
+
+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+ rbio->pick.crc, NULL, &new_crc,
+ offset, e->k.size,
+ rbio->pick.crc.csum_type)) {
+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+ goto out;
}
+
+ if (!bch2_extent_narrow_crcs(e, new_crc))
+ goto out;
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOWAIT,
+ BTREE_INSERT_ENTRY(&iter, &e->k_i));
+ if (ret == -EINTR)
+ goto retry;
+out:
+ bch2_btree_iter_unlock(&iter);
+}
+
+static bool should_narrow_crcs(struct bkey_s_c_extent e,
+ struct extent_pick_ptr *pick,
+ unsigned flags)
+{
+ return !(flags & BCH_READ_IN_RETRY) &&
+ bch2_can_narrow_extent_crcs(e, pick->crc);
}
-static int bch2_rbio_checksum_uncompress(struct bio *dst,
- struct bch_read_bio *rbio)
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
+ struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->bvec_iter;
- struct nonce nonce = extent_nonce(rbio->version,
- rbio->pick.crc.nonce,
- crc_uncompressed_size(NULL, &rbio->pick.crc),
- rbio->pick.crc.compression_type);
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
struct bch_csum csum;
- int ret = 0;
- /*
- * reset iterator for checksumming and copying bounced data: here we've
- * set rbio->compressed_size to the amount of data we actually read,
- * which was not necessarily the full extent if we were only bouncing
- * in order to promote
- */
+ /* Reset iterator for checksumming and copying bounced data: */
if (rbio->bounce) {
- src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->pick.crc) << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
+ src->bi_iter.bi_size = crc.compressed_size << 9;
+ src->bi_iter.bi_idx = 0;
+ src->bi_iter.bi_bvec_done = 0;
} else {
- src->bi_iter = rbio->bvec_iter;
+ src->bi_iter = rbio->bvec_iter;
}
- csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
- if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
- rbio->pick.ca,
- "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
- rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
- rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo,
- rbio->pick.crc.csum_type))
- ret = -EIO;
+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+ goto csum_err;
- /*
- * If there was a checksum error, still copy the data back - unless it
- * was compressed, we don't want to decompress bad data:
- */
- if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
- if (!ret) {
- bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
- ret = bch2_bio_uncompress(c, src, dst,
- dst_iter, rbio->pick.crc);
- if (ret)
- __bcache_io_error(c, "decompression error");
- }
- } else if (rbio->bounce) {
- bio_advance(src, rbio->pick.crc.offset << 9);
-
- /* don't need to decrypt the entire bio: */
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
+ if (unlikely(rbio->narrow_crcs))
+ bch2_rbio_narrow_crcs(rbio);
- nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
+ if (rbio->flags & BCH_READ_NODECODE)
+ goto nodecode;
- bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
- nonce, src);
+ /* Adjust crc to point to subset of data we want: */
+ crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- bio_copy_data_iter(dst, &dst_iter,
- src, &src->bi_iter);
+ if (crc.compression_type != BCH_COMPRESSION_NONE) {
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+ goto decompression_err;
} else {
- bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
- }
+ /* don't need to decrypt the entire bio: */
+ nonce = nonce_add(nonce, crc.offset << 9);
+ bio_advance(src, crc.offset << 9);
- return ret;
-}
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+ src->bi_iter.bi_size = dst_iter.bi_size;
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- int ret;
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
- if (ret) {
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, ret);
- } else {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
+ if (rbio->bounce) {
+ struct bvec_iter src_iter = src->bi_iter;
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
- return;
}
- if (rbio->promote)
+ if (rbio->promote) {
+ /*
+ * Re encrypt data we decrypted, so it's consistent with
+ * rbio->crc:
+ */
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
promote_start(rbio->promote, rbio);
-
+ }
+nodecode:
if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
bch2_rbio_done(rbio);
+ return;
+csum_err:
+ /*
+ * Checksum error: if the bio wasn't bounced, we may have been
+ * reading into buffers owned by userspace (that userspace can
+ * scribble over) - retry the read, bouncing it this time:
+ */
+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+ rbio->flags |= BCH_READ_MUST_BOUNCE;
+ bch2_rbio_error(rbio, READ_RETRY, -EIO);
+ return;
+ }
+
+ bch2_dev_io_error(rbio->pick.ca,
+ "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
+ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+ csum.hi, csum.lo, crc.csum_type);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+ return;
+decompression_err:
+ __bcache_io_error(c, "decompression error, inode %llu offset %llu",
+ rbio->pos.inode,
+ (u64) rbio->bvec_iter.bi_sector);
+ bch2_rbio_error(rbio, READ_ERR, -EIO);
+ return;
}
static void bch2_read_endio(struct bio *bio)
@@ -1074,6 +1261,9 @@ static void bch2_read_endio(struct bio *bio)
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
struct workqueue_struct *wq = NULL;
+ enum rbio_context context = RBIO_CONTEXT_NULL;
+
+ bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
percpu_ref_put(&rbio->pick.ca->io_ref);
@@ -1097,38 +1287,45 @@ static void bch2_read_endio(struct bio *bio)
return;
}
- if (rbio->pick.crc.compression_type ||
+ if (rbio->narrow_crcs ||
+ rbio->pick.crc.compression_type ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
- wq = system_unbound_wq;
+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
else if (rbio->pick.crc.csum_type)
- wq = system_highpri_wq;
+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
- bch2_rbio_punt(rbio, __bch2_read_endio, wq);
+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c k,
+ struct bvec_iter iter, struct bkey_s_c_extent e,
struct extent_pick_ptr *pick, unsigned flags)
{
struct bch_read_bio *rbio;
- struct promote_op *promote_op = NULL;
- unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
- bool bounce = false, split, read_full = false;
+ bool split = false, bounce = false, read_full = false;
+ bool promote = false, narrow_crcs = false;
+ struct bpos pos = bkey_start_pos(e.k);
int ret = 0;
- bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
- EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
- k.k->p.offset < bvec_iter_end_sector(iter));
+ narrow_crcs = should_narrow_crcs(e, pick, flags);
+
+ if (flags & BCH_READ_NODECODE) {
+ BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
+ iter.bi_size = pick->crc.compressed_size << 9;
+ goto noclone;
+ }
+
+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+ flags |= BCH_READ_MUST_BOUNCE;
+
+ EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
+ e.k->p.offset < bvec_iter_end_sector(iter));
- /*
- * note: if compression_type and crc_type both == none, then
- * compressed/uncompressed size is zero
- */
if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
(pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
+ (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick->crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
(flags & BCH_READ_MUST_BOUNCE)))) {
@@ -1136,17 +1333,30 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
bounce = true;
}
- if (should_promote(c, pick, flags))
- promote_op = promote_alloc(c, iter, k, pick, read_full);
-
+ promote = should_promote(c, pick, flags);
/* could also set read_full */
- if (promote_op)
+ if (promote)
bounce = true;
+ if (!read_full) {
+ EBUG_ON(pick->crc.compression_type);
+ EBUG_ON(pick->crc.csum_type &&
+ (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+ bvec_iter_sectors(iter) != pick->crc.live_size ||
+ pick->crc.offset ||
+ iter.bi_sector != pos.offset));
+
+ pick->ptr.offset += pick->crc.offset +
+ (iter.bi_sector - pos.offset);
+ pick->crc.compressed_size = bvec_iter_sectors(iter);
+ pick->crc.uncompressed_size = bvec_iter_sectors(iter);
+ pick->crc.offset = 0;
+ pick->crc.live_size = bvec_iter_sectors(iter);
+ pos.offset = iter.bi_sector;
+ }
+
if (bounce) {
- unsigned sectors = read_full
- ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
- : bvec_iter_sectors(iter);
+ unsigned sectors = pick->crc.compressed_size;
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
@@ -1163,41 +1373,38 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_clone_fast(&orig->bio,
- GFP_NOIO, &c->bio_read_split));
+ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
+ &c->bio_read_split));
rbio->bio.bi_iter = iter;
split = true;
} else {
+noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
split = false;
BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- rbio->c = c;
+ BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
+ rbio->c = c;
if (split)
rbio->parent = orig;
else
rbio->end_io = orig->bio.bi_end_io;
-
rbio->bvec_iter = iter;
+ rbio->submit_time_us = local_clock_us();
rbio->flags = flags;
rbio->bounce = bounce;
rbio->split = split;
- rbio->process_context = false;
+ rbio->narrow_crcs = narrow_crcs;
rbio->retry = 0;
+ rbio->context = 0;
+ rbio->devs_have = bch2_extent_devs(e);
rbio->pick = *pick;
- /*
- * crc.compressed_size will be 0 if there wasn't any checksum
- * information, also we need to stash the original size of the bio if we
- * bounced (which isn't necessarily the original key size, if we bounced
- * only for promoting)
- */
- rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
- rbio->version = k.k->version;
- rbio->promote = promote_op;
- rbio->inode = k.k->p.inode;
+ rbio->pos = pos;
+ rbio->version = e.k->version;
+ rbio->promote = promote ? promote_alloc(rbio) : NULL;
INIT_WORK(&rbio->work, NULL);
rbio->bio.bi_bdev = pick->ca->disk_sb.bdev;
@@ -1205,16 +1412,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
- if (read_full)
- rbio->pick.crc.offset += skip;
- else
- rbio->bio.bi_iter.bi_sector += skip;
-
- rbio->submit_time_us = local_clock_us();
-
if (bounce)
trace_read_bounce(&rbio->bio);
+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
bio_sectors(&rbio->bio));
@@ -1223,7 +1424,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
} else {
submit_bio_wait(&rbio->bio);
- rbio->process_context = true;
+ rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
@@ -1234,6 +1435,79 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
return ret;
}
+static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
+{
+ struct extent_pick_ptr pick;
+ struct btree_iter iter;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_WITH_HOLES);
+retry:
+ k = bch2_btree_iter_peek_with_holes(&iter);
+ if (btree_iter_err(k)) {
+ bch2_btree_iter_unlock(&iter);
+ goto err;
+ }
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ if (!bkey_extent_is_data(k.k) ||
+ !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset) ||
+ bkey_start_offset(k.k) != bvec_iter.bi_sector)
+ goto err;
+
+ bch2_extent_pick_ptr(c, k, avoid, &pick);
+ if (IS_ERR(pick.ca)) {
+ bcache_io_error(c, &rbio->bio, "no device to read from");
+ bio_endio(&rbio->bio);
+ return;
+ }
+
+ if (!pick.ca)
+ goto err;
+
+ if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
+ percpu_ref_put(&pick.ca->io_ref);
+ goto err;
+
+ }
+
+ ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
+ &pick, flags);
+ switch (ret) {
+ case READ_RETRY_AVOID:
+ __set_bit(pick.ca->dev_idx, avoid->d);
+ case READ_RETRY:
+ goto retry;
+ case READ_ERR:
+ bio_endio(&rbio->bio);
+ return;
+ };
+
+ return;
+err:
+ /*
+ * extent we wanted to read no longer exists, or
+ * was merged or partially overwritten (and thus
+ * possibly bigger than the memory that was
+ * originally allocated)
+ */
+ rbio->bio.bi_error = -EINTR;
+ bio_endio(&rbio->bio);
+ return;
+}
+
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, u64 inode,
struct bch_devs_mask *avoid, unsigned flags)
@@ -1241,6 +1515,8 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_iter iter;
struct bkey_s_c k;
int ret;
+
+ EBUG_ON(flags & BCH_READ_NODECODE);
retry:
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
@@ -1277,7 +1553,8 @@ retry:
}
ret = __bch2_read_extent(c, rbio, fragment,
- k, &pick, flags);
+ bkey_s_c_to_extent(k),
+ &pick, flags);
switch (ret) {
case READ_RETRY_AVOID:
__set_bit(pick.ca->dev_idx, avoid->d);
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 658c15a5..bd0d7c43 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -2,6 +2,8 @@
#define _BCACHEFS_IO_H
#include <linux/hash.h>
+#include "alloc.h"
+#include "checksum.h"
#include "io_types.h"
#define to_wbio(_bio) \
@@ -12,6 +14,9 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
@@ -20,14 +25,15 @@ enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
BCH_WRITE_FLUSH = (1 << 2),
- BCH_WRITE_DATA_COMPRESSED = (1 << 3),
- BCH_WRITE_THROTTLE = (1 << 4),
- BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
+ BCH_WRITE_DATA_ENCODED = (1 << 3),
+ BCH_WRITE_PAGES_STABLE = (1 << 4),
+ BCH_WRITE_PAGES_OWNED = (1 << 5),
+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6),
- BCH_WRITE_DONE = (1 << 7),
- BCH_WRITE_LOOPED = (1 << 8),
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7),
+ BCH_WRITE_DONE = (1 << 8),
+ BCH_WRITE_LOOPED = (1 << 9),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -36,11 +42,60 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
-void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
- struct disk_reservation,
- struct bch_devs_mask *,
- unsigned long,
- struct bpos, u64 *, unsigned);
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+ return op->alloc_reserve == RESERVE_MOVINGGC
+ ? op->c->copygc_wq
+ : op->c->wq;
+}
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+{
+ op->c = c;
+ op->io_wq = index_update_wq(op);
+ op->flags = 0;
+ op->written = 0;
+ op->error = 0;
+ op->csum_type = bch2_data_checksum_type(c);
+ op->compression_type =
+ bch2_compression_opt_to_type(c->opts.compression);
+ op->nr_replicas = 0;
+ op->nr_replicas_required = c->opts.data_replicas_required;
+ op->alloc_reserve = RESERVE_NONE;
+ op->open_buckets_nr = 0;
+ op->devs_have.nr = 0;
+ op->pos = POS_MAX;
+ op->version = ZERO_VERSION;
+ op->devs = NULL;
+ op->write_point = (struct write_point_specifier) { 0 };
+ op->res = (struct disk_reservation) { 0 };
+ op->journal_seq = 0;
+ op->index_update_fn = bch2_write_index_default;
+}
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+ struct disk_reservation res,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier write_point,
+ struct bpos pos,
+ u64 *journal_seq, unsigned flags)
+{
+ __bch2_write_op_init(op, c);
+ op->flags = flags;
+ op->nr_replicas = res.nr_replicas;
+ op->pos = pos;
+ op->res = res;
+ op->devs = devs;
+ op->write_point = write_point;
+
+ if (journal_seq) {
+ op->journal_seq_p = journal_seq;
+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+ }
+}
+
void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
@@ -51,14 +106,13 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
return wbio;
}
-void bch2_wake_delayed_writes(unsigned long data);
-
struct bch_devs_mask;
struct cache_promote_op;
struct extent_pick_ptr;
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
+ struct bkey_s_c_extent e, struct extent_pick_ptr *,
+ unsigned);
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
u64, struct bch_devs_mask *, unsigned);
@@ -66,21 +120,22 @@ enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
+ BCH_READ_NODECODE = 1 << 3,
/* internal: */
- BCH_READ_MUST_BOUNCE = 1 << 3,
- BCH_READ_MUST_CLONE = 1 << 4,
- BCH_READ_IN_RETRY = 1 << 5,
+ BCH_READ_MUST_BOUNCE = 1 << 4,
+ BCH_READ_MUST_CLONE = 1 << 5,
+ BCH_READ_IN_RETRY = 1 << 6,
};
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
- struct bkey_s_c k,
+ struct bkey_s_c_extent e,
struct extent_pick_ptr *pick,
unsigned flags)
{
rbio->_state = 0;
- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
}
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index f77106be..ed9a4bbe 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -1,20 +1,16 @@
#ifndef _BCACHEFS_IO_TYPES_H
#define _BCACHEFS_IO_TYPES_H
+#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
+#include "extents_types.h"
#include "keylist_types.h"
#include "super_types.h"
#include <linux/llist.h>
#include <linux/workqueue.h>
-struct extent_pick_ptr {
- struct bch_extent_crc128 crc;
- struct bch_extent_ptr ptr;
- struct bch_dev *ca;
-};
-
struct bch_read_bio {
struct bch_fs *c;
@@ -44,26 +40,22 @@ struct bch_read_bio {
struct {
u8 bounce:1,
split:1,
- process_context:1,
- retry:2;
+ narrow_crcs:1,
+ retry:2,
+ context:2;
};
u8 _state;
};
+ struct bch_devs_list devs_have;
+
struct extent_pick_ptr pick;
+ /* start pos of data we read (may not be pos of data we want) */
+ struct bpos pos;
struct bversion version;
struct promote_op *promote;
- /*
- * If we have to retry the read (IO error, checksum failure, read stale
- * data (raced with allocator), we retry the portion of the parent bio
- * that failed (i.e. this bio's portion, bvec_iter).
- *
- * But we need to stash the inode somewhere:
- */
- u64 inode;
-
struct work_struct work;
struct bio bio;
@@ -98,36 +90,33 @@ struct bch_write_op {
struct bch_fs *c;
struct workqueue_struct *io_wq;
- unsigned written; /* sectors */
-
- short error;
-
u16 flags;
+ u16 written; /* sectors */
+ s8 error;
+
unsigned csum_type:4;
unsigned compression_type:4;
unsigned nr_replicas:4;
+ unsigned nr_replicas_required:4;
unsigned alloc_reserve:4;
- unsigned nonce:14;
+
+ u8 open_buckets_nr;
+ struct bch_devs_list devs_have;
+ u16 target;
+ u16 nonce;
struct bpos pos;
struct bversion version;
- /* For BCH_WRITE_DATA_COMPRESSED: */
- struct bch_extent_crc128 crc;
- unsigned size;
+ /* For BCH_WRITE_DATA_ENCODED: */
+ struct bch_extent_crc_unpacked crc;
struct bch_devs_mask *devs;
- unsigned long write_point;
+ struct write_point_specifier write_point;
struct disk_reservation res;
- union {
u8 open_buckets[16];
- struct {
- struct bch_write_op *next;
- unsigned long expires;
- };
- };
/*
* If caller wants to flush but hasn't passed us a journal_seq ptr, we
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 37b342b9..5d9a298d 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -464,7 +464,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
if (invalid) {
bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf);
+ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+ type, invalid, buf);
le16_add_cpu(&entry->u64s, -k->k.u64s);
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -1568,35 +1569,31 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq);
+ spin_unlock(&j->lock);
while (ja->nr < nr) {
- /* must happen under journal lock, to avoid racing with gc: */
- long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC);
- if (b < 0) {
- if (!closure_wait(&c->freelist_wait, &cl)) {
- spin_unlock(&j->lock);
+ struct open_bucket *ob;
+ size_t bucket;
+ int ob_idx;
+
+ ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
+ if (ob_idx < 0) {
+ if (!closure_wait(&c->freelist_wait, &cl))
closure_sync(&cl);
- spin_lock(&j->lock);
- }
continue;
}
- bch2_mark_metadata_bucket(ca, &ca->buckets[b],
- BUCKET_JOURNAL, false);
- bch2_mark_alloc_bucket(ca, &ca->buckets[b], false);
+ ob = c->open_buckets + ob_idx;
+ bucket = sector_to_bucket(ca, ob->ptr.offset);
- memmove(ja->buckets + ja->last_idx + 1,
- ja->buckets + ja->last_idx,
- (ja->nr - ja->last_idx) * sizeof(u64));
- memmove(ja->bucket_seq + ja->last_idx + 1,
- ja->bucket_seq + ja->last_idx,
- (ja->nr - ja->last_idx) * sizeof(u64));
- memmove(journal_buckets->buckets + ja->last_idx + 1,
- journal_buckets->buckets + ja->last_idx,
- (ja->nr - ja->last_idx) * sizeof(u64));
+ spin_lock(&j->lock);
+ __array_insert_item(ja->buckets, ja->nr, ja->last_idx);
+ __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
+ __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
- ja->buckets[ja->last_idx] = b;
- journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+ ja->buckets[ja->last_idx] = bucket;
+ ja->bucket_seq[ja->last_idx] = 0;
+ journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
if (ja->last_idx < ja->nr) {
if (ja->cur_idx >= ja->last_idx)
@@ -1604,9 +1601,14 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
ja->last_idx++;
}
ja->nr++;
+ spin_unlock(&j->lock);
+
+ bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket],
+ BUCKET_JOURNAL,
+ gc_phase(GC_PHASE_SB), 0);
+ bch2_open_bucket_put(c, ob);
}
- spin_unlock(&j->lock);
BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
@@ -1623,6 +1625,8 @@ err:
if (!ret)
bch2_dev_allocator_add(c, ca);
+ closure_sync(&cl);
+
return ret;
}
diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h
index ea65f8e0..b7c8a861 100644
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@@ -7,8 +7,7 @@ int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
void bch2_keylist_pop_front(struct keylist *);
-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys,
- size_t nr_inline_u64s)
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
{
l->top_p = l->keys_p = inline_keys;
}
@@ -17,7 +16,7 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
{
if (l->keys_p != inline_keys)
kfree(l->keys_p);
- memset(l, 0, sizeof(*l));
+ bch2_keylist_init(l, inline_keys);
}
static inline void bch2_keylist_push(struct keylist *l)
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index d7f27a3d..8d1c0ee0 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -13,31 +13,16 @@
#include "move.h"
#include "super-io.h"
-static int issue_migration_move(struct bch_dev *ca,
- struct moving_context *ctxt,
- struct bch_devs_mask *devs,
- struct bkey_s_c k)
+static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
{
- struct bch_fs *c = ca->fs;
- struct disk_reservation res;
+ struct bch_dev *ca = arg;
const struct bch_extent_ptr *ptr;
- int ret;
-
- if (bch2_disk_reservation_get(c, &res, k.k->size, 0))
- return -ENOSPC;
- extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
+ extent_for_each_ptr(e, ptr)
if (ptr->dev == ca->dev_idx)
- goto found;
+ return true;
- BUG();
-found:
- /* XXX: we need to be doing something with the disk reservation */
-
- ret = bch2_data_move(c, ctxt, devs, k, ptr);
- if (ret)
- bch2_disk_reservation_put(c, &res);
- return ret;
+ return false;
}
#define MAX_DATA_OFF_ITER 10
@@ -58,10 +43,11 @@ found:
int bch2_move_data_off_device(struct bch_dev *ca)
{
- struct moving_context ctxt;
struct bch_fs *c = ca->fs;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 keys_moved, sectors_moved;
unsigned pass = 0;
- u64 seen_key_count;
int ret = 0;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
@@ -69,12 +55,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
return 0;
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
-
- bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
- __set_bit(ca->dev_idx, ctxt.avoid.d);
-
/*
* In theory, only one pass should be necessary as we've
* quiesced all writes before calling this.
@@ -91,69 +71,43 @@ int bch2_move_data_off_device(struct bch_dev *ca)
* Thus this scans the tree one more time than strictly necessary,
* but that can be viewed as a verification pass.
*/
-
do {
- struct btree_iter iter;
- struct bkey_s_c k;
-
- seen_key_count = 0;
- atomic_set(&ctxt.error_count, 0);
- atomic_set(&ctxt.error_flags, 0);
-
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
- BTREE_ITER_PREFETCH);
-
- while (!bch2_move_ctxt_wait(&ctxt) &&
- (k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = btree_iter_err(k))) {
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_has_device(bkey_s_c_to_extent(k),
- ca->dev_idx))
- goto next;
-
- ret = issue_migration_move(ca, &ctxt, NULL, k);
- if (ret == -ENOMEM) {
- bch2_btree_iter_unlock(&iter);
-
- /*
- * memory allocation failure, wait for some IO
- * to finish
- */
- bch2_move_ctxt_wait_for_io(&ctxt);
- continue;
- }
- if (ret == -ENOSPC)
- break;
- BUG_ON(ret);
+ ret = bch2_move_data(c, NULL,
+ SECTORS_IN_FLIGHT_PER_DEVICE,
+ NULL,
+ writepoint_hashed((unsigned long) current),
+ 0,
+ ca->dev_idx,
+ migrate_pred, ca,
+ &keys_moved,
+ &sectors_moved);
+ if (ret) {
+ bch_err(c, "error migrating data: %i", ret);
+ return ret;
+ }
+ } while (keys_moved && pass++ < MAX_DATA_OFF_ITER);
- seen_key_count++;
- continue;
-next:
- if (bkey_extent_is_data(k.k)) {
- ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
- BCH_DATA_USER);
- if (ret)
- break;
- }
- bch2_btree_iter_advance_pos(&iter);
- bch2_btree_iter_cond_resched(&iter);
+ if (keys_moved) {
+ bch_err(c, "unable to migrate all data in %d iterations",
+ MAX_DATA_OFF_ITER);
+ return -1;
+ }
- }
- bch2_btree_iter_unlock(&iter);
- bch2_move_ctxt_exit(&ctxt);
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
- if (ret)
- goto err;
- } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
+ if (!bkey_extent_is_data(k.k))
+ continue;
- if (seen_key_count) {
- pr_err("Unable to migrate all data in %d iterations.",
- MAX_DATA_OFF_ITER);
- ret = -1;
- goto err;
+ ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+ BCH_DATA_USER);
+ if (ret) {
+ bch_err(c, "error migrating data %i from check_mark_super()", ret);
+ break;
+ }
}
-err:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
@@ -167,14 +121,11 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id)
{
struct btree_iter iter;
- struct closure cl;
struct btree *b;
int ret;
BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
- closure_init_stack(&cl);
-
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 0c5b924c..5eaf0cf8 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -9,41 +9,38 @@
#include "keylist.h"
#include <linux/ioprio.h>
+#include <linux/kthread.h>
#include <trace/events/bcachefs.h>
-static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
- struct bkey_s_extent e,
- struct bch_extent_ptr ptr)
-{
- struct bch_extent_ptr *ptr2;
- struct bch_dev *ca = c->devs[ptr.dev];
+struct moving_io {
+ struct list_head list;
+ struct closure cl;
+ bool read_completed;
+ unsigned sectors;
- extent_for_each_ptr(e, ptr2)
- if (ptr2->dev == ptr.dev &&
- ptr2->gen == ptr.gen &&
- PTR_BUCKET_NR(ca, ptr2) ==
- PTR_BUCKET_NR(ca, &ptr))
- return ptr2;
+ struct bch_read_bio rbio;
- return NULL;
-}
+ struct migrate_write write;
+ /* Must be last since it is variable size */
+ struct bio_vec bi_inline_vecs[0];
+};
-static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
- struct bkey_s_extent e)
-{
- const struct bch_extent_ptr *ptr;
- struct bch_extent_ptr *ret;
+struct moving_context {
+ /* Closure for waiting on all reads and writes to complete */
+ struct closure cl;
- if (m->move)
- ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
- else
- extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
- if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
- break;
+ /* Key and sector moves issued, updated from submission context */
+ u64 keys_moved;
+ u64 sectors_moved;
+ atomic64_t sectors_raced;
- return ret;
-}
+ struct list_head reads;
+
+ atomic_t sectors_in_flight;
+
+ wait_queue_head_t wait;
+};
static int bch2_migrate_index_update(struct bch_write_op *op)
{
@@ -59,71 +56,78 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
BTREE_ITER_INTENT);
while (1) {
- struct bkey_s_extent insert =
- bkey_i_to_s_extent(bch2_keylist_front(keys));
struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+ struct bkey_i_extent *insert, *new =
+ bkey_i_to_extent(bch2_keylist_front(keys));
+ BKEY_PADDED(k) _new, _insert;
struct bch_extent_ptr *ptr;
- struct bkey_s_extent e;
- BKEY_PADDED(k) new;
+ struct bch_extent_crc_unpacked crc;
+ bool did_work = false;
- if (!k.k) {
+ if (btree_iter_err(k)) {
ret = bch2_btree_iter_unlock(&iter);
break;
}
- if (!bkey_extent_is_data(k.k))
+ if (bversion_cmp(k.k->version, new->k.version) ||
+ !bkey_extent_is_data(k.k) ||
+ !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
+ m->ptr, m->offset))
goto nomatch;
- bkey_reassemble(&new.k, k);
- bch2_cut_front(iter.pos, &new.k);
- bch2_cut_back(insert.k->p, &new.k.k);
- e = bkey_i_to_s_extent(&new.k);
-
- /* hack - promotes can race: */
- if (m->promote)
- extent_for_each_ptr(insert, ptr)
- if (bch2_extent_has_device(e.c, ptr->dev))
- goto nomatch;
-
- ptr = bch2_migrate_matching_ptr(m, e);
- if (ptr) {
- int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c);
- unsigned insert_flags =
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL;
+ bkey_reassemble(&_insert.k, k);
+ insert = bkey_i_to_extent(&_insert.k);
+
+ bkey_copy(&_new.k, bch2_keylist_front(keys));
+ new = bkey_i_to_extent(&_new.k);
+
+ bch2_cut_front(iter.pos, &insert->k_i);
+ bch2_cut_back(new->k.p, &insert->k);
+ bch2_cut_back(insert->k.p, &new->k);
+
+ if (m->move_dev >= 0 &&
+ (ptr = (struct bch_extent_ptr *)
+ bch2_extent_has_device(extent_i_to_s_c(insert),
+ m->move_dev)))
+ bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
- /* copygc uses btree node reserve: */
- if (m->move)
- insert_flags |= BTREE_INSERT_USE_RESERVE;
- if (m->move) {
- nr_new_dirty -= !ptr->cached;
- __bch2_extent_drop_ptr(e, ptr);
+ extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
+ if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+ /*
+ * raced with another move op? extent already
+ * has a pointer to the device we just wrote
+ * data to
+ */
+ continue;
}
- BUG_ON(nr_new_dirty < 0);
-
- memcpy_u64s(extent_entry_last(e),
- insert.v,
- bkey_val_u64s(insert.k));
- e.k->u64s += bkey_val_u64s(insert.k);
-
- bch2_extent_narrow_crcs(e);
- bch2_extent_drop_redundant_crcs(e);
- bch2_extent_normalize(c, e.s);
- bch2_extent_mark_replicas_cached(c, e, nr_new_dirty);
-
- ret = bch2_btree_insert_at(c, &op->res,
- NULL, op_journal_seq(op),
- insert_flags,
- BTREE_INSERT_ENTRY(&iter, &new.k));
- if (ret && ret != -EINTR)
- break;
- } else {
-nomatch:
- bch2_btree_iter_advance_pos(&iter);
+ bch2_extent_crc_append(insert, crc);
+ extent_ptr_append(insert, *ptr);
+ did_work = true;
}
+ if (!did_work)
+ goto nomatch;
+
+ bch2_extent_narrow_crcs(insert,
+ (struct bch_extent_crc_unpacked) { 0 });
+ bch2_extent_normalize(c, extent_i_to_s(insert).s);
+ bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
+
+ ret = bch2_btree_insert_at(c, &op->res,
+ NULL, op_journal_seq(op),
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ m->btree_insert_flags,
+ BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+ if (!ret)
+ atomic_long_inc(&c->extent_migrate_done);
+ if (ret == -EINTR)
+ ret = 0;
+ if (ret)
+ break;
+next:
while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
bch2_keylist_pop_front(keys);
if (bch2_keylist_empty(keys))
@@ -131,96 +135,83 @@ nomatch:
}
bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+ continue;
+nomatch:
+ if (m->ctxt)
+ atomic64_add(k.k->p.offset - iter.pos.offset,
+ &m->ctxt->sectors_raced);
+ atomic_long_inc(&c->extent_migrate_raced);
+ trace_move_race(&new->k);
+ bch2_btree_iter_advance_pos(&iter);
+ goto next;
}
out:
bch2_btree_iter_unlock(&iter);
return ret;
}
-void bch2_migrate_write_init(struct bch_fs *c,
- struct migrate_write *m,
- struct bch_devs_mask *devs,
- struct bkey_s_c k,
- const struct bch_extent_ptr *move_ptr,
- unsigned flags)
+void bch2_migrate_write_init(struct migrate_write *m,
+ struct bch_read_bio *rbio)
{
- bkey_reassemble(&m->key, k);
-
- m->promote = false;
- m->move = move_ptr != NULL;
- if (move_ptr)
- m->move_ptr = *move_ptr;
-
- if (bkey_extent_is_cached(k.k) ||
- (move_ptr && move_ptr->cached))
- flags |= BCH_WRITE_CACHED;
+ /* write bio must own pages: */
+ BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+ m->ptr = rbio->pick.ptr;
+ m->offset = rbio->pos.offset - rbio->pick.crc.offset;
+ m->op.devs_have = rbio->devs_have;
+ m->op.pos = rbio->pos;
+ m->op.version = rbio->version;
+ m->op.crc = rbio->pick.crc;
+
+ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+ m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
+ m->op.csum_type = m->op.crc.csum_type;
+ }
- bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
- devs, (unsigned long) current,
- bkey_start_pos(k.k), NULL,
- flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
+ if (m->move_dev >= 0)
+ bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
- if (m->move)
+ if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_MOVINGGC;
- m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k));
+ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+ BCH_WRITE_PAGES_STABLE|
+ BCH_WRITE_PAGES_OWNED|
+ BCH_WRITE_DATA_ENCODED;
+
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
m->op.nr_replicas = 1;
+ m->op.nr_replicas_required = 1;
m->op.index_update_fn = bch2_migrate_index_update;
}
-static void migrate_bio_init(struct moving_io *io, struct bio *bio,
- unsigned sectors)
+static void move_free(struct closure *cl)
{
- bio_init(bio, io->bi_inline_vecs,
- DIV_ROUND_UP(sectors, PAGE_SECTORS));
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
- bio->bi_iter.bi_size = sectors << 9;
- bio->bi_private = &io->cl;
- bch2_bio_map(bio, NULL);
-}
-
-static void moving_io_free(struct moving_io *io)
-{
- struct moving_context *ctxt = io->ctxt;
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct moving_context *ctxt = io->write.ctxt;
struct bio_vec *bv;
int i;
- atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
- wake_up(&ctxt->wait);
-
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
if (bv->bv_page)
__free_page(bv->bv_page);
- kfree(io);
-}
-
-static void moving_error(struct moving_context *ctxt, unsigned flag)
-{
- atomic_inc(&ctxt->error_count);
- //atomic_or(flag, &ctxt->error_flags);
-}
-static void moving_write_done(struct closure *cl)
-{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
-
- if (io->write.op.error)
- moving_error(io->ctxt, MOVING_FLAG_WRITE);
-
- //if (io->replace.failures)
- // trace_copy_collision(q, &io->key.k);
+ atomic_sub(io->sectors, &ctxt->sectors_in_flight);
+ wake_up(&ctxt->wait);
- moving_io_free(io);
+ kfree(io);
}
-static void write_moving(struct closure *cl)
+static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct bch_write_op *op = &io->write.op;
- closure_call(&op->cl, bch2_write, NULL, &io->cl);
- closure_return_with_destructor(&io->cl, moving_write_done);
+ if (likely(!io->rbio.bio.bi_error)) {
+ bch2_migrate_write_init(&io->write, &io->rbio);
+ closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+ }
+
+ closure_return_with_destructor(cl, move_free);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -231,16 +222,10 @@ static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
return io && io->read_completed ? io : NULL;
}
-static void read_moving_endio(struct bio *bio)
+static void move_read_endio(struct bio *bio)
{
- struct closure *cl = bio->bi_private;
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_context *ctxt = io->ctxt;
-
- trace_move_read_done(&io->write.key.k);
-
- if (bio->bi_error)
- moving_error(io->ctxt, MOVING_FLAG_READ);
+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+ struct moving_context *ctxt = io->write.ctxt;
io->read_completed = true;
if (next_pending_write(ctxt))
@@ -249,58 +234,81 @@ static void read_moving_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
-int bch2_data_move(struct bch_fs *c,
- struct moving_context *ctxt,
- struct bch_devs_mask *devs,
- struct bkey_s_c k,
- const struct bch_extent_ptr *move_ptr)
+static int bch2_move_extent(struct bch_fs *c,
+ struct moving_context *ctxt,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier wp,
+ int btree_insert_flags,
+ int move_device,
+ struct bkey_s_c k)
{
struct extent_pick_ptr pick;
struct moving_io *io;
+ const struct bch_extent_ptr *ptr;
+ struct bch_extent_crc_unpacked crc;
+ unsigned sectors = k.k->size, pages;
- bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
+ bch2_extent_pick_ptr(c, k, NULL, &pick);
if (IS_ERR_OR_NULL(pick.ca))
return pick.ca ? PTR_ERR(pick.ca) : 0;
- io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
- DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
- if (!io)
- return -ENOMEM;
+ /* write path might have to decompress data: */
+ extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc)
+ sectors = max_t(unsigned, sectors, crc.uncompressed_size);
- io->ctxt = ctxt;
+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+ io = kzalloc(sizeof(struct moving_io) +
+ sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ if (!io)
+ goto err;
- migrate_bio_init(io, &io->rbio.bio, k.k->size);
+ io->write.ctxt = ctxt;
+ io->sectors = k.k->size;
- bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
- io->rbio.bio.bi_end_io = read_moving_endio;
+ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+ bio_set_prio(&io->write.op.wbio.bio,
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
- if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
+ bch2_bio_map(&io->write.op.wbio.bio, NULL);
+ if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) {
kfree(io);
- return -ENOMEM;
+ goto err;
}
- migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
+ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+ io->rbio.bio.bi_iter.bi_size = sectors << 9;
- bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
+ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
+ io->rbio.bio.bi_end_io = move_read_endio;
- trace_move_read(&io->write.key.k);
+ __bch2_write_op_init(&io->write.op, c);
+ io->write.btree_insert_flags = btree_insert_flags;
+ io->write.move_dev = move_device;
+ io->write.op.devs = devs;
+ io->write.op.write_point = wp;
ctxt->keys_moved++;
ctxt->sectors_moved += k.k->size;
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
- atomic_add(k.k->size, &ctxt->sectors_in_flight);
+ trace_move_extent(k.k);
+
+ atomic_add(io->sectors, &ctxt->sectors_in_flight);
list_add_tail(&io->list, &ctxt->reads);
/*
- * dropped by read_moving_endio() - guards against use after free of
+ * dropped by move_read_endio() - guards against use after free of
* ctxt when doing wakeup
*/
- closure_get(&io->ctxt->cl);
- bch2_read_extent(c, &io->rbio, k, &pick, 0);
+ closure_get(&ctxt->cl);
+ bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k),
+ &pick, BCH_READ_NODECODE);
return 0;
+err:
+ trace_move_alloc_fail(k.k);
+ return -ENOMEM;
}
static void do_pending_writes(struct moving_context *ctxt)
@@ -309,14 +317,7 @@ static void do_pending_writes(struct moving_context *ctxt)
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
-
- if (io->rbio.bio.bi_error) {
- moving_io_free(io);
- continue;
- }
-
- trace_move_write(&io->write.key.k);
- closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
+ closure_call(&io->cl, move_write, NULL, &ctxt->cl);
}
}
@@ -330,18 +331,7 @@ do { \
next_pending_write(_ctxt) || (_cond)); \
} while (1)
-int bch2_move_ctxt_wait(struct moving_context *ctxt)
-{
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->sectors_in_flight) <
- ctxt->max_sectors_in_flight);
-
- return ctxt->rate
- ? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate)
- : 0;
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
@@ -350,7 +340,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
}
-void bch2_move_ctxt_exit(struct moving_context *ctxt)
+static void bch2_move_ctxt_exit(struct moving_context *ctxt)
{
move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
closure_sync(&ctxt->cl);
@@ -359,16 +349,92 @@ void bch2_move_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
}
-void bch2_move_ctxt_init(struct moving_context *ctxt,
- struct bch_ratelimit *rate,
- unsigned max_sectors_in_flight)
+static void bch2_move_ctxt_init(struct moving_context *ctxt)
{
memset(ctxt, 0, sizeof(*ctxt));
closure_init_stack(&ctxt->cl);
- ctxt->rate = rate;
- ctxt->max_sectors_in_flight = max_sectors_in_flight;
-
INIT_LIST_HEAD(&ctxt->reads);
init_waitqueue_head(&ctxt->wait);
}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bch_ratelimit *rate,
+ unsigned sectors_in_flight,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier wp,
+ int btree_insert_flags,
+ int move_device,
+ move_pred_fn pred, void *arg,
+ u64 *keys_moved,
+ u64 *sectors_moved)
+{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct moving_context ctxt;
+ struct btree_iter iter;
+ BKEY_PADDED(k) tmp;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_move_ctxt_init(&ctxt);
+ bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+ BTREE_ITER_PREFETCH);
+
+ if (rate)
+ bch2_ratelimit_reset(rate);
+
+ while (!kthread || !(ret = kthread_should_stop())) {
+ if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
+ bch2_btree_iter_unlock(&iter);
+ move_ctxt_wait_event(&ctxt,
+ atomic_read(&ctxt.sectors_in_flight) <
+ sectors_in_flight);
+ }
+
+ if (rate &&
+ bch2_ratelimit_delay(rate) &&
+ (bch2_btree_iter_unlock(&iter),
+ (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
+ break;
+
+ k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+ ret = btree_iter_err(k);
+ if (ret)
+ break;
+
+ if (!bkey_extent_is_data(k.k) ||
+ !pred(arg, bkey_s_c_to_extent(k)))
+ goto next;
+
+ /* unlock before doing IO: */
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+ bch2_btree_iter_unlock(&iter);
+
+ if (bch2_move_extent(c, &ctxt, devs, wp,
+ btree_insert_flags,
+ move_device, k)) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(&ctxt);
+ continue;
+ }
+
+ if (rate)
+ bch2_ratelimit_increment(rate, k.k->size);
+next:
+ bch2_btree_iter_advance_pos(&iter);
+ bch2_btree_iter_cond_resched(&iter);
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ bch2_move_ctxt_exit(&ctxt);
+
+ trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
+
+ *keys_moved = ctxt.keys_moved;
+ *sectors_moved = ctxt.sectors_moved;
+
+ return ret;
+}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index a756a462..2e884ce0 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -4,77 +4,31 @@
#include "buckets.h"
#include "io_types.h"
-enum moving_flag_bitnos {
- MOVING_FLAG_BITNO_READ = 0,
- MOVING_FLAG_BITNO_WRITE,
-};
-
-#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ)
-#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE)
+struct bch_read_bio;
+struct moving_context;
struct migrate_write {
- BKEY_PADDED(key);
- bool promote;
- bool move;
- struct bch_extent_ptr move_ptr;
- struct bch_write_op op;
-};
-
-void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
- struct bch_devs_mask *, struct bkey_s_c,
- const struct bch_extent_ptr *, unsigned);
-
-#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
-
-struct moving_context {
- /* Closure for waiting on all reads and writes to complete */
- struct closure cl;
-
- /* Number and types of errors reported */
- atomic_t error_count;
- atomic_t error_flags;
-
- /* Key and sector moves issued, updated from submission context */
- u64 keys_moved;
- u64 sectors_moved;
-
- /* Rate-limiter counting submitted reads */
- struct bch_ratelimit *rate;
-
- /* Try to avoid reading the following device */
- struct bch_devs_mask avoid;
-
- struct list_head reads;
+ struct moving_context *ctxt;
- /* Configuration */
- unsigned max_sectors_in_flight;
- atomic_t sectors_in_flight;
+ /* what we read: */
+ struct bch_extent_ptr ptr;
+ u64 offset;
- wait_queue_head_t wait;
+ int move_dev;
+ int btree_insert_flags;
+ struct bch_write_op op;
};
-struct moving_io {
- struct list_head list;
- struct rb_node node;
- struct closure cl;
- struct moving_context *ctxt;
- struct migrate_write write;
- bool read_completed;
-
- struct bch_read_bio rbio;
- /* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[0];
-};
+void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
-int bch2_data_move(struct bch_fs *, struct moving_context *,
- struct bch_devs_mask *, struct bkey_s_c,
- const struct bch_extent_ptr *);
+#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
-int bch2_move_ctxt_wait(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
+typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
-void bch2_move_ctxt_exit(struct moving_context *);
-void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
- unsigned);
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+ unsigned, struct bch_devs_mask *,
+ struct write_point_specifier,
+ int, int, move_pred_fn, void *,
+ u64 *, u64 *);
#endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 125159ee..728be2ba 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -6,6 +6,7 @@
#include "bcachefs.h"
#include "btree_iter.h"
+#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
#include "extents.h"
@@ -23,137 +24,63 @@
#include <linux/sort.h>
#include <linux/wait.h>
-/* Moving GC - IO loop */
-
-static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
-{
- const struct bucket_heap_entry *l = _l;
- const struct bucket_heap_entry *r = _r;
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca) \
+ ((ca)->free[RESERVE_MOVINGGC].size / 2)
- if (l->bucket < r->bucket)
- return -1;
- if (l->bucket > r->bucket)
- return 1;
- return 0;
-}
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca) \
+ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
-static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
- struct bkey_s_c k)
+static inline int sectors_used_cmp(copygc_heap *heap,
+ struct copygc_heap_entry l,
+ struct copygc_heap_entry r)
{
- bucket_heap *h = &ca->copygc_heap;
- const struct bch_extent_ptr *ptr;
-
- if (bkey_extent_is_data(k.k) &&
- (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
- ca->dev_idx))) {
- struct bucket_heap_entry search = {
- .bucket = PTR_BUCKET_NR(ca, ptr)
- };
-
- size_t i = eytzinger0_find(h->data, h->used,
- sizeof(h->data[0]),
- bucket_idx_cmp, &search);
-
- if (i < h->used)
- return ptr;
- }
-
- return NULL;
+ return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
}
-static int issue_moving_gc_move(struct bch_dev *ca,
- struct moving_context *ctxt,
- struct bkey_s_c k)
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
{
- struct bch_fs *c = ca->fs;
- const struct bch_extent_ptr *ptr;
- int ret;
+ const struct copygc_heap_entry *l = _l;
+ const struct copygc_heap_entry *r = _r;
- ptr = moving_pred(ca, k);
- if (!ptr) /* We raced - bucket's been reused */
- return 0;
-
- ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
- if (!ret)
- trace_gc_copy(k.k);
- else
- trace_moving_gc_alloc_fail(c, k.k->size);
- return ret;
+ return (l->offset > r->offset) - (l->offset < r->offset);
}
-static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
- u64 sectors_to_move)
+static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
{
- struct bch_fs *c = ca->fs;
- bucket_heap *h = &ca->copygc_heap;
- struct moving_context ctxt;
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 sectors_not_moved = 0;
- size_t buckets_not_moved = 0;
- struct bucket_heap_entry *i;
-
- bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
- bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
- SECTORS_IN_FLIGHT_PER_DEVICE);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
- BTREE_ITER_PREFETCH);
-
- while (1) {
- if (kthread_should_stop())
- goto out;
- if (bch2_move_ctxt_wait(&ctxt))
- goto out;
- k = bch2_btree_iter_peek(&iter);
- if (!k.k)
- break;
- if (btree_iter_err(k))
- goto out;
-
- if (!moving_pred(ca, k))
- goto next;
+ struct bch_dev *ca = arg;
+ copygc_heap *h = &ca->copygc_heap;
+ const struct bch_extent_ptr *ptr =
+ bch2_extent_has_device(e, ca->dev_idx);
- if (issue_moving_gc_move(ca, &ctxt, k)) {
- bch2_btree_iter_unlock(&iter);
+ if (ptr) {
+ struct copygc_heap_entry search = { .offset = ptr->offset };
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(&ctxt);
- continue;
- }
-next:
- bch2_btree_iter_advance_pos(&iter);
- //bch2_btree_iter_cond_resched(&iter);
+ size_t i = eytzinger0_find_le(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, &search);
- /* unlock before calling moving_context_wait() */
- bch2_btree_iter_unlock(&iter);
- cond_resched();
+ return (i >= 0 &&
+ ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+ ptr->gen == h->data[i].mark.gen);
}
- bch2_btree_iter_unlock(&iter);
- bch2_move_ctxt_exit(&ctxt);
- trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
- buckets_to_move);
-
- /* don't check this if we bailed out early: */
- for (i = h->data; i < h->data + h->used; i++) {
- struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
-
- if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
- sectors_not_moved += bucket_sectors_used(m);
- buckets_not_moved++;
- }
- }
-
- if (sectors_not_moved)
- bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
- sectors_not_moved, sectors_to_move,
- buckets_not_moved, buckets_to_move);
- return;
-out:
- bch2_btree_iter_unlock(&iter);
- bch2_move_ctxt_exit(&ctxt);
- trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
- buckets_to_move);
+ return false;
}
static bool have_copygc_reserve(struct bch_dev *ca)
@@ -168,38 +95,17 @@ static bool have_copygc_reserve(struct bch_dev *ca)
return ret;
}
-static inline int sectors_used_cmp(bucket_heap *heap,
- struct bucket_heap_entry l,
- struct bucket_heap_entry r)
-{
- return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
-}
-
-static void bch2_moving_gc(struct bch_dev *ca)
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_fs *c = ca->fs;
+ copygc_heap *h = &ca->copygc_heap;
+ struct copygc_heap_entry e, *i;
struct bucket *g;
- u64 sectors_to_move = 0;
- size_t buckets_to_move, buckets_unused = 0;
- struct bucket_heap_entry e, *i;
- int reserve_sectors;
-
- if (!have_copygc_reserve(ca)) {
- struct closure cl;
-
- closure_init_stack(&cl);
- while (1) {
- closure_wait(&c->freelist_wait, &cl);
- if (have_copygc_reserve(ca))
- break;
- closure_sync(&cl);
- }
- closure_wake_up(&c->freelist_wait);
- }
-
- reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+ u64 keys_moved, sectors_moved;
+ u64 sectors_to_move = 0, sectors_not_moved = 0;
+ u64 buckets_to_move, buckets_not_moved = 0;
+ int ret;
- trace_moving_gc_start(ca);
+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
/*
* Find buckets with lowest sector counts, skipping completely
@@ -213,48 +119,73 @@ static void bch2_moving_gc(struct bch_dev *ca)
* them:
*/
down_read(&c->gc_lock);
- ca->copygc_heap.used = 0;
+ h->used = 0;
for_each_bucket(g, ca) {
struct bucket_mark m = READ_ONCE(g->mark);
- struct bucket_heap_entry e = { g - ca->buckets, m };
-
- if (bucket_unused(m)) {
- buckets_unused++;
- continue;
- }
+ struct copygc_heap_entry e;
if (m.owned_by_allocator ||
- m.data_type != BUCKET_DATA)
+ m.data_type != BUCKET_DATA ||
+ !bucket_sectors_used(m) ||
+ bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
- if (bucket_sectors_used(m) >= ca->mi.bucket_size)
- continue;
-
- heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
+ e = (struct copygc_heap_entry) {
+ .offset = bucket_to_sector(ca, g - ca->buckets),
+ .mark = m
+ };
+ heap_add_or_replace(h, e, -sectors_used_cmp);
}
up_read(&c->gc_lock);
- for (i = ca->copygc_heap.data;
- i < ca->copygc_heap.data + ca->copygc_heap.used;
- i++)
+ for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += bucket_sectors_used(i->mark);
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
- BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
+ BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
sectors_to_move -= bucket_sectors_used(e.mark);
}
- buckets_to_move = ca->copygc_heap.used;
+ buckets_to_move = h->used;
+
+ if (!buckets_to_move)
+ return;
+
+ eytzinger0_sort(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, NULL);
+
+ ret = bch2_move_data(c, &ca->copygc_pd.rate,
+ SECTORS_IN_FLIGHT_PER_DEVICE,
+ &ca->self,
+ writepoint_ptr(&ca->copygc_write_point),
+ BTREE_INSERT_USE_RESERVE,
+ ca->dev_idx,
+ copygc_pred, ca,
+ &keys_moved,
+ &sectors_moved);
+
+ for (i = h->data; i < h->data + h->used; i++) {
+ size_t bucket = sector_to_bucket(ca, i->offset);
+ struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark);
+
+ if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+ sectors_not_moved += bucket_sectors_used(m);
+ buckets_not_moved++;
+ }
+ }
- eytzinger0_sort(ca->copygc_heap.data,
- ca->copygc_heap.used,
- sizeof(ca->copygc_heap.data[0]),
- bucket_idx_cmp, NULL);
+ if (sectors_not_moved && !ret)
+ bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+ sectors_not_moved, sectors_to_move,
+ buckets_not_moved, buckets_to_move);
- read_moving(ca, buckets_to_move, sectors_to_move);
+ trace_copygc(ca,
+ sectors_moved, sectors_not_moved,
+ buckets_to_move, buckets_not_moved);
}
-static int bch2_moving_gc_thread(void *arg)
+static int bch2_copygc_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
@@ -273,7 +204,7 @@ static int bch2_moving_gc_thread(void *arg)
* don't start copygc until less than half the gc reserve is
* available:
*/
- available = dev_buckets_available(ca);
+ available = dev_buckets_available(c, ca);
want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
c->opts.gc_reserve_percent, 200);
if (available > want) {
@@ -283,46 +214,46 @@ static int bch2_moving_gc_thread(void *arg)
continue;
}
- bch2_moving_gc(ca);
+ bch2_copygc(c, ca);
}
return 0;
}
-void bch2_moving_gc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_dev *ca)
{
- ca->moving_gc_pd.rate.rate = UINT_MAX;
- bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
+ ca->copygc_pd.rate.rate = UINT_MAX;
+ bch2_ratelimit_reset(&ca->copygc_pd.rate);
- if (ca->moving_gc_read)
- kthread_stop(ca->moving_gc_read);
- ca->moving_gc_read = NULL;
+ if (ca->copygc_thread)
+ kthread_stop(ca->copygc_thread);
+ ca->copygc_thread = NULL;
}
-int bch2_moving_gc_start(struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
{
struct task_struct *t;
- BUG_ON(ca->moving_gc_read);
+ BUG_ON(ca->copygc_thread);
- if (ca->fs->opts.nochanges)
+ if (c->opts.nochanges)
return 0;
- if (bch2_fs_init_fault("moving_gc_start"))
+ if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
- t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
+ t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
if (IS_ERR(t))
return PTR_ERR(t);
- ca->moving_gc_read = t;
- wake_up_process(ca->moving_gc_read);
+ ca->copygc_thread = t;
+ wake_up_process(ca->copygc_thread);
return 0;
}
-void bch2_dev_moving_gc_init(struct bch_dev *ca)
+void bch2_dev_copygc_init(struct bch_dev *ca)
{
- bch2_pd_controller_init(&ca->moving_gc_pd);
- ca->moving_gc_pd.d_term = 0;
+ bch2_pd_controller_init(&ca->copygc_pd);
+ ca->copygc_pd.d_term = 0;
}
diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h
index d835d138..c46fa1f1 100644
--- a/libbcachefs/movinggc.h
+++ b/libbcachefs/movinggc.h
@@ -1,30 +1,8 @@
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca) \
- ((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca) \
- ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
-
-void bch2_moving_gc_stop(struct bch_dev *);
-int bch2_moving_gc_start(struct bch_dev *);
-void bch2_dev_moving_gc_init(struct bch_dev *);
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);
#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 1e4eafb2..a3ecfb92 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -425,6 +425,11 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
if (err)
return err;
+ if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
+ bch2_sb_get_crypt(sb) &&
+ BCH_SB_INITIALIZED(sb))
+ return "Incompatible extent nonces";
+
sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
return NULL;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 0342778d..4e8b0a51 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -20,6 +20,7 @@
#include "debug.h"
#include "error.h"
#include "fs.h"
+#include "fs-io.h"
#include "fsck.h"
#include "inode.h"
#include "io.h"
@@ -209,7 +210,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_tiering_stop(c);
for_each_member_device(ca, c, i)
- bch2_moving_gc_stop(ca);
+ bch2_copygc_stop(ca);
bch2_gc_thread_stop(c);
@@ -258,12 +259,8 @@ void bch2_fs_read_only(struct bch_fs *c)
*/
percpu_ref_kill(&c->writes);
- del_timer(&c->foreground_write_wakeup);
cancel_delayed_work(&c->pd_controllers_update);
- c->foreground_write_pd.rate.rate = UINT_MAX;
- bch2_wake_delayed_writes((unsigned long) c);
-
/*
* If we're not doing an emergency shutdown, we want to wait on
* outstanding writes to complete so they don't see spurious errors due
@@ -348,9 +345,9 @@ const char *bch2_fs_read_write(struct bch_fs *c)
if (bch2_gc_thread_start(c))
goto err;
- err = "error starting moving GC thread";
+ err = "error starting copygc thread";
for_each_rw_member(ca, c, i)
- if (bch2_moving_gc_start(ca)) {
+ if (bch2_copygc_start(c, ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
@@ -375,6 +372,7 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
+ bch2_fs_fsio_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
@@ -411,7 +409,6 @@ static void bch2_fs_exit(struct bch_fs *c)
{
unsigned i;
- del_timer_sync(&c->foreground_write_wakeup);
cancel_delayed_work_sync(&c->pd_controllers_update);
cancel_work_sync(&c->read_only_work);
@@ -535,8 +532,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->tiering_enabled = 1;
c->tiering_percent = 10;
- c->foreground_target_percent = 20;
-
c->journal.write_time = &c->journal_write_time;
c->journal.delay_time = &c->journal_delay_time;
c->journal.blocked_time = &c->journal_blocked_time;
@@ -600,7 +595,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_cache_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
- bch2_check_set_has_compressed_data(c, c->opts.compression))
+ bch2_check_set_has_compressed_data(c, c->opts.compression) ||
+ bch2_fs_fsio_init(c))
goto err;
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
@@ -1105,8 +1101,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
+ writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+
spin_lock_init(&ca->freelist_lock);
- bch2_dev_moving_gc_init(ca);
+ bch2_dev_copygc_init(ca);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@@ -1224,10 +1222,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
- lg_local_lock(&c->usage_lock);
- if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
- bch2_mark_dev_metadata(c, ca);
- lg_local_unlock(&c->usage_lock);
+ bch2_mark_dev_superblock(c, ca, 0);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@@ -1324,7 +1319,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
- bch2_moving_gc_stop(ca);
+ bch2_copygc_stop(ca);
/*
* This stops new data writes (e.g. to existing open data
@@ -1347,8 +1342,8 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
if (bch2_dev_allocator_start(ca))
return "error starting allocator thread";
- if (bch2_moving_gc_start(ca))
- return "error starting moving GC thread";
+ if (bch2_copygc_start(c, ca))
+ return "error starting copygc thread";
if (bch2_tiering_start(c))
return "error starting tiering thread";
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 18e36c08..eb1d2f3d 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -35,6 +35,30 @@ static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
}
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+ unsigned dev)
+{
+ unsigned i;
+
+ for (i = 0; i < devs.nr; i++)
+ if (devs.devs[i] == dev)
+ return true;
+
+ return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+ unsigned dev)
+{
+ unsigned i;
+
+ for (i = 0; i < devs->nr; i++)
+ if (devs->devs[i] == dev) {
+ array_remove_item(devs->devs, devs->nr, i);
+ return;
+ }
+}
+
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask)
{
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 756dfeba..35c8bebf 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -13,4 +13,33 @@ struct bch_devs_mask {
unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
};
+struct bch_devs_list {
+ u8 nr;
+ u8 devs[BCH_REPLICAS_MAX];
+};
+
+struct bch_member_cpu {
+ u64 nbuckets; /* device size */
+ u16 first_bucket; /* index of first bucket used */
+ u16 bucket_size; /* sectors */
+ u8 state;
+ u8 tier;
+ u8 replacement;
+ u8 discard;
+ u8 data_allowed;
+ u8 valid;
+};
+
+struct bch_replicas_cpu_entry {
+ u8 data_type;
+ u8 devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+ struct rcu_head rcu;
+ unsigned nr;
+ unsigned entry_size;
+ struct bch_replicas_cpu_entry entries[];
+};
+
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index c20769b7..35f1e561 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -161,8 +161,11 @@ read_attribute(meta_buckets);
read_attribute(alloc_buckets);
read_attribute(has_data);
read_attribute(alloc_debug);
+write_attribute(wake_allocator);
read_attribute(read_realloc_races);
+read_attribute(extent_migrate_done);
+read_attribute(extent_migrate_raced);
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
@@ -170,7 +173,6 @@ rw_attribute(journal_reclaim_delay_ms);
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
-rw_attribute(foreground_write_ratelimit_enabled);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
@@ -179,12 +181,9 @@ rw_attribute(tiering_enabled);
rw_attribute(tiering_percent);
sysfs_pd_controller_attribute(tiering);
-sysfs_pd_controller_attribute(foreground_write);
rw_attribute(pd_controllers_update_seconds);
-rw_attribute(foreground_target_percent);
-
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
@@ -272,18 +271,18 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
if (k.k->type == BCH_EXTENT) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
+ struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
- if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
+ if (crc.compression_type == BCH_COMPRESSION_NONE) {
nr_uncompressed_extents++;
uncompressed_sectors += e.k->size;
} else {
nr_compressed_extents++;
compressed_sectors_compressed +=
- crc_compressed_size(e.k, crc);
+ crc.compressed_size;
compressed_sectors_uncompressed +=
- crc_uncompressed_size(e.k, crc);
+ crc.uncompressed_size;
}
/* only looking at the first ptr */
@@ -323,17 +322,17 @@ SHOW(bch2_fs)
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
+ sysfs_print(extent_migrate_done,
+ atomic_long_read(&c->extent_migrate_done));
+ sysfs_print(extent_migrate_raced,
+ atomic_long_read(&c->extent_migrate_raced));
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
- sysfs_printf(foreground_write_ratelimit_enabled, "%i",
- c->foreground_write_ratelimit_enabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
- sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
sysfs_print(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
- sysfs_print(foreground_target_percent, c->foreground_target_percent);
sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
sysfs_print(tiering_percent, c->tiering_percent);
@@ -371,9 +370,6 @@ STORE(__bch2_fs)
sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
- sysfs_strtoul(foreground_write_ratelimit_enabled,
- c->foreground_write_ratelimit_enabled);
-
if (attr == &sysfs_btree_gc_periodic) {
ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
?: (ssize_t) size;
@@ -389,8 +385,8 @@ STORE(__bch2_fs)
?: (ssize_t) size;
for_each_member_device(ca, c, i)
- if (ca->moving_gc_read)
- wake_up_process(ca->moving_gc_read);
+ if (ca->copygc_thread)
+ wake_up_process(ca->copygc_thread);
return ret;
}
@@ -402,11 +398,8 @@ STORE(__bch2_fs)
return ret;
}
- sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
-
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
- sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
sysfs_strtoul(tiering_percent, c->tiering_percent);
sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
@@ -466,7 +459,6 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
- &sysfs_foreground_target_percent,
&sysfs_tiering_percent,
&sysfs_compression_stats,
@@ -494,17 +486,17 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_pins,
&sysfs_read_realloc_races,
+ &sysfs_extent_migrate_done,
+ &sysfs_extent_migrate_raced,
&sysfs_trigger_journal_flush,
&sysfs_trigger_btree_coalesce,
&sysfs_trigger_gc,
&sysfs_prune_cache,
- &sysfs_foreground_write_ratelimit_enabled,
&sysfs_copy_gc_enabled,
&sysfs_tiering_enabled,
sysfs_pd_controller_files(tiering),
- sysfs_pd_controller_files(foreground_write),
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -710,17 +702,23 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n"
"free[RESERVE_BTREE]: %zu/%zu\n"
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
"free[RESERVE_NONE]: %zu/%zu\n"
- "alloc: %llu/%llu\n"
- "meta: %llu/%llu\n"
- "dirty: %llu/%llu\n"
- "available: %llu/%llu\n"
+ "buckets:\n"
+ " capacity: %llu\n"
+ " alloc: %llu\n"
+ " meta: %llu\n"
+ " dirty: %llu\n"
+ " available: %llu\n"
+ "sectors:\n"
+ " meta: %llu\n"
+ " dirty: %llu\n"
+ " cached: %llu\n"
"freelist_wait: %s\n"
"open buckets: %u/%u (reserved %u)\n"
"open_buckets_wait: %s\n",
@@ -728,10 +726,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
- stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets[S_META], ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets[S_DIRTY], ca->mi.nbuckets - ca->mi.first_bucket,
- __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket,
+ ca->mi.nbuckets - ca->mi.first_bucket,
+ stats.buckets_alloc,
+ stats.buckets[S_META],
+ stats.buckets[S_DIRTY],
+ __dev_buckets_available(ca, stats),
+ stats.sectors[S_META],
+ stats.sectors[S_DIRTY],
+ stats.sectors_cached,
c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
c->open_buckets_wait.list.first ? "waiting" : "empty");
@@ -769,7 +771,7 @@ SHOW(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
char *out = buf, *end = buf + PAGE_SIZE;
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
@@ -788,8 +790,8 @@ SHOW(bch2_dev)
sysfs_print(cached_buckets, stats.buckets_cached);
sysfs_print(meta_buckets, stats.buckets[S_META]);
sysfs_print(alloc_buckets, stats.buckets_alloc);
- sysfs_print(available_buckets, dev_buckets_available(ca));
- sysfs_print(free_buckets, dev_buckets_free(ca));
+ sysfs_print(available_buckets, __dev_buckets_available(ca, stats));
+ sysfs_print(free_buckets, __dev_buckets_free(ca, stats));
if (attr == &sysfs_has_data) {
out += bch2_scnprint_flag_list(out, end - out,
@@ -799,7 +801,7 @@ SHOW(bch2_dev)
return out - buf;
}
- sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
+ sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
if (attr == &sysfs_cache_replacement_policy) {
out += bch2_scnprint_string_list(out, end - out,
@@ -843,7 +845,7 @@ STORE(bch2_dev)
struct bch_fs *c = ca->fs;
struct bch_member *mi;
- sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
+ sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@@ -899,6 +901,9 @@ STORE(bch2_dev)
bch2_tiering_start(c);
}
+ if (attr == &sysfs_wake_allocator)
+ bch2_wake_allocator(ca);
+
return size;
}
SYSFS_OPS(bch2_dev);
@@ -942,6 +947,7 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
+ &sysfs_wake_allocator,
sysfs_pd_controller_files(copy_gc),
NULL
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index cbfcfccc..2e29f741 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -15,105 +15,23 @@
#include <linux/kthread.h>
#include <trace/events/bcachefs.h>
-struct tiering_state {
- struct bch_tier *tier;
- unsigned sectors;
- unsigned stripe_size;
- unsigned dev_idx;
- struct bch_dev *ca;
-};
-
-static bool tiering_pred(struct bch_fs *c,
- struct bch_tier *tier,
- struct bkey_s_c k)
+static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
{
- if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- unsigned replicas = 0;
-
- /* Make sure we have room to add a new pointer: */
- if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
- BKEY_EXTENT_VAL_U64s_MAX)
- return false;
-
- extent_for_each_ptr(e, ptr)
- if (c->devs[ptr->dev]->mi.tier >= tier->idx)
- replicas++;
-
- return replicas < c->opts.data_replicas;
- }
-
- return false;
-}
-
-static int issue_tiering_move(struct bch_fs *c,
- struct bch_tier *tier,
- struct moving_context *ctxt,
- struct bkey_s_c k)
-{
- int ret;
-
- ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
- if (!ret)
- trace_tiering_copy(k.k);
- else
- trace_tiering_alloc_fail(c, k.k->size);
-
- return ret;
-}
-
-/**
- * tiering_next_cache - issue a move to write an extent to the next cache
- * device in round robin order
- */
-static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
-{
- struct moving_context ctxt;
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned nr_devices = dev_mask_nr(&tier->devs);
- int ret;
-
- if (!nr_devices)
- return 0;
-
- trace_tiering_start(c);
-
- bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
- nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
- bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
- BTREE_ITER_PREFETCH);
-
- while (!kthread_should_stop() &&
- !bch2_move_ctxt_wait(&ctxt) &&
- (k = bch2_btree_iter_peek(&iter)).k &&
- !btree_iter_err(k)) {
- if (!tiering_pred(c, tier, k))
- goto next;
-
- ret = issue_tiering_move(c, tier, &ctxt, k);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
-
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(&ctxt);
- continue;
- }
-next:
- bch2_btree_iter_advance_pos(&iter);
- //bch2_btree_iter_cond_resched(&iter);
+ struct bch_tier *tier = arg;
+ struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
+ const struct bch_extent_ptr *ptr;
+ unsigned replicas = 0;
- /* unlock before calling moving_context_wait() */
- bch2_btree_iter_unlock(&iter);
- cond_resched();
- }
+ /* Make sure we have room to add a new pointer: */
+ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+ BKEY_EXTENT_VAL_U64s_MAX)
+ return false;
- bch2_btree_iter_unlock(&iter);
- bch2_move_ctxt_exit(&ctxt);
- trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
+ extent_for_each_ptr(e, ptr)
+ if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+ replicas++;
- return ctxt.sectors_moved;
+ return replicas < c->opts.data_replicas;
}
static int bch2_tiering_thread(void *arg)
@@ -122,15 +40,15 @@ static int bch2_tiering_thread(void *arg)
struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
struct io_clock *clock = &c->io_clock[WRITE];
struct bch_dev *ca;
- u64 tier_capacity, available_sectors;
+ u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
unsigned long last;
- unsigned i;
+ unsigned i, nr_devices;
set_freezable();
while (!kthread_should_stop()) {
if (kthread_wait_freezable(c->tiering_enabled &&
- dev_mask_nr(&tier->devs)))
+ (nr_devices = dev_mask_nr(&tier->devs))))
break;
while (1) {
@@ -151,7 +69,7 @@ static int bch2_tiering_thread(void *arg)
ca->mi.first_bucket);
available_sectors +=
bucket_to_sector(ca,
- dev_buckets_available(ca));
+ dev_buckets_available(c, ca));
}
rcu_read_unlock();
}
@@ -167,7 +85,15 @@ static int bch2_tiering_thread(void *arg)
return 0;
}
- read_tiering(c, tier);
+ bch2_move_data(c, &tier->pd.rate,
+ SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
+ &tier->devs,
+ writepoint_ptr(&tier->wp),
+ 0,
+ -1,
+ tiering_pred, tier,
+ &keys_moved,
+ &sectors_moved);
}
return 0;
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 2eb8ca72..fa853750 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -291,13 +291,15 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+
while (1) {
u64 delay = bch2_ratelimit_delay(d);
if (delay)
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
+ if (kthread && kthread_should_stop())
return 1;
if (!delay)
@@ -434,8 +436,11 @@ size_t bch2_rand_range(size_t max)
{
size_t rand;
+ if (!max)
+ return 0;
+
do {
- get_random_bytes(&rand, sizeof(rand));
+ rand = get_random_long();
rand &= roundup_pow_of_two(max) - 1;
} while (rand >= max);
@@ -642,3 +647,129 @@ void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
return vpmalloc(size, gfp_mask);
}
+
+#if 0
+void eytzinger1_test(void)
+{
+ unsigned inorder, eytz, size;
+
+ pr_info("1 based eytzinger test:");
+
+ for (size = 2;
+ size < 65536;
+ size++) {
+ unsigned extra = eytzinger1_extra(size);
+
+ if (!(size % 4096))
+ pr_info("tree size %u", size);
+
+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
+
+ inorder = 1;
+ eytzinger1_for_each(eytz, size) {
+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+ BUG_ON(eytz != eytzinger1_last(size) &&
+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+ inorder++;
+ }
+ }
+}
+
+void eytzinger0_test(void)
+{
+
+ unsigned inorder, eytz, size;
+
+ pr_info("0 based eytzinger test:");
+
+ for (size = 1;
+ size < 65536;
+ size++) {
+ unsigned extra = eytzinger0_extra(size);
+
+ if (!(size % 4096))
+ pr_info("tree size %u", size);
+
+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
+
+ inorder = 0;
+ eytzinger0_for_each(eytz, size) {
+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+ BUG_ON(eytz != eytzinger0_last(size) &&
+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+ inorder++;
+ }
+ }
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+ const u16 *l = _l, *r = _r;
+
+ return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+ int i, c1 = -1, c2 = -1;
+ ssize_t r;
+
+ r = eytzinger0_find_le(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ if (r >= 0)
+ c1 = test_array[r];
+
+ for (i = 0; i < nr; i++)
+ if (test_array[i] <= search && test_array[i] > c2)
+ c2 = test_array[i];
+
+ if (c1 != c2) {
+ eytzinger0_for_each(i, nr)
+ pr_info("[%3u] = %12u", i, test_array[i]);
+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+ i, r, c1, c2);
+ }
+}
+
+void eytzinger0_find_test(void)
+{
+ unsigned i, nr, allocated = 1 << 12;
+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+ for (nr = 1; nr < allocated; nr++) {
+ pr_info("testing %u elems", nr);
+
+ get_random_bytes(test_array, nr * sizeof(test_array[0]));
+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+ /* verify array is sorted correctly: */
+ eytzinger0_for_each(i, nr)
+ BUG_ON(i != eytzinger0_last(nr) &&
+ test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+ for (i = 0; i < U16_MAX; i += 1 << 12)
+ eytzinger0_find_test_val(test_array, nr, i);
+
+ for (i = 0; i < nr; i++) {
+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+ eytzinger0_find_test_val(test_array, nr, test_array[i]);
+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+ }
+ }
+
+ kfree(test_array);
+}
+#endif
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index b91b2dc8..a251bf9c 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -789,4 +789,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos) \
+ memmove(&(_array)[(_pos) + 1], \
+ &(_array)[(_pos)], \
+ sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item) \
+do { \
+ __array_insert_item(_array, _nr, _pos); \
+ (_nr)++; \
+ (_array)[(_pos)] = (_new_item); \
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
+do { \
+ (_nr) -= (_nr_to_remove); \
+ memmove(&(_array)[(_pos)], \
+ &(_array)[(_pos) + (_nr_to_remove)], \
+ sizeof((_array)[0]) * ((_nr) - (_pos))); \
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos) \
+ array_remove_items(_array, _nr, _pos, 1)
+
#endif /* _BCACHEFS_UTIL_H */