summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Makefile6
-rw-r--r--fs/bcachefs/alloc.c13
-rw-r--r--fs/bcachefs/alloc_types.h2
-rw-r--r--fs/bcachefs/bcache.h51
-rw-r--r--fs/bcachefs/btree_gc.c18
-rw-r--r--fs/bcachefs/buckets.h8
-rw-r--r--fs/bcachefs/buckets_types.h3
-rw-r--r--fs/bcachefs/extents.c8
-rw-r--r--fs/bcachefs/keylist.c190
-rw-r--r--fs/bcachefs/keylist.h44
-rw-r--r--fs/bcachefs/keylist_types.h45
-rw-r--r--fs/bcachefs/migrate.c166
-rw-r--r--fs/bcachefs/move.c581
-rw-r--r--fs/bcachefs/move.h149
-rw-r--r--fs/bcachefs/move_types.h65
-rw-r--r--fs/bcachefs/movinggc.c367
-rw-r--r--fs/bcachefs/movinggc.h9
-rw-r--r--fs/bcachefs/rebalance.c467
-rw-r--r--fs/bcachefs/rebalance.h7
-rw-r--r--fs/bcachefs/super.c80
-rw-r--r--fs/bcachefs/sysfs.c34
-rw-r--r--fs/bcachefs/tier.c466
-rw-r--r--fs/bcachefs/tier.h12
23 files changed, 631 insertions, 2160 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 390c167819c4..5a688e85732c 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -5,6 +5,6 @@ bcache-y := acl.o alloc.o bkey.o bkey_methods.o blockdev.o\
bset.o btree_cache.o btree_gc.o btree_io.o btree_iter.o btree_update.o\
buckets.o chardev.o checksum.o clock.o closure.o debug.o dirent.o\
error.o extents.o fs.o fs-gc.o fs-io.o inode.o io.o journal.o keybuf.o\
- keylist.o migrate.o move.o movinggc.o notify.o opts.o request.o\
- siphash.o six.o stats.o super.o sysfs.o tier.o trace.o util.o\
- writeback.o xattr.o
+ keylist.o migrate.o move.o notify.o opts.o rebalance.o request.o\
+ siphash.o six.o stats.o super.o sysfs.o trace.o util.o writeback.o\
+ xattr.o
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 1d1d302c84a1..3485019c535a 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -148,10 +148,10 @@ static void pd_controllers_update(struct work_struct *work)
if (fragmented < 0)
fragmented = 0;
-
+#if 0
bch_pd_controller_update(&ca->moving_gc_pd,
free, fragmented, -1);
-
+#endif
if (i == 0)
tier0_can_free += fragmented;
@@ -165,11 +165,12 @@ static void pd_controllers_update(struct work_struct *work)
u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
-
+#if 0
bch_pd_controller_update(&c->tiering_pd,
target,
tier_dirty[0],
-1);
+#endif
}
/*
@@ -579,7 +580,6 @@ static void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
g->read_prio = ca->set->prio_clock[READ].hand;
g->write_prio = ca->set->prio_clock[WRITE].hand;
- g->copygc_gen = 0;
verify_not_on_freelist(ca, g - ca->buckets);
}
@@ -1643,11 +1643,10 @@ void bch_cache_allocator_stop(struct cache *ca)
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
bch_stop_write_point(ca, &c->write_points[i]);
- for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++)
- bch_stop_write_point(ca, &ca->gc_buckets[i]);
+ for (i = 0; i < ARRAY_SIZE(c->rebalance); i++)
+ bch_stop_write_point(ca, &c->rebalance[i].wp);
bch_stop_write_point(ca, &c->promote_write_point);
- bch_stop_write_point(ca, &ca->tiering_write_point);
bch_stop_write_point(ca, &c->migration_write_point);
bch_stop_write_point(ca, &c->btree_write_point);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 065b9c02f185..1372fc26ccb1 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -38,7 +38,7 @@ enum alloc_reserve {
RESERVE_PRIO,
RESERVE_BTREE,
RESERVE_METADATA_LAST = RESERVE_BTREE,
- RESERVE_MOVINGGC,
+ RESERVE_MOVINGGC, /* hrm */
RESERVE_NONE,
RESERVE_NR,
diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h
index 7f856f7fd1aa..a6bbd38f4316 100644
--- a/fs/bcachefs/bcache.h
+++ b/fs/bcachefs/bcache.h
@@ -278,7 +278,6 @@
#include "journal_types.h"
#include "keylist_types.h"
#include "keybuf_types.h"
-#include "move_types.h"
#include "stats_types.h"
#include "super_types.h"
@@ -356,8 +355,6 @@ struct cache {
struct cache_set *set;
- struct cache_group self;
-
/*
* Cached version of this device's member info from superblock
* Committed by write_super()
@@ -433,25 +430,6 @@ struct cache {
struct mutex heap_lock;
DECLARE_HEAP(struct bucket_heap_entry, heap);
- /* Moving GC: */
- struct task_struct *moving_gc_read;
-
- struct moving_queue moving_gc_queue;
- struct bch_pd_controller moving_gc_pd;
-
- /* Tiering: */
- struct moving_queue tiering_queue;
- struct write_point tiering_write_point;
- unsigned tiering_stripe_size;
-
- /*
- * open buckets used in moving garbage collection
- * NOTE: GC_GEN == 0 signifies no moving gc, so accessing the
- * gc_buckets array is always GC_GEN-1.
- */
-#define NUM_GC_GENS 8
- struct write_point gc_buckets[NUM_GC_GENS];
-
struct journal_device journal;
struct work_struct io_error_work;
@@ -504,6 +482,26 @@ struct btree_debug {
struct dentry *btree_format;
};
+struct rebalance_bucket_entry {
+ size_t bucket;
+ u8 dev;
+ u8 gen;
+ unsigned sectors;
+};
+
+struct rebalance_thread {
+ unsigned tier;
+ unsigned initialized;
+ struct task_struct *p;
+ struct bch_pd_controller pd;
+ struct write_point wp;
+
+ struct workqueue_struct *wq;
+
+ struct mutex heap_lock;
+ DECLARE_HEAP(struct rebalance_bucket_entry, heap);
+};
+
struct cache_set {
struct closure cl;
@@ -702,10 +700,6 @@ struct cache_set {
struct task_struct *gc_thread;
atomic_t kick_gc;
- /* This is a list of scan_keylists for btree GC to scan */
- struct list_head gc_scan_keylists;
- struct mutex gc_scan_keylist_lock;
-
/*
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
* has been marked by GC.
@@ -751,9 +745,8 @@ struct cache_set {
/* FILESYSTEM */
atomic_long_t nr_inodes;
- /* TIERING */
- struct task_struct *tiering_read;
- struct bch_pd_controller tiering_pd;
+ /* REBALANCE */
+ struct rebalance_thread rebalance[CACHE_TIERS];
/* NOTIFICATIONS */
struct mutex uevent_lock;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 965b4a58ba9a..65222eb0b0cb 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -301,23 +301,6 @@ static void bch_mark_pending_btree_node_frees(struct cache_set *c)
mutex_unlock(&c->btree_interior_update_lock);
}
-static void bch_mark_scan_keylists(struct cache_set *c)
-{
- struct scan_keylist *kl;
-
- mutex_lock(&c->gc_scan_keylist_lock);
-
- /* What the goddamn fuck? */
- list_for_each_entry(kl, &c->gc_scan_keylists, mark_list) {
- if (kl->owner == NULL)
- bch_keylist_recalc_oldest_gens(c, kl);
- else
- bch_queue_recalc_oldest_gens(c, kl->owner);
- }
-
- mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
/**
* bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
*/
@@ -419,7 +402,6 @@ void bch_gc(struct cache_set *c)
bch_mark_metadata(c);
bch_mark_pending_btree_node_frees(c);
bch_writeback_recalc_oldest_gens(c);
- bch_mark_scan_keylists(c);
for_each_cache(ca, c, i)
atomic_long_set(&ca->saturated_count, 0);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e878ac09a0f2..be225cb850c7 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -33,6 +33,14 @@ static inline struct cache *PTR_CACHE(const struct cache_set *c,
return rcu_dereference(c->cache[ptr->dev]);
}
+static inline unsigned PTR_TIER(const struct cache_member_rcu *mi,
+ const struct bch_extent_ptr *ptr)
+{
+ return ptr->dev < mi->nr_in_set
+ ? mi->m[ptr->dev].tier
+ : UINT_MAX;
+}
+
static inline size_t PTR_BUCKET_NR(const struct cache *ca,
const struct bch_extent_ptr *ptr)
{
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index a1914404531e..78cdcafcf155 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -42,9 +42,6 @@ struct bucket {
struct bucket_mark mark;
/* Most out of date gen in the btree */
u8 oldest_gen;
-
- /* generation copygc is going to move this bucket into */
- u8 copygc_gen;
};
struct bucket_stats_cache {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index d041f0cfbdc7..6890bcc77dae 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1649,14 +1649,6 @@ static void bch_extent_to_text(struct cache_set *c, char *buf,
#undef p
}
-static unsigned PTR_TIER(struct cache_member_rcu *mi,
- const struct bch_extent_ptr *ptr)
-{
- return ptr->dev < mi->nr_in_set
- ? mi->m[ptr->dev].tier
- : UINT_MAX;
-}
-
void bch_extent_entry_append(struct bkey_i_extent *e,
union bch_extent_entry *entry)
{
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 638596300575..644734b1d4f2 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -117,193 +117,3 @@ void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
BUG_ON(l->top_p > l->end_keys_p);
bkey_copy(where, insert);
}
-
-/* Scan keylists simple utilities */
-
-void bch_scan_keylist_init(struct scan_keylist *kl,
- struct cache_set *c,
- unsigned max_size)
-
-{
- kl->c = c;
- kl->owner = NULL;
-
- mutex_init(&kl->lock);
- kl->max_size = max_size;
- bch_keylist_init(&kl->list, NULL, 0);
-
- /*
- * Order of initialization is tricky, and this makes sure that
- * we have a valid cache set in case the order of
- * initialization chages and breaks things.
- */
- BUG_ON(c == NULL);
- mutex_lock(&c->gc_scan_keylist_lock);
- list_add_tail(&kl->mark_list, &c->gc_scan_keylists);
- mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl)
-{
- if (kl->c) {
- mutex_lock(&kl->c->gc_scan_keylist_lock);
- list_del(&kl->mark_list);
- mutex_unlock(&kl->c->gc_scan_keylist_lock);
- }
-
- mutex_lock(&kl->lock);
- bch_keylist_free(&kl->list);
- mutex_unlock(&kl->lock);
-}
-
-void bch_scan_keylist_reset(struct scan_keylist *kl)
-{
- mutex_lock(&kl->lock);
- kl->list.bot_p = kl->list.top_p = kl->list.start_keys_p;
- mutex_unlock(&kl->lock);
-}
-
-/*
- * This should only be called from sysfs, and holding a lock that prevents
- * re-entrancy.
- */
-void bch_scan_keylist_resize(struct scan_keylist *kl,
- unsigned max_size)
-{
- mutex_lock(&kl->lock);
- kl->max_size = max_size; /* May be smaller than current size */
- mutex_unlock(&kl->lock);
-}
-
-/**
- * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from keylist keys
- *
- * This prevents us from wrapping around gens for a bucket only referenced from
- * the tiering or moving GC keylists. We don't actually care that the data in
- * those buckets is marked live, only that we don't wrap the gens.
- *
- * Note: This interlocks with insertions, but not all dequeues interlock.
- * The particular case in which dequeues don't interlock is when a
- * scan list used by the copy offload ioctls is used as a plain
- * keylist for btree insertion.
- * The btree insertion code doesn't go through
- * bch_scan_keylist_dequeue below, and instead uses plain
- * bch_keylist_dequeue. The other pointers (top, start, end) are
- * unchanged in this case.
- * A little care with the bottomp pointer suffices in this case.
- * Of course, we may end up marking stuff that we don't need to mark,
- * but was recently valid and we have likely just inserted in the tree
- * anyway.
- */
-void bch_keylist_recalc_oldest_gens(struct cache_set *c,
- struct scan_keylist *kl)
-{
- struct bkey_i *k;
-
- mutex_lock(&kl->lock);
-
- for_each_keylist_key(&kl->list, k)
- bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(k));
-
- mutex_unlock(&kl->lock);
-}
-
-int bch_scan_keylist_add(struct scan_keylist *kl, struct bkey_s_c k)
-{
- int ret;
-
- mutex_lock(&kl->lock);
- ret = bch_keylist_realloc_max(&kl->list,
- k.k->u64s,
- kl->max_size);
-
- if (!ret) {
- bkey_reassemble(kl->list.top, k);
- bch_keylist_enqueue(&kl->list);
- atomic64_add(k.k->size, &kl->sectors);
- }
- mutex_unlock(&kl->lock);
-
- return ret;
-}
-
-/* Actual scanning functionality of scan_keylists */
-
-static void bch_refill_scan_keylist(struct cache_set *c,
- struct scan_keylist *kl,
- struct bpos *last_scanned,
- struct bpos end,
- scan_keylist_pred_fn *pred)
-{
- struct bpos start = *last_scanned;
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned nr_found = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, *last_scanned, k) {
- if (bkey_cmp(k.k->p, end) >= 0) {
- *last_scanned = k.k->p;
- goto done;
- }
-
- if (pred(kl, k)) {
- if (bch_scan_keylist_add(kl, k))
- goto done;
-
- nr_found++;
- }
-
- *last_scanned = k.k->p;
- bch_btree_iter_cond_resched(&iter);
- }
-
- /* If we end up here, it means:
- * - the map_fn didn't fill up the keybuf
- * - the map_fn didn't see the end key
- * - there were no more keys to map over
- * Therefore, we are at the end of the key space */
- *last_scanned = POS_MAX;
-done:
- bch_btree_iter_unlock(&iter);
-
- trace_bcache_keyscan(nr_found,
- start.inode, start.offset,
- last_scanned->inode,
- last_scanned->offset);
-}
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *kl)
-{
- if (bch_keylist_empty(&kl->list))
- return NULL;
-
- return bch_keylist_front(&kl->list);
-}
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
- struct scan_keylist *kl,
- struct bpos *last_scanned,
- struct bpos end,
- scan_keylist_pred_fn *pred)
-{
- if (bch_keylist_empty(&kl->list)) {
- if (bkey_cmp(*last_scanned, end) >= 0)
- return NULL;
-
- bch_refill_scan_keylist(c, kl, last_scanned, end, pred);
- }
-
- return bch_scan_keylist_next(kl);
-}
-
-void bch_scan_keylist_dequeue(struct scan_keylist *kl)
-{
- u64 sectors;
-
- mutex_lock(&kl->lock);
- sectors = kl->list.bot->k.size;
- bch_keylist_dequeue(&kl->list);
- mutex_unlock(&kl->lock);
-
- BUG_ON(atomic64_sub_return(sectors, &kl->sectors) < 0);
-}
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index 028552757527..8fc92986f22f 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -116,49 +116,5 @@ void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
int bch_keylist_realloc(struct keylist *, unsigned);
int bch_keylist_realloc_max(struct keylist *, unsigned, unsigned);
-void bch_scan_keylist_init(struct scan_keylist *kl,
- struct cache_set *c,
- unsigned max_size);
-
-void bch_scan_keylist_reset(struct scan_keylist *kl);
-
-/* The keylist is dynamically adjusted. This just clamps the maxima */
-
-static inline unsigned bch_scan_keylist_size(struct scan_keylist *kl)
-{
- return kl->max_size;
-}
-
-static inline u64 bch_scan_keylist_sectors(struct scan_keylist *kl)
-{
- return atomic64_read(&kl->sectors);
-}
-
-void bch_scan_keylist_resize(struct scan_keylist *kl,
- unsigned max_size);
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl);
-
-/*
- * IMPORTANT: The caller of bch_scan_keylist_next or
- * bch_scan_keylist_next_rescan needs to copy any
- * non-null return value before calling either again!
- * These functions return a pointer into the internal structure.
- * Furthermore, they need to call bch_scan_keylist_advance after
- * copying the structure.
- */
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *);
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
- struct scan_keylist *kl,
- struct bpos *last_scanned,
- struct bpos end,
- scan_keylist_pred_fn *pred);
-
-int bch_scan_keylist_add(struct scan_keylist *, struct bkey_s_c);
-void bch_scan_keylist_dequeue(struct scan_keylist *);
-
-void bch_keylist_recalc_oldest_gens(struct cache_set *, struct scan_keylist *);
#endif /* _BCACHE_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
index 569cdc2480e2..156fbe0745fd 100644
--- a/fs/bcachefs/keylist_types.h
+++ b/fs/bcachefs/keylist_types.h
@@ -48,49 +48,4 @@ struct keylist {
bool has_buf;
};
-/*
- * scan_keylists are conceptually similar to keybufs, but they don't
- * have an internal RB tree.
- * keybufs should be used when read or write operations need to
- * examine keys in flight, as for writeback.
- * But for moving operations (moving gc, tiering, moving data off
- * devices), read and writes don't need to look at all, so we don't
- * need the RB tree and use scan_keylists instead.
- *
- * Note that unlike keybufs, they don't contain a semaphore to limit
- * bios. That must be done externally, if necessary.
- */
-
-#define DFLT_SCAN_KEYLIST_MAX_SIZE 512
-
-struct scan_keylist {
- struct list_head mark_list; /* For GC marking */
-
- struct cache_set *c; /* For destroying */
-
- /*
- * Only one thread is allowed to mutate the keylist. Other threads can
- * read it. The mutex has to be taken by the mutator thread when
- * mutating the keylist, and by other threads when reading, but not by
- * the mutator thread when reading.
- */
- struct mutex lock;
- /*
- * Maximum size, in u64s. The keylist will not grow beyond this size.
- */
- unsigned max_size;
- /*
- * Number of sectors in keys currently on the keylist.
- */
- atomic64_t sectors;
- /*
- * The underlying keylist.
- */
- struct keylist list;
-
- struct moving_queue *owner;
-};
-
-typedef bool (scan_keylist_pred_fn)(struct scan_keylist *, struct bkey_s_c);
-
#endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index c33606865eb2..aa9e0dd80227 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -12,39 +12,30 @@
#include "migrate.h"
#include "move.h"
-static bool migrate_data_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
- struct cache *ca = container_of(kl, struct cache,
- moving_gc_queue.keys);
-
- return bkey_extent_is_data(k.k) &&
- bch_extent_has_device(bkey_s_c_to_extent(k),
- ca->sb.nr_this_dev);
-}
-
static void bch_extent_drop_dev_ptrs(struct bkey_s_extent e, unsigned dev)
{
struct bch_extent_ptr *ptr;
+ unsigned dropped = 0;
extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == dev)
+ if (ptr->dev == dev) {
bch_extent_drop_ptr(e, ptr);
+ dropped++;
+ }
+
+ BUG_ON(dropped > 1);
}
-static int issue_migration_move(struct cache *ca,
- struct moving_context *ctxt,
- struct bkey_s_c k,
- u64 *seen_key_count)
+static int migrate_extent(struct cache_set *c, struct cache *ca,
+ struct bkey_s_c k, struct move_context *m)
{
- struct moving_queue *q = &ca->moving_gc_queue;
- struct cache_set *c = ca->set;
struct moving_io *io;
struct disk_reservation res;
if (bch_disk_reservation_get(c, &res, k.k->size, 0))
return -ENOSPC;
- io = moving_io_alloc(k);
+ io = bch_moving_io_alloc(k);
if (!io) {
bch_disk_reservation_put(c, &res);
return -ENOMEM;
@@ -60,33 +51,14 @@ static int issue_migration_move(struct cache *ca,
0);
io->op.nr_replicas = 1;
- io->op.io_wq = q->wq;
-
bch_extent_drop_dev_ptrs(bkey_i_to_s_extent(&io->op.insert_key),
ca->sb.nr_this_dev);
- bch_data_move(q, ctxt, io);
- (*seen_key_count)++;
-
- /*
- * IMPORTANT: We must call bch_data_move before we dequeue so
- * that the key can always be found in either the pending list
- * in the moving queue or in the scan keylist list in the
- * moving queue.
- * If we reorder, there is a window where a key is not found
- * by btree gc marking.
- */
- bch_scan_keylist_dequeue(&q->keys);
+ bch_data_move(m, io);
return 0;
}
-#define MIGRATION_DEBUG 0
-
#define MAX_DATA_OFF_ITER 10
-#define PASS_LOW_LIMIT (MIGRATION_DEBUG ? 0 : 2)
-#define MIGRATE_NR 64
-#define MIGRATE_READ_NR 32
-#define MIGRATE_WRITE_NR 32
/*
* This moves only the data off, leaving the meta-data (if any) in place.
@@ -104,37 +76,9 @@ static int issue_migration_move(struct cache *ca,
int bch_move_data_off_device(struct cache *ca)
{
- int ret;
- struct bkey_i *k;
- unsigned pass;
- u64 seen_key_count;
- unsigned last_error_count;
- unsigned last_error_flags;
- struct moving_context context;
struct cache_set *c = ca->set;
- struct moving_queue *queue = &ca->moving_gc_queue;
-
- /*
- * This reuses the moving gc queue as it is no longer in use
- * by moving gc, which must have been stopped to call this.
- */
-
- BUG_ON(ca->moving_gc_read != NULL);
-
- /*
- * This may actually need to start the work queue because the
- * device may have always been read-only and never have had it
- * started (moving gc usually starts it but not for RO
- * devices).
- */
-
- bch_queue_start(queue);
-
- queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR);
-
- BUG_ON(queue->wq == NULL);
- bch_moving_context_init(&context, NULL, MOVING_PURPOSE_MIGRATION);
- context.avoid = ca;
+ u64 seen_key_count = 1;
+ unsigned pass;
/*
* In theory, only one pass should be necessary as we've
@@ -153,82 +97,44 @@ int bch_move_data_off_device(struct cache *ca)
* but that can be viewed as a verification pass.
*/
- seen_key_count = 1;
- last_error_count = 0;
- last_error_flags = 0;
-
for (pass = 0;
(seen_key_count != 0 && (pass < MAX_DATA_OFF_ITER));
pass++) {
- bool again;
-
- seen_key_count = 0;
- atomic_set(&context.error_count, 0);
- atomic_set(&context.error_flags, 0);
- context.last_scanned = POS_MIN;
-
-again:
- again = false;
-
- while (1) {
- if (bch_queue_full(queue)) {
- if (queue->rotational) {
- again = true;
- break;
- } else {
- bch_moving_wait(&context);
- continue;
- }
- }
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct move_context m;
- k = bch_scan_keylist_next_rescan(c,
- &queue->keys,
- &context.last_scanned,
- POS_MAX,
- migrate_data_pred);
- if (k == NULL)
- break;
+ move_context_init(&m);
- if (issue_migration_move(ca, &context, bkey_i_to_s_c(k),
- &seen_key_count)) {
- /*
- * Memory allocation failed; we will wait for
- * all queued moves to finish and continue
- * scanning starting from the same key
- */
- again = true;
- break;
- }
- }
-
- bch_queue_run(queue, &context);
- if (again)
- goto again;
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) {
+ if (bkey_extent_is_data(k.k) &&
+ bch_extent_has_device(bkey_s_c_to_extent(k),
+ ca->sb.nr_this_dev)) {
+ BKEY_PADDED(k) tmp;
- if ((pass >= PASS_LOW_LIMIT)
- && (seen_key_count != (MIGRATION_DEBUG ? ~0ULL : 0))) {
- pr_notice("found %llu keys on pass %u.",
- seen_key_count, pass);
- }
+ bkey_reassemble(&tmp.k, k);
+ bch_btree_iter_unlock(&iter);
- last_error_count = atomic_read(&context.error_count);
- last_error_flags = atomic_read(&context.error_flags);
+ seen_key_count++;
+ migrate_extent(c, ca,
+ bkey_i_to_s_c(&tmp.k),
+ &m);
+ }
- if (last_error_count != 0) {
- pr_notice("pass %u: error count = %u, error flags = 0x%x",
- pass, last_error_count, last_error_flags);
+ bch_btree_iter_cond_resched(&iter);
}
+ bch_btree_iter_unlock(&iter);
+
+ closure_sync(&m.cl);
}
- if (seen_key_count != 0 || last_error_count != 0) {
+ if (seen_key_count) {
pr_err("Unable to migrate all data in %d iterations.",
MAX_DATA_OFF_ITER);
- ret = -EDEADLK;
- } else if (MIGRATION_DEBUG)
- pr_notice("Migrated all data in %d iterations", pass);
+ return -EDEADLK;
+ }
- bch_queue_run(queue, &context);
- return ret;
+ return 0;
}
/*
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bbfcbdae2f37..8e8ae4acb74c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1,75 +1,28 @@
#include "bcache.h"
-#include "btree_gc.h"
-#include "buckets.h"
+#include "extents.h"
#include "io.h"
#include "move.h"
-#include "super.h"
-#include "keylist.h"
#include <trace/events/bcache.h>
-static void moving_error(struct moving_context *ctxt, unsigned flag)
+void bch_moving_io_free(struct moving_io *io)
{
- atomic_inc(&ctxt->error_count);
- atomic_or(flag, &ctxt->error_flags);
-}
-
-void bch_moving_context_init(struct moving_context *ctxt,
- struct bch_ratelimit *rate,
- enum moving_purpose purpose)
-{
- memset(ctxt, 0, sizeof(*ctxt));
- ctxt->task = current;
- ctxt->rate = rate;
- ctxt->purpose = purpose;
- closure_init_stack(&ctxt->cl);
-}
-
-/*
- * bch_moving_wait() -- wait for a bch_moving_notify() call
- *
- * To deal with lost wakeups, we make this return immediately if notify
- * was already called.
- */
-void bch_moving_wait(struct moving_context *ctxt)
-{
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (atomic_xchg(&ctxt->pending, 0))
- break;
- schedule();
- }
- __set_current_state(TASK_RUNNING);
-}
-
-static void bch_moving_notify(struct moving_context *ctxt)
-{
- atomic_set(&ctxt->pending, 1);
- wake_up_process(ctxt->task);
-}
-
-static bool __bch_queue_reads_pending(struct moving_queue *q)
-{
- return (q->read_count > 0 || !RB_EMPTY_ROOT(&q->tree));
+ bch_bio_free_pages(&io->bio.bio.bio);
+ kfree(io);
}
-static bool bch_queue_reads_pending(struct moving_queue *q)
+static void bch_moving_io_destructor(struct closure *cl)
{
- unsigned long flags;
- bool pending;
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct move_context *m = container_of(cl->parent,
+ struct move_context, cl);
+ unsigned nr_pages = DIV_ROUND_UP(io->key.k.size, PAGE_SECTORS);
- spin_lock_irqsave(&q->lock, flags);
- pending = __bch_queue_reads_pending(q);
- spin_unlock_irqrestore(&q->lock, flags);
+ while (nr_pages--)
+ up(&m->nr_pages_limit);
- return pending;
-}
-
-static void bch_queue_write(struct moving_queue *q)
-{
- BUG_ON(q->wq == NULL);
- queue_work(q->wq, &q->work);
+ bch_moving_io_free(io);
}
static void moving_init(struct moving_io *io, struct bio *bio)
@@ -86,531 +39,83 @@ static void moving_init(struct moving_io *io, struct bio *bio)
bch_bio_map(bio, NULL);
}
-struct moving_io *moving_io_alloc(struct bkey_s_c k)
-{
- struct moving_io *io;
-
- io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
- * DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
- GFP_KERNEL);
- if (!io)
- return NULL;
-
- bkey_reassemble(&io->key, k);
-
- moving_init(io, &io->rbio.bio);
-
- if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
- kfree(io);
- return NULL;
- }
-
- return io;
-}
-
-void moving_io_free(struct moving_io *io)
-{
- bch_bio_free_pages(&io->wbio.bio.bio);
- kfree(io);
-}
-
-static void moving_io_destructor(struct closure *cl)
+static void write_moving(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_queue *q = io->q;
- struct moving_context *ctxt = io->context;
- unsigned long flags;
- bool kick_writes = true;
-
- if (io->replace.failures)
- trace_bcache_copy_collision(q, &io->key.k);
-
- spin_lock_irqsave(&q->lock, flags);
-
- BUG_ON(!q->count);
- q->count--;
-
- if (io->read_issued) {
- BUG_ON(!q->read_count);
- q->read_count--;
- }
-
- if (io->write_issued) {
- BUG_ON(!q->write_count);
- q->write_count--;
- trace_bcache_move_write_done(q, &io->key.k);
- }
-
- list_del_init(&io->list);
-
- if ((q->count == 0) && (q->stop_waitcl != NULL)) {
- closure_put(q->stop_waitcl);
- q->stop_waitcl = NULL;
- }
-
- if (q->rotational && __bch_queue_reads_pending(q))
- kick_writes = false;
-
- if (list_empty(&q->pending))
- kick_writes = false;
-
- spin_unlock_irqrestore(&q->lock, flags);
-
- moving_io_free(io);
-
- if (kick_writes)
- bch_queue_write(q);
-
- bch_moving_notify(ctxt);
-}
-
-static void moving_io_after_write(struct closure *cl)
-{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_context *ctxt = io->context;
if (io->op.error)
- moving_error(ctxt, MOVING_FLAG_WRITE);
-
- moving_io_destructor(cl);
-}
-
-static void write_moving(struct moving_io *io)
-{
- bool stopped;
- unsigned long flags;
- struct bch_write_op *op = &io->op;
-
- spin_lock_irqsave(&io->q->lock, flags);
- BUG_ON(io->q->count == 0);
- stopped = io->q->stopped;
- spin_unlock_irqrestore(&io->q->lock, flags);
-
- /*
- * If the queue has been stopped, prevent the write from occurring.
- * This stops all writes on a device going read-only as quickly
- * as possible.
- */
-
- if (op->error || stopped)
- closure_return_with_destructor(&io->cl, moving_io_destructor);
- else {
- moving_init(io, &io->wbio.bio.bio);
-
- op->bio->bio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k);
-
- closure_call(&op->cl, bch_write, NULL, &io->cl);
- closure_return_with_destructor(&io->cl, moving_io_after_write);
- }
-}
-
-static void bch_queue_write_work(struct work_struct *work)
-{
- struct moving_queue *q = container_of(work, struct moving_queue, work);
- struct moving_io *io;
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
-
- if (q->rotational && __bch_queue_reads_pending(q)) {
- /* All reads should have finished before writes start */
- spin_unlock_irqrestore(&q->lock, flags);
- return;
- }
-
- while (!q->stopped && q->write_count < q->max_write_count) {
- io = list_first_entry_or_null(&q->pending,
- struct moving_io, list);
- /*
- * We only issue the writes in insertion order to preserve
- * any linearity in the original key list/tree, so if we
- * find an io whose read hasn't completed, we don't
- * scan beyond it. Eventually that read will complete,
- * at which point we may issue multiple writes (for it
- * and any following entries whose reads had already
- * completed and we had not examined here).
- */
- if (!io || !io->read_completed)
- break;
-
- BUG_ON(io->write_issued);
- q->write_count++;
- io->write_issued = 1;
- list_del(&io->list);
- list_add_tail(&io->list, &q->write_pending);
- trace_bcache_move_write(q, &io->key.k);
- spin_unlock_irqrestore(&q->lock, flags);
- write_moving(io);
- spin_lock_irqsave(&q->lock, flags);
- }
-
- spin_unlock_irqrestore(&q->lock, flags);
-}
-
-/*
- * IMPORTANT: The caller of queue_init must have zero-filled it when it
- * allocates it.
- */
-
-int bch_queue_init(struct moving_queue *q,
- struct cache_set *c,
- unsigned max_size,
- unsigned max_count,
- unsigned max_read_count,
- unsigned max_write_count,
- bool rotational,
- const char *name)
-{
- INIT_WORK(&q->work, bch_queue_write_work);
-
- q->keys.owner = q;
- q->max_count = max_count;
- q->max_read_count = max_read_count;
- q->max_write_count = max_write_count;
- q->rotational = rotational;
+ closure_return_with_destructor(&io->cl, bch_moving_io_destructor);
- spin_lock_init(&q->lock);
- INIT_LIST_HEAD(&q->pending);
- INIT_LIST_HEAD(&q->write_pending);
- q->tree = RB_ROOT;
+ moving_init(io);
- q->wq = alloc_workqueue(name,
- WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1);
- if (!q->wq)
- return -ENOMEM;
+ io->op.bio->bio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k);
- return 0;
-}
-
-void bch_queue_start(struct moving_queue *q)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- q->stopped = false;
- spin_unlock_irqrestore(&q->lock, flags);
-
- bch_scan_keylist_reset(&q->keys);
-}
-
-void queue_io_resize(struct moving_queue *q,
- unsigned max_io,
- unsigned max_read,
- unsigned max_write)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- q->max_count = max_io;
- q->max_read_count = max_read;
- q->max_write_count = max_write;
- spin_unlock_irqrestore(&q->lock, flags);
-}
-
-void bch_queue_destroy(struct moving_queue *q)
-{
- if (q->wq)
- destroy_workqueue(q->wq);
- q->wq = NULL;
-
- bch_scan_keylist_destroy(&q->keys);
-}
-
-static void bch_queue_cancel_writes(struct moving_queue *q)
-{
- struct moving_io *io;
- unsigned long flags;
- bool read_issued, read_completed;
-
- spin_lock_irqsave(&q->lock, flags);
-
- while (1) {
- io = list_first_entry_or_null(&q->pending,
- struct moving_io,
- list);
- if (!io)
- break;
-
- BUG_ON(io->write_issued);
- list_del_init(&io->list);
- read_issued = io->read_issued;
- read_completed = io->read_completed;
- if (!read_issued && !read_completed && q->rotational)
- rb_erase(&io->node, &q->tree);
- spin_unlock_irqrestore(&q->lock, flags);
- if (read_completed)
- closure_return_with_destructor_noreturn(&io->cl,
- moving_io_destructor);
- else if (!read_issued)
- moving_io_destructor(&io->cl);
- spin_lock_irqsave(&q->lock, flags);
- }
-
- spin_unlock_irqrestore(&q->lock, flags);
-}
-
-void bch_queue_stop(struct moving_queue *q)
-{
- unsigned long flags;
- struct closure waitcl;
-
- closure_init_stack(&waitcl);
-
- spin_lock_irqsave(&q->lock, flags);
- if (q->stopped)
- BUG_ON(q->stop_waitcl != NULL);
- else {
- q->stopped = true;
- if (q->count != 0) {
- q->stop_waitcl = &waitcl;
- closure_get(&waitcl);
- }
- }
- spin_unlock_irqrestore(&q->lock, flags);
-
- bch_queue_cancel_writes(q);
-
- closure_sync(&waitcl);
-}
-
-static void pending_recalc_oldest_gens(struct cache_set *c, struct list_head *l)
-{
- struct moving_io *io;
-
- list_for_each_entry(io, l, list) {
- /*
- * This only marks the (replacement) key and not the
- * insertion key in the bch_write_op, as the insertion
- * key should be a subset of the replacement key except
- * for any new pointers added by the write, and those
- * don't need to be marked because they are pointing
- * to open buckets until the write completes
- */
- bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&io->key));
- }
-}
-
-void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q)
-{
- unsigned long flags;
-
- /* 1st, mark the keylist keys */
- bch_keylist_recalc_oldest_gens(c, &q->keys);
-
- /* 2nd, mark the keys in the I/Os */
- spin_lock_irqsave(&q->lock, flags);
-
- pending_recalc_oldest_gens(c, &q->pending);
- pending_recalc_oldest_gens(c, &q->write_pending);
-
- spin_unlock_irqrestore(&q->lock, flags);
+ closure_call(&io->op.cl, bch_write, NULL, &io->cl);
+ closure_return_with_destructor(&io->cl, bch_moving_io_destructor);
}
static void read_moving_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_queue *q = io->q;
- struct moving_context *ctxt = io->context;
- bool stopped;
- unsigned long flags;
-
- if (bio->bi_error) {
+ if (bio->bi_error)
io->op.error = bio->bi_error;
- moving_error(io->context, MOVING_FLAG_READ);
- }
-
- bio_put(bio);
-
- spin_lock_irqsave(&q->lock, flags);
-
- trace_bcache_move_read_done(q, &io->key.k);
-
- BUG_ON(!io->read_issued);
- BUG_ON(io->read_completed);
- io->read_issued = 0;
- io->read_completed = 1;
- BUG_ON(!q->read_count);
- q->read_count--;
- stopped = q->stopped;
- if (stopped)
- list_del_init(&io->list);
- spin_unlock_irqrestore(&q->lock, flags);
- if (stopped)
- closure_return_with_destructor(&io->cl,
- moving_io_destructor);
- else if (!q->rotational)
- bch_queue_write(q);
-
- bch_moving_notify(ctxt);
+ closure_put(cl);
}
static void __bch_data_move(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct extent_pick_ptr pick;
- u64 size = io->key.k.size;
- bch_extent_pick_ptr_avoiding(io->op.c, bkey_i_to_s_c(&io->key),
- io->context->avoid, &pick);
+ bch_extent_pick_ptr(io->op.c,
+ bkey_i_to_s_c(&io->key),
+ &pick);
if (IS_ERR_OR_NULL(pick.ca))
- closure_return_with_destructor(cl, moving_io_destructor);
-
- io->context->keys_moved++;
- io->context->sectors_moved += size;
- if (io->context->rate)
- bch_ratelimit_increment(io->context->rate, size);
+ closure_return_with_destructor(cl, bch_moving_io_destructor);
io->rbio.bio.bi_rw = READ;
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k);
io->rbio.bio.bi_end_io = read_moving_endio;
+ closure_get(cl);
bch_read_extent(io->op.c, &io->rbio,
bkey_i_to_s_c(&io->key),
&pick, BCH_READ_IS_LAST);
-}
-
-/*
- * bch_queue_full() - return if more reads can be queued with bch_data_move().
- *
- * In rotational mode, always returns false if no reads are in flight (see
- * how max_count is initialized in bch_queue_init()).
- */
-bool bch_queue_full(struct moving_queue *q)
-{
- unsigned long flags;
- bool full;
- spin_lock_irqsave(&q->lock, flags);
- BUG_ON(q->count > q->max_count);
- BUG_ON(q->read_count > q->max_read_count);
- full = (q->count == q->max_count ||
- q->read_count == q->max_read_count);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return full;
-}
-
-static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2)
-{
- if (io1->sort_key < io2->sort_key)
- return -1;
- else if (io1->sort_key > io2->sort_key)
- return 1;
- else {
- /* We don't want duplicate keys. Eventually, we will have
- * support for GC with duplicate pointers -- for now,
- * just sort them randomly instead */
- if (io1 < io2)
- return -1;
- else if (io1 > io2)
- return 1;
- BUG();
- }
+ continue_at(cl, write_moving, io->op.io_wq); /* XXX different wq */
}
-void bch_data_move(struct moving_queue *q,
- struct moving_context *ctxt,
- struct moving_io *io)
+void bch_data_move(struct move_context *m, struct moving_io *io)
{
- unsigned long flags;
- bool stopped = false;
-
- BUG_ON(q->wq == NULL);
- io->q = q;
- io->context = ctxt;
-
- spin_lock_irqsave(&q->lock, flags);
- if (q->stopped) {
- stopped = true;
- goto out;
- }
-
- q->count++;
- list_add_tail(&io->list, &q->pending);
- trace_bcache_move_read(q, &io->key.k);
-
- if (q->rotational)
- BUG_ON(RB_INSERT(&q->tree, io, node, moving_io_cmp));
- else {
- BUG_ON(io->read_issued);
- io->read_issued = 1;
- q->read_count++;
- }
+ unsigned nr_pages = DIV_ROUND_UP(io->key.k.size, PAGE_SECTORS);
-out:
- spin_unlock_irqrestore(&q->lock, flags);
+ while (nr_pages--)
+ down(&m->nr_pages_limit);
- if (stopped)
- moving_io_free(io);
- else if (!q->rotational)
- closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
+ closure_call(&io->cl, __bch_data_move, NULL, &m->cl);
}
-/* Rotational device queues */
-
-static bool bch_queue_read(struct moving_queue *q,
- struct moving_context *ctxt)
+struct moving_io *bch_moving_io_alloc(struct bkey_s_c k)
{
- unsigned long flags;
- struct rb_node *node;
struct moving_io *io;
- bool stopped;
-
- BUG_ON(!q->rotational);
-
- spin_lock_irqsave(&q->lock, flags);
- node = rb_first(&q->tree);
- if (!node) {
- spin_unlock_irqrestore(&q->lock, flags);
- return false;
- }
-
- io = rb_entry(node, struct moving_io, node);
- rb_erase(node, &q->tree);
- io->read_issued = 1;
- q->read_count++;
- stopped = q->stopped;
- spin_unlock_irqrestore(&q->lock, flags);
- if (stopped) {
- moving_io_destructor(&io->cl);
- return false;
- } else {
- closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
- return true;
- }
-}
-
-void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt)
-{
- unsigned long flags;
- bool full;
-
- if (!q->rotational)
- goto sync;
+ io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+ * DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ return NULL;
- while (!bch_moving_context_wait(ctxt)) {
- spin_lock_irqsave(&q->lock, flags);
- full = (q->read_count == q->max_read_count);
- spin_unlock_irqrestore(&q->lock, flags);
+ bkey_reassemble(&io->key, k);
- if (full) {
- bch_moving_wait(ctxt);
- continue;
- }
+ moving_init(io);
- if (!bch_queue_read(q, ctxt))
- break;
+ if (bio_alloc_pages(&io->bio.bio.bio, GFP_KERNEL)) {
+ kfree(io);
+ return NULL;
}
- while (bch_queue_reads_pending(q))
- bch_moving_wait(ctxt);
-
- bch_queue_write(q);
-
-sync:
- closure_sync(&ctxt->cl);
+ return io;
}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 2f9998e66e7f..4c5433e36abf 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -3,95 +3,25 @@
#include "buckets.h"
#include "io_types.h"
+#include <linux/semaphore.h>
-enum moving_purpose {
- MOVING_PURPOSE_UNKNOWN, /* Un-init */
- MOVING_PURPOSE_MIGRATION,
- MOVING_PURPOSE_TIERING,
- MOVING_PURPOSE_COPY_GC,
-};
-
-enum moving_flag_bitnos {
- MOVING_FLAG_BITNO_READ = 0,
- MOVING_FLAG_BITNO_WRITE,
-};
-
-#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ)
-#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE)
-
-struct moving_context {
- /* Closure for waiting on all reads and writes to complete */
+struct move_context {
struct closure cl;
-
- /* Number and types of errors reported */
- atomic_t error_count;
- atomic_t error_flags;
-
- /* If != 0, @task is waiting for a read or write to complete */
- atomic_t pending;
- struct task_struct *task;
-
- /* Key and sector moves issued, updated from submission context */
- u64 keys_moved;
- u64 sectors_moved;
-
- /* Last key scanned */
- struct bpos last_scanned;
-
- /* Rate-limiter counting submitted reads */
- struct bch_ratelimit *rate;
-
- /* Try to avoid reading the following device */
- struct cache *avoid;
-
- /* Debugging... */
- enum moving_purpose purpose;
+ struct semaphore nr_pages_limit;
};
-void bch_moving_context_init(struct moving_context *, struct bch_ratelimit *,
- enum moving_purpose);
-
-static inline int bch_moving_context_wait(struct moving_context *ctxt)
+static inline void move_context_init(struct move_context *m)
{
- if (ctxt->rate == NULL)
- return 0;
-
- return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl);
+ closure_init_stack(&m->cl);
+ sema_init(&m->nr_pages_limit, (8 << 20) / PAGE_SIZE);
}
-void bch_moving_wait(struct moving_context *);
-
struct moving_io {
- struct list_head list;
- struct rb_node node;
struct closure cl;
- struct moving_queue *q;
+
struct bch_write_op op;
struct bch_replace_info replace;
- struct moving_context *context;
BKEY_PADDED(key);
- /* Sort key for moving_queue->tree */
- u64 sort_key;
- /* Protected by q->lock */
-
- /*
- * 1) !read_issued && !read_completed
- * - Closure is not running yet, starts when read_issued is set
- * - IO is in q->tree (if q->rotational) and q->pending
- * 2) !write_issued && !write_completed:
- * - IO is in q->pending
- * 3) write_issued:
- * - IO is in q->write_pending
- * 4) write_completed:
- * - Closure is about to return and the IO is about to be freed
- *
- * If read_issued, we hold a reference on q->read_count
- * If write_issued, we hold a reference on q->write_count
- * Until IO is freed, we hold a reference on q->count
- */
- unsigned read_issued:1;
- unsigned read_completed:1;
- unsigned write_issued:1;
struct bch_read_bio rbio;
struct bch_write_bio wbio;
@@ -99,67 +29,8 @@ struct moving_io {
struct bio_vec bi_inline_vecs[0];
};
-struct moving_io *moving_io_alloc(struct bkey_s_c);
-void moving_io_free(struct moving_io *);
-
-typedef struct moving_io *(moving_queue_fn)(struct moving_queue *,
- struct moving_context *);
-
-int bch_queue_init(struct moving_queue *,
- struct cache_set *,
- unsigned max_keys,
- unsigned max_ios,
- unsigned max_reads,
- unsigned max_writes,
- bool rotational,
- const char *);
-void bch_queue_start(struct moving_queue *);
-bool bch_queue_full(struct moving_queue *);
-void bch_data_move(struct moving_queue *,
- struct moving_context *,
- struct moving_io *);
-void queue_io_resize(struct moving_queue *,
- unsigned,
- unsigned,
- unsigned);
-void bch_queue_destroy(struct moving_queue *);
-void bch_queue_stop(struct moving_queue *);
-
-void bch_queue_recalc_oldest_gens(struct cache_set *, struct moving_queue *);
-
-void bch_queue_run(struct moving_queue *, struct moving_context *);
-
-#define sysfs_queue_attribute(name) \
- rw_attribute(name##_max_count); \
- rw_attribute(name##_max_read_count); \
- rw_attribute(name##_max_write_count); \
- rw_attribute(name##_max_keys)
-
-#define sysfs_queue_files(name) \
- &sysfs_##name##_max_count, \
- &sysfs_##name##_max_read_count, \
- &sysfs_##name##_max_write_count, \
- &sysfs_##name##_max_keys
-
-#define sysfs_queue_show(name, var) \
-do { \
- sysfs_hprint(name##_max_count, (var)->max_count); \
- sysfs_print(name##_max_read_count, (var)->max_read_count); \
- sysfs_print(name##_max_write_count, (var)->max_write_count);\
- sysfs_print(name##_max_keys, bch_scan_keylist_size(&(var)->keys));\
-} while (0)
-
-#define sysfs_queue_store(name, var) \
-do { \
- sysfs_strtoul(name##_max_count, (var)->max_count); \
- sysfs_strtoul(name##_max_read_count, (var)->max_read_count); \
- sysfs_strtoul(name##_max_write_count, (var)->max_write_count); \
- if (attr == &sysfs_##name##_max_keys) { \
- int v = strtoi_h_or_return(buf); \
- \
- v = clamp(v, 2, KEYLIST_MAX); \
- bch_scan_keylist_resize(&(var)->keys, v); \
- } \
-} while (0)
+void bch_moving_io_free(struct moving_io *);
+struct moving_io *bch_moving_io_alloc(struct bkey_s_c);
+void bch_data_move(struct move_context *, struct moving_io *);
#endif /* _BCACHE_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
deleted file mode 100644
index d5e1a4a968fa..000000000000
--- a/fs/bcachefs/move_types.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _BCACHE_MOVE_TYPES_H
-#define _BCACHE_MOVE_TYPES_H
-
-/*
- * We rely on moving_queue being kzalloc'd so that the initial value of
- * the flags is 0.
- */
-
-struct moving_queue {
- struct work_struct work;
- struct scan_keylist keys;
- struct workqueue_struct *wq;
-
- /* Configuration */
- unsigned max_count;
- unsigned max_read_count;
- unsigned max_write_count;
-
- /*
- * If true, reads are coming from rotational media. All reads
- * are queued up on @tree and sorted by physical location prior
- * to being submitted.
- */
- bool rotational;
-
- /* This can be examined without locking */
- bool stopped;
-
- /* Protects everything below */
- spinlock_t lock;
-
- struct closure *stop_waitcl;
-
- /*
- * Tree of struct moving_io, sorted by moving_io->sort_key.
- * Contains reads which have not yet been issued; when a read is
- * issued, it is removed from the tree.
- *
- * Only used if @rotational is set.
- */
- struct rb_root tree;
-
- /*
- * List of struct moving_io, sorted by logical offset.
- * Contains writes which have not yet been issued; when a write is
- * issued, it is removed from the list.
- *
- * Writes are issued in logical offset order, and only when all
- * prior writes have been issued.
- */
- struct list_head pending;
-
- /*
- * List of struct moving_io, sorted by logical offset.
- *
- * Contains writes which are in-flight.
- */
- struct list_head write_pending;
-
- unsigned count;
- unsigned read_count;
- unsigned write_count;
-};
-
-#endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
deleted file mode 100644
index 0c77ea6c808c..000000000000
--- a/fs/bcachefs/movinggc.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "buckets.h"
-#include "clock.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-
-#include <trace/events/bcache.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-
-/* Moving GC - IO loop */
-
-static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
- struct cache *ca = container_of(kl, struct cache,
- moving_gc_queue.keys);
- struct cache_set *c = ca->set;
- const struct bch_extent_ptr *ptr;
- bool ret = false;
-
- if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
- rcu_read_lock();
- extent_for_each_ptr(e, ptr)
- if (PTR_CACHE(c, ptr) == ca &&
- PTR_BUCKET(ca, ptr)->copygc_gen)
- ret = true;
- rcu_read_unlock();
- }
-
- return ret;
-}
-
-static int issue_moving_gc_move(struct moving_queue *q,
- struct moving_context *ctxt,
- struct bkey_i *k)
-{
- struct cache *ca = container_of(q, struct cache, moving_gc_queue);
- struct cache_set *c = ca->set;
- struct bkey_s_extent e;
- struct bch_extent_ptr *ptr;
- struct moving_io *io;
- unsigned gen;
-
- io = moving_io_alloc(bkey_i_to_s_c(k));
- if (!io) {
- trace_bcache_moving_gc_alloc_fail(c, k->k.size);
- return -ENOMEM;
- }
-
- bch_replace_init(&io->replace, bkey_i_to_s_c(k));
-
- bch_write_op_init(&io->op, c, &io->wbio,
- (struct disk_reservation) { 0 },
- NULL, bkey_i_to_s_c(k),
- &io->replace.hook, NULL,
- bkey_extent_is_cached(&k->k)
- ? BCH_WRITE_CACHED : 0);
- io->op.nr_replicas = 1;
-
- e = bkey_i_to_s_extent(&io->op.insert_key);
-
- extent_for_each_ptr(e, ptr)
- if ((ca->sb.nr_this_dev == ptr->dev) &&
- (gen = PTR_BUCKET(ca, ptr)->copygc_gen)) {
- gen--;
- BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets));
- io->op.wp = &ca->gc_buckets[gen];
- io->sort_key = ptr->offset;
- bch_extent_drop_ptr(e, ptr);
- goto found;
- }
-
- /* We raced - bucket's been reused */
- moving_io_free(io);
- goto out;
-found:
- trace_bcache_gc_copy(&k->k);
-
- /*
- * IMPORTANT: We must call bch_data_move before we dequeue so
- * that the key can always be found in either the pending list
- * in the moving queue or in the scan keylist list in the
- * moving queue.
- * If we reorder, there is a window where a key is not found
- * by btree gc marking.
- */
- bch_data_move(q, ctxt, io);
-out:
- bch_scan_keylist_dequeue(&q->keys);
- return 0;
-}
-
-static void read_moving(struct cache *ca, struct moving_context *ctxt)
-{
- struct bkey_i *k;
- bool again;
-
- bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
- do {
- again = false;
-
- while (!bch_moving_context_wait(ctxt)) {
- if (bch_queue_full(&ca->moving_gc_queue)) {
- if (ca->moving_gc_queue.rotational) {
- again = true;
- break;
- } else {
- bch_moving_wait(ctxt);
- continue;
- }
- }
-
- k = bch_scan_keylist_next_rescan(
- ca->set,
- &ca->moving_gc_queue.keys,
- &ctxt->last_scanned,
- POS_MAX,
- moving_pred);
-
- if (k == NULL)
- break;
-
- if (issue_moving_gc_move(&ca->moving_gc_queue,
- ctxt, k)) {
- /*
- * Memory allocation failed; we will wait for
- * all queued moves to finish and continue
- * scanning starting from the same key
- */
- again = true;
- break;
- }
- }
-
- bch_queue_run(&ca->moving_gc_queue, ctxt);
- } while (!kthread_should_stop() && again);
-}
-
-static void bch_moving_gc(struct cache *ca)
-{
- struct cache_set *c = ca->set;
- struct bucket *g;
-
- u64 sectors_to_move, sectors_gen, gen_current, sectors_total;
- size_t buckets_to_move, buckets_unused = 0;
- struct bucket_heap_entry e;
- unsigned sectors_used, i;
- int reserve_sectors;
-
- struct moving_context ctxt;
-
- bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate,
- MOVING_PURPOSE_COPY_GC);
-
- /*
- * We won't fill up the moving GC reserve completely if the data
- * being copied is from different generations. In the worst case,
- * there will be NUM_GC_GENS buckets of internal fragmentation
- */
-
- spin_lock(&ca->freelist_lock);
- reserve_sectors = ca->mi.bucket_size *
- (fifo_used(&ca->free[RESERVE_MOVINGGC]) - NUM_GC_GENS);
- spin_unlock(&ca->freelist_lock);
-
- if (reserve_sectors < (int) c->sb.block_size) {
- trace_bcache_moving_gc_reserve_empty(ca);
- return;
- }
-
- trace_bcache_moving_gc_start(ca);
-
- /*
- * Find buckets with lowest sector counts, skipping completely
- * empty buckets, by building a maxheap sorted by sector count,
- * and repeatedly replacing the maximum element until all
- * buckets have been visited.
- */
-
- mutex_lock(&ca->heap_lock);
- ca->heap.used = 0;
- for_each_bucket(g, ca) {
- g->copygc_gen = 0;
-
- if (bucket_unused(g)) {
- buckets_unused++;
- continue;
- }
-
- if (g->mark.owned_by_allocator ||
- g->mark.is_metadata)
- continue;
-
- sectors_used = bucket_sectors_used(g);
-
- if (sectors_used >= ca->mi.bucket_size)
- continue;
-
- bucket_heap_push(ca, g, sectors_used);
- }
-
- sectors_to_move = 0;
- for (i = 0; i < ca->heap.used; i++)
- sectors_to_move += ca->heap.data[i].val;
-
- /* XXX: calculate this threshold rigorously */
-
- if (ca->heap.used < ca->free_inc.size / 2 &&
- sectors_to_move < reserve_sectors) {
- mutex_unlock(&ca->heap_lock);
- trace_bcache_moving_gc_no_work(ca);
- return;
- }
-
- while (sectors_to_move > reserve_sectors) {
- BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
- sectors_to_move -= e.val;
- }
-
- buckets_to_move = ca->heap.used;
-
- /*
- * resort by write_prio to group into generations, attempts to
- * keep hot and cold data in the same locality.
- */
-
- mutex_lock(&ca->set->bucket_lock);
- for (i = 0; i < ca->heap.used; i++) {
- struct bucket_heap_entry *e = &ca->heap.data[i];
-
- e->val = (c->prio_clock[WRITE].hand - e->g->write_prio);
- }
-
- heap_resort(&ca->heap, bucket_max_cmp);
-
- sectors_gen = sectors_to_move / NUM_GC_GENS;
- gen_current = 1;
- sectors_total = 0;
-
- while (heap_pop(&ca->heap, e, bucket_max_cmp)) {
- sectors_total += bucket_sectors_used(e.g);
- e.g->copygc_gen = gen_current;
- if (gen_current < NUM_GC_GENS &&
- sectors_total >= sectors_gen * gen_current)
- gen_current++;
- }
- mutex_unlock(&ca->set->bucket_lock);
-
- mutex_unlock(&ca->heap_lock);
-
- read_moving(ca, &ctxt);
-
- trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
- buckets_to_move);
-}
-
-static int bch_moving_gc_thread(void *arg)
-{
- struct cache *ca = arg;
- struct cache_set *c = ca->set;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last;
- s64 next;
-
- set_freezable();
-
- while (!kthread_should_stop()) {
- if (kthread_wait_freezable(c->copy_gc_enabled))
- break;
-
- last = atomic_long_read(&clock->now);
- /*
- * don't start copygc until less than half the gc reserve is
- * available:
- */
- next = (buckets_available_cache(ca) -
- div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
- c->opts.gc_reserve_percent, 200)) *
- ca->mi.bucket_size;
-
- if (next <= 0)
- bch_moving_gc(ca);
- else
- bch_kthread_io_clock_wait(clock, last + next);
- }
-
- return 0;
-}
-
-#define MOVING_GC_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE
-#define MOVING_GC_NR 64
-#define MOVING_GC_READ_NR 32
-#define MOVING_GC_WRITE_NR 32
-
-int bch_moving_init_cache(struct cache *ca)
-{
- bool rotational = !blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev));
-
- bch_pd_controller_init(&ca->moving_gc_pd);
- ca->moving_gc_pd.d_term = 0;
-
- return bch_queue_init(&ca->moving_gc_queue,
- ca->set,
- MOVING_GC_KEYS_MAX_SIZE,
- MOVING_GC_NR,
- MOVING_GC_READ_NR,
- MOVING_GC_WRITE_NR,
- rotational,
- "bch_copygc_write");
-}
-
-int bch_moving_gc_thread_start(struct cache *ca)
-{
- struct task_struct *t;
-
- /* The moving gc read thread must be stopped */
- BUG_ON(ca->moving_gc_read != NULL);
-
- bch_queue_start(&ca->moving_gc_queue);
-
- if (cache_set_init_fault("moving_gc_start"))
- return -ENOMEM;
-
- t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read");
- if (IS_ERR(t))
- return PTR_ERR(t);
-
- ca->moving_gc_read = t;
- wake_up_process(ca->moving_gc_read);
-
- return 0;
-}
-
-void bch_moving_gc_stop(struct cache *ca)
-{
- ca->moving_gc_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
- bch_queue_stop(&ca->moving_gc_queue);
-
- if (ca->moving_gc_read)
- kthread_stop(ca->moving_gc_read);
- ca->moving_gc_read = NULL;
-
- /*
- * Make sure that it is empty so that gc marking doesn't keep
- * marking stale entries from when last used.
- */
- bch_scan_keylist_reset(&ca->moving_gc_queue.keys);
-}
-
-void bch_moving_gc_destroy(struct cache *ca)
-{
- bch_queue_destroy(&ca->moving_gc_queue);
-}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
deleted file mode 100644
index 5d09e0fa3ae1..000000000000
--- a/fs/bcachefs/movinggc.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _BCACHE_MOVINGGC_H
-#define _BCACHE_MOVINGGC_H
-
-int bch_moving_init_cache(struct cache *);
-void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
-void bch_moving_gc_destroy(struct cache *);
-
-#endif
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..cedad5462da3
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,467 @@
+/*
+ * Copygc, tiering:
+ */
+
+#include "bcache.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "io.h"
+#include "move.h"
+
+#include <trace/events/bcache.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+
+/*
+ * XXX preserve ordering when reads complete out of order
+ *
+ * do performance testing with disk write cache off
+ */
+
+static inline bool rebalance_entry_sectors_cmp(struct rebalance_bucket_entry l,
+ struct rebalance_bucket_entry r)
+{
+ return l.sectors < r.sectors;
+}
+
+static int rebalance_entry_bucket_cmp(const void *_l, const void *_r)
+{
+ const struct rebalance_bucket_entry *l = _l;
+ const struct rebalance_bucket_entry *r = _r;
+
+ if (l->dev != r->dev)
+ return l->dev < r->dev ? -1 : 1;
+ if (l->bucket != r->bucket)
+ return l->bucket < r->bucket ? -1 : 1;
+ return 0;
+}
+
+static inline void rebalance_heap_push(struct rebalance_thread *r,
+ size_t bucket, u8 dev,
+ u8 gen, unsigned sectors)
+{
+ struct rebalance_bucket_entry new = {
+ .bucket = bucket,
+ .dev = dev,
+ .gen = gen,
+ .sectors = sectors,
+ };
+
+ if (!heap_full(&r->heap))
+ heap_add(&r->heap, new, rebalance_entry_sectors_cmp);
+ else if (rebalance_entry_sectors_cmp(new, heap_peek(&r->heap))) {
+ r->heap.data[0] = new;
+ heap_sift(&r->heap, 0, rebalance_entry_sectors_cmp);
+ }
+}
+
+/* returns nr of extents that should be written to this tier: */
+static unsigned should_tier_extent(struct cache_set *c,
+ struct rebalance_thread *r,
+ struct cache_member_rcu *mi,
+ struct bkey_s_c_extent e)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned replicas = 0;
+
+ /* Make sure we have room to add a new pointer: */
+ if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+ BKEY_EXTENT_VAL_U64s_MAX)
+ return false;
+
+ extent_for_each_ptr(e, ptr)
+ if (PTR_TIER(mi, ptr) >= r->tier)
+ replicas++;
+
+ return replicas < c->opts.data_replicas
+ ? c->opts.data_replicas - replicas
+ : 0;
+}
+
+static bool should_copygc_ptr(struct cache_set *c,
+ struct rebalance_thread *r,
+ struct cache_member_rcu *mi,
+ const struct bch_extent_ptr *ptr)
+{
+ struct cache *ca;
+ bool ret = false;
+
+ if (PTR_TIER(mi, ptr) == r->tier &&
+ (ca = PTR_CACHE(c, ptr))) {
+ struct rebalance_bucket_entry *e, s = {
+ .dev = ptr->dev,
+ .bucket = PTR_BUCKET_NR(ca, ptr),
+ };
+
+ mutex_lock(&r->heap_lock);
+
+ e = bsearch(&s,
+ r->heap.data,
+ r->heap.used,
+ sizeof(r->heap.data[0]),
+ rebalance_entry_bucket_cmp);
+ if (e &&
+ e->gen == ptr->gen &&
+ e->gen == PTR_BUCKET_GEN(ca, ptr))
+ ret = true;
+
+ mutex_unlock(&r->heap_lock);
+ }
+
+ return ret;
+}
+
+static bool rebalance_pred(struct cache_set *c,
+ struct rebalance_thread *r,
+ struct bkey_s_c k)
+{
+ bool need_tier = false, need_copygc = false;
+
+ if (bkey_extent_is_data(k.k)) {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ struct cache_member_rcu *mi = cache_member_info_get(c);
+
+ if (should_tier_extent(c, r, mi, e))
+ need_tier = true;
+
+ extent_for_each_ptr(e, ptr)
+ if (should_copygc_ptr(c, r, mi, ptr))
+ need_copygc = true;
+
+ cache_member_info_put();
+ }
+
+ return need_tier || need_copygc;
+}
+
+static int rebalance_extent(struct cache_set *c,
+ struct rebalance_thread *r,
+ struct bkey_s_c k,
+ struct move_context *m)
+{
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct moving_io *io;
+ unsigned nr_new_extents;
+ bool have_faster_extent = false;
+ struct cache_member_rcu *mi;
+
+ io = bch_moving_io_alloc(k);
+ if (!io) {
+ //trace_bcache_moving_gc_alloc_fail(c, k.k->size);
+ return -ENOMEM;
+ }
+
+ bch_replace_init(&io->replace, k);
+
+ /* How the piss are reserves going to work? */
+
+ bch_write_op_init(&io->op, c, &io->bio,
+ (struct disk_reservation) { 0 },
+ &r->wp, k,
+ &io->replace.hook, NULL,
+ bkey_extent_is_cached(k.k)
+ ? BCH_WRITE_CACHED : 0);
+
+ io->op.io_wq = r->wq;
+
+ e = bkey_i_to_s_extent(&io->op.insert_key);
+
+ mi = cache_member_info_get(c);
+
+ nr_new_extents = should_tier_extent(c, r, mi, e.c);
+
+ extent_for_each_ptr_backwards(e, ptr) {
+ if (PTR_TIER(mi, ptr) < r->tier) {
+ if (have_faster_extent)
+ bch_extent_drop_ptr(e, ptr);
+ else
+ have_faster_extent = true;
+ }
+
+ if (should_copygc_ptr(c, r, mi, ptr)) {
+ bch_extent_drop_ptr(e, ptr);
+ nr_new_extents++;
+ }
+ }
+
+ cache_member_info_put();
+
+ if (!nr_new_extents) {
+ /* We raced - bucket's been reused */
+ bch_moving_io_free(io);
+ return 0;
+ }
+ io->op.nr_replicas = nr_new_extents;
+
+ bch_data_move(m, io);
+ return 0;
+}
+
+static void rebalance_walk_extents(struct cache_set *c,
+ struct rebalance_thread *r)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct move_context m;
+
+ move_context_init(&m);
+ bch_ratelimit_reset(&r->pd.rate);
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) {
+ if (kthread_should_stop())
+ break;
+
+ if (rebalance_pred(c, r, k)) {
+ BKEY_PADDED(k) tmp;
+
+ bkey_reassemble(&tmp.k, k);
+ bch_btree_iter_unlock(&iter);
+
+ rebalance_extent(c, r,
+ bkey_i_to_s_c(&tmp.k),
+ &m);
+ }
+
+ bch_btree_iter_cond_resched(&iter);
+ }
+ bch_btree_iter_unlock(&iter);
+
+ closure_sync(&m.cl);
+}
+
+static void bch_rebalance(struct cache_set *c, struct rebalance_thread *r)
+{
+ struct cache_group devs, *tier = &c->cache_tiers[r->tier];
+ struct rebalance_bucket_entry e;
+ unsigned i, seq, sectors_used;
+ u64 sectors_to_move, reserve_sectors = 0;
+ size_t buckets_unused = 0;
+
+ rcu_read_lock();
+
+ do {
+ seq = read_seqcount_begin(&tier->lock);
+ devs = *tier;
+ } while (read_seqcount_retry(&tier->lock, seq));
+
+ for (i = 0; i < devs.nr_devices; i++)
+ percpu_ref_get(&rcu_dereference(devs.devices[i])->ref);
+
+ rcu_read_unlock();
+
+ mutex_lock(&r->heap_lock);
+
+ r->heap.used = 0;
+
+ for (i = 0; i < devs.nr_devices; i++) {
+ struct cache *ca =
+ rcu_dereference_protected(devs.devices[i], 1);
+ size_t bucket;
+
+ spin_lock(&ca->freelist_lock);
+ reserve_sectors += ca->mi.bucket_size *
+ fifo_used(&ca->free[RESERVE_MOVINGGC]);
+ spin_unlock(&ca->freelist_lock);
+
+ for (bucket = ca->mi.first_bucket;
+ bucket < ca->mi.nbuckets;
+ bucket++) {
+ struct bucket *g = ca->buckets + bucket;
+
+ if (bucket_unused(g)) {
+ buckets_unused++;
+ continue;
+ }
+
+ if (g->mark.owned_by_allocator ||
+ g->mark.is_metadata)
+ continue;
+
+ sectors_used = bucket_sectors_used(g);
+
+ if (sectors_used >= ca->mi.bucket_size)
+ continue;
+
+ rebalance_heap_push(r, bucket, ca->sb.nr_this_dev,
+ ca->bucket_gens[bucket],
+ sectors_used);
+ }
+ }
+
+ /*
+ * Problems...
+ * XXX: wait on the allocator? perhaps the allocator just hasn't
+ * invalidated/discarded buckets we freed up from our last run?
+ */
+ if (!reserve_sectors)
+ goto out_put;
+
+ sectors_to_move = 0;
+ for (i = 0; i < r->heap.used; i++)
+ sectors_to_move += r->heap.data[i].sectors;
+
+ /*
+ * If there's not enough work to do, bail out so we aren't scanning the
+ * btree unnecessarily:
+ *
+ * XXX: calculate this threshold rigorously
+ */
+#if 0
+ if (r->heap.used < ca->free_inc.size / 2 &&
+ sectors_to_move < reserve_sectors)
+ goto out_put;
+#endif
+
+ /* Pop buckets off until the they fit into our reserve: */
+ while (sectors_to_move > reserve_sectors) {
+ BUG_ON(!heap_pop(&r->heap, e, rebalance_entry_sectors_cmp));
+ sectors_to_move -= e.sectors;
+ }
+
+ sort(r->heap.data,
+ r->heap.used,
+ sizeof(r->heap.data[0]),
+ rebalance_entry_bucket_cmp,
+ NULL);
+
+ mutex_unlock(&r->heap_lock);
+
+ for (i = 0; i < devs.nr_devices; i++)
+ percpu_ref_put(&rcu_dereference_protected(devs.devices[i],
+ 1)->ref);
+
+ rebalance_walk_extents(c, r);
+ return;
+
+out_put:
+ mutex_unlock(&r->heap_lock);
+ for (i = 0; i < devs.nr_devices; i++)
+ percpu_ref_put(&rcu_dereference(devs.devices[i])->ref);
+}
+
+static int bch_rebalance_thread(void *arg)
+{
+ struct rebalance_thread *r = arg;
+ struct cache_set *c = container_of(r, struct cache_set,
+ rebalance[r->tier]);
+ struct io_clock *clock = &c->io_clock[WRITE];
+ unsigned long last;
+ //bool moved;
+
+ while (!kthread_should_stop()) {
+ if (kthread_wait_freezable(c->copy_gc_enabled ||
+ c->tiering_enabled))
+ break;
+
+ last = atomic_long_read(&clock->now);
+
+ bch_rebalance(c, r);
+
+ /*
+ * This really should be a library code, but it has to be
+ * kthread specific... ugh
+ */
+#if 0
+ if (!moved)
+ bch_kthread_io_clock_wait(clock,
+ last + ca->free_inc.size / 2);
+#endif
+ }
+
+ return 0;
+}
+
+static void bch_rebalance_exit_tier(struct rebalance_thread *r)
+{
+ if (r->p)
+ kthread_stop(r->p);
+ r->p = NULL;
+ if (r->wq)
+ destroy_workqueue(r->wq);
+ r->wq = NULL;
+ free_heap(&r->heap);
+}
+
+void bch_rebalance_exit(struct cache_set *c)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->rebalance); i++)
+ bch_rebalance_exit_tier(&c->rebalance[i]);
+}
+
+/*
+ * Called whenever we add a device - initializes the per tier rebalance thread,
+ * or resizes the heap if necessary
+ */
+int bch_rebalance_init(struct cache_set *c, struct cache *ca)
+{
+ unsigned tier = ca->mi.tier;
+ struct rebalance_thread *r = &c->rebalance[tier];
+ struct task_struct *p;
+ u64 nbuckets = 0;
+ size_t heap_size;
+ unsigned i;
+ typeof(r->heap) old_heap;
+
+ lockdep_assert_held(&bch_register_lock);
+
+ if (!r->initialized) {
+ r->tier = tier;
+ mutex_init(&r->heap_lock);
+ r->wp.group = &c->cache_tiers[tier];
+ r->wp.reserve = RESERVE_MOVINGGC; /* XXX */
+ r->initialized = 1;
+ }
+
+ if (!r->wq)
+ r->wq = create_workqueue("bch_rebalance_io");
+ if (!r->wq)
+ return -ENOMEM;
+
+ if (!r->p) {
+ p = kthread_create(bch_rebalance_thread, r,
+ "bch_rebalance");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ r->p = p;
+ }
+
+ /* ca hasn't been added to array of devices yet: */
+ nbuckets += ca->mi.nbuckets;
+
+ rcu_read_lock();
+ for_each_cache_rcu(ca, c, i)
+ if (ca->mi.tier == tier)
+ nbuckets += ca->mi.nbuckets;
+ rcu_read_unlock();
+
+ mutex_lock(&r->heap_lock);
+ old_heap = r->heap;
+
+ heap_size = max_t(size_t, nbuckets >> 7, old_heap.used);
+ BUG_ON(!heap_size);
+
+ if (!init_heap(&r->heap, heap_size, GFP_KERNEL)) {
+ mutex_unlock(&r->heap_lock);
+ return -ENOMEM;
+ }
+
+ if (old_heap.data) {
+ memcpy(r->heap.data,
+ old_heap.data,
+ sizeof(old_heap.data[0]) * old_heap.used);
+ r->heap.used = old_heap.used;
+ free_heap(&old_heap);
+ }
+
+ mutex_unlock(&r->heap_lock);
+
+ return 0;
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..3a15dff7bdff
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,7 @@
+#ifndef _BCACHE_REBALANCE_H
+#define _BCACHE_REBALANCE_H
+
+void bch_rebalance_exit(struct cache_set *);
+int bch_rebalance_init(struct cache_set *, struct cache *ca);
+
+#endif /* _BCACHE_REBALANCE_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ef8fb0dac003..beb0587be4ce 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -24,11 +24,10 @@
#include "keylist.h"
#include "move.h"
#include "migrate.h"
-#include "movinggc.h"
#include "notify.h"
+#include "rebalance.h"
#include "stats.h"
#include "super.h"
-#include "tier.h"
#include "writeback.h"
#include <linux/backing-dev.h>
@@ -683,15 +682,6 @@ static void __bch_cache_set_read_only(struct cache_set *c)
struct cache *ca;
unsigned i;
- c->tiering_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&c->tiering_pd.rate);
- bch_tiering_read_stop(c);
-
- for_each_cache(ca, c, i) {
- bch_tiering_write_stop(ca);
- bch_moving_gc_stop(ca);
- }
-
bch_gc_thread_stop(c);
bch_btree_flush(c);
@@ -804,7 +794,6 @@ void bch_cache_set_read_only_sync(struct cache_set *c)
static const char *__bch_cache_set_read_write(struct cache_set *c)
{
- struct cache *ca;
const char *err;
unsigned i;
@@ -822,22 +811,9 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
if (bch_gc_thread_start(c))
goto err;
- for_each_cache(ca, c, i) {
- if (ca->mi.state != CACHE_ACTIVE)
- continue;
-
- err = "error starting moving GC thread";
- if (bch_moving_gc_thread_start(ca)) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
-
- bch_tiering_write_start(ca);
- }
-
- err = "error starting tiering thread";
- if (bch_tiering_read_start(c))
- goto err;
+ for (i = 0; i < ARRAY_SIZE(c->rebalance); i++)
+ if (c->rebalance[i].p)
+ wake_up_process(c->rebalance[i].p);
schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
@@ -877,6 +853,7 @@ static void cache_set_free(struct cache_set *c)
cancel_work_sync(&c->bio_submit_work);
cancel_work_sync(&c->read_retry_work);
+ bch_rebalance_exit(c);
bch_bset_sort_state_free(&c->sort);
bch_btree_cache_free(c);
bch_journal_free(&c->journal);
@@ -1061,11 +1038,8 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
mutex_init(&c->mi_lock);
-
init_rwsem(&c->gc_lock);
mutex_init(&c->trigger_gc_lock);
- mutex_init(&c->gc_scan_keylist_lock);
- INIT_LIST_HEAD(&c->gc_scan_keylists);
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
spin_lock_init(&c->name##_time.lock);
@@ -1073,7 +1047,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
#undef BCH_TIME_STAT
bch_open_buckets_init(c);
- bch_tiering_init_cache_set(c);
INIT_LIST_HEAD(&c->list);
INIT_LIST_HEAD(&c->cached_devs);
@@ -1507,8 +1480,7 @@ static void __bch_cache_read_only(struct cache *ca)
{
trace_bcache_cache_read_only(ca);
- bch_tiering_write_stop(ca);
- bch_moving_gc_stop(ca);
+ /* XXX do stuff with rebalance thread */
/*
* This stops new data writes (e.g. to existing open data
@@ -1564,19 +1536,12 @@ static const char *__bch_cache_read_write(struct cache *ca)
trace_bcache_cache_read_write(ca);
- bch_tiering_write_start(ca);
-
- trace_bcache_cache_read_write_done(ca);
-
- /* XXX wtf? */
- return NULL;
-
- err = "error starting moving GC thread";
- if (!bch_moving_gc_thread_start(ca))
- err = NULL;
+ if (bch_cache_allocator_start(ca))
+ return "error starting allocator thread";
- wake_up_process(ca->set->tiering_read);
+ /* XXX notify rebalance thread? */
+ trace_bcache_cache_read_write_done(ca);
bch_notify_cache_read_write(ca);
return err;
@@ -1633,8 +1598,6 @@ static void bch_cache_free_work(struct work_struct *work)
* to unregister them before we drop our reference to
* @c.
*/
- bch_moving_gc_destroy(ca);
- bch_tiering_write_destroy(ca);
cancel_work_sync(&ca->io_error_work);
@@ -1890,9 +1853,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
kobject_init(&ca->kobj, &bch_cache_ktype);
- seqcount_init(&ca->self.lock);
- ca->self.nr_devices = 1;
- rcu_assign_pointer(ca->self.devices[0], ca);
ca->sb.nr_this_dev = sb->sb->nr_this_dev;
INIT_WORK(&ca->free_work, bch_cache_free_work);
@@ -1919,8 +1879,7 @@ static const char *cache_alloc(struct bcache_superblock *sb,
ca->bucket_bits = ilog2(ca->mi.bucket_size);
/* XXX: tune these */
- movinggc_reserve = max_t(size_t, NUM_GC_GENS * 2,
- ca->mi.nbuckets >> 7);
+ movinggc_reserve = ca->mi.nbuckets >> 7;
reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
free_inc_reserve = reserve_none << 1;
heap_size = max_t(size_t, free_inc_reserve, movinggc_reserve);
@@ -1946,8 +1905,7 @@ static const char *cache_alloc(struct bcache_superblock *sb,
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio.bio)) ||
!(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
- bch_moving_init_cache(ca) ||
- bch_tiering_init_cache(ca))
+ bch_rebalance_init(c, ca))
goto err;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1957,20 +1915,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
total_reserve += ca->free[i].size;
pr_debug("%zu buckets reserved", total_reserve);
- for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++) {
- ca->gc_buckets[i].reserve = RESERVE_MOVINGGC;
- ca->gc_buckets[i].group = &ca->self;
- }
-
- ca->tiering_write_point.reserve = RESERVE_NONE;
- ca->tiering_write_point.group = &ca->self;
-
- /* XXX: scan keylists will die */
- bch_scan_keylist_init(&ca->moving_gc_queue.keys, c,
- DFLT_SCAN_KEYLIST_MAX_SIZE);
- bch_scan_keylist_init(&ca->tiering_queue.keys, c,
- DFLT_SCAN_KEYLIST_MAX_SIZE);
-
kobject_get(&c->kobj);
ca->set = c;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a901b5d8368a..446552c460ec 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -138,14 +138,14 @@ rw_attribute(cache_replacement_policy);
rw_attribute(foreground_write_ratelimit_enabled);
rw_attribute(copy_gc_enabled);
-sysfs_queue_attribute(copy_gc);
-sysfs_pd_controller_attribute(copy_gc);
+//sysfs_queue_attribute(copy_gc);
+//sysfs_pd_controller_attribute(copy_gc);
rw_attribute(tiering_enabled);
rw_attribute(tiering_percent);
sysfs_pd_controller_attribute(tiering);
-sysfs_queue_attribute(tiering);
-rw_attribute(tiering_stripe_size);
+//sysfs_queue_attribute(tiering);
+//rw_attribute(tiering_stripe_size);
sysfs_pd_controller_attribute(foreground_write);
@@ -701,7 +701,7 @@ SHOW(bch_cache_set)
sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
sysfs_print(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_show(tiering, &c->tiering_pd);
+ //sysfs_pd_controller_show(tiering, &c->tiering_pd);
sysfs_print(btree_flush_delay, c->btree_flush_delay);
@@ -781,23 +781,26 @@ STORE(__bch_cache_set)
c->foreground_write_ratelimit_enabled);
if (attr == &sysfs_copy_gc_enabled) {
- struct cache *ca;
- unsigned i;
ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
?: (ssize_t) size;
+#if 0
+ struct cache *ca;
+ unsigned i;
for_each_cache(ca, c, i)
if (ca->moving_gc_read)
wake_up_process(ca->moving_gc_read);
+#endif
return ret;
}
if (attr == &sysfs_tiering_enabled) {
ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
?: (ssize_t) size;
-
+#if 0
if (c->tiering_read)
wake_up_process(c->tiering_read);
+#endif
return ret;
}
@@ -807,7 +810,6 @@ STORE(__bch_cache_set)
if (attr == &sysfs_journal_flush) {
bch_journal_meta_async(&c->journal, NULL);
-
return size;
}
@@ -816,7 +818,7 @@ STORE(__bch_cache_set)
sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
sysfs_strtoul(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_store(tiering, &c->tiering_pd);
+ //sysfs_pd_controller_store(tiering, &c->tiering_pd);
/* Debugging: */
@@ -1210,13 +1212,13 @@ SHOW(bch_cache)
sysfs_print(free_buckets, buckets_free_cache(ca, RESERVE_NONE));
sysfs_print(has_data, ca->mi.has_data);
sysfs_print(has_metadata, ca->mi.has_metadata);
-
+#if 0
sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
sysfs_queue_show(copy_gc, &ca->moving_gc_queue);
sysfs_queue_show(tiering, &ca->tiering_queue);
sysfs_print(tiering_stripe_size, ca->tiering_stripe_size);
-
+#endif
if (attr == &sysfs_cache_replacement_policy)
return bch_snprint_string_list(buf, PAGE_SIZE,
cache_replacement_policies,
@@ -1250,13 +1252,13 @@ STORE(__bch_cache)
struct cache *ca = container_of(kobj, struct cache, kobj);
struct cache_set *c = ca->set;
struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev];
-
+#if 0
sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
sysfs_queue_store(copy_gc, &ca->moving_gc_queue);
sysfs_queue_store(tiering, &ca->tiering_queue);
sysfs_strtoul(tiering_stripe_size, ca->tiering_stripe_size);
-
+#endif
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@@ -1377,10 +1379,6 @@ static struct attribute *bch_cache_files[] = {
&sysfs_state_rw,
&sysfs_alloc_debug,
- sysfs_pd_controller_files(copy_gc),
- sysfs_queue_files(copy_gc),
- sysfs_queue_files(tiering),
- &sysfs_tiering_stripe_size,
NULL
};
KTYPE(bch_cache);
diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c
deleted file mode 100644
index caf6b3df2c9c..000000000000
--- a/fs/bcachefs/tier.c
+++ /dev/null
@@ -1,466 +0,0 @@
-
-#include "bcache.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "move.h"
-#include "tier.h"
-
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <trace/events/bcache.h>
-
-/**
- * tiering_pred - check if tiering should copy an extent to tier 1
- */
-static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
- struct cache *ca = container_of(kl, struct cache,
- tiering_queue.keys);
- struct cache_set *c = ca->set;
-
- if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- struct cache_member_rcu *mi;
- unsigned replicas = 0;
-
- /* Make sure we have room to add a new pointer: */
- if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
- BKEY_EXTENT_VAL_U64s_MAX)
- return false;
-
- mi = cache_member_info_get(c);
- extent_for_each_ptr(e, ptr)
- if (ptr->dev < mi->nr_in_set &&
- mi->m[ptr->dev].tier)
- replicas++;
- cache_member_info_put();
-
- return replicas < c->opts.data_replicas;
- }
-
- return false;
-}
-
-struct tiering_refill {
- struct bpos start;
- struct cache *ca;
- int cache_iter;
- u64 sectors;
-};
-
-static void refill_done(struct tiering_refill *refill)
-{
- if (refill->ca) {
- percpu_ref_put(&refill->ca->ref);
- refill->ca = NULL;
- }
-}
-
-/**
- * refill_next - move on to refilling the next cache's tiering keylist
- */
-static void refill_next(struct cache_set *c, struct tiering_refill *refill)
-{
- struct cache_group *tier;
-
- refill_done(refill);
-
- rcu_read_lock();
- tier = &c->cache_tiers[1];
- if (tier->nr_devices == 0)
- goto out;
-
- while (1) {
- while (refill->cache_iter < tier->nr_devices) {
- refill->ca = rcu_dereference(
- tier->devices[refill->cache_iter]);
- if (refill->ca != NULL) {
- percpu_ref_get(&refill->ca->ref);
- goto out;
- }
- refill->cache_iter++;
- }
-
- /* Reached the end, wrap around */
- refill->cache_iter = 0;
- }
-
-out:
- rcu_read_unlock();
-}
-
-/*
- * refill_init - Start refilling a random cache device -- this ensures we
- * distribute data sanely even if each tiering pass discovers only a few
- * keys to tier
- */
-static void refill_init(struct cache_set *c, struct tiering_refill *refill)
-{
- struct cache_group *tier;
-
- memset(refill, 0, sizeof(*refill));
- refill->start = POS_MIN;
-
- rcu_read_lock();
- tier = &c->cache_tiers[1];
- if (tier->nr_devices != 0)
- refill->cache_iter = bch_rand_range(tier->nr_devices);
- rcu_read_unlock();
-
- refill_next(c, refill);
-}
-
-/**
- * tiering_keylist_full - we accumulate tiering_stripe_size sectors in a cache
- * device's tiering keylist before we move on to the next cache device
- */
-static bool tiering_keylist_full(struct tiering_refill *refill)
-{
- return (refill->sectors >= refill->ca->tiering_stripe_size);
-}
-
-/**
- * tiering_keylist_empty - to prevent a keylist from growing to more than twice
- * the tiering stripe size, we stop refill when a keylist has more than a single
- * stripe of sectors
- */
-static bool tiering_keylist_empty(struct cache *ca)
-{
- return (bch_scan_keylist_sectors(&ca->tiering_queue.keys)
- <= ca->tiering_stripe_size);
-}
-
-/**
- * tiering_refill - to keep all queues busy as much as possible, we add
- * up to a single stripe of sectors to each cache device's queue, iterating
- * over all cache devices twice, so each one has two stripe's of writes
- * queued up, before we have to wait for move operations to complete.
- */
-static void tiering_refill(struct cache_set *c, struct tiering_refill *refill)
-{
- struct scan_keylist *keys;
- struct btree_iter iter;
- struct bkey_s_c k;
-
- if (bkey_cmp(refill->start, POS_MAX) >= 0)
- return;
-
- if (refill->ca == NULL)
- return;
-
- if (!tiering_keylist_empty(refill->ca))
- return;
-
- trace_bcache_tiering_refill_start(c);
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, refill->start, k) {
- keys = &refill->ca->tiering_queue.keys;
-
- if (!tiering_pred(keys, k)) {
- refill->start = k.k->p;
- goto next;
- }
-
- /* Growing the keylist might fail */
- if (bch_scan_keylist_add(keys, k))
- goto done;
-
- /* TODO: split key if refill->sectors is now > stripe_size */
- refill->sectors += k.k->size;
- refill->start = k.k->p;
-
- /* Check if we've added enough keys to this keylist */
- if (tiering_keylist_full(refill)) {
- /* Move on to refill the next cache device's keylist */
- refill->sectors = 0;
- refill->cache_iter++;
- refill_next(c, refill);
-
- /* All cache devices got removed somehow */
- if (refill->ca == NULL)
- goto done;
-
- /*
- * If the next cache's keylist is not sufficiently
- * empty, wait for it to drain before refilling
- * anything. We prioritize even distribution of data
- * over maximizing write bandwidth.
- */
- if (!tiering_keylist_empty(refill->ca))
- goto done;
- }
-next:
- bch_btree_iter_cond_resched(&iter);
- }
- /* Reached the end of the keyspace */
- refill->start = POS_MAX;
-done:
- bch_btree_iter_unlock(&iter);
-
- trace_bcache_tiering_refill_end(c);
-}
-
-static int issue_tiering_move(struct moving_queue *q,
- struct moving_context *ctxt,
- struct bkey_s_c k)
-{
- struct cache *ca = container_of(q, struct cache, tiering_queue);
- struct cache_set *c = ca->set;
- struct moving_io *io;
-
- io = moving_io_alloc(k);
- if (!io) {
- trace_bcache_tiering_alloc_fail(c, k.k->size);
- return -ENOMEM;
- }
-
- bch_replace_init(&io->replace, bkey_i_to_s_c(&io->key));
-
- bch_write_op_init(&io->op, c, &io->wbio,
- (struct disk_reservation) { 0 },
- &ca->tiering_write_point,
- bkey_i_to_s_c(&io->key),
- &io->replace.hook, NULL, 0);
- io->op.io_wq = q->wq;
- io->op.nr_replicas = 1;
-
- trace_bcache_tiering_copy(k.k);
-
- /*
- * IMPORTANT: We must call bch_data_move before we dequeue so
- * that the key can always be found in either the pending list
- * in the moving queue or in the scan keylist list in the
- * moving queue.
- * If we reorder, there is a window where a key is not found
- * by btree gc marking.
- */
- bch_data_move(q, ctxt, io);
- bch_scan_keylist_dequeue(&q->keys);
- return 0;
-}
-
-/**
- * tiering_next_cache - issue a move to write an extent to the next cache
- * device in round robin order
- */
-static int tiering_next_cache(struct cache_set *c,
- int *cache_iter,
- struct moving_context *ctxt,
- struct tiering_refill *refill)
-{
- struct cache_group *tier;
- int start = *cache_iter;
- struct cache *ca;
-
- /* If true at the end of the loop, all keylists were empty, so we
- * have reached the end of the keyspace */
- bool done = true;
- /* If true at the end of the loop, all queues were full, so we must
- * wait for some ops to finish */
- bool full = true;
-
- do {
- rcu_read_lock();
- tier = &c->cache_tiers[1];
- if (tier->nr_devices == 0) {
- rcu_read_unlock();
- return 0;
- }
-
- if (*cache_iter >= tier->nr_devices) {
- rcu_read_unlock();
- *cache_iter = 0;
- continue;
- }
-
- ca = rcu_dereference(tier->devices[*cache_iter]);
- if (ca == NULL ||
- ca->mi.state != CACHE_ACTIVE ||
- ca->tiering_queue.stopped) {
- rcu_read_unlock();
- (*cache_iter)++;
- continue;
- }
-
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
- (*cache_iter)++;
-
- tiering_refill(c, refill);
-
- if (bch_queue_full(&ca->tiering_queue)) {
- done = false;
- } else {
- struct bkey_i *k =
- bch_scan_keylist_next(&ca->tiering_queue.keys);
- if (k) {
- issue_tiering_move(&ca->tiering_queue, ctxt,
- bkey_i_to_s_c(k));
- done = false;
- full = false;
- }
- }
-
- percpu_ref_put(&ca->ref);
- } while (*cache_iter != start);
-
- if (done) {
- /*
- * All devices have an empty keylist now, just wait for
- * pending moves to finish and we're done.
- */
- return 0;
- } else if (full) {
- /*
- * No device with keys still remaining on its keylist has a
- * queue that is not full. In this case, we have to wait for
- * at least one read to complete before trying again.
- * Otherwise, we could issue a read for this device.
- */
- return -EAGAIN;
- } else {
- /* Try again immediately */
- return -EIOCBQUEUED;
- }
-}
-
-static u64 read_tiering(struct cache_set *c)
-{
- struct moving_context ctxt;
- struct tiering_refill refill;
- int cache_iter = 0;
- int ret;
-
- trace_bcache_tiering_start(c);
-
- refill_init(c, &refill);
-
- bch_moving_context_init(&ctxt, &c->tiering_pd.rate,
- MOVING_PURPOSE_TIERING);
-
- while (!bch_moving_context_wait(&ctxt)) {
- cond_resched();
-
- ret = tiering_next_cache(c, &cache_iter, &ctxt, &refill);
- if (ret == -EAGAIN)
- bch_moving_wait(&ctxt);
- else if (!ret)
- break;
- }
-
- closure_sync(&ctxt.cl);
- refill_done(&refill);
-
- trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
-
- return ctxt.sectors_moved;
-}
-
-static int bch_tiering_thread(void *arg)
-{
- struct cache_set *c = arg;
- struct io_clock *clock = &c->io_clock[WRITE];
- struct cache *ca;
- u64 sectors, tier_capacity;
- unsigned long last;
- unsigned i;
-
- set_freezable();
-
- while (!kthread_should_stop()) {
- if (kthread_wait_freezable(c->tiering_enabled &&
- c->cache_tiers[1].nr_devices))
- break;
-
- last = atomic_long_read(&clock->now);
-
- sectors = read_tiering(c);
-
- tier_capacity = 0;
- rcu_read_lock();
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], i)
- tier_capacity +=
- (ca->mi.nbuckets -
- ca->mi.first_bucket) << ca->bucket_bits;
- rcu_read_unlock();
-
- if (sectors < tier_capacity >> 4)
- bch_kthread_io_clock_wait(clock,
- last + (tier_capacity >> 5));
- }
-
- return 0;
-}
-
-#define TIERING_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE
-#define TIERING_NR 64
-#define TIERING_READ_NR 8
-#define TIERING_WRITE_NR 32
-
-void bch_tiering_init_cache_set(struct cache_set *c)
-{
- bch_pd_controller_init(&c->tiering_pd);
-}
-
-int bch_tiering_init_cache(struct cache *ca)
-{
- ca->tiering_stripe_size = ca->mi.bucket_size * 2;
-
- return bch_queue_init(&ca->tiering_queue,
- ca->set,
- TIERING_KEYS_MAX_SIZE,
- TIERING_NR,
- TIERING_READ_NR,
- TIERING_WRITE_NR,
- false,
- "bch_tier_write");
-}
-
-void bch_tiering_write_start(struct cache *ca)
-{
- bch_queue_start(&ca->tiering_queue);
-}
-
-int bch_tiering_read_start(struct cache_set *c)
-{
- struct task_struct *t;
-
- t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
- if (IS_ERR(t))
- return PTR_ERR(t);
-
- c->tiering_read = t;
- wake_up_process(c->tiering_read);
-
- return 0;
-}
-
-void bch_tiering_write_destroy(struct cache *ca)
-{
- bch_queue_destroy(&ca->tiering_queue);
-}
-
-void bch_tiering_write_stop(struct cache *ca)
-{
- bch_queue_stop(&ca->tiering_queue);
-
- /*
- * Make sure that it is empty so that gc marking doesn't keep
- * marking stale entries from when last used.
- */
- bch_scan_keylist_reset(&ca->tiering_queue.keys);
-}
-
-void bch_tiering_read_stop(struct cache_set *c)
-{
- if (!IS_ERR_OR_NULL(c->tiering_read)) {
- kthread_stop(c->tiering_read);
- c->tiering_read = NULL;
- }
-}
diff --git a/fs/bcachefs/tier.h b/fs/bcachefs/tier.h
deleted file mode 100644
index 57b4acf86fb5..000000000000
--- a/fs/bcachefs/tier.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _BCACHE_TIER_H
-#define _BCACHE_TIER_H
-
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_init_cache(struct cache *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_write_start(struct cache *);
-void bch_tiering_write_destroy(struct cache *);
-void bch_tiering_write_stop(struct cache *);
-void bch_tiering_read_stop(struct cache_set *);
-
-#endif