summaryrefslogtreecommitdiff
path: root/libbcache/alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcache/alloc.c')
-rw-r--r--libbcache/alloc.c189
1 files changed, 103 insertions, 86 deletions
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
index 8cb31944..93f0c2f1 100644
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@@ -73,7 +73,6 @@
#include <linux/rcupdate.h>
#include <trace/events/bcache.h>
-static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
static void __bch_bucket_free(struct cache *, struct bucket *);
/* Allocation groups: */
@@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
spin_lock(&grp->lock);
- for (i = 0; i < grp->nr_devices; i++)
+ for (i = 0; i < grp->nr; i++)
if (rcu_access_pointer(grp->d[i].dev) == ca) {
- grp->nr_devices--;
+ grp->nr--;
memmove(&grp->d[i],
&grp->d[i + 1],
- (grp->nr_devices - i) * sizeof(grp->d[0]));
+ (grp->nr- i) * sizeof(grp->d[0]));
break;
}
@@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
unsigned i;
spin_lock(&grp->lock);
- for (i = 0; i < grp->nr_devices; i++)
+ for (i = 0; i < grp->nr; i++)
if (rcu_access_pointer(grp->d[i].dev) == ca)
goto out;
- BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+ BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
- rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+ rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
out:
spin_unlock(&grp->lock);
}
@@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
struct cache_set,
pd_controllers_update);
struct cache *ca;
- unsigned iter;
- int i;
+ unsigned i, iter;
/* All units are in bytes */
- u64 tier_size[BCH_TIER_MAX];
- u64 tier_free[BCH_TIER_MAX];
- u64 tier_dirty[BCH_TIER_MAX];
- u64 tier0_can_free = 0;
+ u64 faster_tiers_size = 0;
+ u64 faster_tiers_dirty = 0;
- memset(tier_size, 0, sizeof(tier_size));
- memset(tier_free, 0, sizeof(tier_free));
- memset(tier_dirty, 0, sizeof(tier_dirty));
+ u64 fastest_tier_size = 0;
+ u64 fastest_tier_free = 0;
+ u64 copygc_can_free = 0;
rcu_read_lock();
- for (i = BCH_TIER_MAX - 1; i >= 0; --i)
- group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+ bch_pd_controller_update(&c->tiers[i].pd,
+ div_u64(faster_tiers_size *
+ c->tiering_percent, 100),
+ faster_tiers_dirty,
+ -1);
+
+ group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
+ u64 size = (ca->mi.nbuckets -
+ ca->mi.first_bucket) << bucket_bits;
+ u64 dirty = stats.buckets_dirty << bucket_bits;
+ u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
/*
* Bytes of internal fragmentation, which can be
* reclaimed by copy GC
@@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
((stats.sectors_dirty +
stats.sectors_cached) << 9);
- u64 dev_size = (ca->mi.nbuckets -
- ca->mi.first_bucket) << bucket_bits;
-
- u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
-
if (fragmented < 0)
fragmented = 0;
bch_pd_controller_update(&ca->moving_gc_pd,
free, fragmented, -1);
- if (i == 0)
- tier0_can_free += fragmented;
-
- tier_size[i] += dev_size;
- tier_free[i] += free;
- tier_dirty[i] += stats.buckets_dirty << bucket_bits;
- }
- rcu_read_unlock();
-
- if (tier_size[1]) {
- u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+ faster_tiers_size += size;
+ faster_tiers_dirty += dirty;
- tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+ if (!c->fastest_tier ||
+ c->fastest_tier == &c->tiers[i]) {
+ fastest_tier_size += size;
+ fastest_tier_free += free;
+ }
- bch_pd_controller_update(&c->tiering_pd,
- target,
- tier_dirty[0],
- -1);
+ copygc_can_free += fragmented;
+ }
}
+ rcu_read_unlock();
+
/*
* Throttle foreground writes if tier 0 is running out of free buckets,
- * and either tiering or copygc can free up space (but don't take both
- * into account).
+ * and either tiering or copygc can free up space.
*
* Target will be small if there isn't any work to do - we don't want to
* throttle foreground writes if we currently have all the free space
@@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
* Otherwise, if there's work to do, try to keep 20% of tier0 available
* for foreground writes.
*/
+ if (c->fastest_tier)
+ copygc_can_free = U64_MAX;
+
bch_pd_controller_update(&c->foreground_write_pd,
- min(tier0_can_free,
- div_u64(tier_size[0] *
+ min(copygc_can_free,
+ div_u64(fastest_tier_size *
c->foreground_target_percent,
100)),
- tier_free[0],
+ fastest_tier_free,
-1);
schedule_delayed_work(&c->pd_controllers_update,
@@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
* it getting gc'd from under us
*/
ca->prio_buckets[i] = r;
- bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+ bch_mark_metadata_bucket(ca, ca->buckets + r,
+ BUCKET_PRIOS, false);
spin_unlock(&ca->prio_buckets_lock);
SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
do {
unsigned u64s = jset_u64s(0);
+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
+ break;
+
ret = bch_journal_res_get(j, &res, u64s, u64s);
if (ret)
return ret;
@@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
if (is_available_bucket(m) &&
!m.cached_sectors &&
!m.had_metadata &&
- (!m.wait_on_journal ||
- ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+ !bucket_needs_journal_commit(m, last_seq_ondisk)) {
spin_lock(&ca->freelist_lock);
bch_mark_alloc_bucket(ca, g, true);
@@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
set_freezable();
+ bch_find_empty_buckets(c, ca);
+
while (1) {
/*
* First, we pull buckets off of the free_inc list, possibly
@@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
* See if we have buckets we can reuse without invalidating them
* or forcing a journal commit:
*/
- bch_find_empty_buckets(c, ca);
+ //bch_find_empty_buckets(c, ca);
if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
up_read(&c->gc_lock);
@@ -967,7 +970,7 @@ out:
*
* Returns index of bucket on success, 0 on failure
* */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
{
struct bucket *g;
long r;
@@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
u64 available_buckets = 1; /* avoid a divide by zero... */
unsigned i;
- for (i = 0; i < devs->nr_devices; i++) {
+ for (i = 0; i < devs->nr; i++) {
ca = devs->d[i].dev;
devs->d[i].weight = buckets_free_cache(ca);
available_buckets += devs->d[i].weight;
}
- for (i = 0; i < devs->nr_devices; i++) {
+ for (i = 0; i < devs->nr; i++) {
const unsigned min_weight = U32_MAX >> 4;
const unsigned max_weight = U32_MAX;
devs->d[i].weight =
min_weight +
div64_u64(devs->d[i].weight *
- devs->nr_devices *
+ devs->nr *
(max_weight - min_weight),
available_buckets);
devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
rcu_read_lock();
spin_lock(&devs->lock);
- for (i = 0; i < devs->nr_devices; i++)
+ for (i = 0; i < devs->nr; i++)
available += !test_bit(devs->d[i].dev->dev_idx,
caches_used);
@@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
}
i++;
- i %= devs->nr_devices;
+ i %= devs->nr;
ret = FREELIST_EMPTY;
if (i == fail_idx)
@@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
enum alloc_reserve reserve,
long *caches_used)
{
+ struct bch_tier *tier;
/*
* this should implement policy - for a given type of allocation, decide
* which devices to allocate from:
*
* XXX: switch off wp->type and do something more intelligent here
*/
+ if (wp->group)
+ return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+ wp->group, caches_used);
- /* foreground writes: prefer tier 0: */
- if (wp->group == &c->cache_all)
+ /* foreground writes: prefer fastest tier: */
+ tier = READ_ONCE(c->fastest_tier);
+ if (tier)
bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- &c->cache_tiers[0], caches_used);
+ &tier->devs, caches_used);
return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- wp->group, caches_used);
+ &c->cache_all, caches_used);
}
static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
? 0 : BTREE_NODE_RESERVE;
int ret;
- BUG_ON(!wp->group);
BUG_ON(!reserve);
BUG_ON(!nr_replicas);
retry:
@@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
unsigned nr_replicas, struct open_bucket *ob,
unsigned sectors)
{
- struct bch_extent_ptr tmp, *ptr;
+ struct bch_extent_ptr tmp;
struct cache *ca;
bool has_data = false;
unsigned i;
@@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
if (nr_replicas < ob->nr_ptrs)
has_data = true;
+ rcu_read_lock();
+
for (i = 0; i < nr_replicas; i++) {
EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
@@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
extent_ptr_append(e, tmp);
ob->ptr_offset[i] += sectors;
+
+ if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
+ this_cpu_add(*ca->sectors_written, sectors);
}
- open_bucket_for_each_online_device(c, ob, ptr, ca)
- this_cpu_add(*ca->sectors_written, sectors);
+ rcu_read_unlock();
}
/*
@@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
/* Startup/shutdown (ro/rw): */
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
{
- struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+ struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
struct cache *ca;
u64 total_capacity, capacity = 0, reserved_sectors = 0;
unsigned long ra_pages = 0;
@@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
c->bdi.ra_pages = ra_pages;
+ /* Find fastest, slowest tiers with devices: */
+
+ for (tier = c->tiers;
+ tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+ if (!tier->devs.nr)
+ continue;
+ if (!fastest_tier)
+ fastest_tier = tier;
+ slowest_tier = tier;
+ }
+
+ c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
+
+ c->promote_write_point.group = &fastest_tier->devs;
+
+ if (!fastest_tier)
+ goto set_capacity;
+
/*
* Capacity of the cache set is the capacity of all the devices in the
* slowest (highest) tier - we don't include lower tier devices.
*/
- for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
- tier > c->cache_tiers && !tier->nr_devices;
- --tier)
- ;
-
- group_for_each_cache_rcu(ca, tier, i) {
+ group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
size_t reserve = 0;
/*
@@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
ca->mi.first_bucket) <<
ca->bucket_bits;
}
+set_capacity:
rcu_read_unlock();
-
total_capacity = capacity;
capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
void bch_dev_allocator_stop(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
struct task_struct *p;
struct closure cl;
unsigned i;
@@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
int bch_dev_allocator_start(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
struct task_struct *k;
/*
@@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
bch_dev_group_add(tier, ca);
bch_dev_group_add(&c->cache_all, ca);
+ bch_dev_group_add(&c->journal.devs, ca);
bch_recalc_capacity(c);
@@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
return 0;
}
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
{
unsigned i;
@@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
spin_lock_init(&c->cache_all.lock);
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
- c->write_points[i].throttle = true;
- c->write_points[i].group = &c->cache_tiers[0];
- }
-
- for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
- spin_lock_init(&c->cache_tiers[i].lock);
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+ spin_lock_init(&c->tiers[i].devs.lock);
- c->promote_write_point.group = &c->cache_tiers[0];
-
- c->migration_write_point.group = &c->cache_all;
-
- c->btree_write_point.group = &c->cache_all;
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+ c->write_points[i].throttle = true;
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);