summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-09-23 16:08:31 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-09-23 16:20:53 -0800
commit5a6a7df52aa32889b19d6e64c3ccd8d4ae4b148c (patch)
treeaa6a35570e9be05d94f669fa090f389bdc3b856e
parent3ef3f7a98d06152d17423a5e679d06fdb970df6e (diff)
bcache: fix a lost wakeup
the way bch_next_cache() was using buckets_free_cache() was racy
-rw-r--r--drivers/md/bcache/alloc.c145
-rw-r--r--drivers/md/bcache/alloc.h40
-rw-r--r--drivers/md/bcache/alloc_types.h10
-rw-r--r--drivers/md/bcache/bcache.h6
-rw-r--r--drivers/md/bcache/buckets.h17
-rw-r--r--drivers/md/bcache/journal.c1
-rw-r--r--drivers/md/bcache/super.c4
-rw-r--r--drivers/md/bcache/super.h37
-rw-r--r--drivers/md/bcache/sysfs.c2
-rw-r--r--drivers/md/bcache/tier.c5
10 files changed, 128 insertions, 139 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 3a58d51ff645..1a6d5a8c7ed3 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,7 @@
#include <linux/blkdev.h>
#include <linux/kthread.h>
+#include <linux/random.h>
#include <linux/rcupdate.h>
#include <trace/events/bcache.h>
@@ -80,27 +81,27 @@ static void bch_cache_group_remove_cache(struct cache_group *grp, struct cache *
{
unsigned i;
- write_seqcount_begin(&grp->lock);
+ spin_lock(&grp->lock);
for (i = 0; i < grp->nr_devices; i++)
- if (rcu_access_pointer(grp->devices[i]) == ca) {
+ if (rcu_access_pointer(grp->d[i].dev) == ca) {
grp->nr_devices--;
- memmove(&grp->devices[i],
- &grp->devices[i + 1],
- (grp->nr_devices - i) * sizeof(ca));
+ memmove(&grp->d[i],
+ &grp->d[i + 1],
+ (grp->nr_devices - i) * sizeof(grp->d[0]));
break;
}
- write_seqcount_end(&grp->lock);
+ spin_unlock(&grp->lock);
}
static void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
{
- write_seqcount_begin(&grp->lock);
+ spin_lock(&grp->lock);
BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
- rcu_assign_pointer(grp->devices[grp->nr_devices++], ca);
- write_seqcount_end(&grp->lock);
+ rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+ spin_unlock(&grp->lock);
}
/* Ratelimiting/PD controllers */
@@ -143,8 +144,7 @@ static void pd_controllers_update(struct work_struct *work)
u64 dev_size = (ca->mi.nbuckets -
ca->mi.first_bucket) << bucket_bits;
- u64 free = __buckets_free_cache(ca, stats,
- RESERVE_NONE) << bucket_bits;
+ u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
if (fragmented < 0)
fragmented = 0;
@@ -920,7 +920,6 @@ static void __bch_bucket_free(struct cache *ca, struct bucket *g)
enum bucket_alloc_ret {
ALLOC_SUCCESS,
CACHE_SET_FULL, /* -ENOSPC */
- BUCKETS_NOT_AVAILABLE, /* Device full */
FREELIST_EMPTY, /* Allocator thread not keeping up */
};
@@ -930,50 +929,63 @@ static struct cache *bch_next_cache(struct cache_set *c,
long *cache_used)
{
struct cache *ca;
- size_t bucket_count = 0, rand;
- unsigned i;
-
- /*
- * first ptr allocation will always go to the specified tier,
- * 2nd and greater can go to any. If one tier is significantly larger
- * it is likely to go that tier.
- */
+ unsigned i, weight;
+ u64 available_buckets = 0;
- group_for_each_cache_rcu(ca, devs, i) {
- if (test_bit(ca->sb.nr_this_dev, cache_used))
- continue;
-
- bucket_count += buckets_free_cache(ca, reserve);
- }
+ spin_lock(&devs->lock);
- if (!bucket_count)
- return ERR_PTR(-BUCKETS_NOT_AVAILABLE);
+ if (devs->nr_devices == 0)
+ goto err;
- /*
- * We create a weighted selection by using the number of free buckets
- * in each cache. You can think of this like lining up the caches
- * linearly so each as a given range, corresponding to the number of
- * free buckets in that cache, and then randomly picking a number
- * within that range.
- */
-
- rand = bch_rand_range(bucket_count);
-
- group_for_each_cache_rcu(ca, devs, i) {
+ if (devs->nr_devices == 1) {
+ ca = devs->d[0].dev;
if (test_bit(ca->sb.nr_this_dev, cache_used))
- continue;
+ goto err;
+ goto out;
+ }
- bucket_count -= buckets_free_cache(ca, reserve);
+ /* recalculate weightings: XXX don't do this on every call */
+ for (i = 0; i < devs->nr_devices; i++) {
+ ca = devs->d[i].dev;
- if (rand >= bucket_count)
- return ca;
+ devs->d[i].weight = buckets_free_cache(ca);
+ available_buckets += devs->d[i].weight;
}
- /*
- * If we fall off the end, it means we raced because of bucket counters
- * changing - return NULL so __bch_bucket_alloc_set() knows to retry
- */
+ for (i = 0; i < devs->nr_devices; i++) {
+ const unsigned min_weight = U32_MAX >> 4;
+ const unsigned max_weight = U32_MAX;
+
+ devs->d[i].weight =
+ min_weight +
+ div64_u64(devs->d[i].weight *
+ devs->nr_devices *
+ (max_weight - min_weight),
+ available_buckets);
+ devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
+ }
+ for (i = 0; i < devs->nr_devices; i++)
+ if (!test_bit(devs->d[i].dev->sb.nr_this_dev, cache_used))
+ goto available;
+
+ /* no unused devices: */
+ goto err;
+available:
+ i = devs->cur_device;
+ do {
+ weight = devs->d[i].weight;
+ ca = devs->d[i].dev;
+ i++;
+ i %= devs->nr_devices;
+ } while (test_bit(ca->sb.nr_this_dev, cache_used) ||
+ get_random_int() > weight);
+ devs->cur_device = i;
+out:
+ spin_unlock(&devs->lock);
+ return ca;
+err:
+ spin_unlock(&devs->lock);
return NULL;
}
@@ -999,38 +1011,23 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
/* sort by free space/prio of oldest data in caches */
while (ob->nr_ptrs < nr_replicas) {
+ struct cache_group *d;
struct cache *ca;
- unsigned seq;
size_t r;
/* first ptr goes to the specified tier, the rest to any */
- do {
- struct cache_group *d;
-
- seq = read_seqcount_begin(&devs->lock);
-
- d = (!ob->nr_ptrs && devs == &c->cache_all &&
- c->cache_tiers[0].nr_devices)
- ? &c->cache_tiers[0]
- : devs;
- ca = devs->nr_devices
- ? bch_next_cache(c, reserve, d, caches_used)
- : ERR_PTR(-CACHE_SET_FULL);
+ d = (!ob->nr_ptrs && devs == &c->cache_all &&
+ c->cache_tiers[0].nr_devices)
+ ? &c->cache_tiers[0]
+ : devs;
- /*
- * If ca == NULL, we raced because of bucket counters
- * changing
- */
- } while (read_seqcount_retry(&devs->lock, seq) || !ca);
-
- if (IS_ERR(ca)) {
- ret = -PTR_ERR(ca);
+ ca = bch_next_cache(c, reserve, d, caches_used);
+ if (!ca) {
+ ret = CACHE_SET_FULL;
goto err;
}
- __set_bit(ca->sb.nr_this_dev, caches_used);
-
r = bch_bucket_alloc(ca, reserve);
if (!r) {
ret = FREELIST_EMPTY;
@@ -1050,6 +1047,8 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
.offset = bucket_to_sector(ca, r),
.dev = ca->sb.nr_this_dev,
};
+
+ __set_bit(ca->sb.nr_this_dev, caches_used);
}
rcu_read_unlock();
@@ -1083,8 +1082,6 @@ static int bch_bucket_alloc_set(struct cache_set *c, struct open_bucket *ob,
closure_wake_up(&c->freelist_wait);
return -ENOSPC;
- case BUCKETS_NOT_AVAILABLE:
- trace_bcache_buckets_unavailable_fail(c, reserve, cl);
case FREELIST_EMPTY:
if (!cl)
return -ENOSPC;
@@ -1806,7 +1803,7 @@ void bch_open_buckets_init(struct cache_set *c)
list_add(&c->open_buckets[i].list, &c->open_buckets_free);
}
- seqcount_init(&c->cache_all.lock);
+ spin_lock_init(&c->cache_all.lock);
for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
c->write_points[i].throttle = true;
@@ -1815,7 +1812,7 @@ void bch_open_buckets_init(struct cache_set *c)
}
for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
- seqcount_init(&c->cache_tiers[i].lock);
+ spin_lock_init(&c->cache_tiers[i].lock);
c->promote_write_point.group = &c->cache_tiers[0];
c->promote_write_point.reserve = RESERVE_NONE;
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index 3da349c581b3..b02a307b0265 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -35,15 +35,43 @@ static inline void bch_wake_allocator(struct cache *ca)
if ((p = ACCESS_ONCE(ca->alloc_thread)))
wake_up_process(p);
rcu_read_unlock();
+}
+
+static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
+ unsigned *iter)
+{
+ struct cache *ret = NULL;
+
+ while (*iter < devs->nr_devices &&
+ !(ret = rcu_dereference(devs->d[*iter].dev)))
+ (*iter)++;
- /*
- * XXX: this is only needed because of ca->reserve_buckets_count, but is
- * reserve_buckets_count needed anymore? It predates modern
- * reservations.
- */
- closure_wake_up(&ca->set->freelist_wait);
+ return ret;
}
+#define group_for_each_cache_rcu(ca, devs, iter) \
+ for ((iter) = 0; \
+ ((ca) = cache_group_next_rcu((devs), &(iter))); \
+ (iter)++)
+
+static inline struct cache *cache_group_next(struct cache_group *devs,
+ unsigned *iter)
+{
+ struct cache *ret;
+
+ rcu_read_lock();
+ if ((ret = cache_group_next_rcu(devs, iter)))
+ percpu_ref_get(&ret->ref);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+#define group_for_each_cache(ca, devs, iter) \
+ for ((iter) = 0; \
+ (ca = cache_group_next(devs, &(iter))); \
+ percpu_ref_put(&ca->ref), (iter)++)
+
#define __open_bucket_next_online_device(_c, _ob, _ptr, _ca) \
({ \
(_ca) = NULL; \
diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h
index 065b9c02f185..0bfafbf8c91f 100644
--- a/drivers/md/bcache/alloc_types.h
+++ b/drivers/md/bcache/alloc_types.h
@@ -49,6 +49,16 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
return id <= RESERVE_METADATA_LAST;
}
+struct cache_group {
+ spinlock_t lock;
+ unsigned nr_devices;
+ unsigned cur_device;
+ struct {
+ u64 weight;
+ struct cache __rcu *dev;
+ } d[MAX_CACHES_PER_SET];
+};
+
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 318616955570..aee5451a762d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -321,12 +321,6 @@ struct gc_pos {
unsigned level;
};
-struct cache_group {
- seqcount_t lock;
- unsigned nr_devices;
- struct cache __rcu *devices[MAX_CACHES_PER_SET];
-};
-
struct cache_member_cpu {
u64 nbuckets; /* device size */
u16 first_bucket; /* index of first bucket used */
diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h
index da077a3fe5a6..42fcd620cefe 100644
--- a/drivers/md/bcache/buckets.h
+++ b/drivers/md/bcache/buckets.h
@@ -171,23 +171,16 @@ static inline size_t buckets_available_cache(struct cache *ca)
}
static inline size_t __buckets_free_cache(struct cache *ca,
- struct bucket_stats_cache stats,
- enum alloc_reserve reserve)
+ struct bucket_stats_cache stats)
{
- size_t free = __buckets_available_cache(ca, stats) +
- fifo_used(&ca->free[reserve]) +
+ return __buckets_available_cache(ca, stats) +
+ fifo_used(&ca->free[RESERVE_NONE]) +
fifo_used(&ca->free_inc);
-
- if (reserve == RESERVE_NONE)
- free = max_t(ssize_t, 0, free - ca->reserve_buckets_count);
-
- return free;
}
-static inline size_t buckets_free_cache(struct cache *ca,
- enum alloc_reserve reserve)
+static inline size_t buckets_free_cache(struct cache *ca)
{
- return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca), reserve);
+ return __buckets_free_cache(ca, bch_bucket_stats_read_cache(ca));
}
/* Cache set stats: */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 3dd1c7503cf4..1a9f27dcdc0d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -5,6 +5,7 @@
*/
#include "bcache.h"
+#include "alloc.h"
#include "bkey_methods.h"
#include "buckets.h"
#include "btree_gc.h"
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 66f06ec0ff1b..7b0b165efe7b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1903,9 +1903,9 @@ static const char *cache_alloc(struct bcache_superblock *sb,
kobject_init(&ca->kobj, &bch_cache_ktype);
- seqcount_init(&ca->self.lock);
+ spin_lock_init(&ca->self.lock);
ca->self.nr_devices = 1;
- rcu_assign_pointer(ca->self.devices[0], ca);
+ rcu_assign_pointer(ca->self.d[0].dev, ca);
ca->sb.nr_this_dev = sb->sb->nr_this_dev;
INIT_WORK(&ca->free_work, bch_cache_free_work);
diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h
index 517047938aa2..635e1a6f5cf3 100644
--- a/drivers/md/bcache/super.h
+++ b/drivers/md/bcache/super.h
@@ -59,41 +59,6 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
(ca = bch_get_next_cache(c, &(iter))); \
percpu_ref_put(&ca->ref), (iter)++)
-static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
- unsigned *iter)
-{
- struct cache *ret = NULL;
-
- while (*iter < devs->nr_devices &&
- !(ret = rcu_dereference(devs->devices[*iter])))
- (*iter)++;
-
- return ret;
-}
-
-#define group_for_each_cache_rcu(ca, devs, iter) \
- for ((iter) = 0; \
- ((ca) = cache_group_next_rcu((devs), &(iter))); \
- (iter)++)
-
-static inline struct cache *cache_group_next(struct cache_group *devs,
- unsigned *iter)
-{
- struct cache *ret;
-
- rcu_read_lock();
- if ((ret = cache_group_next_rcu(devs, iter)))
- percpu_ref_get(&ret->ref);
- rcu_read_unlock();
-
- return ret;
-}
-
-#define group_for_each_cache(ca, devs, iter) \
- for ((iter) = 0; \
- (ca = cache_group_next(devs, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
-
void bch_check_mark_super_slowpath(struct cache_set *,
const struct bkey_i *, bool);
@@ -151,7 +116,7 @@ static inline bool bch_cache_may_remove(struct cache *ca)
*/
return tier->nr_devices != 1 ||
- rcu_access_pointer(tier->devices[0]) != ca;
+ rcu_access_pointer(tier->d[0].dev) != ca;
}
void free_super(struct bcache_superblock *);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 52ebb718e376..60436b0ed3a1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -1205,7 +1205,7 @@ SHOW(bch_cache)
sysfs_print(meta_buckets, stats.buckets_meta);
sysfs_print(alloc_buckets, stats.buckets_alloc);
sysfs_print(available_buckets, buckets_available_cache(ca));
- sysfs_print(free_buckets, buckets_free_cache(ca, RESERVE_NONE));
+ sysfs_print(free_buckets, buckets_free_cache(ca));
sysfs_print(has_data, ca->mi.has_data);
sysfs_print(has_metadata, ca->mi.has_metadata);
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 524a304c367f..02ceb586239d 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -1,5 +1,6 @@
#include "bcache.h"
+#include "alloc.h"
#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
@@ -79,7 +80,7 @@ static void refill_next(struct cache_set *c, struct tiering_refill *refill)
while (1) {
while (refill->cache_iter < tier->nr_devices) {
refill->ca = rcu_dereference(
- tier->devices[refill->cache_iter]);
+ tier->d[refill->cache_iter].dev);
if (refill->ca != NULL) {
percpu_ref_get(&refill->ca->ref);
goto out;
@@ -274,7 +275,7 @@ static int tiering_next_cache(struct cache_set *c,
continue;
}
- ca = rcu_dereference(tier->devices[*cache_iter]);
+ ca = rcu_dereference(tier->d[*cache_iter].dev);
if (ca == NULL ||
ca->mi.state != CACHE_ACTIVE ||
ca->tiering_queue.stopped) {