summaryrefslogtreecommitdiff
path: root/libbcache
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2017-03-01 01:45:15 -0900
committerKent Overstreet <kent.overstreet@gmail.com>2017-03-09 09:14:11 -0900
commita17f7bcec7ed810a247c24e56229af8f43a9a6ae (patch)
tree1b2d60b21661bd2991324e3efaa83b3cdd87a783 /libbcache
parent171ee48e57be78f4e95954c99851553fa523bf91 (diff)
cmd_migrate
Diffstat (limited to 'libbcache')
-rw-r--r--libbcache/alloc.c189
-rw-r--r--libbcache/alloc.h7
-rw-r--r--libbcache/alloc_types.h2
-rw-r--r--libbcache/bcache.h47
-rw-r--r--libbcache/blockdev.c12
-rw-r--r--libbcache/btree_cache.c7
-rw-r--r--libbcache/btree_cache.h6
-rw-r--r--libbcache/btree_gc.c105
-rw-r--r--libbcache/buckets.c103
-rw-r--r--libbcache/buckets.h15
-rw-r--r--libbcache/buckets_types.h27
-rw-r--r--libbcache/chardev.c2
-rw-r--r--libbcache/checksum.c7
-rw-r--r--libbcache/checksum.h2
-rw-r--r--libbcache/compress.c14
-rw-r--r--libbcache/compress.h4
-rw-r--r--libbcache/debug.c10
-rw-r--r--libbcache/debug.h8
-rw-r--r--libbcache/error.c6
-rw-r--r--libbcache/extents.c43
-rw-r--r--libbcache/fs-gc.c10
-rw-r--r--libbcache/fs.c24
-rw-r--r--libbcache/fs.h8
-rw-r--r--libbcache/io.c25
-rw-r--r--libbcache/journal.c318
-rw-r--r--libbcache/journal.h17
-rw-r--r--libbcache/movinggc.c27
-rw-r--r--libbcache/movinggc.h4
-rw-r--r--libbcache/opts.h10
-rw-r--r--libbcache/super-io.c170
-rw-r--r--libbcache/super-io.h46
-rw-r--r--libbcache/super.c709
-rw-r--r--libbcache/super.h32
-rw-r--r--libbcache/super_types.h1
-rw-r--r--libbcache/sysfs.c60
-rw-r--r--libbcache/tier.c101
-rw-r--r--libbcache/tier.h6
37 files changed, 1264 insertions, 920 deletions
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
index 8cb31944..93f0c2f1 100644
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@@ -73,7 +73,6 @@
#include <linux/rcupdate.h>
#include <trace/events/bcache.h>
-static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
static void __bch_bucket_free(struct cache *, struct bucket *);
/* Allocation groups: */
@@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
spin_lock(&grp->lock);
- for (i = 0; i < grp->nr_devices; i++)
+ for (i = 0; i < grp->nr; i++)
if (rcu_access_pointer(grp->d[i].dev) == ca) {
- grp->nr_devices--;
+ grp->nr--;
memmove(&grp->d[i],
&grp->d[i + 1],
- (grp->nr_devices - i) * sizeof(grp->d[0]));
+ (grp->nr- i) * sizeof(grp->d[0]));
break;
}
@@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
unsigned i;
spin_lock(&grp->lock);
- for (i = 0; i < grp->nr_devices; i++)
+ for (i = 0; i < grp->nr; i++)
if (rcu_access_pointer(grp->d[i].dev) == ca)
goto out;
- BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+ BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
- rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+ rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
out:
spin_unlock(&grp->lock);
}
@@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
struct cache_set,
pd_controllers_update);
struct cache *ca;
- unsigned iter;
- int i;
+ unsigned i, iter;
/* All units are in bytes */
- u64 tier_size[BCH_TIER_MAX];
- u64 tier_free[BCH_TIER_MAX];
- u64 tier_dirty[BCH_TIER_MAX];
- u64 tier0_can_free = 0;
+ u64 faster_tiers_size = 0;
+ u64 faster_tiers_dirty = 0;
- memset(tier_size, 0, sizeof(tier_size));
- memset(tier_free, 0, sizeof(tier_free));
- memset(tier_dirty, 0, sizeof(tier_dirty));
+ u64 fastest_tier_size = 0;
+ u64 fastest_tier_free = 0;
+ u64 copygc_can_free = 0;
rcu_read_lock();
- for (i = BCH_TIER_MAX - 1; i >= 0; --i)
- group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+ bch_pd_controller_update(&c->tiers[i].pd,
+ div_u64(faster_tiers_size *
+ c->tiering_percent, 100),
+ faster_tiers_dirty,
+ -1);
+
+ group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
+ u64 size = (ca->mi.nbuckets -
+ ca->mi.first_bucket) << bucket_bits;
+ u64 dirty = stats.buckets_dirty << bucket_bits;
+ u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
/*
* Bytes of internal fragmentation, which can be
* reclaimed by copy GC
@@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
((stats.sectors_dirty +
stats.sectors_cached) << 9);
- u64 dev_size = (ca->mi.nbuckets -
- ca->mi.first_bucket) << bucket_bits;
-
- u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
-
if (fragmented < 0)
fragmented = 0;
bch_pd_controller_update(&ca->moving_gc_pd,
free, fragmented, -1);
- if (i == 0)
- tier0_can_free += fragmented;
-
- tier_size[i] += dev_size;
- tier_free[i] += free;
- tier_dirty[i] += stats.buckets_dirty << bucket_bits;
- }
- rcu_read_unlock();
-
- if (tier_size[1]) {
- u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+ faster_tiers_size += size;
+ faster_tiers_dirty += dirty;
- tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+ if (!c->fastest_tier ||
+ c->fastest_tier == &c->tiers[i]) {
+ fastest_tier_size += size;
+ fastest_tier_free += free;
+ }
- bch_pd_controller_update(&c->tiering_pd,
- target,
- tier_dirty[0],
- -1);
+ copygc_can_free += fragmented;
+ }
}
+ rcu_read_unlock();
+
/*
* Throttle foreground writes if tier 0 is running out of free buckets,
- * and either tiering or copygc can free up space (but don't take both
- * into account).
+ * and either tiering or copygc can free up space.
*
* Target will be small if there isn't any work to do - we don't want to
* throttle foreground writes if we currently have all the free space
@@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
* Otherwise, if there's work to do, try to keep 20% of tier0 available
* for foreground writes.
*/
+ if (c->fastest_tier)
+ copygc_can_free = U64_MAX;
+
bch_pd_controller_update(&c->foreground_write_pd,
- min(tier0_can_free,
- div_u64(tier_size[0] *
+ min(copygc_can_free,
+ div_u64(fastest_tier_size *
c->foreground_target_percent,
100)),
- tier_free[0],
+ fastest_tier_free,
-1);
schedule_delayed_work(&c->pd_controllers_update,
@@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
* it getting gc'd from under us
*/
ca->prio_buckets[i] = r;
- bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+ bch_mark_metadata_bucket(ca, ca->buckets + r,
+ BUCKET_PRIOS, false);
spin_unlock(&ca->prio_buckets_lock);
SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
do {
unsigned u64s = jset_u64s(0);
+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
+ break;
+
ret = bch_journal_res_get(j, &res, u64s, u64s);
if (ret)
return ret;
@@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
if (is_available_bucket(m) &&
!m.cached_sectors &&
!m.had_metadata &&
- (!m.wait_on_journal ||
- ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+ !bucket_needs_journal_commit(m, last_seq_ondisk)) {
spin_lock(&ca->freelist_lock);
bch_mark_alloc_bucket(ca, g, true);
@@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
set_freezable();
+ bch_find_empty_buckets(c, ca);
+
while (1) {
/*
* First, we pull buckets off of the free_inc list, possibly
@@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
* See if we have buckets we can reuse without invalidating them
* or forcing a journal commit:
*/
- bch_find_empty_buckets(c, ca);
+ //bch_find_empty_buckets(c, ca);
if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
up_read(&c->gc_lock);
@@ -967,7 +970,7 @@ out:
*
* Returns index of bucket on success, 0 on failure
* */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
{
struct bucket *g;
long r;
@@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
u64 available_buckets = 1; /* avoid a divide by zero... */
unsigned i;
- for (i = 0; i < devs->nr_devices; i++) {
+ for (i = 0; i < devs->nr; i++) {
ca = devs->d[i].dev;
devs->d[i].weight = buckets_free_cache(ca);
available_buckets += devs->d[i].weight;
}
- for (i = 0; i < devs->nr_devices; i++) {
+ for (i = 0; i < devs->nr; i++) {
const unsigned min_weight = U32_MAX >> 4;
const unsigned max_weight = U32_MAX;
devs->d[i].weight =
min_weight +
div64_u64(devs->d[i].weight *
- devs->nr_devices *
+ devs->nr *
(max_weight - min_weight),
available_buckets);
devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
rcu_read_lock();
spin_lock(&devs->lock);
- for (i = 0; i < devs->nr_devices; i++)
+ for (i = 0; i < devs->nr; i++)
available += !test_bit(devs->d[i].dev->dev_idx,
caches_used);
@@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
}
i++;
- i %= devs->nr_devices;
+ i %= devs->nr;
ret = FREELIST_EMPTY;
if (i == fail_idx)
@@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
enum alloc_reserve reserve,
long *caches_used)
{
+ struct bch_tier *tier;
/*
* this should implement policy - for a given type of allocation, decide
* which devices to allocate from:
*
* XXX: switch off wp->type and do something more intelligent here
*/
+ if (wp->group)
+ return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+ wp->group, caches_used);
- /* foreground writes: prefer tier 0: */
- if (wp->group == &c->cache_all)
+ /* foreground writes: prefer fastest tier: */
+ tier = READ_ONCE(c->fastest_tier);
+ if (tier)
bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- &c->cache_tiers[0], caches_used);
+ &tier->devs, caches_used);
return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- wp->group, caches_used);
+ &c->cache_all, caches_used);
}
static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
? 0 : BTREE_NODE_RESERVE;
int ret;
- BUG_ON(!wp->group);
BUG_ON(!reserve);
BUG_ON(!nr_replicas);
retry:
@@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
unsigned nr_replicas, struct open_bucket *ob,
unsigned sectors)
{
- struct bch_extent_ptr tmp, *ptr;
+ struct bch_extent_ptr tmp;
struct cache *ca;
bool has_data = false;
unsigned i;
@@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
if (nr_replicas < ob->nr_ptrs)
has_data = true;
+ rcu_read_lock();
+
for (i = 0; i < nr_replicas; i++) {
EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
@@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
extent_ptr_append(e, tmp);
ob->ptr_offset[i] += sectors;
+
+ if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
+ this_cpu_add(*ca->sectors_written, sectors);
}
- open_bucket_for_each_online_device(c, ob, ptr, ca)
- this_cpu_add(*ca->sectors_written, sectors);
+ rcu_read_unlock();
}
/*
@@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
/* Startup/shutdown (ro/rw): */
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
{
- struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+ struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
struct cache *ca;
u64 total_capacity, capacity = 0, reserved_sectors = 0;
unsigned long ra_pages = 0;
@@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
c->bdi.ra_pages = ra_pages;
+ /* Find fastest, slowest tiers with devices: */
+
+ for (tier = c->tiers;
+ tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+ if (!tier->devs.nr)
+ continue;
+ if (!fastest_tier)
+ fastest_tier = tier;
+ slowest_tier = tier;
+ }
+
+ c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
+
+ c->promote_write_point.group = &fastest_tier->devs;
+
+ if (!fastest_tier)
+ goto set_capacity;
+
/*
* Capacity of the cache set is the capacity of all the devices in the
* slowest (highest) tier - we don't include lower tier devices.
*/
- for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
- tier > c->cache_tiers && !tier->nr_devices;
- --tier)
- ;
-
- group_for_each_cache_rcu(ca, tier, i) {
+ group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
size_t reserve = 0;
/*
@@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
ca->mi.first_bucket) <<
ca->bucket_bits;
}
+set_capacity:
rcu_read_unlock();
-
total_capacity = capacity;
capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
void bch_dev_allocator_stop(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
struct task_struct *p;
struct closure cl;
unsigned i;
@@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
int bch_dev_allocator_start(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+ struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
struct task_struct *k;
/*
@@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
bch_dev_group_add(tier, ca);
bch_dev_group_add(&c->cache_all, ca);
+ bch_dev_group_add(&c->journal.devs, ca);
bch_recalc_capacity(c);
@@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
return 0;
}
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
{
unsigned i;
@@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
spin_lock_init(&c->cache_all.lock);
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
- c->write_points[i].throttle = true;
- c->write_points[i].group = &c->cache_tiers[0];
- }
-
- for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
- spin_lock_init(&c->cache_tiers[i].lock);
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+ spin_lock_init(&c->tiers[i].devs.lock);
- c->promote_write_point.group = &c->cache_tiers[0];
-
- c->migration_write_point.group = &c->cache_all;
-
- c->btree_write_point.group = &c->cache_all;
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+ c->write_points[i].throttle = true;
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
index 09139a59..9573dd2c 100644
--- a/libbcache/alloc.h
+++ b/libbcache/alloc.h
@@ -27,6 +27,8 @@ int bch_prio_read(struct cache *);
void bch_recalc_min_prio(struct cache *, int);
+size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+
void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
@@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
{
struct cache *ret = NULL;
- while (*iter < devs->nr_devices &&
+ while (*iter < devs->nr &&
!(ret = rcu_dereference(devs->d[*iter].dev)))
(*iter)++;
@@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs,
((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\
(_ptr)++)
+void bch_recalc_capacity(struct cache_set *);
void bch_dev_allocator_stop(struct cache *);
int bch_dev_allocator_start(struct cache *);
-void bch_open_buckets_init(struct cache_set *);
+void bch_fs_allocator_init(struct cache_set *);
#endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
index fbe8b75c..f408bd97 100644
--- a/libbcache/alloc_types.h
+++ b/libbcache/alloc_types.h
@@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
struct cache_group {
spinlock_t lock;
- unsigned nr_devices;
+ unsigned nr;
unsigned cur_device;
struct {
u64 weight;
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
index babc08db..5b668c71 100644
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@@ -464,24 +464,10 @@ struct cache {
* BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
* all the backing devices first (their cached data gets invalidated, and they
* won't automatically reattach).
- *
- * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
- * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
- * flushing dirty data).
- *
- * BCH_FS_RUNNING means all cache devices have been registered and journal
- * replay is complete.
*/
enum {
- /* Startup: */
BCH_FS_INITIAL_GC_DONE,
- BCH_FS_RUNNING,
-
- /* Shutdown: */
BCH_FS_DETACHING,
- BCH_FS_STOPPING,
- BCH_FS_RO,
- BCH_FS_RO_COMPLETE,
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_GC_STOPPING,
@@ -498,6 +484,21 @@ struct btree_debug {
struct dentry *failed;
};
+struct bch_tier {
+ unsigned idx;
+ struct task_struct *migrate;
+ struct bch_pd_controller pd;
+
+ struct cache_group devs;
+};
+
+enum bch_fs_state {
+ BCH_FS_STARTING = 0,
+ BCH_FS_STOPPING,
+ BCH_FS_RO,
+ BCH_FS_RW,
+};
+
struct cache_set {
struct closure cl;
@@ -506,7 +507,6 @@ struct cache_set {
struct kobject internal;
struct kobject opts_dir;
struct kobject time_stats;
- struct completion *stop_completion;
unsigned long flags;
int minor;
@@ -514,6 +514,10 @@ struct cache_set {
struct super_block *vfs_sb;
char name[40];
+ /* ro/rw, add/remove devices: */
+ struct mutex state_lock;
+ enum bch_fs_state state;
+
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes;
struct work_struct read_only_work;
@@ -640,7 +644,9 @@ struct cache_set {
* allocate from:
*/
struct cache_group cache_all;
- struct cache_group cache_tiers[BCH_TIER_MAX];
+ struct bch_tier tiers[BCH_TIER_MAX];
+ /* NULL if we only have devices in one tier: */
+ struct bch_tier *fastest_tier;
u64 capacity; /* sectors */
@@ -753,10 +759,6 @@ struct cache_set {
unsigned writeback_pages_max;
atomic_long_t nr_inodes;
- /* TIERING */
- struct task_struct *tiering_read;
- struct bch_pd_controller tiering_pd;
-
/* NOTIFICATIONS */
struct mutex uevent_lock;
struct kobj_uevent_env uevent_env;
@@ -828,6 +830,11 @@ struct cache_set {
#undef BCH_TIME_STAT
};
+static inline bool bch_fs_running(struct cache_set *c)
+{
+ return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
static inline unsigned bucket_pages(const struct cache *ca)
{
return ca->mi.bucket_size / PAGE_SECTORS;
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
index 82b07f59..ba2e9a8c 100644
--- a/libbcache/blockdev.c
+++ b/libbcache/blockdev.c
@@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
bool found;
int ret;
+ lockdep_assert_held(&c->state_lock);
+
bdevname(dc->disk_sb.bdev, buf);
if (memcmp(&dc->disk_sb.sb->set_uuid,
@@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
return -EINVAL;
}
- if (!test_bit(BCH_FS_RUNNING, &c->flags))
- return 0;
-
- if (test_bit(BCH_FS_STOPPING, &c->flags)) {
- pr_err("Can't attach %s: shutting down", buf);
+ if (!bch_fs_running(c)) {
+ pr_err("Can't attach %s: not running", buf);
return -EINVAL;
}
@@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
struct cached_dev *dc, *t;
lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
bch_cached_dev_attach(dc, c);
@@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
struct bkey_s_c_inode_blockdev inode;
int ret = 0;
- if (test_bit(BCH_FS_STOPPING, &c->flags))
+ if (!bch_fs_running(c))
return -EINVAL;
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
index 4d5efdbd..4d0c6d4d 100644
--- a/libbcache/btree_cache.c
+++ b/libbcache/btree_cache.c
@@ -11,8 +11,9 @@
#define DEF_BTREE_ID(kwd, val, name) name,
-const char *bch_btree_id_names[BTREE_ID_NR] = {
+const char * const bch_btree_ids[] = {
DEFINE_BCH_BTREE_IDS()
+ NULL
};
#undef DEF_BTREE_ID
@@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
return mca_can_free(c) * btree_pages(c);
}
-void bch_btree_cache_free(struct cache_set *c)
+void bch_fs_btree_exit(struct cache_set *c)
{
struct btree *b;
unsigned i;
@@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c)
rhashtable_destroy(&c->btree_cache_table);
}
-int bch_btree_cache_alloc(struct cache_set *c)
+int bch_fs_btree_init(struct cache_set *c)
{
unsigned i;
int ret;
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
index c26489d1..4d67704b 100644
--- a/libbcache/btree_cache.h
+++ b/libbcache/btree_cache.h
@@ -6,7 +6,7 @@
struct btree_iter;
-extern const char *bch_btree_id_names[BTREE_ID_NR];
+extern const char * const bch_btree_ids[];
void bch_recalc_btree_reserve(struct cache_set *);
@@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *);
struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
unsigned, enum six_lock_type);
-void bch_btree_cache_free(struct cache_set *);
-int bch_btree_cache_alloc(struct cache_set *);
+void bch_fs_btree_exit(struct cache_set *);
+int bch_fs_btree_init(struct cache_set *);
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
index 0eb7290c..b90807f7 100644
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
}
}
+static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
+ enum bucket_data_type type)
+{
+ u64 b = start >> ca->bucket_bits;
+
+ do {
+ bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+ b++;
+ } while (b < end >> ca->bucket_bits);
+}
+
/*
* Mark non btree metadata - prios, journal
*/
-static void bch_mark_metadata(struct cache_set *c)
+static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
{
- struct cache *ca;
- unsigned i, j;
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ unsigned i;
u64 b;
- for_each_cache(ca, c, i) {
- for (j = 0; j < ca->journal.nr; j++) {
- b = ca->journal.buckets[j];
- bch_mark_metadata_bucket(ca, ca->buckets + b, true);
- }
+ /* Mark superblocks: */
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ if (layout->sb_offset[i] == BCH_SB_SECTOR)
+ mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
+ BUCKET_SB);
+
+ mark_metadata_sectors(ca,
+ layout->sb_offset[i],
+ layout->sb_offset[i] +
+ (1 << layout->sb_max_size_bits),
+ BUCKET_SB);
+ }
- spin_lock(&ca->prio_buckets_lock);
+ spin_lock(&c->journal.lock);
- for (j = 0; j < prio_buckets(ca) * 2; j++) {
- b = ca->prio_buckets[j];
- bch_mark_metadata_bucket(ca, ca->buckets + b, true);
- }
+ for (i = 0; i < ca->journal.nr; i++) {
+ b = ca->journal.buckets[i];
+ bch_mark_metadata_bucket(ca, ca->buckets + b,
+ BUCKET_JOURNAL, true);
+ }
+
+ spin_unlock(&c->journal.lock);
+
+ spin_lock(&ca->prio_buckets_lock);
- spin_unlock(&ca->prio_buckets_lock);
+ for (i = 0; i < prio_buckets(ca) * 2; i++) {
+ b = ca->prio_buckets[i];
+ if (b)
+ bch_mark_metadata_bucket(ca, ca->buckets + b,
+ BUCKET_PRIOS, true);
}
+
+ spin_unlock(&ca->prio_buckets_lock);
+}
+
+static void bch_mark_metadata(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ mutex_lock(&c->sb_lock);
+
+ for_each_cache(ca, c, i)
+ bch_mark_dev_metadata(c, ca);
+
+ mutex_unlock(&c->sb_lock);
}
/* Also see bch_pending_btree_node_free_insert_done() */
@@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c)
for_each_bucket(g, ca) {
bucket_cmpxchg(g, new, ({
new.owned_by_allocator = 0;
- new.is_metadata = 0;
+ new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
@@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
u64 start_time;
enum btree_id id;
- if (btree_gc_coalesce_disabled(c))
- return;
-
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return;
@@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg)
last_kick = atomic_read(&c->kick_gc);
bch_gc(c);
- bch_coalesce(c);
+ if (!btree_gc_coalesce_disabled(c))
+ bch_coalesce(c);
debug_check_no_locks_held();
}
@@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
{
set_bit(BCH_FS_GC_STOPPING, &c->flags);
- if (!IS_ERR_OR_NULL(c->gc_thread))
+ if (c->gc_thread)
kthread_stop(c->gc_thread);
+
+ c->gc_thread = NULL;
+ clear_bit(BCH_FS_GC_STOPPING, &c->flags);
}
int bch_gc_thread_start(struct cache_set *c)
{
- clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+ struct task_struct *p;
- c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
- if (IS_ERR(c->gc_thread))
- return PTR_ERR(c->gc_thread);
+ BUG_ON(c->gc_thread);
+ p = kthread_create(bch_gc_thread, c, "bcache_gc");
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ c->gc_thread = p;
wake_up_process(c->gc_thread);
return 0;
}
@@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
{
enum btree_id id;
- if (journal) {
- for (id = 0; id < BTREE_ID_NR; id++)
- bch_initial_gc_btree(c, id);
+ bch_mark_metadata(c);
+ for (id = 0; id < BTREE_ID_NR; id++)
+ bch_initial_gc_btree(c, id);
+
+ if (journal)
bch_journal_mark(c, journal);
- }
/*
* Skip past versions that might have possibly been used (as nonces),
@@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
- bch_mark_metadata(c);
-
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
index 315cfbec..ec4ee54a 100644
--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@@ -66,6 +66,7 @@
#include "alloc.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "error.h"
#include <linux/preempt.h>
#include <trace/events/bcache.h>
@@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {}
#endif
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
void bch_bucket_seq_cleanup(struct cache_set *c)
{
u16 last_seq_ondisk = c->journal.last_seq_ondisk;
@@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c)
for_each_cache(ca, c, i)
for_each_bucket(g, ca) {
bucket_cmpxchg(g, m, ({
- if (!m.wait_on_journal ||
- ((s16) last_seq_ondisk -
- (s16) m.journal_seq < 0))
+ if (!m.journal_seq_valid ||
+ bucket_needs_journal_commit(m, last_seq_ondisk))
break;
- m.wait_on_journal = 0;
+ m.journal_seq_valid = 0;
}));
}
}
@@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c)
static inline int is_meta_bucket(struct bucket_mark m)
{
- return !m.owned_by_allocator && m.is_metadata;
+ return m.data_type != BUCKET_DATA;
}
static inline int is_dirty_bucket(struct bucket_mark m)
{
- return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+ return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
}
static inline int is_cached_bucket(struct bucket_mark m)
{
- return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+ return m.data_type == BUCKET_DATA &&
+ !m.dirty_sectors && !!m.cached_sectors;
}
void bch_fs_stats_apply(struct cache_set *c,
@@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c,
memset(stats, 0, sizeof(*stats));
}
+static bool bucket_became_unavailable(struct cache_set *c,
+ struct bucket_mark old,
+ struct bucket_mark new)
+{
+ return is_available_bucket(old) &&
+ !is_available_bucket(new) &&
+ c->gc_pos.phase == GC_PHASE_DONE;
+}
+
static void bucket_stats_update(struct cache *ca,
struct bucket_mark old, struct bucket_mark new,
- bool may_make_unavailable,
struct bucket_stats_cache_set *bch_alloc_stats)
{
struct cache_set *c = ca->set;
struct bucket_stats_cache *cache_stats;
- BUG_ON(!may_make_unavailable &&
- is_available_bucket(old) &&
- !is_available_bucket(new) &&
- c->gc_pos.phase == GC_PHASE_DONE);
+ bch_fs_inconsistent_on(old.data_type && new.data_type &&
+ old.data_type != new.data_type, c,
+ "different types of metadata in same bucket: %u, %u",
+ old.data_type, new.data_type);
if (bch_alloc_stats) {
bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
(int) new.cached_sectors - (int) old.cached_sectors;
bch_alloc_stats->s[S_COMPRESSED]
- [old.is_metadata ? S_META : S_DIRTY] -=
+ [is_meta_bucket(old) ? S_META : S_DIRTY] -=
old.dirty_sectors;
bch_alloc_stats->s[S_COMPRESSED]
- [new.is_metadata ? S_META : S_DIRTY] +=
+ [is_meta_bucket(new) ? S_META : S_DIRTY] +=
new.dirty_sectors;
}
@@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca,
cache_stats->sectors_cached +=
(int) new.cached_sectors - (int) old.cached_sectors;
- if (old.is_metadata)
+ if (is_meta_bucket(old))
cache_stats->sectors_meta -= old.dirty_sectors;
else
cache_stats->sectors_dirty -= old.dirty_sectors;
- if (new.is_metadata)
+ if (is_meta_bucket(new))
cache_stats->sectors_meta += new.dirty_sectors;
else
cache_stats->sectors_dirty += new.dirty_sectors;
@@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca,
bch_wake_allocator(ca);
}
+#define bucket_data_cmpxchg(ca, g, new, expr) \
+({ \
+ struct bucket_stats_cache_set _stats = { 0 }; \
+ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
+ \
+ bucket_stats_update(ca, _old, new, &_stats); \
+ _old; \
+})
+
void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
{
struct bucket_stats_cache_set stats = { 0 };
@@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
old = bucket_cmpxchg(g, new, ({
new.owned_by_allocator = 1;
- new.is_metadata = 0;
+ new.had_metadata = 0;
+ new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
new.copygc = 0;
new.gen++;
}));
- BUG_ON(old.dirty_sectors);
+ bucket_stats_update(ca, old, new, &stats);
- bucket_stats_update(ca, old, new, true, &stats);
+ BUG_ON(old.dirty_sectors);
/*
* Ick:
@@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
{
- struct bucket_stats_cache_set stats = { 0 };
struct bucket_mark old, new;
- old = bucket_cmpxchg(g, new, ({
+ old = bucket_data_cmpxchg(ca, g, new, ({
new.owned_by_allocator = 0;
- new.is_metadata = 0;
+ new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
}));
- bucket_stats_update(ca, old, new, false, &stats);
+ BUG_ON(bucket_became_unavailable(ca->set, old, new));
}
void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
bool owned_by_allocator)
{
- struct bucket_stats_cache_set stats = { 0 };
- struct bucket_mark old, new;
+ struct bucket_mark new;
- old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
-
- bucket_stats_update(ca, old, new, true, &stats);
+ bucket_data_cmpxchg(ca, g, new, ({
+ new.owned_by_allocator = owned_by_allocator;
+ }));
}
void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+ enum bucket_data_type type,
bool may_make_unavailable)
{
- struct bucket_stats_cache_set stats = { 0 };
struct bucket_mark old, new;
- old = bucket_cmpxchg(g, new, ({
- new.is_metadata = 1;
+ BUG_ON(!type);
+
+ old = bucket_data_cmpxchg(ca, g, new, ({
+ new.data_type = type;
new.had_metadata = 1;
}));
BUG_ON(old.cached_sectors);
BUG_ON(old.dirty_sectors);
-
- bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+ BUG_ON(!may_make_unavailable &&
+ bucket_became_unavailable(ca->set, old, new));
}
#define saturated_add(ca, dst, src, max) \
@@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c,
if (!new.dirty_sectors &&
!new.cached_sectors) {
- new.is_metadata = false;
+ new.data_type = 0;
if (journal_seq) {
- new.wait_on_journal = true;
+ new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
} else {
- new.is_metadata = (type == S_META);
+ new.data_type = type == S_META
+ ? BUCKET_BTREE : BUCKET_DATA;
}
- new.had_metadata |= new.is_metadata;
+ new.had_metadata |= is_meta_bucket(new);
} while ((v = cmpxchg(&g->_mark.counter,
old.counter,
new.counter)) != old.counter);
- bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+ bucket_stats_update(ca, old, new, NULL);
+
+ BUG_ON(!may_make_unavailable &&
+ bucket_became_unavailable(c, old, new));
if (saturated &&
atomic_long_add_return(saturated,
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
index 9c6e4385..6d70103e 100644
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c)
static inline bool is_available_bucket(struct bucket_mark mark)
{
return (!mark.owned_by_allocator &&
- !mark.is_metadata &&
- !mark.dirty_sectors);
+ mark.data_type == BUCKET_DATA &&
+ !mark.dirty_sectors &&
+ !mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+ u16 last_seq_ondisk)
+{
+ return m.journal_seq_valid &&
+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
}
void bch_bucket_seq_cleanup(struct cache_set *);
@@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *);
void bch_invalidate_bucket(struct cache *, struct bucket *);
void bch_mark_free_bucket(struct cache *, struct bucket *);
void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *,
+ enum bucket_data_type, bool);
void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
struct bucket_stats_cache_set *);
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
index 6bbdcd26..f42e09d8 100644
--- a/libbcache/buckets_types.h
+++ b/libbcache/buckets_types.h
@@ -1,6 +1,14 @@
#ifndef _BUCKETS_TYPES_H
#define _BUCKETS_TYPES_H
+enum bucket_data_type {
+ BUCKET_DATA = 0,
+ BUCKET_BTREE,
+ BUCKET_PRIOS,
+ BUCKET_JOURNAL,
+ BUCKET_SB,
+};
+
struct bucket_mark {
union {
struct {
@@ -12,23 +20,30 @@ struct bucket_mark {
/* generation copygc is going to move this bucket into */
unsigned copygc:1;
- unsigned wait_on_journal:1;
+
+ unsigned journal_seq_valid:1;
/*
- * If this bucket ever had metadata in it, the allocator must
- * increment its gen before we reuse it:
+ * If this bucket had metadata while at the current generation
+ * number, the allocator must increment its gen before we reuse
+ * it:
*/
unsigned had_metadata:1;
unsigned owned_by_allocator:1;
- unsigned is_metadata:1;
- u16 cached_sectors;
+ unsigned data_type:3;
+
+ unsigned nouse:1;
+
u16 dirty_sectors;
+ u16 cached_sectors;
/*
* low bits of journal sequence number when this bucket was most
- * recently modified:
+ * recently modified: if journal_seq_valid is set, this bucket
+ * can't be reused until the journal sequence number written to
+ * disk is >= the bucket's journal sequence number:
*/
u16 journal_seq;
};
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
index b142d7b2..049aa910 100644
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg)
static long bch_ioctl_stop(struct cache_set *c)
{
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
return 0;
}
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
index dae52d49..92036db4 100644
--- a/libbcache/checksum.c
+++ b/libbcache/checksum.c
@@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed)
if (ret)
goto err;
- crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
- sizeof(*crypt) / sizeof(u64)),
- struct bch_sb_field_crypt, field);
+ crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
if (!crypt) {
ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
goto err;
}
- crypt->field.type = BCH_SB_FIELD_crypt;
crypt->key = key;
/* write superblock */
@@ -560,7 +557,7 @@ err:
return ret;
}
-void bch_fs_encryption_free(struct cache_set *c)
+void bch_fs_encryption_exit(struct cache_set *c)
{
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
index 137c9155..9d4da08d 100644
--- a/libbcache/checksum.h
+++ b/libbcache/checksum.h
@@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned,
int bch_disable_encryption(struct cache_set *);
int bch_enable_encryption(struct cache_set *, bool);
-void bch_fs_encryption_free(struct cache_set *);
+void bch_fs_encryption_exit(struct cache_set *);
int bch_fs_encryption_init(struct cache_set *);
static inline unsigned bch_data_checksum_type(struct cache_set *c)
diff --git a/libbcache/compress.c b/libbcache/compress.c
index f81a8143..89da31e5 100644
--- a/libbcache/compress.c
+++ b/libbcache/compress.c
@@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c,
break;
}
- return bch_compress_init(c);
+ return bch_fs_compress_init(c);
}
-void bch_compress_free(struct cache_set *c)
+void bch_fs_compress_exit(struct cache_set *c)
{
vfree(c->zlib_workspace);
mempool_exit(&c->lz4_workspace_pool);
@@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c)
max_t(size_t, zlib_inflate_workspacesize(), \
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
-int bch_compress_init(struct cache_set *c)
+int bch_fs_compress_init(struct cache_set *c)
{
unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
int ret, cpu;
- if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
- !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
-
if (!c->bio_decompress_worker) {
c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
if (!c->bio_decompress_worker)
@@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c)
}
}
+ if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+ !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+ return 0;
+
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_page_pool(&c->compression_bounce[READ],
1, order);
diff --git a/libbcache/compress.h b/libbcache/compress.h
index 485acd95..4604b065 100644
--- a/libbcache/compress.h
+++ b/libbcache/compress.h
@@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
struct bio *, size_t *, unsigned *);
int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
-void bch_compress_free(struct cache_set *);
-int bch_compress_init(struct cache_set *);
+void bch_fs_compress_exit(struct cache_set *);
+int bch_fs_compress_init(struct cache_set *);
#endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
index d25c32ae..16cc72b9 100644
--- a/libbcache/debug.c
+++ b/libbcache/debug.c
@@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = {
.read = bch_read_bfloat_failed,
};
-void bch_debug_exit_cache_set(struct cache_set *c)
+void bch_fs_debug_exit(struct cache_set *c)
{
if (!IS_ERR_OR_NULL(c->debug))
debugfs_remove_recursive(c->debug);
}
-void bch_debug_init_cache_set(struct cache_set *c)
+void bch_fs_debug_init(struct cache_set *c)
{
struct btree_debug *bd;
char name[100];
@@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+ bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
0400, c->debug, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
- bch_btree_id_names[bd->id]);
+ bch_btree_ids[bd->id]);
bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch_btree_id_names[bd->id]);
+ bch_btree_ids[bd->id]);
bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
&bfloat_failed_debug_ops);
diff --git a/libbcache/debug.h b/libbcache/debug.h
index a3635e60..d34a95a0 100644
--- a/libbcache/debug.h
+++ b/libbcache/debug.h
@@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
}
#ifdef CONFIG_DEBUG_FS
-void bch_debug_exit_cache_set(struct cache_set *);
-void bch_debug_init_cache_set(struct cache_set *);
+void bch_fs_debug_exit(struct cache_set *);
+void bch_fs_debug_init(struct cache_set *);
#else
-static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
-static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_exit(struct cache_set *c) {}
+static inline void bch_fs_debug_init(struct cache_set *c) {}
#endif
void bch_debug_exit(void);
diff --git a/libbcache/error.c b/libbcache/error.c
index 9f39be1b..f4109da6 100644
--- a/libbcache/error.c
+++ b/libbcache/error.c
@@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c)
case BCH_ON_ERROR_RO:
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
/* XXX do something better here? */
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
return;
}
@@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
} else {
bch_notify_dev_error(ca, true);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->state_lock);
dev = bch_dev_may_remove(ca);
if (dev
? bch_dev_read_only(ca)
@@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
"too many IO errors on %s, setting %s RO",
bdevname(ca->disk_sb.bdev, buf),
dev ? "device" : "filesystem");
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
}
}
diff --git a/libbcache/extents.c b/libbcache/extents.c
index 523f3f48..c5e0e375 100644
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- !g->mark.is_metadata;
+ g->mark.data_type != BUCKET_BTREE;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
err = "inconsistent";
@@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
+ struct extent_pick_ptr pick = { .ca = NULL };
struct cache *ca;
rcu_read_lock();
@@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
PTR_BUCKET_NR(ca, ptr)))
continue;
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
+ if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
+ continue;
- return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+ pick.ca = ca;
+ pick.ptr = *ptr;
}
+ if (pick.ca)
+ percpu_ref_get(&pick.ca->ref);
+
rcu_read_unlock();
- return (struct extent_pick_ptr) { .ca = NULL, };
+ return pick;
}
const struct bkey_ops bch_bkey_btree_ops = {
@@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
if (stale)
break;
- bad = (mark.is_metadata ||
+ bad = (mark.data_type != BUCKET_DATA ||
(gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
!mark.owned_by_allocator &&
!(ptr->cached
@@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
rcu_read_lock();
ret->ca = NULL;
- extent_for_each_online_device_crc(c, e, crc, ptr, ca)
- if (!ptr_stale(ca, ptr)) {
- *ret = (struct extent_pick_ptr) {
- .crc = crc_to_128(e.k, crc),
- .ptr = *ptr,
- .ca = ca,
- };
-
- if (ca != avoid)
- break;
- }
+ extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+ if (ptr_stale(ca, ptr))
+ continue;
+
+ if (ret->ca &&
+ (ca == avoid ||
+ ret->ca->mi.tier < ca->mi.tier))
+ continue;
+
+ *ret = (struct extent_pick_ptr) {
+ .crc = crc_to_128(e.k, crc),
+ .ptr = *ptr,
+ .ca = ca,
+ };
+ }
if (ret->ca)
percpu_ref_get(&ret->ca->ref);
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
index e9585fd5..e2f1427f 100644
--- a/libbcache/fs-gc.c
+++ b/libbcache/fs-gc.c
@@ -545,9 +545,9 @@ struct nlink {
u32 dir_count;
};
-DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+typedef GENRADIX(struct nlink) nlink_table;
-static void inc_link(struct cache_set *c, struct nlinks *links,
+static void inc_link(struct cache_set *c, nlink_table *links,
u64 range_start, u64 *range_end,
u64 inum, bool dir)
{
@@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links,
}
noinline_for_stack
-static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links,
u64 range_start, u64 *range_end)
{
struct btree_iter iter;
@@ -776,7 +776,7 @@ fsck_err:
noinline_for_stack
static int bch_gc_walk_inodes(struct cache_set *c,
struct bch_inode_unpacked *lostfound_inode,
- struct nlinks *links,
+ nlink_table *links,
u64 range_start, u64 range_end)
{
struct btree_iter iter;
@@ -850,7 +850,7 @@ noinline_for_stack
static int check_inode_nlinks(struct cache_set *c,
struct bch_inode_unpacked *lostfound_inode)
{
- struct nlinks links;
+ nlink_table links;
u64 this_iter_range_start, next_iter_range_start = 0;
int ret = 0;
diff --git a/libbcache/fs.c b/libbcache/fs.c
index ab0d9728..ec70a3e3 100644
--- a/libbcache/fs.c
+++ b/libbcache/fs.c
@@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
if (!c)
goto err_unlock;
- if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+ mutex_lock(&c->state_lock);
+
+ if (!bch_fs_running(c)) {
+ mutex_unlock(&c->state_lock);
err = "incomplete cache set";
c = NULL;
goto err_unlock;
}
closure_get(&c->cl);
+ mutex_unlock(&c->state_lock);
mutex_unlock(&bch_register_lock);
}
@@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
if (ret)
return ret;
- mutex_lock(&bch_register_lock);
-
if (opts.read_only >= 0 &&
opts.read_only != c->opts.read_only) {
const char *err = NULL;
if (opts.read_only) {
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
sb->s_flags |= MS_RDONLY;
} else {
err = bch_fs_read_write(c);
if (err) {
bch_err(c, "error going rw: %s", err);
- ret = -EINVAL;
- goto unlock;
+ return -EINVAL;
}
sb->s_flags &= ~MS_RDONLY;
@@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
if (opts.errors >= 0)
c->opts.errors = opts.errors;
-unlock:
- mutex_unlock(&bch_register_lock);
-
return ret;
}
@@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb)
generic_shutdown_super(sb);
if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
- bch_fs_stop_sync(c);
+ bch_fs_stop(c);
else
closure_put(&c->cl);
}
@@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = {
MODULE_ALIAS_FS("bcache");
-void bch_fs_exit(void)
+void bch_vfs_exit(void)
{
unregister_filesystem(&bcache_fs_type);
if (bch_dio_write_bioset)
@@ -1477,7 +1475,7 @@ void bch_fs_exit(void)
kmem_cache_destroy(bch_inode_cache);
}
-int __init bch_fs_init(void)
+int __init bch_vfs_init(void)
{
int ret = -ENOMEM;
@@ -1504,6 +1502,6 @@ int __init bch_fs_init(void)
return 0;
err:
- bch_fs_exit();
+ bch_vfs_exit();
return ret;
}
diff --git a/libbcache/fs.h b/libbcache/fs.h
index 933fb6de..2a29b132 100644
--- a/libbcache/fs.h
+++ b/libbcache/fs.h
@@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
int __must_check bch_write_inode(struct cache_set *,
struct bch_inode_info *);
-void bch_fs_exit(void);
-int bch_fs_init(void);
+void bch_vfs_exit(void);
+int bch_vfs_init(void);
#else
-static inline void bch_fs_exit(void) {}
-static inline int bch_fs_init(void) { return 0; }
+static inline void bch_vfs_exit(void) {}
+static inline int bch_vfs_init(void) { return 0; }
#endif
diff --git a/libbcache/io.c b/libbcache/io.c
index be99a973..a3df3794 100644
--- a/libbcache/io.c
+++ b/libbcache/io.c
@@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data)
spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
while ((op = c->write_wait_head)) {
- if (!test_bit(BCH_FS_RO, &c->flags) &&
- !test_bit(BCH_FS_STOPPING, &c->flags) &&
- time_after(op->expires, jiffies)) {
+ if (time_after(op->expires, jiffies)) {
mod_timer(&c->foreground_write_wakeup, op->expires);
break;
}
@@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
return;
}
- if (rbio->promote &&
- !test_bit(BCH_FS_RO, &c->flags) &&
- !test_bit(BCH_FS_STOPPING, &c->flags)) {
+ if (rbio->promote) {
struct cache_promote_op *promote = rbio->promote;
struct closure *cl = &promote->cl;
@@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio)
preempt_disable();
d = this_cpu_ptr(c->bio_decompress_worker);
llist_add(&rbio->list, &d->bio_list);
- queue_work(system_unbound_wq, &d->work);
+ queue_work(system_highpri_wq, &d->work);
preempt_enable();
} else {
__bch_read_endio(c, rbio);
}
}
+static bool should_promote(struct cache_set *c,
+ struct extent_pick_ptr *pick, unsigned flags)
+{
+ if (!(flags & BCH_READ_PROMOTE))
+ return false;
+
+ if (percpu_ref_is_dying(&c->writes))
+ return false;
+
+ return c->fastest_tier &&
+ c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k,
struct extent_pick_ptr *pick, unsigned flags)
@@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
* XXX: multiple promotes can race with each other, wastefully. Keep a
* list of outstanding promotes?
*/
- if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+ if (should_promote(c, pick, flags)) {
/*
* biovec needs to be big enough to hold decompressed data, if
* the bch_write_extent() has to decompress/recompress it:
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 99dd9f26..b2838376 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
return BCH_FSCK_UNKNOWN_VERSION;
}
- if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
- bytes > c->journal.entry_size_max, c,
+ if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
"journal entry too big (%zu bytes), sector %lluu",
bytes, sector)) {
/* XXX: note we might have missing journal entries */
@@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl;
- struct cache *ca;
u64 new_seq = 0;
- unsigned i;
-
- for_each_cache(ca, c, i)
- if (is_journal_device(ca))
- bch_dev_group_add(&c->journal.devs, ca);
list_for_each_entry(bl, &j->seq_blacklist, list)
new_seq = max(new_seq, bl->seq);
@@ -1534,48 +1527,111 @@ err:
return ret;
}
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
+ unsigned nr, bool write_super)
{
+ struct journal *j = &c->journal;
struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(ca->disk_sb.sb);
- struct bch_sb_field *f;
- u64 *p;
+ struct bch_sb_field_journal *journal_buckets;
+ struct disk_reservation disk_res = { 0, 0 };
+ struct closure cl;
+ u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ int ret = 0;
- p = krealloc(ja->bucket_seq, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
+ closure_init_stack(&cl);
- ja->bucket_seq = p;
+ mutex_lock(&c->sb_lock);
- p = krealloc(ja->buckets, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
+ /* don't handle reducing nr of buckets yet: */
+ if (nr <= ja->nr)
+ goto err;
- ja->buckets = p;
+ /*
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+ * when space used goes up without a reservation - but we do need the
+ * reservation to ensure we'll actually be able to allocate:
+ */
- f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
- sizeof(*journal_buckets) / sizeof(u64));
- if (!f)
- return -ENOMEM;
- f->type = BCH_SB_FIELD_journal;
+ ret = ENOSPC;
+ if (bch_disk_reservation_get(c, &disk_res,
+ (nr - ja->nr) << ca->bucket_bits, 0))
+ goto err;
- ja->nr = nr;
- return 0;
+ ret = -ENOMEM;
+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+ if (!new_buckets || !new_bucket_seq)
+ goto err;
+
+ journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+ nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (!journal_buckets)
+ goto err;
+
+ spin_lock(&j->lock);
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+
+ while (ja->nr < nr) {
+ /* must happen under journal lock, to avoid racing with gc: */
+ u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
+ if (!b) {
+ if (!closure_wait(&c->freelist_wait, &cl)) {
+ spin_unlock(&j->lock);
+ closure_sync(&cl);
+ spin_lock(&j->lock);
+ }
+ continue;
+ }
+
+ bch_mark_metadata_bucket(ca, &ca->buckets[b],
+ BUCKET_JOURNAL, false);
+ bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
+
+ memmove(ja->buckets + ja->last_idx + 1,
+ ja->buckets + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+ memmove(ja->bucket_seq + ja->last_idx + 1,
+ ja->bucket_seq + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+ memmove(journal_buckets->buckets + ja->last_idx + 1,
+ journal_buckets->buckets + ja->last_idx,
+ (ja->nr - ja->last_idx) * sizeof(u64));
+
+ ja->buckets[ja->last_idx] = b;
+ journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+
+ if (ja->last_idx < ja->nr) {
+ if (ja->cur_idx >= ja->last_idx)
+ ja->cur_idx++;
+ ja->last_idx++;
+ }
+ ja->nr++;
+
+ }
+ spin_unlock(&j->lock);
+
+ BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+ if (write_super)
+ bch_write_super(c);
+
+ ret = 0;
+err:
+ mutex_unlock(&c->sb_lock);
+
+ kfree(new_bucket_seq);
+ kfree(new_buckets);
+ bch_disk_reservation_put(c, &disk_res);
+
+ return ret;
}
int bch_dev_journal_alloc(struct cache *ca)
{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- int ret;
- unsigned i;
-
- if (ca->mi.tier != 0)
- return 0;
-
if (dynamic_fault("bcache:add:journal_alloc"))
return -ENOMEM;
@@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
* clamp journal size to 1024 buckets or 512MB (in sectors), whichever
* is smaller:
*/
- ret = bch_set_nr_journal_buckets(ca,
+ return bch_set_nr_journal_buckets(ca->set, ca,
clamp_t(unsigned, ca->mi.nbuckets >> 8,
BCH_JOURNAL_BUCKETS_MIN,
min(1 << 10,
- (1 << 20) / ca->mi.bucket_size)));
- if (ret)
- return ret;
-
- journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-
- for (i = 0; i < ja->nr; i++) {
- u64 bucket = ca->mi.first_bucket + i;
-
- ja->buckets[i] = bucket;
- journal_buckets->buckets[i] = cpu_to_le64(bucket);
-
- bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
- }
-
- return 0;
+ (1 << 20) / ca->mi.bucket_size)),
+ false);
}
/* Journalling */
@@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
fifo_entry_idx(&j->pin, pin->pin_list))) {
if (journal_pin_active(pin))
__journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list,
- pin, NULL);
+ __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
spin_unlock_irq(&j->pin_lock);
}
-
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j, u64 seq_to_flush)
{
@@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
return ret;
}
+static bool journal_has_pins(struct journal *j)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ journal_reclaim_fast(j);
+ ret = fifo_used(&j->pin) > 1 ||
+ atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+void bch_journal_flush_pins(struct journal *j)
+{
+ struct journal_entry_pin *pin;
+
+ while ((pin = journal_get_next_pin(j, U64_MAX)))
+ pin->flush(j, pin);
+
+ wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
+}
+
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
@@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
struct cache_set *c = container_of(j, struct cache_set, journal);
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
struct bch_extent_ptr *ptr;
+ struct journal_device *ja;
struct cache *ca;
- unsigned iter, replicas, replicas_want =
+ bool swapped;
+ unsigned i, replicas, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
spin_lock(&j->lock);
@@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
replicas = bch_extent_nr_ptrs(e.c);
+ spin_lock(&j->devs.lock);
+
+ /* Sort by tier: */
+ do {
+ swapped = false;
+
+ for (i = 0; i + 1 < j->devs.nr; i++)
+ if (j->devs.d[i + 0].dev->mi.tier >
+ j->devs.d[i + 1].dev->mi.tier) {
+ swap(j->devs.d[i], j->devs.d[i + 1]);
+ swapped = true;
+ }
+ } while (swapped);
+
/*
- * Determine location of the next journal write:
- * XXX: sort caches by free journal space
+ * Pick devices for next journal write:
+ * XXX: sort devices by free journal space?
*/
- group_for_each_cache_rcu(ca, &j->devs, iter) {
- struct journal_device *ja = &ca->journal;
+ for (i = 0; i < j->devs.nr; i++) {
+ ca = j->devs.d[i].dev;
+ ja = &ca->journal;
if (replicas >= replicas_want)
break;
@@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
}
-
+ spin_unlock(&j->devs.lock);
rcu_read_unlock();
j->prev_buf_sectors = 0;
@@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
return bch_journal_flush_seq(j, seq);
}
-void bch_journal_free(struct journal *j)
-{
- unsigned order = get_order(j->entry_size_max);
-
- free_pages((unsigned long) j->buf[1].data, order);
- free_pages((unsigned long) j->buf[0].data, order);
- free_fifo(&j->pin);
-}
-
-int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
-{
- static struct lock_class_key res_key;
- unsigned order = get_order(entry_size_max);
-
- spin_lock_init(&j->lock);
- spin_lock_init(&j->pin_lock);
- init_waitqueue_head(&j->wait);
- INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
- mutex_init(&j->blacklist_lock);
- INIT_LIST_HEAD(&j->seq_blacklist);
- spin_lock_init(&j->devs.lock);
- mutex_init(&j->reclaim_lock);
-
- lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
- j->entry_size_max = entry_size_max;
- j->write_delay_ms = 100;
- j->reclaim_delay_ms = 100;
-
- bkey_extent_init(&j->key);
-
- atomic64_set(&j->reservations.counter,
- ((union journal_res_state)
- { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
- !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
- return -ENOMEM;
-
- return 0;
-}
-
ssize_t bch_journal_print_debug(struct journal *j, char *buf)
{
union journal_res_state *s = &j->reservations;
@@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
return ret;
}
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
+{
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
+ return;
+
+ /*
+ * Empty out the journal by first flushing everything pinning existing
+ * journal entries, then force a brand new empty journal entry to be
+ * written:
+ */
+ bch_journal_flush_pins(j);
+ bch_journal_flush_async(j, NULL);
+ bch_journal_meta(j);
+
+ cancel_delayed_work_sync(&j->write_work);
+ cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch_dev_journal_exit(struct cache *ca)
{
kfree(ca->journal.buckets);
kfree(ca->journal.bucket_seq);
}
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
{
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
@@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
return 0;
}
+
+void bch_fs_journal_exit(struct journal *j)
+{
+ unsigned order = get_order(j->entry_size_max);
+
+ free_pages((unsigned long) j->buf[1].data, order);
+ free_pages((unsigned long) j->buf[0].data, order);
+ free_fifo(&j->pin);
+}
+
+int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
+{
+ static struct lock_class_key res_key;
+ unsigned order = get_order(entry_size_max);
+
+ spin_lock_init(&j->lock);
+ spin_lock_init(&j->pin_lock);
+ init_waitqueue_head(&j->wait);
+ INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+ INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+ mutex_init(&j->blacklist_lock);
+ INIT_LIST_HEAD(&j->seq_blacklist);
+ spin_lock_init(&j->devs.lock);
+ mutex_init(&j->reclaim_lock);
+
+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+ j->entry_size_max = entry_size_max;
+ j->write_delay_ms = 100;
+ j->reclaim_delay_ms = 100;
+
+ bkey_extent_init(&j->key);
+
+ atomic64_set(&j->reservations.counter,
+ ((union journal_res_state)
+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+ !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
index 02a6e676..d3a1db0c 100644
--- a/libbcache/journal.h
+++ b/libbcache/journal.h
@@ -111,7 +111,6 @@
#include <linux/hash.h>
#include "journal_types.h"
-//#include "super-io.h"
/*
* Only used for holding the journal entries we read in btree_journal_read()
@@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
+void bch_journal_flush_pins(struct journal *);
struct closure;
struct cache_set;
@@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j)
? -EIO : 0;
}
-static inline bool is_journal_device(struct cache *ca)
-{
- return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
-}
-
static inline bool journal_flushes_device(struct cache *ca)
{
return true;
@@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j)
spin_unlock(&j->lock);
}
-void bch_journal_free(struct journal *);
-int bch_journal_alloc(struct journal *, unsigned);
-
ssize_t bch_journal_print_debug(struct journal *, char *);
int bch_dev_journal_alloc(struct cache *);
@@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
int bch_journal_move(struct cache *);
-void bch_journal_free_cache(struct cache *);
-int bch_journal_init_cache(struct cache *);
+void bch_fs_journal_stop(struct journal *);
+void bch_dev_journal_exit(struct cache *);
+int bch_dev_journal_init(struct cache *);
+void bch_fs_journal_exit(struct journal *);
+int bch_fs_journal_init(struct journal *, unsigned);
#endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
index e40dfbca..27f5c63c 100644
--- a/libbcache/movinggc.c
+++ b/libbcache/movinggc.c
@@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca)
}
if (g->mark.owned_by_allocator ||
- g->mark.is_metadata)
+ g->mark.data_type != BUCKET_DATA)
continue;
sectors_used = bucket_sectors_used(g);
@@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
return 0;
}
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
{
- bch_pd_controller_init(&ca->moving_gc_pd);
- ca->moving_gc_pd.d_term = 0;
+ ca->moving_gc_pd.rate.rate = UINT_MAX;
+ bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+ if (ca->moving_gc_read)
+ kthread_stop(ca->moving_gc_read);
+ ca->moving_gc_read = NULL;
}
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
{
struct task_struct *t;
- /* The moving gc read thread must be stopped */
- BUG_ON(ca->moving_gc_read != NULL);
+ BUG_ON(ca->moving_gc_read);
if (ca->set->opts.nochanges)
return 0;
@@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
return 0;
}
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
{
- ca->moving_gc_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
- if (ca->moving_gc_read)
- kthread_stop(ca->moving_gc_read);
- ca->moving_gc_read = NULL;
+ bch_pd_controller_init(&ca->moving_gc_pd);
+ ca->moving_gc_pd.d_term = 0;
}
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
index 5f153085..e8ae95e5 100644
--- a/libbcache/movinggc.h
+++ b/libbcache/movinggc.h
@@ -23,8 +23,8 @@
#define COPYGC_SECTORS_PER_ITER(ca) \
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
-void bch_moving_init_cache(struct cache *);
void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
+void bch_dev_moving_gc_init(struct cache *);
#endif
diff --git a/libbcache/opts.h b/libbcache/opts.h
index 95184db1..9b10310d 100644
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@@ -86,11 +86,17 @@ enum opt_type {
BCH_OPT(noreplay, 0444, NO_SB_OPT, \
s8, OPT_BOOL()) \
BCH_OPT(norecovery, 0444, NO_SB_OPT, \
- s8, OPT_BOOL())
+ s8, OPT_BOOL()) \
+ BCH_OPT(noexcl, 0444, NO_SB_OPT, \
+ s8, OPT_BOOL()) \
+ BCH_OPT(sb, 0444, NO_SB_OPT, \
+ s64, OPT_UINT(0, S64_MAX)) \
#define BCH_OPTS() \
BCH_OPT(read_only, 0444, NO_SB_OPT, \
s8, OPT_BOOL()) \
+ BCH_OPT(nostart, 0444, NO_SB_OPT, \
+ s8, OPT_BOOL()) \
BCH_VISIBLE_OPTS()
struct bch_opts {
@@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
#undef BCH_OPT
}
+#define opt_defined(_opt) ((_opt) >= 0)
+
void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch_sb_opts(struct bch_sb *);
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
index be27d3ee..f50a5ee8 100644
--- a/libbcache/super-io.c
+++ b/libbcache/super-io.c
@@ -10,6 +10,7 @@
#include "vstructs.h"
#include <linux/backing-dev.h>
+#include <linux/sort.h>
static inline void __bch_sb_layout_size_assert(void)
{
@@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void)
}
struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
- enum bch_sb_field_types type)
+ enum bch_sb_field_type type)
{
struct bch_sb_field *f;
@@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb)
if (sb->bio)
bio_put(sb->bio);
if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ blkdev_put(sb->bdev, sb->mode);
free_pages((unsigned long) sb->sb, sb->page_order);
memset(sb, 0, sizeof(*sb));
@@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
return 0;
}
-int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
{
u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
le32_add_cpu(&sb->u64s, u64s - old_u64s);
return f;
+}
+
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
+ enum bch_sb_field_type type,
+ unsigned u64s)
+{
+ struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ ssize_t d = -old_u64s + u64s;
+ if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+ return NULL;
+
+ f = __bch_sb_field_resize(sb->sb, f, u64s);
+ f->type = type;
+ return f;
}
struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
- struct bch_sb_field *f,
+ enum bch_sb_field_type type,
unsigned u64s)
{
+ struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
ssize_t d = -old_u64s + u64s;
struct cache *ca;
@@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
for_each_cache(ca, c, i) {
struct bcache_superblock *sb = &ca->disk_sb;
- if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+ if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
percpu_ref_put(&ca->ref);
return NULL;
}
}
- return __bch_sb_field_resize(c->disk_sb, f, u64s);
-}
-
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
- struct bch_sb_field *f,
- unsigned u64s)
-{
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
-
- if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
- return NULL;
-
- return __bch_sb_field_resize(sb->sb, f, u64s);
+ f = __bch_sb_field_resize(c->disk_sb, f, u64s);
+ f->type = type;
+ return f;
}
static const char *validate_sb_layout(struct bch_sb_layout *layout)
@@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
prev_offset = le64_to_cpu(layout->sb_offset[0]);
- if (prev_offset != BCH_SB_SECTOR)
- return "Invalid superblock layout: doesn't have default superblock location";
-
for (i = 1; i < layout->nr_superblocks; i++) {
offset = le64_to_cpu(layout->sb_offset[i]);
@@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
return NULL;
}
+static int u64_cmp(const void *_l, const void *_r)
+{
+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+ return l < r ? -1 : l > r ? 1 : 0;
+}
+
+const char *bch_validate_journal_layout(struct bch_sb *sb,
+ struct cache_member_cpu mi)
+{
+ struct bch_sb_field_journal *journal;
+ const char *err;
+ unsigned nr;
+ unsigned i;
+ u64 *b;
+
+ journal = bch_sb_get_journal(sb);
+ if (!journal)
+ return NULL;
+
+ nr = bch_nr_journal_buckets(journal);
+ if (!nr)
+ return NULL;
+
+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+ if (!b)
+ return "cannot allocate memory";
+
+ for (i = 0; i < nr; i++)
+ b[i] = le64_to_cpu(journal->buckets[i]);
+
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+ err = "journal bucket at sector 0";
+ if (!b[0])
+ goto err;
+
+ err = "journal bucket before first bucket";
+ if (b[0] < mi.first_bucket)
+ goto err;
+
+ err = "journal bucket past end of device";
+ if (b[nr - 1] >= mi.nbuckets)
+ goto err;
+
+ err = "duplicate journal buckets";
+ for (i = 0; i + 1 < nr; i++)
+ if (b[i] == b[i + 1])
+ goto err;
+
+ err = NULL;
+err:
+ kfree(b);
+ return err;
+}
+
const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *sb_mi;
- struct bch_sb_field_journal *journal;
struct cache_member_cpu mi;
const char *err;
u16 block_size;
- unsigned i;
switch (le64_to_cpu(sb->version)) {
case BCACHE_SB_VERSION_CDEV_V4:
@@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
- for (i = 0; i < sb->layout.nr_superblocks; i++) {
- u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
- u64 max_size = 1 << sb->layout.sb_max_size_bits;
-
- if (offset + max_size > mi.first_bucket * mi.bucket_size)
- return "Invalid superblock: first bucket comes before end of super";
- }
-
if (mi.nbuckets > LONG_MAX)
return "Too many buckets";
@@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
mi.bucket_size * mi.nbuckets)
return "Invalid superblock: device too small";
- /* Validate journal buckets: */
- journal = bch_sb_get_journal(sb);
- if (journal) {
- for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
- u64 b = le64_to_cpu(journal->buckets[i]);
-
- if (b < mi.first_bucket || b >= mi.nbuckets)
- return "bad journal bucket";
- }
- }
+ err = bch_validate_journal_layout(sb, mi);
+ if (err)
+ return err;
return NULL;
}
@@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev)
static bool bch_is_open(struct block_device *bdev)
{
- lockdep_assert_held(&bch_register_lock);
+ bool ret;
+
+ mutex_lock(&bch_register_lock);
+ ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+ mutex_unlock(&bch_register_lock);
- return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+ return ret;
}
-static const char *bch_blkdev_open(const char *path, void *holder,
- struct bch_opts opts,
- struct block_device **ret)
+static const char *bch_blkdev_open(const char *path, fmode_t mode,
+ void *holder, struct block_device **ret)
{
struct block_device *bdev;
- fmode_t mode = opts.nochanges > 0
- ? FMODE_READ
- : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
const char *err;
*ret = NULL;
@@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
int ret;
- ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+ ret = bch_sb_realloc(&ca->disk_sb, u64s);
if (ret)
return ret;
@@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
reread:
bio_reset(sb->bio);
sb->bio->bi_bdev = sb->bdev;
- sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+ sb->bio->bi_iter.bi_sector = offset;
sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch_bio_map(sb->bio, sb->sb);
@@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb,
struct bch_opts opts,
const char *path)
{
+ u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
struct bch_sb_layout layout;
const char *err;
unsigned i;
- lockdep_assert_held(&bch_register_lock);
-
memset(sb, 0, sizeof(*sb));
+ sb->mode = FMODE_READ;
+
+ if (!(opt_defined(opts.noexcl) && opts.noexcl))
+ sb->mode |= FMODE_EXCL;
- err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+ if (!(opt_defined(opts.nochanges) && opts.nochanges))
+ sb->mode |= FMODE_WRITE;
+
+ err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
if (err)
return err;
@@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb,
if (bch_fs_init_fault("read_super"))
goto err;
- err = read_one_super(sb, BCH_SB_SECTOR);
+ err = read_one_super(sb, offset);
if (!err)
goto got_super;
- pr_err("error reading default super: %s", err);
+ if (offset != BCH_SB_SECTOR) {
+ pr_err("error reading superblock: %s", err);
+ goto err;
+ }
+
+ pr_err("error reading default superblock: %s", err);
/*
* Error reading primary superblock - read location of backup
@@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c)
lockdep_assert_held(&c->sb_lock);
+ if (c->opts.nochanges)
+ return;
+
closure_init_stack(cl);
le64_add_cpu(&c->disk_sb->seq, 1);
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
index 665de811..ae1e8b9d 100644
--- a/libbcache/super-io.h
+++ b/libbcache/super-io.h
@@ -6,16 +6,35 @@
#include <asm/byteorder.h>
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
-
-#define BCH_SB_FIELD_TYPE(_name) \
-static inline struct bch_sb_field_##_name * \
-bch_sb_get_##_name(struct bch_sb *sb) \
-{ \
- struct bch_sb_field *f = \
- bch_sb_field_get(sb, BCH_SB_FIELD_##_name); \
- \
- return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
+ enum bch_sb_field_type, unsigned);
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+ enum bch_sb_field_type, unsigned);
+
+#define field_to_type(_f, _name) \
+ container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define BCH_SB_FIELD_TYPE(_name) \
+static inline struct bch_sb_field_##_name * \
+bch_sb_get_##_name(struct bch_sb *sb) \
+{ \
+ return field_to_type(bch_sb_field_get(sb, \
+ BCH_SB_FIELD_##_name), _name); \
+} \
+ \
+static inline struct bch_sb_field_##_name * \
+bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \
+{ \
+ return field_to_type(bch_sb_field_resize(sb, \
+ BCH_SB_FIELD_##_name, u64s), _name); \
+} \
+ \
+static inline struct bch_sb_field_##_name * \
+bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s) \
+{ \
+ return field_to_type(bch_fs_sb_field_resize(c, \
+ BCH_SB_FIELD_##_name, u64s), _name); \
}
BCH_SB_FIELD_TYPE(journal);
@@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned);
int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
int bch_sb_from_cache_set(struct cache_set *, struct cache *);
-struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
- struct bch_sb_field *, unsigned);
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
- struct bch_sb_field *, unsigned);
-
void bch_free_super(struct bcache_superblock *);
int bch_super_realloc(struct bcache_superblock *, unsigned);
+const char *bch_validate_journal_layout(struct bch_sb *,
+ struct cache_member_cpu);
const char *bch_validate_cache_super(struct bcache_superblock *);
const char *bch_read_super(struct bcache_superblock *,
diff --git a/libbcache/super.c b/libbcache/super.c
index fab34805..5535639c 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
struct workqueue_struct *bcache_io_wq;
struct crypto_shash *bch_sha256;
-static void bch_dev_stop(struct cache *);
+static void bch_dev_free(struct cache *);
static int bch_dev_online(struct cache *);
static int bch_congested_fn(void *data, int bdi_bits)
@@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits)
}
}
} else {
- /* Writes only go to tier 0: */
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+ /* Writes prefer fastest tier: */
+ struct bch_tier *tier = READ_ONCE(c->fastest_tier);
+ struct cache_group *grp = tier ? &tier->devs : &c->cache_all;
+
+ group_for_each_cache_rcu(ca, grp, i) {
bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
if (bdi_congested(bdi, bdi_bits)) {
@@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
return ret;
}
-/* Cache set RO/RW: */
+/* Filesystem RO/RW: */
/*
* For startup/shutdown of RW stuff, the dependencies are:
@@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c)
struct cache *ca;
unsigned i;
- c->tiering_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&c->tiering_pd.rate);
- bch_tiering_read_stop(c);
+ bch_tiering_stop(c);
for_each_cache(ca, c, i)
bch_moving_gc_stop(ca);
@@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c)
for_each_cache(ca, c, i)
bch_dev_allocator_stop(ca);
- /*
- * Write a journal entry after flushing the btree, so we don't end up
- * replaying everything we just flushed:
- */
- if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
- int ret;
-
- bch_journal_flush_async(&c->journal, NULL);
- ret = bch_journal_meta(&c->journal);
- BUG_ON(ret && !bch_journal_error(&c->journal));
- }
-
- cancel_delayed_work_sync(&c->journal.write_work);
- cancel_delayed_work_sync(&c->journal.reclaim_work);
+ bch_fs_journal_stop(&c->journal);
}
static void bch_writes_disabled(struct percpu_ref *writes)
@@ -167,12 +155,27 @@ static void bch_writes_disabled(struct percpu_ref *writes)
wake_up(&bch_read_only_wait);
}
-static void bch_fs_read_only_work(struct work_struct *work)
+void bch_fs_read_only(struct cache_set *c)
{
- struct cache_set *c =
- container_of(work, struct cache_set, read_only_work);
+ mutex_lock(&c->state_lock);
+ if (c->state != BCH_FS_STARTING &&
+ c->state != BCH_FS_RW)
+ goto out;
+
+ if (test_bit(BCH_FS_ERROR, &c->flags))
+ goto out;
- percpu_ref_put(&c->writes);
+ trace_fs_read_only(c);
+
+ /*
+ * Block new foreground-end write operations from starting - any new
+ * writes will return -EROFS:
+ *
+ * (This is really blocking new _allocations_, writes to previously
+ * allocated space can still happen until stopping the allocator in
+ * bch_dev_allocator_stop()).
+ */
+ percpu_ref_kill(&c->writes);
del_timer(&c->foreground_write_wakeup);
cancel_delayed_work(&c->pd_controllers_update);
@@ -180,98 +183,77 @@ static void bch_fs_read_only_work(struct work_struct *work)
c->foreground_write_pd.rate.rate = UINT_MAX;
bch_wake_delayed_writes((unsigned long) c);
- if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
- /*
- * If we're not doing an emergency shutdown, we want to wait on
- * outstanding writes to complete so they don't see spurious
- * errors due to shutting down the allocator:
- */
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ /*
+ * If we're not doing an emergency shutdown, we want to wait on
+ * outstanding writes to complete so they don't see spurious errors due
+ * to shutting down the allocator:
+ *
+ * If we are doing an emergency shutdown outstanding writes may
+ * hang until we shutdown the allocator so we don't want to wait
+ * on outstanding writes before shutting everything down - but
+ * we do need to wait on them before returning and signalling
+ * that going RO is complete:
+ */
+ wait_event(bch_read_only_wait,
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
- __bch_fs_read_only(c);
+ __bch_fs_read_only(c);
- if (!bch_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb, true);
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
- } else {
- /*
- * If we are doing an emergency shutdown outstanding writes may
- * hang until we shutdown the allocator so we don't want to wait
- * on outstanding writes before shutting everything down - but
- * we do need to wait on them before returning and signalling
- * that going RO is complete:
- */
- __bch_fs_read_only(c);
+ wait_event(bch_read_only_wait,
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ if (!bch_journal_error(&c->journal) &&
+ !test_bit(BCH_FS_ERROR, &c->flags)) {
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_CLEAN(c->disk_sb, true);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
+ c->state = BCH_FS_RO;
bch_notify_fs_read_only(c);
trace_fs_read_only_done(c);
-
- set_bit(BCH_FS_RO_COMPLETE, &c->flags);
- wake_up(&bch_read_only_wait);
+out:
+ mutex_unlock(&c->state_lock);
}
-bool bch_fs_read_only(struct cache_set *c)
+static void bch_fs_read_only_work(struct work_struct *work)
{
- if (test_and_set_bit(BCH_FS_RO, &c->flags))
- return false;
-
- trace_fs_read_only(c);
-
- percpu_ref_get(&c->writes);
+ struct cache_set *c =
+ container_of(work, struct cache_set, read_only_work);
- /*
- * Block new foreground-end write operations from starting - any new
- * writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch_dev_allocator_stop()).
- */
- percpu_ref_kill(&c->writes);
+ bch_fs_read_only(c);
+}
- queue_work(system_freezable_wq, &c->read_only_work);
- return true;
+static void bch_fs_read_only_async(struct cache_set *c)
+{
+ queue_work(system_long_wq, &c->read_only_work);
}
bool bch_fs_emergency_read_only(struct cache_set *c)
{
bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
- bch_fs_read_only(c);
+ bch_fs_read_only_async(c);
bch_journal_halt(&c->journal);
wake_up(&bch_read_only_wait);
return ret;
}
-void bch_fs_read_only_sync(struct cache_set *c)
-{
- /* so we don't race with bch_fs_read_write() */
- lockdep_assert_held(&bch_register_lock);
-
- bch_fs_read_only(c);
-
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-}
-
-static const char *__bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct cache_set *c)
{
struct cache *ca;
- const char *err;
+ const char *err = NULL;
unsigned i;
- lockdep_assert_held(&bch_register_lock);
+ mutex_lock(&c->state_lock);
+ if (c->state != BCH_FS_STARTING &&
+ c->state != BCH_FS_RO)
+ goto out;
err = "error starting allocator thread";
for_each_cache(ca, c, i)
@@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c)
if (bch_gc_thread_start(c))
goto err;
- for_each_cache(ca, c, i) {
- if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
- continue;
-
- err = "error starting moving GC thread";
- if (bch_moving_gc_thread_start(ca)) {
+ err = "error starting moving GC thread";
+ for_each_cache(ca, c, i)
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+ bch_moving_gc_start(ca)) {
percpu_ref_put(&ca->ref);
goto err;
}
- }
err = "error starting tiering thread";
- if (bch_tiering_read_start(c))
+ if (bch_tiering_start(c))
goto err;
schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
- return NULL;
+ if (c->state != BCH_FS_STARTING)
+ percpu_ref_reinit(&c->writes);
+
+ c->state = BCH_FS_RW;
+ err = NULL;
+out:
+ mutex_unlock(&c->state_lock);
+ return err;
err:
__bch_fs_read_only(c);
- return err;
-}
-
-const char *bch_fs_read_write(struct cache_set *c)
-{
- const char *err;
-
- lockdep_assert_held(&bch_register_lock);
-
- if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
- return NULL;
-
- err = __bch_fs_read_write(c);
- if (err)
- return err;
-
- percpu_ref_reinit(&c->writes);
-
- clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
- clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
- clear_bit(BCH_FS_RO, &c->flags);
- return NULL;
+ goto out;
}
-/* Cache set startup/shutdown: */
+/* Filesystem startup/shutdown: */
static void bch_fs_free(struct cache_set *c)
{
- del_timer_sync(&c->foreground_write_wakeup);
- cancel_delayed_work_sync(&c->pd_controllers_update);
- cancel_work_sync(&c->read_only_work);
- cancel_work_sync(&c->bio_submit_work);
- cancel_work_sync(&c->read_retry_work);
-
- bch_fs_encryption_free(c);
- bch_btree_cache_free(c);
- bch_journal_free(&c->journal);
+ bch_fs_encryption_exit(c);
+ bch_fs_btree_exit(c);
+ bch_fs_journal_exit(&c->journal);
bch_io_clock_exit(&c->io_clock[WRITE]);
bch_io_clock_exit(&c->io_clock[READ]);
- bch_compress_free(c);
+ bch_fs_compress_exit(c);
bch_fs_blockdev_exit(c);
bdi_destroy(&c->bdi);
lg_lock_free(&c->bucket_stats_lock);
@@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c)
module_put(THIS_MODULE);
}
+static void bch_fs_exit(struct cache_set *c)
+{
+ unsigned i;
+
+ del_timer_sync(&c->foreground_write_wakeup);
+ cancel_delayed_work_sync(&c->pd_controllers_update);
+ cancel_work_sync(&c->read_only_work);
+ cancel_work_sync(&c->bio_submit_work);
+ cancel_work_sync(&c->read_retry_work);
+
+ for (i = 0; i < c->sb.nr_devices; i++)
+ if (c->cache[i])
+ bch_dev_free(c->cache[i]);
+
+ closure_debug_destroy(&c->cl);
+ kobject_put(&c->kobj);
+}
+
+static void bch_fs_offline(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ mutex_lock(&bch_register_lock);
+ list_del(&c->list);
+ mutex_unlock(&bch_register_lock);
+
+ if (c->kobj.state_in_sysfs)
+ kobject_del(&c->kobj);
+
+ for_each_cache(ca, c, i)
+ if (ca->kobj.state_in_sysfs)
+ kobject_del(&ca->kobj);
+
+ bch_fs_debug_exit(c);
+ bch_fs_chardev_exit(c);
+
+ bch_cache_accounting_destroy(&c->accounting);
+
+ kobject_put(&c->time_stats);
+ kobject_put(&c->opts_dir);
+ kobject_put(&c->internal);
+
+ __bch_fs_read_only(c);
+}
+
/*
* should be __bch_fs_stop4 - block devices are closed, now we can finally
* free it
@@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c)
void bch_fs_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
- struct completion *stop_completion = c->stop_completion;
bch_notify_fs_stopped(c);
- bch_info(c, "stopped");
-
bch_fs_free(c);
-
- if (stop_completion)
- complete(stop_completion);
}
/*
@@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj)
static void __bch_fs_stop3(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, cl);
- struct cache *ca;
- unsigned i;
- mutex_lock(&bch_register_lock);
- for_each_cache(ca, c, i)
- bch_dev_stop(ca);
-
- list_del(&c->list);
- mutex_unlock(&bch_register_lock);
-
- closure_debug_destroy(&c->cl);
- kobject_put(&c->kobj);
+ bch_fs_exit(c);
}
/*
@@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
- bch_debug_exit_cache_set(c);
- bch_fs_chardev_exit(c);
-
- if (c->kobj.state_in_sysfs)
- kobject_del(&c->kobj);
-
- bch_cache_accounting_destroy(&c->accounting);
-
- kobject_put(&c->time_stats);
- kobject_put(&c->opts_dir);
- kobject_put(&c->internal);
-
- mutex_lock(&bch_register_lock);
- bch_fs_read_only_sync(c);
- mutex_unlock(&bch_register_lock);
+ bch_fs_offline(c);
closure_return(cl);
}
/*
- * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
- * haven't waited for anything to stop yet, we're just punting to process
+ * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
+ * we haven't waited for anything to stop yet, we're just punting to process
* context to shut down block devices:
*/
static void __bch_fs_stop1(struct closure *cl)
@@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl)
continue_at(cl, __bch_fs_stop2, system_wq);
}
-void bch_fs_stop(struct cache_set *c)
+void bch_fs_stop_async(struct cache_set *c)
{
- if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
+ mutex_lock(&c->state_lock);
+ if (c->state != BCH_FS_STOPPING) {
+ c->state = BCH_FS_STOPPING;
closure_queue(&c->caching);
+ }
+ mutex_unlock(&c->state_lock);
}
-void bch_fs_stop_sync(struct cache_set *c)
+void bch_fs_stop(struct cache_set *c)
{
- DECLARE_COMPLETION_ONSTACK(complete);
+ mutex_lock(&c->state_lock);
+ BUG_ON(c->state == BCH_FS_STOPPING);
+ c->state = BCH_FS_STOPPING;
+ mutex_unlock(&c->state_lock);
+
+ bch_blockdevs_stop(c);
+
+ closure_sync(&c->caching);
+ closure_debug_destroy(&c->caching);
+
+ bch_fs_offline(c);
- c->stop_completion = &complete;
- bch_fs_stop(c);
closure_put(&c->cl);
+ closure_sync(&c->cl);
- /* Killable? */
- wait_for_completion(&complete);
+ bch_fs_exit(c);
+ kobject_put(&c->kobj);
}
/* Stop, detaching from backing devices: */
void bch_fs_detach(struct cache_set *c)
{
if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
}
static unsigned bch_fs_nr_devices(struct cache_set *c)
@@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->minor = -1;
+ mutex_init(&c->state_lock);
mutex_init(&c->sb_lock);
INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
mutex_init(&c->btree_cache_lock);
@@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BCH_TIME_STATS()
#undef BCH_TIME_STAT
- bch_open_buckets_init(c);
- bch_tiering_init_cache_set(c);
+ bch_fs_allocator_init(c);
+ bch_fs_tiering_init(c);
INIT_LIST_HEAD(&c->list);
INIT_LIST_HEAD(&c->cached_devs);
@@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch_fs_blockdev_init(c) ||
bch_io_clock_init(&c->io_clock[READ]) ||
bch_io_clock_init(&c->io_clock[WRITE]) ||
- bch_journal_alloc(&c->journal, journal_entry_bytes) ||
- bch_btree_cache_alloc(c) ||
+ bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
+ bch_fs_btree_init(c) ||
bch_fs_encryption_init(c) ||
- bch_compress_init(c) ||
+ bch_fs_compress_init(c) ||
bch_check_set_has_compressed_data(c, c->opts.compression))
goto err;
@@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
closure_init(&c->caching, &c->cl);
set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
+ closure_get(&c->cl);
continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
return c;
err:
@@ -671,7 +660,20 @@ err:
return NULL;
}
-static int bch_fs_online(struct cache_set *c)
+static struct cache_set *bch_fs_lookup(uuid_le uuid)
+{
+ struct cache_set *c;
+
+ lockdep_assert_held(&bch_register_lock);
+
+ list_for_each_entry(c, &bch_fs_list, list)
+ if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+ return c;
+
+ return NULL;
+}
+
+static const char *__bch_fs_online(struct cache_set *c)
{
struct cache *ca;
unsigned i;
@@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c)
lockdep_assert_held(&bch_register_lock);
if (!list_empty(&c->list))
- return 0;
+ return NULL;
- list_add(&c->list, &bch_fs_list);
+ if (bch_fs_lookup(c->sb.uuid))
+ return "filesystem UUID already open";
ret = bch_fs_chardev_init(c);
if (ret)
- return ret;
+ return "error creating character device";
+
+ bch_fs_debug_init(c);
if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
kobject_add(&c->internal, &c->kobj, "internal") ||
kobject_add(&c->opts_dir, &c->kobj, "options") ||
kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
- return -1;
+ return "error creating sysfs objects";
for_each_cache(ca, c, i)
if (bch_dev_online(ca)) {
percpu_ref_put(&ca->ref);
- return -1;
+ return "error creating sysfs objects";
}
+ mutex_lock(&c->state_lock);
+
+ if (bch_blockdev_volumes_start(c)) {
+ mutex_unlock(&c->state_lock);
+ return "can't bring up blockdev volumes";
+ }
+
+ bch_attach_backing_devs(c);
+
+ mutex_unlock(&c->state_lock);
+
+ list_add(&c->list, &bch_fs_list);
+
return 0;
}
-static const char *bch_fs_start(struct cache_set *c)
+static const char *bch_fs_online(struct cache_set *c)
+{
+ const char *err;
+
+ mutex_lock(&bch_register_lock);
+ err = __bch_fs_online(c);
+ mutex_unlock(&bch_register_lock);
+
+ return err;
+}
+
+static const char *__bch_fs_start(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
@@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c)
struct jset *j;
int ret = -EINVAL;
- lockdep_assert_held(&bch_register_lock);
- BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
-
- /* We don't want bch_fatal_error() to free underneath us */
- closure_get(&c->caching);
+ BUG_ON(c->state != BCH_FS_STARTING);
/*
* Make sure that each cache object's mi is up to date before
@@ -826,6 +851,16 @@ static const char *bch_fs_start(struct cache_set *c)
bch_notice(c, "initializing new filesystem");
+ bch_initial_gc(c, NULL);
+
+ err = "error starting allocator thread";
+ for_each_cache(ca, c, i)
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+ bch_dev_allocator_start(ca)) {
+ percpu_ref_put(&ca->ref);
+ goto err;
+ }
+
err = "unable to allocate journal buckets";
for_each_cache(ca, c, i)
if (bch_dev_journal_alloc(ca)) {
@@ -833,8 +868,6 @@ static const char *bch_fs_start(struct cache_set *c)
goto err;
}
- bch_initial_gc(c, NULL);
-
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
@@ -842,14 +875,6 @@ static const char *bch_fs_start(struct cache_set *c)
bch_journal_start(c);
bch_journal_set_replay_done(&c->journal);
- err = "error starting allocator thread";
- for_each_cache(ca, c, i)
- if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
- bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
-
err = "cannot allocate new btree root";
for (id = 0; id < BTREE_ID_NR; id++)
if (bch_btree_root_alloc(c, id, &cl)) {
@@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c)
goto err;
}
recovery_done:
+ err = "dynamic fault";
+ if (bch_fs_init_fault("fs_start"))
+ goto err;
+
if (c->opts.read_only) {
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
} else {
- err = __bch_fs_read_write(c);
+ err = bch_fs_read_write(c);
if (err)
goto err;
}
@@ -901,27 +930,9 @@ recovery_done:
bch_write_super(c);
mutex_unlock(&c->sb_lock);
- err = "dynamic fault";
- if (bch_fs_init_fault("fs_start"))
- goto err;
-
- err = "error creating kobject";
- if (bch_fs_online(c))
- goto err;
-
- err = "can't bring up blockdev volumes";
- if (bch_blockdev_volumes_start(c))
- goto err;
-
- bch_debug_init_cache_set(c);
- set_bit(BCH_FS_RUNNING, &c->flags);
- bch_attach_backing_devs(c);
-
- bch_notify_fs_read_write(c);
err = NULL;
out:
bch_journal_entries_free(&journal);
- closure_put(&c->caching);
return err;
err:
switch (ret) {
@@ -955,6 +966,11 @@ err:
goto out;
}
+const char *bch_fs_start(struct cache_set *c)
+{
+ return __bch_fs_start(c) ?: bch_fs_online(c);
+}
+
static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
{
struct bch_sb_field_members *sb_mi;
@@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
return NULL;
}
-/* Cache device */
+/* Device startup/shutdown, ro/rw: */
bool bch_dev_read_only(struct cache *ca)
{
@@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca)
bdevname(ca->disk_sb.bdev, buf);
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
return false;
if (!bch_dev_may_remove(ca)) {
bch_err(c, "required member %s going RO, forcing fs RO", buf);
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
}
trace_bcache_cache_read_only(ca);
@@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca)
static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
{
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
return NULL;
@@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
if (bch_dev_allocator_start(ca))
return "error starting allocator thread";
- if (bch_moving_gc_thread_start(ca))
+ if (bch_moving_gc_start(ca))
return "error starting moving GC thread";
- bch_dev_group_add(&c->journal.devs, ca);
-
- wake_up_process(c->tiering_read);
+ if (bch_tiering_start(c))
+ return "error starting tiering thread";
bch_notify_dev_read_write(ca);
trace_bcache_cache_read_write_done(ca);
@@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca)
return NULL;
}
-/*
- * bch_dev_stop has already returned, so we no longer hold the register
- * lock at the point this is called.
- */
-
void bch_dev_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
- percpu_ref_exit(&ca->ref);
kfree(ca);
}
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_free(struct cache *ca)
{
- struct cache *ca = container_of(work, struct cache, free_work);
struct cache_set *c = ca->set;
unsigned i;
@@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work)
kobject_del(&ca->kobj);
bch_free_super(&ca->disk_sb);
-
- /*
- * bch_dev_stop can be called in the middle of initialization
- * of the struct cache object.
- * As such, not all the sub-structures may be initialized.
- * However, they were zeroed when the object was allocated.
- */
-
- bch_journal_free_cache(ca);
+ bch_dev_journal_exit(ca);
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->bucket_stats_percpu);
@@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work)
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
+ percpu_ref_exit(&ca->ref);
kobject_put(&ca->kobj);
if (c)
kobject_put(&c->kobj);
}
+static void bch_dev_free_work(struct work_struct *work)
+{
+ struct cache *ca = container_of(work, struct cache, free_work);
+
+ bch_dev_free(ca);
+}
+
static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
{
struct cache *ca = container_of(ref, struct cache, ref);
@@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca)
{
struct cache_set *c = ca->set;
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->state_lock);
- if (c) {
- BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
- rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
- }
+ BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+ rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
call_rcu(&ca->free_rcu, bch_dev_free_rcu);
}
@@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work)
*/
closure_get(&c->cl);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->state_lock);
+
bch_dev_stop(ca);
/*
@@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work)
*/
synchronize_rcu();
- lockdep_assert_held(&bch_register_lock);
-
/*
* Free this device's slot in the bch_member array - all pointers to
* this device must be gone:
@@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work)
memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
bch_write_super(c);
- mutex_unlock(&c->sb_lock);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->sb_lock);
+ mutex_unlock(&c->state_lock);
closure_put(&c->cl);
}
-bool bch_dev_remove(struct cache *ca, bool force)
+static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
{
- mutex_lock(&bch_register_lock);
-
if (test_bit(BCH_DEV_REMOVING, &ca->flags))
return false;
if (!bch_dev_may_remove(ca)) {
- bch_err(ca->set, "Can't remove last device in tier %u",
- ca->mi.tier);
+ bch_err(ca->set, "Can't remove last RW device");
bch_notify_dev_remove_failed(ca);
return false;
}
@@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force)
if (force)
set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
+
set_bit(BCH_DEV_REMOVING, &ca->flags);
bch_notify_dev_removing(ca);
- mutex_unlock(&bch_register_lock);
-
/* Migrate the data and finish removal asynchronously: */
queue_work(system_long_wq, &ca->remove_work);
return true;
}
+bool bch_dev_remove(struct cache *ca, bool force)
+{
+ struct cache_set *c = ca->set;
+ bool ret;
+
+ mutex_lock(&c->state_lock);
+ ret = __bch_dev_remove(c, ca, force);
+ mutex_unlock(&c->state_lock);
+
+ return ret;
+}
+
static int bch_dev_online(struct cache *ca)
{
char buf[12];
- lockdep_assert_held(&bch_register_lock);
-
sprintf(buf, "cache%u", ca->dev_idx);
if (kobject_add(&ca->kobj,
@@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
kobject_init(&ca->kobj, &bch_dev_ktype);
spin_lock_init(&ca->self.lock);
- ca->self.nr_devices = 1;
+ ca->self.nr = 1;
rcu_assign_pointer(ca->self.d[0].dev, ca);
ca->dev_idx = sb->sb->dev_idx;
@@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
spin_lock_init(&ca->freelist_lock);
spin_lock_init(&ca->prio_buckets_lock);
mutex_init(&ca->heap_lock);
- bch_moving_init_cache(ca);
+ bch_dev_moving_gc_init(ca);
ca->disk_sb = *sb;
- ca->disk_sb.bdev->bd_holder = ca;
+ if (sb->mode & FMODE_EXCL)
+ ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
@@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
!(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
- bch_journal_init_cache(ca))
+ bch_dev_journal_init(ca))
goto err;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
err = "error creating kobject";
if (c->kobj.state_in_sysfs &&
bch_dev_online(ca))
- goto err;
+ pr_warn("error creating sysfs objects");
if (ret)
*ret = ca;
@@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
kobject_put(&ca->kobj);
return NULL;
err:
- bch_dev_stop(ca);
+ bch_dev_free(ca);
return err;
}
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
- struct cache_set *c;
-
- lockdep_assert_held(&bch_register_lock);
-
- list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
- return c;
-
- return NULL;
-}
-
int bch_dev_add(struct cache_set *c, const char *path)
{
struct bcache_superblock sb;
const char *err;
struct cache *ca;
- struct bch_sb_field *f;
struct bch_sb_field_members *mi, *dev_mi;
struct bch_member saved_mi;
unsigned dev_idx, nr_devices, u64s;
int ret = -EINVAL;
- mutex_lock(&bch_register_lock);
-
err = bch_read_super(&sb, c->opts, path);
if (err)
- goto err_unlock_register;
+ return -EINVAL;
err = bch_validate_cache_super(&sb);
if (err)
- goto err_unlock_register;
-
- mutex_lock(&c->sb_lock);
+ return -EINVAL;
err = bch_dev_may_add(sb.sb, c);
if (err)
- goto err_unlock;
+ return -EINVAL;
+
+ mutex_lock(&c->state_lock);
+ mutex_lock(&c->sb_lock);
/*
* Preserve the old cache member information (esp. tier)
@@ -1571,17 +1568,14 @@ have_slot:
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
err = "no space in superblock for member info";
- f = bch_fs_sb_field_resize(c, &mi->field, u64s);
- if (!f)
+ mi = bch_fs_sb_resize_members(c, u64s);
+ if (!mi)
goto err_unlock;
- mi = container_of(f, struct bch_sb_field_members, field);
-
- f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
- if (!f)
+ dev_mi = bch_sb_resize_members(&sb, u64s);
+ if (!dev_mi)
goto err_unlock;
- dev_mi = container_of(f, struct bch_sb_field_members, field);
memcpy(dev_mi, mi, u64s * sizeof(u64));
dev_mi->members[dev_idx] = saved_mi;
@@ -1619,14 +1613,13 @@ have_slot:
kobject_put(&ca->kobj);
mutex_unlock(&c->sb_lock);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
return 0;
err_put:
bch_dev_stop(ca);
err_unlock:
mutex_unlock(&c->sb_lock);
-err_unlock_register:
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
bch_free_super(&sb);
bch_err(c, "Unable to add device: %s", err);
@@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
const char *err;
struct cache_set *c = NULL;
struct bcache_superblock *sb;
- uuid_le uuid;
unsigned i;
- memset(&uuid, 0, sizeof(uuid_le));
-
if (!nr_devices)
return "need at least one device";
@@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
if (!sb)
goto err;
- /*
- * bch_read_super() needs to happen under register_lock, so that the
- * exclusive open is atomic with adding the new cache set to the list of
- * cache sets:
- */
- mutex_lock(&bch_register_lock);
-
for (i = 0; i < nr_devices; i++) {
err = bch_read_super(&sb[i], opts, devices[i]);
if (err)
- goto err_unlock;
+ goto err;
err = "attempting to register backing device";
if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
- goto err_unlock;
+ goto err;
err = bch_validate_cache_super(&sb[i]);
if (err)
- goto err_unlock;
+ goto err;
}
- err = "cache set already registered";
- if (bch_fs_lookup(sb->sb->uuid))
- goto err_unlock;
-
err = "cannot allocate memory";
c = bch_fs_alloc(sb[0].sb, opts);
if (!c)
- goto err_unlock;
+ goto err;
for (i = 0; i < nr_devices; i++) {
err = bch_dev_alloc(&sb[i], c, NULL);
if (err)
- goto err_unlock;
+ goto err;
}
err = "insufficient devices";
if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
- goto err_unlock;
+ goto err;
- err = bch_fs_start(c);
- if (err)
- goto err_unlock;
+ if (!c->opts.nostart) {
+ err = __bch_fs_start(c);
+ if (err)
+ goto err;
+ }
- err = "error creating kobject";
- if (bch_fs_online(c))
- goto err_unlock;
+ err = bch_fs_online(c);
+ if (err)
+ goto err;
- if (ret) {
- closure_get(&c->cl);
+ if (ret)
*ret = c;
- }
-
- mutex_unlock(&bch_register_lock);
+ else
+ closure_put(&c->cl);
err = NULL;
out:
@@ -1717,20 +1696,18 @@ out:
if (err)
c = NULL;
return err;
-err_unlock:
+err:
if (c)
bch_fs_stop(c);
- mutex_unlock(&bch_register_lock);
-err:
+
for (i = 0; i < nr_devices; i++)
bch_free_super(&sb[i]);
goto out;
}
static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
- struct bch_opts opts)
+ struct bch_opts opts)
{
- char name[BDEVNAME_SIZE];
const char *err;
struct cache_set *c;
bool allocated_cache_set = false;
@@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
if (err)
return err;
- bdevname(sb->bdev, name);
-
+ mutex_lock(&bch_register_lock);
c = bch_fs_lookup(sb->sb->uuid);
if (c) {
+ closure_get(&c->cl);
+
err = bch_dev_in_fs(sb->sb, c);
if (err)
- return err;
+ goto err;
} else {
c = bch_fs_alloc(sb->sb, opts);
+ err = "cannot allocate memory";
if (!c)
- return "cannot allocate memory";
+ goto err;
allocated_cache_set = true;
}
@@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
if (err)
goto err;
- if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
- err = bch_fs_start(c);
+ if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
+ !c->opts.nostart) {
+ err = __bch_fs_start(c);
if (err)
goto err;
- } else {
- err = "error creating kobject";
- if (bch_fs_online(c))
- goto err;
}
- bch_info(c, "started");
+ err = __bch_fs_online(c);
+ if (err)
+ goto err;
+
+ closure_put(&c->cl);
+ mutex_unlock(&bch_register_lock);
+
return NULL;
err:
+ mutex_unlock(&bch_register_lock);
+
if (allocated_cache_set)
bch_fs_stop(c);
+ else if (c)
+ closure_put(&c->cl);
+
return err;
}
@@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path)
struct bch_opts opts = bch_opts_empty();
const char *err;
- mutex_lock(&bch_register_lock);
-
err = bch_read_super(&sb, opts, path);
if (err)
- goto err;
+ return err;
- if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+ if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
+ mutex_lock(&bch_register_lock);
err = bch_backing_dev_register(&sb);
- else
+ mutex_unlock(&bch_register_lock);
+ } else {
err = __bch_fs_open_incremental(&sb, opts);
+ }
bch_free_super(&sb);
-err:
- mutex_unlock(&bch_register_lock);
+
return err;
}
@@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
pr_info("Setting all devices read only:");
list_for_each_entry(c, &bch_fs_list, list)
- bch_fs_read_only(c);
+ bch_fs_read_only_async(c);
list_for_each_entry(c, &bch_fs_list, list)
- bch_fs_read_only_sync(c);
+ bch_fs_read_only(c);
mutex_unlock(&bch_register_lock);
}
@@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot, reboot_test);
static void bcache_exit(void)
{
bch_debug_exit();
- bch_fs_exit();
+ bch_vfs_exit();
bch_blockdev_exit();
bch_chardev_exit();
if (bcache_kset)
@@ -1917,7 +1904,7 @@ static int __init bcache_init(void)
sysfs_create_files(&bcache_kset->kobj, files) ||
bch_chardev_init() ||
bch_blockdev_init() ||
- bch_fs_init() ||
+ bch_vfs_init() ||
bch_debug_init())
goto err;
diff --git a/libbcache/super.h b/libbcache/super.h
index bcf7d983..bafd88e0 100644
--- a/libbcache/super.h
+++ b/libbcache/super.h
@@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
static inline bool bch_dev_may_remove(struct cache *ca)
{
struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
-
- /*
- * Right now, we can't remove the last device from a tier,
- * - For tier 0, because all metadata lives in tier 0 and because
- * there is no way to have foreground writes go directly to tier 1.
- * - For tier 1, because the code doesn't completely support an
- * empty tier 1.
- */
-
- /*
- * Turning a device read-only removes it from the cache group,
- * so there may only be one read-write device in a tier, and yet
- * the device we are removing is in the same tier, so we have
- * to check for identity.
- * Removing the last RW device from a tier requires turning the
- * whole cache set RO.
- */
-
- return tier->nr_devices != 1 ||
- rcu_access_pointer(tier->d[0].dev) != ca;
+ struct cache_group *grp = &c->cache_all;
+
+ /* Can't remove the last RW device: */
+ return grp->nr != 1 ||
+ rcu_access_pointer(grp->d[0].dev) != ca;
}
void bch_dev_release(struct kobject *);
@@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *);
void bch_fs_detach(struct cache_set *);
-bool bch_fs_read_only(struct cache_set *);
bool bch_fs_emergency_read_only(struct cache_set *);
-void bch_fs_read_only_sync(struct cache_set *);
+void bch_fs_read_only(struct cache_set *);
const char *bch_fs_read_write(struct cache_set *);
void bch_fs_release(struct kobject *);
+void bch_fs_stop_async(struct cache_set *);
void bch_fs_stop(struct cache_set *);
-void bch_fs_stop_sync(struct cache_set *);
+const char *bch_fs_start(struct cache_set *);
const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
struct cache_set **);
const char *bch_fs_open_incremental(const char *path);
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
index 41eaf0dd..69c747de 100644
--- a/libbcache/super_types.h
+++ b/libbcache/super_types.h
@@ -6,6 +6,7 @@ struct bcache_superblock {
struct block_device *bdev;
struct bio *bio;
unsigned page_order;
+ fmode_t mode;
};
#endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
index 9f45a6b0..48f9f1f6 100644
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@@ -22,6 +22,7 @@
#include "opts.h"
#include "request.h"
#include "super-io.h"
+#include "tier.h"
#include "writeback.h"
#include <linux/blkdev.h>
@@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy);
rw_attribute(foreground_write_ratelimit_enabled);
rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(tier);
rw_attribute(tiering_enabled);
rw_attribute(tiering_percent);
sysfs_pd_controller_attribute(tiering);
@@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent);
rw_attribute(size);
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
-read_attribute(tier);
#define BCH_DEBUG_PARAM(name, description) \
rw_attribute(name);
@@ -680,7 +682,8 @@ SHOW(bch_fs)
sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
sysfs_print(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_show(tiering, &c->tiering_pd);
+
+ sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
@@ -694,7 +697,7 @@ SHOW(bch_fs)
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
- if (!test_bit(BCH_FS_RUNNING, &c->flags))
+ if (!bch_fs_running(c))
return -EPERM;
if (attr == &sysfs_bset_tree_stats)
@@ -723,7 +726,7 @@ STORE(__bch_fs)
}
if (attr == &sysfs_stop) {
- bch_fs_stop(c);
+ bch_fs_stop_async(c);
return size;
}
@@ -773,25 +776,18 @@ STORE(__bch_fs)
ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
?: (ssize_t) size;
- if (c->tiering_read)
- wake_up_process(c->tiering_read);
+ bch_tiering_start(c); /* issue wakeups */
return ret;
}
sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
- if (attr == &sysfs_journal_flush) {
- bch_journal_meta_async(&c->journal, NULL);
-
- return size;
- }
-
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
sysfs_strtoul(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_store(tiering, &c->tiering_pd);
+ sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
/* Debugging: */
@@ -799,11 +795,14 @@ STORE(__bch_fs)
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
- if (!test_bit(BCH_FS_RUNNING, &c->flags))
+ if (!bch_fs_running(c))
return -EPERM;
- if (test_bit(BCH_FS_STOPPING, &c->flags))
- return -EINTR;
+ if (attr == &sysfs_journal_flush) {
+ bch_journal_meta_async(&c->journal, NULL);
+
+ return size;
+ }
if (attr == &sysfs_blockdev_volume_create) {
u64 v = strtoi_h_or_return(buf);
@@ -836,9 +835,9 @@ STORE(bch_fs)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->state_lock);
size = __bch_fs_store(kobj, attr, buf, size);
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->state_lock);
if (attr == &sysfs_add_device) {
char *path = kstrdup(buf, GFP_KERNEL);
@@ -1273,6 +1272,31 @@ STORE(__bch_dev)
mutex_unlock(&c->sb_lock);
}
+ if (attr == &sysfs_tier) {
+ unsigned prev_tier;
+ unsigned v = strtoul_restrict_or_return(buf,
+ 0, BCH_TIER_MAX - 1);
+
+ mutex_lock(&c->sb_lock);
+ prev_tier = ca->mi.tier;
+
+ if (v == ca->mi.tier) {
+ mutex_unlock(&c->sb_lock);
+ return size;
+ }
+
+ mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+ SET_BCH_MEMBER_TIER(mi, v);
+ bch_write_super(c);
+
+ bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
+ bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+ mutex_unlock(&c->sb_lock);
+
+ bch_recalc_capacity(c);
+ bch_tiering_start(c);
+ }
+
if (attr == &sysfs_state_rw) {
char name[BDEVNAME_SIZE];
const char *err = NULL;
diff --git a/libbcache/tier.c b/libbcache/tier.c
index 46864594..0ab17708 100644
--- a/libbcache/tier.c
+++ b/libbcache/tier.c
@@ -16,8 +16,7 @@
#include <trace/events/bcache.h>
struct tiering_state {
- struct cache_group *tier;
- unsigned tier_idx;
+ struct bch_tier *tier;
unsigned sectors;
unsigned stripe_size;
unsigned dev_idx;
@@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
mi = cache_member_info_get(c);
extent_for_each_ptr(e, ptr)
if (ptr->dev < mi->nr_devices &&
- mi->m[ptr->dev].tier >= s->tier_idx)
+ mi->m[ptr->dev].tier >= s->tier->idx)
replicas++;
cache_member_info_put();
@@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s)
s->sectors = 0;
s->dev_idx++;
- spin_lock(&s->tier->lock);
- if (s->dev_idx >= s->tier->nr_devices)
+ spin_lock(&s->tier->devs.lock);
+ if (s->dev_idx >= s->tier->devs.nr)
s->dev_idx = 0;
- if (s->tier->nr_devices) {
- s->ca = s->tier->d[s->dev_idx].dev;
+ if (s->tier->devs.nr) {
+ s->ca = s->tier->devs.d[s->dev_idx].dev;
percpu_ref_get(&s->ca->ref);
}
- spin_unlock(&s->tier->lock);
+ spin_unlock(&s->tier->devs.lock);
}
}
@@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c,
* tiering_next_cache - issue a move to write an extent to the next cache
* device in round robin order
*/
-static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+static s64 read_tiering(struct cache_set *c, struct bch_tier *tier)
{
struct moving_context ctxt;
struct tiering_state s;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned nr_devices = READ_ONCE(tier->nr_devices);
+ unsigned nr_devices = READ_ONCE(tier->devs.nr);
int ret;
if (!nr_devices)
@@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
memset(&s, 0, sizeof(s));
s.tier = tier;
- s.tier_idx = tier - c->cache_tiers;
s.stripe_size = 2048; /* 1 mb for now */
- bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+ bch_move_ctxt_init(&ctxt, &tier->pd.rate,
nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
@@ -164,8 +162,8 @@ next:
static int bch_tiering_thread(void *arg)
{
- struct cache_set *c = arg;
- struct cache_group *tier = &c->cache_tiers[1];
+ struct bch_tier *tier = arg;
+ struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]);
struct io_clock *clock = &c->io_clock[WRITE];
struct cache *ca;
u64 tier_capacity, available_sectors;
@@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg)
while (!kthread_should_stop()) {
if (kthread_wait_freezable(c->tiering_enabled &&
- tier->nr_devices))
+ tier->devs.nr))
break;
while (1) {
- struct cache_group *faster_tier;
+ struct bch_tier *faster_tier;
last = atomic_long_read(&clock->now);
tier_capacity = available_sectors = 0;
rcu_read_lock();
- for (faster_tier = c->cache_tiers;
+ for (faster_tier = c->tiers;
faster_tier != tier;
faster_tier++) {
- group_for_each_cache_rcu(ca, faster_tier, i) {
+ group_for_each_cache_rcu(ca, &faster_tier->devs, i) {
tier_capacity +=
(ca->mi.nbuckets -
ca->mi.first_bucket) << ca->bucket_bits;
@@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg)
return 0;
}
-void bch_tiering_init_cache_set(struct cache_set *c)
+static void __bch_tiering_stop(struct bch_tier *tier)
{
- bch_pd_controller_init(&c->tiering_pd);
+ tier->pd.rate.rate = UINT_MAX;
+ bch_ratelimit_reset(&tier->pd.rate);
+
+ if (tier->migrate)
+ kthread_stop(tier->migrate);
+
+ tier->migrate = NULL;
+}
+
+void bch_tiering_stop(struct cache_set *c)
+{
+ struct bch_tier *tier;
+
+ for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
+ __bch_tiering_stop(tier);
+}
+
+static int __bch_tiering_start(struct bch_tier *tier)
+{
+ if (!tier->migrate) {
+ struct task_struct *p =
+ kthread_create(bch_tiering_thread, tier,
+ "bch_tier[%u]", tier->idx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ tier->migrate = p;
+ }
+
+ wake_up_process(tier->migrate);
+ return 0;
}
-int bch_tiering_read_start(struct cache_set *c)
+int bch_tiering_start(struct cache_set *c)
{
- struct task_struct *t;
+ struct bch_tier *tier;
+ bool have_faster_tier = false;
if (c->opts.nochanges)
return 0;
- t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
- if (IS_ERR(t))
- return PTR_ERR(t);
+ for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+ if (!tier->devs.nr)
+ continue;
- c->tiering_read = t;
- wake_up_process(c->tiering_read);
+ if (have_faster_tier) {
+ int ret = __bch_tiering_start(tier);
+ if (ret)
+ return ret;
+ } else {
+ __bch_tiering_stop(tier);
+ }
+
+ have_faster_tier = true;
+ }
return 0;
}
-void bch_tiering_read_stop(struct cache_set *c)
+void bch_fs_tiering_init(struct cache_set *c)
{
- if (!IS_ERR_OR_NULL(c->tiering_read)) {
- kthread_stop(c->tiering_read);
- c->tiering_read = NULL;
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+ c->tiers[i].idx = i;
+ bch_pd_controller_init(&c->tiers[i].pd);
}
}
diff --git a/libbcache/tier.h b/libbcache/tier.h
index 89c2bffd..b53e83d9 100644
--- a/libbcache/tier.h
+++ b/libbcache/tier.h
@@ -1,8 +1,8 @@
#ifndef _BCACHE_TIER_H
#define _BCACHE_TIER_H
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_read_stop(struct cache_set *);
+void bch_tiering_stop(struct cache_set *);
+int bch_tiering_start(struct cache_set *);
+void bch_fs_tiering_init(struct cache_set *);
#endif