diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-01 01:45:15 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-09 09:14:11 -0900 |
commit | a17f7bcec7ed810a247c24e56229af8f43a9a6ae (patch) | |
tree | 1b2d60b21661bd2991324e3efaa83b3cdd87a783 /libbcache | |
parent | 171ee48e57be78f4e95954c99851553fa523bf91 (diff) |
cmd_migrate
Diffstat (limited to 'libbcache')
37 files changed, 1264 insertions, 920 deletions
diff --git a/libbcache/alloc.c b/libbcache/alloc.c index 8cb31944..93f0c2f1 100644 --- a/libbcache/alloc.c +++ b/libbcache/alloc.c @@ -73,7 +73,6 @@ #include <linux/rcupdate.h> #include <trace/events/bcache.h> -static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve); static void __bch_bucket_free(struct cache *, struct bucket *); /* Allocation groups: */ @@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca) spin_lock(&grp->lock); - for (i = 0; i < grp->nr_devices; i++) + for (i = 0; i < grp->nr; i++) if (rcu_access_pointer(grp->d[i].dev) == ca) { - grp->nr_devices--; + grp->nr--; memmove(&grp->d[i], &grp->d[i + 1], - (grp->nr_devices - i) * sizeof(grp->d[0])); + (grp->nr- i) * sizeof(grp->d[0])); break; } @@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca) unsigned i; spin_lock(&grp->lock); - for (i = 0; i < grp->nr_devices; i++) + for (i = 0; i < grp->nr; i++) if (rcu_access_pointer(grp->d[i].dev) == ca) goto out; - BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX); + BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX); - rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca); + rcu_assign_pointer(grp->d[grp->nr++].dev, ca); out: spin_unlock(&grp->lock); } @@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work) struct cache_set, pd_controllers_update); struct cache *ca; - unsigned iter; - int i; + unsigned i, iter; /* All units are in bytes */ - u64 tier_size[BCH_TIER_MAX]; - u64 tier_free[BCH_TIER_MAX]; - u64 tier_dirty[BCH_TIER_MAX]; - u64 tier0_can_free = 0; + u64 faster_tiers_size = 0; + u64 faster_tiers_dirty = 0; - memset(tier_size, 0, sizeof(tier_size)); - memset(tier_free, 0, sizeof(tier_free)); - memset(tier_dirty, 0, sizeof(tier_dirty)); + u64 fastest_tier_size = 0; + u64 fastest_tier_free = 0; + u64 copygc_can_free = 0; rcu_read_lock(); - for (i = BCH_TIER_MAX - 1; i >= 0; --i) - group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) { + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { + bch_pd_controller_update(&c->tiers[i].pd, + div_u64(faster_tiers_size * + c->tiering_percent, 100), + faster_tiers_dirty, + -1); + + group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) { struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca); unsigned bucket_bits = ca->bucket_bits + 9; + u64 size = (ca->mi.nbuckets - + ca->mi.first_bucket) << bucket_bits; + u64 dirty = stats.buckets_dirty << bucket_bits; + u64 free = __buckets_free_cache(ca, stats) << bucket_bits; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC @@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work) ((stats.sectors_dirty + stats.sectors_cached) << 9); - u64 dev_size = (ca->mi.nbuckets - - ca->mi.first_bucket) << bucket_bits; - - u64 free = __buckets_free_cache(ca, stats) << bucket_bits; - if (fragmented < 0) fragmented = 0; bch_pd_controller_update(&ca->moving_gc_pd, free, fragmented, -1); - if (i == 0) - tier0_can_free += fragmented; - - tier_size[i] += dev_size; - tier_free[i] += free; - tier_dirty[i] += stats.buckets_dirty << bucket_bits; - } - rcu_read_unlock(); - - if (tier_size[1]) { - u64 target = div_u64(tier_size[0] * c->tiering_percent, 100); + faster_tiers_size += size; + faster_tiers_dirty += dirty; - tier0_can_free = max_t(s64, 0, tier_dirty[0] - target); + if (!c->fastest_tier || + c->fastest_tier == &c->tiers[i]) { + fastest_tier_size += size; + fastest_tier_free += free; + } - bch_pd_controller_update(&c->tiering_pd, - target, - tier_dirty[0], - -1); + copygc_can_free += fragmented; + } } + rcu_read_unlock(); + /* * Throttle foreground writes if tier 0 is running out of free buckets, - * and either tiering or copygc can free up space (but don't take both - * into account). + * and either tiering or copygc can free up space. * * Target will be small if there isn't any work to do - we don't want to * throttle foreground writes if we currently have all the free space @@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work) * Otherwise, if there's work to do, try to keep 20% of tier0 available * for foreground writes. */ + if (c->fastest_tier) + copygc_can_free = U64_MAX; + bch_pd_controller_update(&c->foreground_write_pd, - min(tier0_can_free, - div_u64(tier_size[0] * + min(copygc_can_free, + div_u64(fastest_tier_size * c->foreground_target_percent, 100)), - tier_free[0], + fastest_tier_free, -1); schedule_delayed_work(&c->pd_controllers_update, @@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca) * it getting gc'd from under us */ ca->prio_buckets[i] = r; - bch_mark_metadata_bucket(ca, ca->buckets + r, false); + bch_mark_metadata_bucket(ca, ca->buckets + r, + BUCKET_PRIOS, false); spin_unlock(&ca->prio_buckets_lock); SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c)); @@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca) do { unsigned u64s = jset_u64s(0); + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) + break; + ret = bch_journal_res_get(j, &res, u64s, u64s); if (ret) return ret; @@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca) if (is_available_bucket(m) && !m.cached_sectors && !m.had_metadata && - (!m.wait_on_journal || - ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) { + !bucket_needs_journal_commit(m, last_seq_ondisk)) { spin_lock(&ca->freelist_lock); bch_mark_alloc_bucket(ca, g, true); @@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg) set_freezable(); + bch_find_empty_buckets(c, ca); + while (1) { /* * First, we pull buckets off of the free_inc list, possibly @@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg) * See if we have buckets we can reuse without invalidating them * or forcing a journal commit: */ - bch_find_empty_buckets(c, ca); + //bch_find_empty_buckets(c, ca); if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) { up_read(&c->gc_lock); @@ -967,7 +970,7 @@ out: * * Returns index of bucket on success, 0 on failure * */ -static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve) +size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve) { struct bucket *g; long r; @@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c, u64 available_buckets = 1; /* avoid a divide by zero... */ unsigned i; - for (i = 0; i < devs->nr_devices; i++) { + for (i = 0; i < devs->nr; i++) { ca = devs->d[i].dev; devs->d[i].weight = buckets_free_cache(ca); available_buckets += devs->d[i].weight; } - for (i = 0; i < devs->nr_devices; i++) { + for (i = 0; i < devs->nr; i++) { const unsigned min_weight = U32_MAX >> 4; const unsigned max_weight = U32_MAX; devs->d[i].weight = min_weight + div64_u64(devs->d[i].weight * - devs->nr_devices * + devs->nr * (max_weight - min_weight), available_buckets); devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight); @@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, rcu_read_lock(); spin_lock(&devs->lock); - for (i = 0; i < devs->nr_devices; i++) + for (i = 0; i < devs->nr; i++) available += !test_bit(devs->d[i].dev->dev_idx, caches_used); @@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, } i++; - i %= devs->nr_devices; + i %= devs->nr; ret = FREELIST_EMPTY; if (i == fail_idx) @@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c, enum alloc_reserve reserve, long *caches_used) { + struct bch_tier *tier; /* * this should implement policy - for a given type of allocation, decide * which devices to allocate from: * * XXX: switch off wp->type and do something more intelligent here */ + if (wp->group) + return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, + wp->group, caches_used); - /* foreground writes: prefer tier 0: */ - if (wp->group == &c->cache_all) + /* foreground writes: prefer fastest tier: */ + tier = READ_ONCE(c->fastest_tier); + if (tier) bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - &c->cache_tiers[0], caches_used); + &tier->devs, caches_used); return bch_bucket_alloc_group(c, ob, reserve, nr_replicas, - wp->group, caches_used); + &c->cache_all, caches_used); } static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp, @@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c, ? 0 : BTREE_NODE_RESERVE; int ret; - BUG_ON(!wp->group); BUG_ON(!reserve); BUG_ON(!nr_replicas); retry: @@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, unsigned nr_replicas, struct open_bucket *ob, unsigned sectors) { - struct bch_extent_ptr tmp, *ptr; + struct bch_extent_ptr tmp; struct cache *ca; bool has_data = false; unsigned i; @@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, if (nr_replicas < ob->nr_ptrs) has_data = true; + rcu_read_lock(); + for (i = 0; i < nr_replicas; i++) { EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); @@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, extent_ptr_append(e, tmp); ob->ptr_offset[i] += sectors; + + if ((ca = PTR_CACHE(c, &ob->ptrs[i]))) + this_cpu_add(*ca->sectors_written, sectors); } - open_bucket_for_each_online_device(c, ob, ptr, ca) - this_cpu_add(*ca->sectors_written, sectors); + rcu_read_unlock(); } /* @@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, /* Startup/shutdown (ro/rw): */ -static void bch_recalc_capacity(struct cache_set *c) +void bch_recalc_capacity(struct cache_set *c) { - struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers); + struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier; struct cache *ca; u64 total_capacity, capacity = 0, reserved_sectors = 0; unsigned long ra_pages = 0; @@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c) c->bdi.ra_pages = ra_pages; + /* Find fastest, slowest tiers with devices: */ + + for (tier = c->tiers; + tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { + if (!tier->devs.nr) + continue; + if (!fastest_tier) + fastest_tier = tier; + slowest_tier = tier; + } + + c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL; + + c->promote_write_point.group = &fastest_tier->devs; + + if (!fastest_tier) + goto set_capacity; + /* * Capacity of the cache set is the capacity of all the devices in the * slowest (highest) tier - we don't include lower tier devices. */ - for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1; - tier > c->cache_tiers && !tier->nr_devices; - --tier) - ; - - group_for_each_cache_rcu(ca, tier, i) { + group_for_each_cache_rcu(ca, &slowest_tier->devs, i) { size_t reserve = 0; /* @@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c) ca->mi.first_bucket) << ca->bucket_bits; } +set_capacity: rcu_read_unlock(); - total_capacity = capacity; capacity *= (100 - c->opts.gc_reserve_percent); @@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca) void bch_dev_allocator_stop(struct cache *ca) { struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + struct cache_group *tier = &c->tiers[ca->mi.tier].devs; struct task_struct *p; struct closure cl; unsigned i; @@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca) int bch_dev_allocator_start(struct cache *ca) { struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + struct cache_group *tier = &c->tiers[ca->mi.tier].devs; struct task_struct *k; /* @@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca) bch_dev_group_add(tier, ca); bch_dev_group_add(&c->cache_all, ca); + bch_dev_group_add(&c->journal.devs, ca); bch_recalc_capacity(c); @@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca) return 0; } -void bch_open_buckets_init(struct cache_set *c) +void bch_fs_allocator_init(struct cache_set *c) { unsigned i; @@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c) spin_lock_init(&c->cache_all.lock); - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) { - c->write_points[i].throttle = true; - c->write_points[i].group = &c->cache_tiers[0]; - } - - for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++) - spin_lock_init(&c->cache_tiers[i].lock); + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) + spin_lock_init(&c->tiers[i].devs.lock); - c->promote_write_point.group = &c->cache_tiers[0]; - - c->migration_write_point.group = &c->cache_all; - - c->btree_write_point.group = &c->cache_all; + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + c->write_points[i].throttle = true; c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/libbcache/alloc.h b/libbcache/alloc.h index 09139a59..9573dd2c 100644 --- a/libbcache/alloc.h +++ b/libbcache/alloc.h @@ -27,6 +27,8 @@ int bch_prio_read(struct cache *); void bch_recalc_min_prio(struct cache *, int); +size_t bch_bucket_alloc(struct cache *, enum alloc_reserve); + void bch_open_bucket_put(struct cache_set *, struct open_bucket *); struct open_bucket *bch_alloc_sectors_start(struct cache_set *, @@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs, { struct cache *ret = NULL; - while (*iter < devs->nr_devices && + while (*iter < devs->nr && !(ret = rcu_dereference(devs->d[*iter].dev))) (*iter)++; @@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs, ((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\ (_ptr)++) +void bch_recalc_capacity(struct cache_set *); void bch_dev_allocator_stop(struct cache *); int bch_dev_allocator_start(struct cache *); -void bch_open_buckets_init(struct cache_set *); +void bch_fs_allocator_init(struct cache_set *); #endif /* _BCACHE_ALLOC_H */ diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h index fbe8b75c..f408bd97 100644 --- a/libbcache/alloc_types.h +++ b/libbcache/alloc_types.h @@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id) struct cache_group { spinlock_t lock; - unsigned nr_devices; + unsigned nr; unsigned cur_device; struct { u64 weight; diff --git a/libbcache/bcache.h b/libbcache/bcache.h index babc08db..5b668c71 100644 --- a/libbcache/bcache.h +++ b/libbcache/bcache.h @@ -464,24 +464,10 @@ struct cache { * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching * all the backing devices first (their cached data gets invalidated, and they * won't automatically reattach). - * - * BCH_FS_STOPPING always gets set first when we're closing down a cache set; - * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e. - * flushing dirty data). - * - * BCH_FS_RUNNING means all cache devices have been registered and journal - * replay is complete. */ enum { - /* Startup: */ BCH_FS_INITIAL_GC_DONE, - BCH_FS_RUNNING, - - /* Shutdown: */ BCH_FS_DETACHING, - BCH_FS_STOPPING, - BCH_FS_RO, - BCH_FS_RO_COMPLETE, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, BCH_FS_GC_STOPPING, @@ -498,6 +484,21 @@ struct btree_debug { struct dentry *failed; }; +struct bch_tier { + unsigned idx; + struct task_struct *migrate; + struct bch_pd_controller pd; + + struct cache_group devs; +}; + +enum bch_fs_state { + BCH_FS_STARTING = 0, + BCH_FS_STOPPING, + BCH_FS_RO, + BCH_FS_RW, +}; + struct cache_set { struct closure cl; @@ -506,7 +507,6 @@ struct cache_set { struct kobject internal; struct kobject opts_dir; struct kobject time_stats; - struct completion *stop_completion; unsigned long flags; int minor; @@ -514,6 +514,10 @@ struct cache_set { struct super_block *vfs_sb; char name[40]; + /* ro/rw, add/remove devices: */ + struct mutex state_lock; + enum bch_fs_state state; + /* Counts outstanding writes, for clean transition to read-only */ struct percpu_ref writes; struct work_struct read_only_work; @@ -640,7 +644,9 @@ struct cache_set { * allocate from: */ struct cache_group cache_all; - struct cache_group cache_tiers[BCH_TIER_MAX]; + struct bch_tier tiers[BCH_TIER_MAX]; + /* NULL if we only have devices in one tier: */ + struct bch_tier *fastest_tier; u64 capacity; /* sectors */ @@ -753,10 +759,6 @@ struct cache_set { unsigned writeback_pages_max; atomic_long_t nr_inodes; - /* TIERING */ - struct task_struct *tiering_read; - struct bch_pd_controller tiering_pd; - /* NOTIFICATIONS */ struct mutex uevent_lock; struct kobj_uevent_env uevent_env; @@ -828,6 +830,11 @@ struct cache_set { #undef BCH_TIME_STAT }; +static inline bool bch_fs_running(struct cache_set *c) +{ + return c->state == BCH_FS_RO || c->state == BCH_FS_RW; +} + static inline unsigned bucket_pages(const struct cache *ca) { return ca->mi.bucket_size / PAGE_SECTORS; diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c index 82b07f59..ba2e9a8c 100644 --- a/libbcache/blockdev.c +++ b/libbcache/blockdev.c @@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bool found; int ret; + lockdep_assert_held(&c->state_lock); + bdevname(dc->disk_sb.bdev, buf); if (memcmp(&dc->disk_sb.sb->set_uuid, @@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) return -EINVAL; } - if (!test_bit(BCH_FS_RUNNING, &c->flags)) - return 0; - - if (test_bit(BCH_FS_STOPPING, &c->flags)) { - pr_err("Can't attach %s: shutting down", buf); + if (!bch_fs_running(c)) { + pr_err("Can't attach %s: not running", buf); return -EINVAL; } @@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c) struct cached_dev *dc, *t; lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); list_for_each_entry_safe(dc, t, &uncached_devices, list) bch_cached_dev_attach(dc, c); @@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c) struct bkey_s_c_inode_blockdev inode; int ret = 0; - if (test_bit(BCH_FS_STOPPING, &c->flags)) + if (!bch_fs_running(c)) return -EINVAL; for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) { diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c index 4d5efdbd..4d0c6d4d 100644 --- a/libbcache/btree_cache.c +++ b/libbcache/btree_cache.c @@ -11,8 +11,9 @@ #define DEF_BTREE_ID(kwd, val, name) name, -const char *bch_btree_id_names[BTREE_ID_NR] = { +const char * const bch_btree_ids[] = { DEFINE_BCH_BTREE_IDS() + NULL }; #undef DEF_BTREE_ID @@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, return mca_can_free(c) * btree_pages(c); } -void bch_btree_cache_free(struct cache_set *c) +void bch_fs_btree_exit(struct cache_set *c) { struct btree *b; unsigned i; @@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c) rhashtable_destroy(&c->btree_cache_table); } -int bch_btree_cache_alloc(struct cache_set *c) +int bch_fs_btree_init(struct cache_set *c) { unsigned i; int ret; diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h index c26489d1..4d67704b 100644 --- a/libbcache/btree_cache.h +++ b/libbcache/btree_cache.h @@ -6,7 +6,7 @@ struct btree_iter; -extern const char *bch_btree_id_names[BTREE_ID_NR]; +extern const char * const bch_btree_ids[]; void bch_recalc_btree_reserve(struct cache_set *); @@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *); struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *, unsigned, enum six_lock_type); -void bch_btree_cache_free(struct cache_set *); -int bch_btree_cache_alloc(struct cache_set *); +void bch_fs_btree_exit(struct cache_set *); +int bch_fs_btree_init(struct cache_set *); #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \ diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c index 0eb7290c..b90807f7 100644 --- a/libbcache/btree_gc.c +++ b/libbcache/btree_gc.c @@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c) } } +static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end, + enum bucket_data_type type) +{ + u64 b = start >> ca->bucket_bits; + + do { + bch_mark_metadata_bucket(ca, ca->buckets + b, type, true); + b++; + } while (b < end >> ca->bucket_bits); +} + /* * Mark non btree metadata - prios, journal */ -static void bch_mark_metadata(struct cache_set *c) +static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca) { - struct cache *ca; - unsigned i, j; + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; u64 b; - for_each_cache(ca, c, i) { - for (j = 0; j < ca->journal.nr; j++) { - b = ca->journal.buckets[j]; - bch_mark_metadata_bucket(ca, ca->buckets + b, true); - } + /* Mark superblocks: */ + for (i = 0; i < layout->nr_superblocks; i++) { + if (layout->sb_offset[i] == BCH_SB_SECTOR) + mark_metadata_sectors(ca, 0, BCH_SB_SECTOR, + BUCKET_SB); + + mark_metadata_sectors(ca, + layout->sb_offset[i], + layout->sb_offset[i] + + (1 << layout->sb_max_size_bits), + BUCKET_SB); + } - spin_lock(&ca->prio_buckets_lock); + spin_lock(&c->journal.lock); - for (j = 0; j < prio_buckets(ca) * 2; j++) { - b = ca->prio_buckets[j]; - bch_mark_metadata_bucket(ca, ca->buckets + b, true); - } + for (i = 0; i < ca->journal.nr; i++) { + b = ca->journal.buckets[i]; + bch_mark_metadata_bucket(ca, ca->buckets + b, + BUCKET_JOURNAL, true); + } + + spin_unlock(&c->journal.lock); + + spin_lock(&ca->prio_buckets_lock); - spin_unlock(&ca->prio_buckets_lock); + for (i = 0; i < prio_buckets(ca) * 2; i++) { + b = ca->prio_buckets[i]; + if (b) + bch_mark_metadata_bucket(ca, ca->buckets + b, + BUCKET_PRIOS, true); } + + spin_unlock(&ca->prio_buckets_lock); +} + +static void bch_mark_metadata(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + mutex_lock(&c->sb_lock); + + for_each_cache(ca, c, i) + bch_mark_dev_metadata(c, ca); + + mutex_unlock(&c->sb_lock); } /* Also see bch_pending_btree_node_free_insert_done() */ @@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c) for_each_bucket(g, ca) { bucket_cmpxchg(g, new, ({ new.owned_by_allocator = 0; - new.is_metadata = 0; + new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; })); @@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c) u64 start_time; enum btree_id id; - if (btree_gc_coalesce_disabled(c)) - return; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) return; @@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg) last_kick = atomic_read(&c->kick_gc); bch_gc(c); - bch_coalesce(c); + if (!btree_gc_coalesce_disabled(c)) + bch_coalesce(c); debug_check_no_locks_held(); } @@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c) { set_bit(BCH_FS_GC_STOPPING, &c->flags); - if (!IS_ERR_OR_NULL(c->gc_thread)) + if (c->gc_thread) kthread_stop(c->gc_thread); + + c->gc_thread = NULL; + clear_bit(BCH_FS_GC_STOPPING, &c->flags); } int bch_gc_thread_start(struct cache_set *c) { - clear_bit(BCH_FS_GC_STOPPING, &c->flags); + struct task_struct *p; - c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); - if (IS_ERR(c->gc_thread)) - return PTR_ERR(c->gc_thread); + BUG_ON(c->gc_thread); + p = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(p)) + return PTR_ERR(p); + + c->gc_thread = p; wake_up_process(c->gc_thread); return 0; } @@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal) { enum btree_id id; - if (journal) { - for (id = 0; id < BTREE_ID_NR; id++) - bch_initial_gc_btree(c, id); + bch_mark_metadata(c); + for (id = 0; id < BTREE_ID_NR; id++) + bch_initial_gc_btree(c, id); + + if (journal) bch_journal_mark(c, journal); - } /* * Skip past versions that might have possibly been used (as nonces), @@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal) if (c->sb.encryption_type) atomic64_add(1 << 16, &c->key_version); - bch_mark_metadata(c); - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); diff --git a/libbcache/buckets.c b/libbcache/buckets.c index 315cfbec..ec4ee54a 100644 --- a/libbcache/buckets.c +++ b/libbcache/buckets.c @@ -66,6 +66,7 @@ #include "alloc.h" #include "btree_gc.h" #include "buckets.h" +#include "error.h" #include <linux/preempt.h> #include <trace/events/bcache.h> @@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {} #endif +/* + * Clear journal_seq_valid for buckets for which it's not needed, to prevent + * wraparound: + */ void bch_bucket_seq_cleanup(struct cache_set *c) { u16 last_seq_ondisk = c->journal.last_seq_ondisk; @@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(g, ca) { bucket_cmpxchg(g, m, ({ - if (!m.wait_on_journal || - ((s16) last_seq_ondisk - - (s16) m.journal_seq < 0)) + if (!m.journal_seq_valid || + bucket_needs_journal_commit(m, last_seq_ondisk)) break; - m.wait_on_journal = 0; + m.journal_seq_valid = 0; })); } } @@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c) static inline int is_meta_bucket(struct bucket_mark m) { - return !m.owned_by_allocator && m.is_metadata; + return m.data_type != BUCKET_DATA; } static inline int is_dirty_bucket(struct bucket_mark m) { - return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors; + return m.data_type == BUCKET_DATA && !!m.dirty_sectors; } static inline int is_cached_bucket(struct bucket_mark m) { - return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors; + return m.data_type == BUCKET_DATA && + !m.dirty_sectors && !!m.cached_sectors; } void bch_fs_stats_apply(struct cache_set *c, @@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c, memset(stats, 0, sizeof(*stats)); } +static bool bucket_became_unavailable(struct cache_set *c, + struct bucket_mark old, + struct bucket_mark new) +{ + return is_available_bucket(old) && + !is_available_bucket(new) && + c->gc_pos.phase == GC_PHASE_DONE; +} + static void bucket_stats_update(struct cache *ca, struct bucket_mark old, struct bucket_mark new, - bool may_make_unavailable, struct bucket_stats_cache_set *bch_alloc_stats) { struct cache_set *c = ca->set; struct bucket_stats_cache *cache_stats; - BUG_ON(!may_make_unavailable && - is_available_bucket(old) && - !is_available_bucket(new) && - c->gc_pos.phase == GC_PHASE_DONE); + bch_fs_inconsistent_on(old.data_type && new.data_type && + old.data_type != new.data_type, c, + "different types of metadata in same bucket: %u, %u", + old.data_type, new.data_type); if (bch_alloc_stats) { bch_alloc_stats->s[S_COMPRESSED][S_CACHED] += (int) new.cached_sectors - (int) old.cached_sectors; bch_alloc_stats->s[S_COMPRESSED] - [old.is_metadata ? S_META : S_DIRTY] -= + [is_meta_bucket(old) ? S_META : S_DIRTY] -= old.dirty_sectors; bch_alloc_stats->s[S_COMPRESSED] - [new.is_metadata ? S_META : S_DIRTY] += + [is_meta_bucket(new) ? S_META : S_DIRTY] += new.dirty_sectors; } @@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca, cache_stats->sectors_cached += (int) new.cached_sectors - (int) old.cached_sectors; - if (old.is_metadata) + if (is_meta_bucket(old)) cache_stats->sectors_meta -= old.dirty_sectors; else cache_stats->sectors_dirty -= old.dirty_sectors; - if (new.is_metadata) + if (is_meta_bucket(new)) cache_stats->sectors_meta += new.dirty_sectors; else cache_stats->sectors_dirty += new.dirty_sectors; @@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca, bch_wake_allocator(ca); } +#define bucket_data_cmpxchg(ca, g, new, expr) \ +({ \ + struct bucket_stats_cache_set _stats = { 0 }; \ + struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ + \ + bucket_stats_update(ca, _old, new, &_stats); \ + _old; \ +}) + void bch_invalidate_bucket(struct cache *ca, struct bucket *g) { struct bucket_stats_cache_set stats = { 0 }; @@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g) old = bucket_cmpxchg(g, new, ({ new.owned_by_allocator = 1; - new.is_metadata = 0; + new.had_metadata = 0; + new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; new.copygc = 0; new.gen++; })); - BUG_ON(old.dirty_sectors); + bucket_stats_update(ca, old, new, &stats); - bucket_stats_update(ca, old, new, true, &stats); + BUG_ON(old.dirty_sectors); /* * Ick: @@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g) void bch_mark_free_bucket(struct cache *ca, struct bucket *g) { - struct bucket_stats_cache_set stats = { 0 }; struct bucket_mark old, new; - old = bucket_cmpxchg(g, new, ({ + old = bucket_data_cmpxchg(ca, g, new, ({ new.owned_by_allocator = 0; - new.is_metadata = 0; + new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; })); - bucket_stats_update(ca, old, new, false, &stats); + BUG_ON(bucket_became_unavailable(ca->set, old, new)); } void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g, bool owned_by_allocator) { - struct bucket_stats_cache_set stats = { 0 }; - struct bucket_mark old, new; + struct bucket_mark new; - old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator); - - bucket_stats_update(ca, old, new, true, &stats); + bucket_data_cmpxchg(ca, g, new, ({ + new.owned_by_allocator = owned_by_allocator; + })); } void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g, + enum bucket_data_type type, bool may_make_unavailable) { - struct bucket_stats_cache_set stats = { 0 }; struct bucket_mark old, new; - old = bucket_cmpxchg(g, new, ({ - new.is_metadata = 1; + BUG_ON(!type); + + old = bucket_data_cmpxchg(ca, g, new, ({ + new.data_type = type; new.had_metadata = 1; })); BUG_ON(old.cached_sectors); BUG_ON(old.dirty_sectors); - - bucket_stats_update(ca, old, new, may_make_unavailable, &stats); + BUG_ON(!may_make_unavailable && + bucket_became_unavailable(ca->set, old, new)); } #define saturated_add(ca, dst, src, max) \ @@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c, if (!new.dirty_sectors && !new.cached_sectors) { - new.is_metadata = false; + new.data_type = 0; if (journal_seq) { - new.wait_on_journal = true; + new.journal_seq_valid = 1; new.journal_seq = journal_seq; } } else { - new.is_metadata = (type == S_META); + new.data_type = type == S_META + ? BUCKET_BTREE : BUCKET_DATA; } - new.had_metadata |= new.is_metadata; + new.had_metadata |= is_meta_bucket(new); } while ((v = cmpxchg(&g->_mark.counter, old.counter, new.counter)) != old.counter); - bucket_stats_update(ca, old, new, may_make_unavailable, NULL); + bucket_stats_update(ca, old, new, NULL); + + BUG_ON(!may_make_unavailable && + bucket_became_unavailable(c, old, new)); if (saturated && atomic_long_add_return(saturated, diff --git a/libbcache/buckets.h b/libbcache/buckets.h index 9c6e4385..6d70103e 100644 --- a/libbcache/buckets.h +++ b/libbcache/buckets.h @@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c) static inline bool is_available_bucket(struct bucket_mark mark) { return (!mark.owned_by_allocator && - !mark.is_metadata && - !mark.dirty_sectors); + mark.data_type == BUCKET_DATA && + !mark.dirty_sectors && + !mark.nouse); +} + +static inline bool bucket_needs_journal_commit(struct bucket_mark m, + u16 last_seq_ondisk) +{ + return m.journal_seq_valid && + ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); } void bch_bucket_seq_cleanup(struct cache_set *); @@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *); void bch_invalidate_bucket(struct cache *, struct bucket *); void bch_mark_free_bucket(struct cache *, struct bucket *); void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool); -void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool); +void bch_mark_metadata_bucket(struct cache *, struct bucket *, + enum bucket_data_type, bool); void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool, struct bucket_stats_cache_set *); diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h index 6bbdcd26..f42e09d8 100644 --- a/libbcache/buckets_types.h +++ b/libbcache/buckets_types.h @@ -1,6 +1,14 @@ #ifndef _BUCKETS_TYPES_H #define _BUCKETS_TYPES_H +enum bucket_data_type { + BUCKET_DATA = 0, + BUCKET_BTREE, + BUCKET_PRIOS, + BUCKET_JOURNAL, + BUCKET_SB, +}; + struct bucket_mark { union { struct { @@ -12,23 +20,30 @@ struct bucket_mark { /* generation copygc is going to move this bucket into */ unsigned copygc:1; - unsigned wait_on_journal:1; + + unsigned journal_seq_valid:1; /* - * If this bucket ever had metadata in it, the allocator must - * increment its gen before we reuse it: + * If this bucket had metadata while at the current generation + * number, the allocator must increment its gen before we reuse + * it: */ unsigned had_metadata:1; unsigned owned_by_allocator:1; - unsigned is_metadata:1; - u16 cached_sectors; + unsigned data_type:3; + + unsigned nouse:1; + u16 dirty_sectors; + u16 cached_sectors; /* * low bits of journal sequence number when this bucket was most - * recently modified: + * recently modified: if journal_seq_valid is set, this bucket + * can't be reused until the journal sequence number written to + * disk is >= the bucket's journal sequence number: */ u16 journal_seq; }; diff --git a/libbcache/chardev.c b/libbcache/chardev.c index b142d7b2..049aa910 100644 --- a/libbcache/chardev.c +++ b/libbcache/chardev.c @@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg) static long bch_ioctl_stop(struct cache_set *c) { - bch_fs_stop(c); + bch_fs_stop_async(c); return 0; } diff --git a/libbcache/checksum.c b/libbcache/checksum.c index dae52d49..92036db4 100644 --- a/libbcache/checksum.c +++ b/libbcache/checksum.c @@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed) if (ret) goto err; - crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL, - sizeof(*crypt) / sizeof(u64)), - struct bch_sb_field_crypt, field); + crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64)); if (!crypt) { ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ goto err; } - crypt->field.type = BCH_SB_FIELD_crypt; crypt->key = key; /* write superblock */ @@ -560,7 +557,7 @@ err: return ret; } -void bch_fs_encryption_free(struct cache_set *c) +void bch_fs_encryption_exit(struct cache_set *c) { if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); diff --git a/libbcache/checksum.h b/libbcache/checksum.h index 137c9155..9d4da08d 100644 --- a/libbcache/checksum.h +++ b/libbcache/checksum.h @@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned, int bch_disable_encryption(struct cache_set *); int bch_enable_encryption(struct cache_set *, bool); -void bch_fs_encryption_free(struct cache_set *); +void bch_fs_encryption_exit(struct cache_set *); int bch_fs_encryption_init(struct cache_set *); static inline unsigned bch_data_checksum_type(struct cache_set *c) diff --git a/libbcache/compress.c b/libbcache/compress.c index f81a8143..89da31e5 100644 --- a/libbcache/compress.c +++ b/libbcache/compress.c @@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c, break; } - return bch_compress_init(c); + return bch_fs_compress_init(c); } -void bch_compress_free(struct cache_set *c) +void bch_fs_compress_exit(struct cache_set *c) { vfree(c->zlib_workspace); mempool_exit(&c->lz4_workspace_pool); @@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c) max_t(size_t, zlib_inflate_workspacesize(), \ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)) -int bch_compress_init(struct cache_set *c) +int bch_fs_compress_init(struct cache_set *c) { unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9); int ret, cpu; - if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && - !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) - return 0; - if (!c->bio_decompress_worker) { c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker); if (!c->bio_decompress_worker) @@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c) } } + if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && + !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) + return 0; + if (!mempool_initialized(&c->compression_bounce[READ])) { ret = mempool_init_page_pool(&c->compression_bounce[READ], 1, order); diff --git a/libbcache/compress.h b/libbcache/compress.h index 485acd95..4604b065 100644 --- a/libbcache/compress.h +++ b/libbcache/compress.h @@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *, struct bio *, size_t *, unsigned *); int bch_check_set_has_compressed_data(struct cache_set *, unsigned); -void bch_compress_free(struct cache_set *); -int bch_compress_init(struct cache_set *); +void bch_fs_compress_exit(struct cache_set *); +int bch_fs_compress_init(struct cache_set *); #endif /* _BCACHE_COMPRESS_H */ diff --git a/libbcache/debug.c b/libbcache/debug.c index d25c32ae..16cc72b9 100644 --- a/libbcache/debug.c +++ b/libbcache/debug.c @@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = { .read = bch_read_bfloat_failed, }; -void bch_debug_exit_cache_set(struct cache_set *c) +void bch_fs_debug_exit(struct cache_set *c) { if (!IS_ERR_OR_NULL(c->debug)) debugfs_remove_recursive(c->debug); } -void bch_debug_init_cache_set(struct cache_set *c) +void bch_fs_debug_init(struct cache_set *c) { struct btree_debug *bd; char name[100]; @@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - bd->btree = debugfs_create_file(bch_btree_id_names[bd->id], + bd->btree = debugfs_create_file(bch_btree_ids[bd->id], 0400, c->debug, bd, &btree_debug_ops); snprintf(name, sizeof(name), "%s-formats", - bch_btree_id_names[bd->id]); + bch_btree_ids[bd->id]); bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, &btree_format_debug_ops); snprintf(name, sizeof(name), "%s-bfloat-failed", - bch_btree_id_names[bd->id]); + bch_btree_ids[bd->id]); bd->failed = debugfs_create_file(name, 0400, c->debug, bd, &bfloat_failed_debug_ops); diff --git a/libbcache/debug.h b/libbcache/debug.h index a3635e60..d34a95a0 100644 --- a/libbcache/debug.h +++ b/libbcache/debug.h @@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b) } #ifdef CONFIG_DEBUG_FS -void bch_debug_exit_cache_set(struct cache_set *); -void bch_debug_init_cache_set(struct cache_set *); +void bch_fs_debug_exit(struct cache_set *); +void bch_fs_debug_init(struct cache_set *); #else -static inline void bch_debug_exit_cache_set(struct cache_set *c) {} -static inline void bch_debug_init_cache_set(struct cache_set *c) {} +static inline void bch_fs_debug_exit(struct cache_set *c) {} +static inline void bch_fs_debug_init(struct cache_set *c) {} #endif void bch_debug_exit(void); diff --git a/libbcache/error.c b/libbcache/error.c index 9f39be1b..f4109da6 100644 --- a/libbcache/error.c +++ b/libbcache/error.c @@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c) case BCH_ON_ERROR_RO: if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { /* XXX do something better here? */ - bch_fs_stop(c); + bch_fs_stop_async(c); return; } @@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work) } else { bch_notify_dev_error(ca, true); - mutex_lock(&bch_register_lock); + mutex_lock(&c->state_lock); dev = bch_dev_may_remove(ca); if (dev ? bch_dev_read_only(ca) @@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work) "too many IO errors on %s, setting %s RO", bdevname(ca->disk_sb.bdev, buf), dev ? "device" : "filesystem"); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); } } diff --git a/libbcache/extents.c b/libbcache/extents.c index 523f3f48..c5e0e375 100644 --- a/libbcache/extents.c +++ b/libbcache/extents.c @@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b, do { seq = read_seqcount_begin(&c->gc_pos_lock); bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - !g->mark.is_metadata; + g->mark.data_type != BUCKET_BTREE; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); err = "inconsistent"; @@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b) struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; + struct extent_pick_ptr pick = { .ca = NULL }; struct cache *ca; rcu_read_lock(); @@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b) PTR_BUCKET_NR(ca, ptr))) continue; - percpu_ref_get(&ca->ref); - rcu_read_unlock(); + if (pick.ca && pick.ca->mi.tier < ca->mi.tier) + continue; - return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca }; + pick.ca = ca; + pick.ptr = *ptr; } + if (pick.ca) + percpu_ref_get(&pick.ca->ref); + rcu_read_unlock(); - return (struct extent_pick_ptr) { .ca = NULL, }; + return pick; } const struct bkey_ops bch_bkey_btree_ops = { @@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b, if (stale) break; - bad = (mark.is_metadata || + bad = (mark.data_type != BUCKET_DATA || (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && !mark.owned_by_allocator && !(ptr->cached @@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k, rcu_read_lock(); ret->ca = NULL; - extent_for_each_online_device_crc(c, e, crc, ptr, ca) - if (!ptr_stale(ca, ptr)) { - *ret = (struct extent_pick_ptr) { - .crc = crc_to_128(e.k, crc), - .ptr = *ptr, - .ca = ca, - }; - - if (ca != avoid) - break; - } + extent_for_each_online_device_crc(c, e, crc, ptr, ca) { + if (ptr_stale(ca, ptr)) + continue; + + if (ret->ca && + (ca == avoid || + ret->ca->mi.tier < ca->mi.tier)) + continue; + + *ret = (struct extent_pick_ptr) { + .crc = crc_to_128(e.k, crc), + .ptr = *ptr, + .ca = ca, + }; + } if (ret->ca) percpu_ref_get(&ret->ca->ref); diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c index e9585fd5..e2f1427f 100644 --- a/libbcache/fs-gc.c +++ b/libbcache/fs-gc.c @@ -545,9 +545,9 @@ struct nlink { u32 dir_count; }; -DECLARE_GENRADIX_TYPE(nlinks, struct nlink); +typedef GENRADIX(struct nlink) nlink_table; -static void inc_link(struct cache_set *c, struct nlinks *links, +static void inc_link(struct cache_set *c, nlink_table *links, u64 range_start, u64 *range_end, u64 inum, bool dir) { @@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links, } noinline_for_stack -static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links, +static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links, u64 range_start, u64 *range_end) { struct btree_iter iter; @@ -776,7 +776,7 @@ fsck_err: noinline_for_stack static int bch_gc_walk_inodes(struct cache_set *c, struct bch_inode_unpacked *lostfound_inode, - struct nlinks *links, + nlink_table *links, u64 range_start, u64 range_end) { struct btree_iter iter; @@ -850,7 +850,7 @@ noinline_for_stack static int check_inode_nlinks(struct cache_set *c, struct bch_inode_unpacked *lostfound_inode) { - struct nlinks links; + nlink_table links; u64 this_iter_range_start, next_iter_range_start = 0; int ret = 0; diff --git a/libbcache/fs.c b/libbcache/fs.c index ab0d9728..ec70a3e3 100644 --- a/libbcache/fs.c +++ b/libbcache/fs.c @@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name, if (!c) goto err_unlock; - if (!test_bit(BCH_FS_RUNNING, &c->flags)) { + mutex_lock(&c->state_lock); + + if (!bch_fs_running(c)) { + mutex_unlock(&c->state_lock); err = "incomplete cache set"; c = NULL; goto err_unlock; } closure_get(&c->cl); + mutex_unlock(&c->state_lock); mutex_unlock(&bch_register_lock); } @@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data) if (ret) return ret; - mutex_lock(&bch_register_lock); - if (opts.read_only >= 0 && opts.read_only != c->opts.read_only) { const char *err = NULL; if (opts.read_only) { - bch_fs_read_only_sync(c); + bch_fs_read_only(c); sb->s_flags |= MS_RDONLY; } else { err = bch_fs_read_write(c); if (err) { bch_err(c, "error going rw: %s", err); - ret = -EINVAL; - goto unlock; + return -EINVAL; } sb->s_flags &= ~MS_RDONLY; @@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data) if (opts.errors >= 0) c->opts.errors = opts.errors; -unlock: - mutex_unlock(&bch_register_lock); - return ret; } @@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb) generic_shutdown_super(sb); if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) - bch_fs_stop_sync(c); + bch_fs_stop(c); else closure_put(&c->cl); } @@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = { MODULE_ALIAS_FS("bcache"); -void bch_fs_exit(void) +void bch_vfs_exit(void) { unregister_filesystem(&bcache_fs_type); if (bch_dio_write_bioset) @@ -1477,7 +1475,7 @@ void bch_fs_exit(void) kmem_cache_destroy(bch_inode_cache); } -int __init bch_fs_init(void) +int __init bch_vfs_init(void) { int ret = -ENOMEM; @@ -1504,6 +1502,6 @@ int __init bch_fs_init(void) return 0; err: - bch_fs_exit(); + bch_vfs_exit(); return ret; } diff --git a/libbcache/fs.h b/libbcache/fs.h index 933fb6de..2a29b132 100644 --- a/libbcache/fs.h +++ b/libbcache/fs.h @@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *, int __must_check bch_write_inode(struct cache_set *, struct bch_inode_info *); -void bch_fs_exit(void); -int bch_fs_init(void); +void bch_vfs_exit(void); +int bch_vfs_init(void); #else -static inline void bch_fs_exit(void) {} -static inline int bch_fs_init(void) { return 0; } +static inline void bch_vfs_exit(void) {} +static inline int bch_vfs_init(void) { return 0; } #endif diff --git a/libbcache/io.c b/libbcache/io.c index be99a973..a3df3794 100644 --- a/libbcache/io.c +++ b/libbcache/io.c @@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data) spin_lock_irqsave(&c->foreground_write_pd_lock, flags); while ((op = c->write_wait_head)) { - if (!test_bit(BCH_FS_RO, &c->flags) && - !test_bit(BCH_FS_STOPPING, &c->flags) && - time_after(op->expires, jiffies)) { + if (time_after(op->expires, jiffies)) { mod_timer(&c->foreground_write_wakeup, op->expires); break; } @@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio) return; } - if (rbio->promote && - !test_bit(BCH_FS_RO, &c->flags) && - !test_bit(BCH_FS_STOPPING, &c->flags)) { + if (rbio->promote) { struct cache_promote_op *promote = rbio->promote; struct closure *cl = &promote->cl; @@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio) preempt_disable(); d = this_cpu_ptr(c->bio_decompress_worker); llist_add(&rbio->list, &d->bio_list); - queue_work(system_unbound_wq, &d->work); + queue_work(system_highpri_wq, &d->work); preempt_enable(); } else { __bch_read_endio(c, rbio); } } +static bool should_promote(struct cache_set *c, + struct extent_pick_ptr *pick, unsigned flags) +{ + if (!(flags & BCH_READ_PROMOTE)) + return false; + + if (percpu_ref_is_dying(&c->writes)) + return false; + + return c->fastest_tier && + c->fastest_tier < c->tiers + pick->ca->mi.tier; +} + void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, struct extent_pick_ptr *pick, unsigned flags) @@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, * XXX: multiple promotes can race with each other, wastefully. Keep a * list of outstanding promotes? */ - if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) { + if (should_promote(c, pick, flags)) { /* * biovec needs to be big enough to hold decompressed data, if * the bch_write_extent() has to decompress/recompress it: diff --git a/libbcache/journal.c b/libbcache/journal.c index 99dd9f26..b2838376 100644 --- a/libbcache/journal.c +++ b/libbcache/journal.c @@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c, return BCH_FSCK_UNKNOWN_VERSION; } - if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 || - bytes > c->journal.entry_size_max, c, + if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c, "journal entry too big (%zu bytes), sector %lluu", bytes, sector)) { /* XXX: note we might have missing journal entries */ @@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c) { struct journal *j = &c->journal; struct journal_seq_blacklist *bl; - struct cache *ca; u64 new_seq = 0; - unsigned i; - - for_each_cache(ca, c, i) - if (is_journal_device(ca)) - bch_dev_group_add(&c->journal.devs, ca); list_for_each_entry(bl, &j->seq_blacklist, list) new_seq = max(new_seq, bl->seq); @@ -1534,48 +1527,111 @@ err: return ret; } -static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr) +static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca, + unsigned nr, bool write_super) { + struct journal *j = &c->journal; struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets = - bch_sb_get_journal(ca->disk_sb.sb); - struct bch_sb_field *f; - u64 *p; + struct bch_sb_field_journal *journal_buckets; + struct disk_reservation disk_res = { 0, 0 }; + struct closure cl; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; + int ret = 0; - p = krealloc(ja->bucket_seq, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; + closure_init_stack(&cl); - ja->bucket_seq = p; + mutex_lock(&c->sb_lock); - p = krealloc(ja->buckets, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; + /* don't handle reducing nr of buckets yet: */ + if (nr <= ja->nr) + goto err; - ja->buckets = p; + /* + * note: journal buckets aren't really counted as _sectors_ used yet, so + * we don't need the disk reservation to avoid the BUG_ON() in buckets.c + * when space used goes up without a reservation - but we do need the + * reservation to ensure we'll actually be able to allocate: + */ - f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr + - sizeof(*journal_buckets) / sizeof(u64)); - if (!f) - return -ENOMEM; - f->type = BCH_SB_FIELD_journal; + ret = ENOSPC; + if (bch_disk_reservation_get(c, &disk_res, + (nr - ja->nr) << ca->bucket_bits, 0)) + goto err; - ja->nr = nr; - return 0; + ret = -ENOMEM; + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); + if (!new_buckets || !new_bucket_seq) + goto err; + + journal_buckets = bch_sb_resize_journal(&ca->disk_sb, + nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) + goto err; + + spin_lock(&j->lock); + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + + while (ja->nr < nr) { + /* must happen under journal lock, to avoid racing with gc: */ + u64 b = bch_bucket_alloc(ca, RESERVE_NONE); + if (!b) { + if (!closure_wait(&c->freelist_wait, &cl)) { + spin_unlock(&j->lock); + closure_sync(&cl); + spin_lock(&j->lock); + } + continue; + } + + bch_mark_metadata_bucket(ca, &ca->buckets[b], + BUCKET_JOURNAL, false); + bch_mark_alloc_bucket(ca, &ca->buckets[b], false); + + memmove(ja->buckets + ja->last_idx + 1, + ja->buckets + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + memmove(ja->bucket_seq + ja->last_idx + 1, + ja->bucket_seq + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + memmove(journal_buckets->buckets + ja->last_idx + 1, + journal_buckets->buckets + ja->last_idx, + (ja->nr - ja->last_idx) * sizeof(u64)); + + ja->buckets[ja->last_idx] = b; + journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b); + + if (ja->last_idx < ja->nr) { + if (ja->cur_idx >= ja->last_idx) + ja->cur_idx++; + ja->last_idx++; + } + ja->nr++; + + } + spin_unlock(&j->lock); + + BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi)); + + if (write_super) + bch_write_super(c); + + ret = 0; +err: + mutex_unlock(&c->sb_lock); + + kfree(new_bucket_seq); + kfree(new_buckets); + bch_disk_reservation_put(c, &disk_res); + + return ret; } int bch_dev_journal_alloc(struct cache *ca) { - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; - int ret; - unsigned i; - - if (ca->mi.tier != 0) - return 0; - if (dynamic_fault("bcache:add:journal_alloc")) return -ENOMEM; @@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca) * clamp journal size to 1024 buckets or 512MB (in sectors), whichever * is smaller: */ - ret = bch_set_nr_journal_buckets(ca, + return bch_set_nr_journal_buckets(ca->set, ca, clamp_t(unsigned, ca->mi.nbuckets >> 8, BCH_JOURNAL_BUCKETS_MIN, min(1 << 10, - (1 << 20) / ca->mi.bucket_size))); - if (ret) - return ret; - - journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); - - for (i = 0; i < ja->nr; i++) { - u64 bucket = ca->mi.first_bucket + i; - - ja->buckets[i] = bucket; - journal_buckets->buckets[i] = cpu_to_le64(bucket); - - bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true); - } - - return 0; + (1 << 20) / ca->mi.bucket_size)), + false); } /* Journalling */ @@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j, fifo_entry_idx(&j->pin, pin->pin_list))) { if (journal_pin_active(pin)) __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->pin_list, - pin, NULL); + __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); } spin_unlock_irq(&j->pin_lock); } - static struct journal_entry_pin * journal_get_next_pin(struct journal *j, u64 seq_to_flush) { @@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush) return ret; } +static bool journal_has_pins(struct journal *j) +{ + bool ret; + + spin_lock(&j->lock); + journal_reclaim_fast(j); + ret = fifo_used(&j->pin) > 1 || + atomic_read(&fifo_peek_front(&j->pin).count) > 1; + spin_unlock(&j->lock); + + return ret; +} + +void bch_journal_flush_pins(struct journal *j) +{ + struct journal_entry_pin *pin; + + while ((pin = journal_get_next_pin(j, U64_MAX))) + pin->flush(j, pin); + + wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j)); +} + static bool should_discard_bucket(struct journal *j, struct journal_device *ja) { bool ret; @@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) struct cache_set *c = container_of(j, struct cache_set, journal); struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); struct bch_extent_ptr *ptr; + struct journal_device *ja; struct cache *ca; - unsigned iter, replicas, replicas_want = + bool swapped; + unsigned i, replicas, replicas_want = READ_ONCE(c->opts.metadata_replicas); spin_lock(&j->lock); @@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) replicas = bch_extent_nr_ptrs(e.c); + spin_lock(&j->devs.lock); + + /* Sort by tier: */ + do { + swapped = false; + + for (i = 0; i + 1 < j->devs.nr; i++) + if (j->devs.d[i + 0].dev->mi.tier > + j->devs.d[i + 1].dev->mi.tier) { + swap(j->devs.d[i], j->devs.d[i + 1]); + swapped = true; + } + } while (swapped); + /* - * Determine location of the next journal write: - * XXX: sort caches by free journal space + * Pick devices for next journal write: + * XXX: sort devices by free journal space? */ - group_for_each_cache_rcu(ca, &j->devs, iter) { - struct journal_device *ja = &ca->journal; + for (i = 0; i < j->devs.nr; i++) { + ca = j->devs.d[i].dev; + ja = &ca->journal; if (replicas >= replicas_want) break; @@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx); } - + spin_unlock(&j->devs.lock); rcu_read_unlock(); j->prev_buf_sectors = 0; @@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j) return bch_journal_flush_seq(j, seq); } -void bch_journal_free(struct journal *j) -{ - unsigned order = get_order(j->entry_size_max); - - free_pages((unsigned long) j->buf[1].data, order); - free_pages((unsigned long) j->buf[0].data, order); - free_fifo(&j->pin); -} - -int bch_journal_alloc(struct journal *j, unsigned entry_size_max) -{ - static struct lock_class_key res_key; - unsigned order = get_order(entry_size_max); - - spin_lock_init(&j->lock); - spin_lock_init(&j->pin_lock); - init_waitqueue_head(&j->wait); - INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); - mutex_init(&j->blacklist_lock); - INIT_LIST_HEAD(&j->seq_blacklist); - spin_lock_init(&j->devs.lock); - mutex_init(&j->reclaim_lock); - - lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - - j->entry_size_max = entry_size_max; - j->write_delay_ms = 100; - j->reclaim_delay_ms = 100; - - bkey_extent_init(&j->key); - - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || - !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) - return -ENOMEM; - - return 0; -} - ssize_t bch_journal_print_debug(struct journal *j, char *buf) { union journal_res_state *s = &j->reservations; @@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca) return ret; } -void bch_journal_free_cache(struct cache *ca) +void bch_fs_journal_stop(struct journal *j) +{ + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return; + + /* + * Empty out the journal by first flushing everything pinning existing + * journal entries, then force a brand new empty journal entry to be + * written: + */ + bch_journal_flush_pins(j); + bch_journal_flush_async(j, NULL); + bch_journal_meta(j); + + cancel_delayed_work_sync(&j->write_work); + cancel_delayed_work_sync(&j->reclaim_work); +} + +void bch_dev_journal_exit(struct cache *ca) { kfree(ca->journal.buckets); kfree(ca->journal.bucket_seq); } -int bch_journal_init_cache(struct cache *ca) +int bch_dev_journal_init(struct cache *ca) { struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = @@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca) return 0; } + +void bch_fs_journal_exit(struct journal *j) +{ + unsigned order = get_order(j->entry_size_max); + + free_pages((unsigned long) j->buf[1].data, order); + free_pages((unsigned long) j->buf[0].data, order); + free_fifo(&j->pin); +} + +int bch_fs_journal_init(struct journal *j, unsigned entry_size_max) +{ + static struct lock_class_key res_key; + unsigned order = get_order(entry_size_max); + + spin_lock_init(&j->lock); + spin_lock_init(&j->pin_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); + INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work); + mutex_init(&j->blacklist_lock); + INIT_LIST_HEAD(&j->seq_blacklist); + spin_lock_init(&j->devs.lock); + mutex_init(&j->reclaim_lock); + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + + j->entry_size_max = entry_size_max; + j->write_delay_ms = 100; + j->reclaim_delay_ms = 100; + + bkey_extent_init(&j->key); + + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || + !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) + return -ENOMEM; + + return 0; +} diff --git a/libbcache/journal.h b/libbcache/journal.h index 02a6e676..d3a1db0c 100644 --- a/libbcache/journal.h +++ b/libbcache/journal.h @@ -111,7 +111,6 @@ #include <linux/hash.h> #include "journal_types.h" -//#include "super-io.h" /* * Only used for holding the journal entries we read in btree_journal_read() @@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, struct journal_entry_pin *, journal_pin_flush_fn); +void bch_journal_flush_pins(struct journal *); struct closure; struct cache_set; @@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j) ? -EIO : 0; } -static inline bool is_journal_device(struct cache *ca) -{ - return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0; -} - static inline bool journal_flushes_device(struct cache *ca) { return true; @@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j) spin_unlock(&j->lock); } -void bch_journal_free(struct journal *); -int bch_journal_alloc(struct journal *, unsigned); - ssize_t bch_journal_print_debug(struct journal *, char *); int bch_dev_journal_alloc(struct cache *); @@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j) int bch_journal_move(struct cache *); -void bch_journal_free_cache(struct cache *); -int bch_journal_init_cache(struct cache *); +void bch_fs_journal_stop(struct journal *); +void bch_dev_journal_exit(struct cache *); +int bch_dev_journal_init(struct cache *); +void bch_fs_journal_exit(struct journal *); +int bch_fs_journal_init(struct journal *, unsigned); #endif /* _BCACHE_JOURNAL_H */ diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c index e40dfbca..27f5c63c 100644 --- a/libbcache/movinggc.c +++ b/libbcache/movinggc.c @@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca) } if (g->mark.owned_by_allocator || - g->mark.is_metadata) + g->mark.data_type != BUCKET_DATA) continue; sectors_used = bucket_sectors_used(g); @@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg) return 0; } -void bch_moving_init_cache(struct cache *ca) +void bch_moving_gc_stop(struct cache *ca) { - bch_pd_controller_init(&ca->moving_gc_pd); - ca->moving_gc_pd.d_term = 0; + ca->moving_gc_pd.rate.rate = UINT_MAX; + bch_ratelimit_reset(&ca->moving_gc_pd.rate); + + if (ca->moving_gc_read) + kthread_stop(ca->moving_gc_read); + ca->moving_gc_read = NULL; } -int bch_moving_gc_thread_start(struct cache *ca) +int bch_moving_gc_start(struct cache *ca) { struct task_struct *t; - /* The moving gc read thread must be stopped */ - BUG_ON(ca->moving_gc_read != NULL); + BUG_ON(ca->moving_gc_read); if (ca->set->opts.nochanges) return 0; @@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca) return 0; } -void bch_moving_gc_stop(struct cache *ca) +void bch_dev_moving_gc_init(struct cache *ca) { - ca->moving_gc_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&ca->moving_gc_pd.rate); - - if (ca->moving_gc_read) - kthread_stop(ca->moving_gc_read); - ca->moving_gc_read = NULL; + bch_pd_controller_init(&ca->moving_gc_pd); + ca->moving_gc_pd.d_term = 0; } diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h index 5f153085..e8ae95e5 100644 --- a/libbcache/movinggc.h +++ b/libbcache/movinggc.h @@ -23,8 +23,8 @@ #define COPYGC_SECTORS_PER_ITER(ca) \ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) -void bch_moving_init_cache(struct cache *); void bch_moving_gc_stop(struct cache *); -int bch_moving_gc_thread_start(struct cache *); +int bch_moving_gc_start(struct cache *); +void bch_dev_moving_gc_init(struct cache *); #endif diff --git a/libbcache/opts.h b/libbcache/opts.h index 95184db1..9b10310d 100644 --- a/libbcache/opts.h +++ b/libbcache/opts.h @@ -86,11 +86,17 @@ enum opt_type { BCH_OPT(noreplay, 0444, NO_SB_OPT, \ s8, OPT_BOOL()) \ BCH_OPT(norecovery, 0444, NO_SB_OPT, \ - s8, OPT_BOOL()) + s8, OPT_BOOL()) \ + BCH_OPT(noexcl, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ + BCH_OPT(sb, 0444, NO_SB_OPT, \ + s64, OPT_UINT(0, S64_MAX)) \ #define BCH_OPTS() \ BCH_OPT(read_only, 0444, NO_SB_OPT, \ s8, OPT_BOOL()) \ + BCH_OPT(nostart, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ BCH_VISIBLE_OPTS() struct bch_opts { @@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src) #undef BCH_OPT } +#define opt_defined(_opt) ((_opt) >= 0) + void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64); struct bch_opts bch_sb_opts(struct bch_sb *); diff --git a/libbcache/super-io.c b/libbcache/super-io.c index be27d3ee..f50a5ee8 100644 --- a/libbcache/super-io.c +++ b/libbcache/super-io.c @@ -10,6 +10,7 @@ #include "vstructs.h" #include <linux/backing-dev.h> +#include <linux/sort.h> static inline void __bch_sb_layout_size_assert(void) { @@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void) } struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb, - enum bch_sb_field_types type) + enum bch_sb_field_type type) { struct bch_sb_field *f; @@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb) if (sb->bio) bio_put(sb->bio); if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(sb->bdev, sb->mode); free_pages((unsigned long) sb->sb, sb->page_order); memset(sb, 0, sizeof(*sb)); @@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order) return 0; } -int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s) +static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s) { u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; @@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb, le32_add_cpu(&sb->u64s, u64s - old_u64s); return f; +} + +struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch_sb_field_get(sb->sb, type); + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) + return NULL; + + f = __bch_sb_field_resize(sb->sb, f, u64s); + f->type = type; + return f; } struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c, - struct bch_sb_field *f, + enum bch_sb_field_type type, unsigned u64s) { + struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type); ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ssize_t d = -old_u64s + u64s; struct cache *ca; @@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c, for_each_cache(ca, c, i) { struct bcache_superblock *sb = &ca->disk_sb; - if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { percpu_ref_put(&ca->ref); return NULL; } } - return __bch_sb_field_resize(c->disk_sb, f, u64s); -} - -struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - - if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) - return NULL; - - return __bch_sb_field_resize(sb->sb, f, u64s); + f = __bch_sb_field_resize(c->disk_sb, f, u64s); + f->type = type; + return f; } static const char *validate_sb_layout(struct bch_sb_layout *layout) @@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) prev_offset = le64_to_cpu(layout->sb_offset[0]); - if (prev_offset != BCH_SB_SECTOR) - return "Invalid superblock layout: doesn't have default superblock location"; - for (i = 1; i < layout->nr_superblocks; i++) { offset = le64_to_cpu(layout->sb_offset[i]); @@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) return NULL; } +static int u64_cmp(const void *_l, const void *_r) +{ + u64 l = *((const u64 *) _l), r = *((const u64 *) _r); + + return l < r ? -1 : l > r ? 1 : 0; +} + +const char *bch_validate_journal_layout(struct bch_sb *sb, + struct cache_member_cpu mi) +{ + struct bch_sb_field_journal *journal; + const char *err; + unsigned nr; + unsigned i; + u64 *b; + + journal = bch_sb_get_journal(sb); + if (!journal) + return NULL; + + nr = bch_nr_journal_buckets(journal); + if (!nr) + return NULL; + + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) + return "cannot allocate memory"; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + err = "journal bucket at sector 0"; + if (!b[0]) + goto err; + + err = "journal bucket before first bucket"; + if (b[0] < mi.first_bucket) + goto err; + + err = "journal bucket past end of device"; + if (b[nr - 1] >= mi.nbuckets) + goto err; + + err = "duplicate journal buckets"; + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) + goto err; + + err = NULL; +err: + kfree(b); + return err; +} + const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; struct bch_sb_field_members *sb_mi; - struct bch_sb_field_journal *journal; struct cache_member_cpu mi; const char *err; u16 block_size; - unsigned i; switch (le64_to_cpu(sb->version)) { case BCACHE_SB_VERSION_CDEV_V4: @@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx); - for (i = 0; i < sb->layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(sb->layout.sb_offset[i]); - u64 max_size = 1 << sb->layout.sb_max_size_bits; - - if (offset + max_size > mi.first_bucket * mi.bucket_size) - return "Invalid superblock: first bucket comes before end of super"; - } - if (mi.nbuckets > LONG_MAX) return "Too many buckets"; @@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) mi.bucket_size * mi.nbuckets) return "Invalid superblock: device too small"; - /* Validate journal buckets: */ - journal = bch_sb_get_journal(sb); - if (journal) { - for (i = 0; i < bch_nr_journal_buckets(journal); i++) { - u64 b = le64_to_cpu(journal->buckets[i]); - - if (b < mi.first_bucket || b >= mi.nbuckets) - return "bad journal bucket"; - } - } + err = bch_validate_journal_layout(sb, mi); + if (err) + return err; return NULL; } @@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev) static bool bch_is_open(struct block_device *bdev) { - lockdep_assert_held(&bch_register_lock); + bool ret; + + mutex_lock(&bch_register_lock); + ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); + mutex_unlock(&bch_register_lock); - return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); + return ret; } -static const char *bch_blkdev_open(const char *path, void *holder, - struct bch_opts opts, - struct block_device **ret) +static const char *bch_blkdev_open(const char *path, fmode_t mode, + void *holder, struct block_device **ret) { struct block_device *bdev; - fmode_t mode = opts.nochanges > 0 - ? FMODE_READ - : FMODE_READ|FMODE_WRITE|FMODE_EXCL; const char *err; *ret = NULL; @@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca) unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; int ret; - ret = bch_dev_sb_realloc(&ca->disk_sb, u64s); + ret = bch_sb_realloc(&ca->disk_sb, u64s); if (ret) return ret; @@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset) reread: bio_reset(sb->bio); sb->bio->bi_bdev = sb->bdev; - sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR; + sb->bio->bi_iter.bi_sector = offset; sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch_bio_map(sb->bio, sb->sb); @@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb, struct bch_opts opts, const char *path) { + u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR; struct bch_sb_layout layout; const char *err; unsigned i; - lockdep_assert_held(&bch_register_lock); - memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; + + if (!(opt_defined(opts.noexcl) && opts.noexcl)) + sb->mode |= FMODE_EXCL; - err = bch_blkdev_open(path, &sb, opts, &sb->bdev); + if (!(opt_defined(opts.nochanges) && opts.nochanges)) + sb->mode |= FMODE_WRITE; + + err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev); if (err) return err; @@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb, if (bch_fs_init_fault("read_super")) goto err; - err = read_one_super(sb, BCH_SB_SECTOR); + err = read_one_super(sb, offset); if (!err) goto got_super; - pr_err("error reading default super: %s", err); + if (offset != BCH_SB_SECTOR) { + pr_err("error reading superblock: %s", err); + goto err; + } + + pr_err("error reading default superblock: %s", err); /* * Error reading primary superblock - read location of backup @@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c) lockdep_assert_held(&c->sb_lock); + if (c->opts.nochanges) + return; + closure_init_stack(cl); le64_add_cpu(&c->disk_sb->seq, 1); diff --git a/libbcache/super-io.h b/libbcache/super-io.h index 665de811..ae1e8b9d 100644 --- a/libbcache/super-io.h +++ b/libbcache/super-io.h @@ -6,16 +6,35 @@ #include <asm/byteorder.h> -struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types); - -#define BCH_SB_FIELD_TYPE(_name) \ -static inline struct bch_sb_field_##_name * \ -bch_sb_get_##_name(struct bch_sb *sb) \ -{ \ - struct bch_sb_field *f = \ - bch_sb_field_get(sb, BCH_SB_FIELD_##_name); \ - \ - return container_of_or_null(f, struct bch_sb_field_##_name, field);\ +struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type); +struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *, + enum bch_sb_field_type, unsigned); +struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *, + enum bch_sb_field_type, unsigned); + +#define field_to_type(_f, _name) \ + container_of_or_null(_f, struct bch_sb_field_##_name, field) + +#define BCH_SB_FIELD_TYPE(_name) \ +static inline struct bch_sb_field_##_name * \ +bch_sb_get_##_name(struct bch_sb *sb) \ +{ \ + return field_to_type(bch_sb_field_get(sb, \ + BCH_SB_FIELD_##_name), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \ +{ \ + return field_to_type(bch_sb_field_resize(sb, \ + BCH_SB_FIELD_##_name, u64s), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s) \ +{ \ + return field_to_type(bch_fs_sb_field_resize(c, \ + BCH_SB_FIELD_##_name, u64s), _name); \ } BCH_SB_FIELD_TYPE(journal); @@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned); int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *); int bch_sb_from_cache_set(struct cache_set *, struct cache *); -struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *, - struct bch_sb_field *, unsigned); -struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *, - struct bch_sb_field *, unsigned); - void bch_free_super(struct bcache_superblock *); int bch_super_realloc(struct bcache_superblock *, unsigned); +const char *bch_validate_journal_layout(struct bch_sb *, + struct cache_member_cpu); const char *bch_validate_cache_super(struct bcache_superblock *); const char *bch_read_super(struct bcache_superblock *, diff --git a/libbcache/super.c b/libbcache/super.c index fab34805..5535639c 100644 --- a/libbcache/super.c +++ b/libbcache/super.c @@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); struct workqueue_struct *bcache_io_wq; struct crypto_shash *bch_sha256; -static void bch_dev_stop(struct cache *); +static void bch_dev_free(struct cache *); static int bch_dev_online(struct cache *); static int bch_congested_fn(void *data, int bdi_bits) @@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits) } } } else { - /* Writes only go to tier 0: */ - group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) { + /* Writes prefer fastest tier: */ + struct bch_tier *tier = READ_ONCE(c->fastest_tier); + struct cache_group *grp = tier ? &tier->devs : &c->cache_all; + + group_for_each_cache_rcu(ca, grp, i) { bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); if (bdi_congested(bdi, bdi_bits)) { @@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits) return ret; } -/* Cache set RO/RW: */ +/* Filesystem RO/RW: */ /* * For startup/shutdown of RW stuff, the dependencies are: @@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c) struct cache *ca; unsigned i; - c->tiering_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&c->tiering_pd.rate); - bch_tiering_read_stop(c); + bch_tiering_stop(c); for_each_cache(ca, c, i) bch_moving_gc_stop(ca); @@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c) for_each_cache(ca, c, i) bch_dev_allocator_stop(ca); - /* - * Write a journal entry after flushing the btree, so we don't end up - * replaying everything we just flushed: - */ - if (test_bit(JOURNAL_STARTED, &c->journal.flags)) { - int ret; - - bch_journal_flush_async(&c->journal, NULL); - ret = bch_journal_meta(&c->journal); - BUG_ON(ret && !bch_journal_error(&c->journal)); - } - - cancel_delayed_work_sync(&c->journal.write_work); - cancel_delayed_work_sync(&c->journal.reclaim_work); + bch_fs_journal_stop(&c->journal); } static void bch_writes_disabled(struct percpu_ref *writes) @@ -167,12 +155,27 @@ static void bch_writes_disabled(struct percpu_ref *writes) wake_up(&bch_read_only_wait); } -static void bch_fs_read_only_work(struct work_struct *work) +void bch_fs_read_only(struct cache_set *c) { - struct cache_set *c = - container_of(work, struct cache_set, read_only_work); + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STARTING && + c->state != BCH_FS_RW) + goto out; + + if (test_bit(BCH_FS_ERROR, &c->flags)) + goto out; - percpu_ref_put(&c->writes); + trace_fs_read_only(c); + + /* + * Block new foreground-end write operations from starting - any new + * writes will return -EROFS: + * + * (This is really blocking new _allocations_, writes to previously + * allocated space can still happen until stopping the allocator in + * bch_dev_allocator_stop()). + */ + percpu_ref_kill(&c->writes); del_timer(&c->foreground_write_wakeup); cancel_delayed_work(&c->pd_controllers_update); @@ -180,98 +183,77 @@ static void bch_fs_read_only_work(struct work_struct *work) c->foreground_write_pd.rate.rate = UINT_MAX; bch_wake_delayed_writes((unsigned long) c); - if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { - /* - * If we're not doing an emergency shutdown, we want to wait on - * outstanding writes to complete so they don't see spurious - * errors due to shutting down the allocator: - */ - wait_event(bch_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + /* + * If we're not doing an emergency shutdown, we want to wait on + * outstanding writes to complete so they don't see spurious errors due + * to shutting down the allocator: + * + * If we are doing an emergency shutdown outstanding writes may + * hang until we shutdown the allocator so we don't want to wait + * on outstanding writes before shutting everything down - but + * we do need to wait on them before returning and signalling + * that going RO is complete: + */ + wait_event(bch_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || + test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); - __bch_fs_read_only(c); + __bch_fs_read_only(c); - if (!bch_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags)) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb, true); - bch_write_super(c); - mutex_unlock(&c->sb_lock); - } - } else { - /* - * If we are doing an emergency shutdown outstanding writes may - * hang until we shutdown the allocator so we don't want to wait - * on outstanding writes before shutting everything down - but - * we do need to wait on them before returning and signalling - * that going RO is complete: - */ - __bch_fs_read_only(c); + wait_event(bch_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - wait_event(bch_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + if (!bch_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags)) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb, true); + bch_write_super(c); + mutex_unlock(&c->sb_lock); } + c->state = BCH_FS_RO; bch_notify_fs_read_only(c); trace_fs_read_only_done(c); - - set_bit(BCH_FS_RO_COMPLETE, &c->flags); - wake_up(&bch_read_only_wait); +out: + mutex_unlock(&c->state_lock); } -bool bch_fs_read_only(struct cache_set *c) +static void bch_fs_read_only_work(struct work_struct *work) { - if (test_and_set_bit(BCH_FS_RO, &c->flags)) - return false; - - trace_fs_read_only(c); - - percpu_ref_get(&c->writes); + struct cache_set *c = + container_of(work, struct cache_set, read_only_work); - /* - * Block new foreground-end write operations from starting - any new - * writes will return -EROFS: - * - * (This is really blocking new _allocations_, writes to previously - * allocated space can still happen until stopping the allocator in - * bch_dev_allocator_stop()). - */ - percpu_ref_kill(&c->writes); + bch_fs_read_only(c); +} - queue_work(system_freezable_wq, &c->read_only_work); - return true; +static void bch_fs_read_only_async(struct cache_set *c) +{ + queue_work(system_long_wq, &c->read_only_work); } bool bch_fs_emergency_read_only(struct cache_set *c) { bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); - bch_fs_read_only(c); + bch_fs_read_only_async(c); bch_journal_halt(&c->journal); wake_up(&bch_read_only_wait); return ret; } -void bch_fs_read_only_sync(struct cache_set *c) -{ - /* so we don't race with bch_fs_read_write() */ - lockdep_assert_held(&bch_register_lock); - - bch_fs_read_only(c); - - wait_event(bch_read_only_wait, - test_bit(BCH_FS_RO_COMPLETE, &c->flags) && - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -} - -static const char *__bch_fs_read_write(struct cache_set *c) +const char *bch_fs_read_write(struct cache_set *c) { struct cache *ca; - const char *err; + const char *err = NULL; unsigned i; - lockdep_assert_held(&bch_register_lock); + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STARTING && + c->state != BCH_FS_RO) + goto out; err = "error starting allocator thread"; for_each_cache(ca, c, i) @@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c) if (bch_gc_thread_start(c)) goto err; - for_each_cache(ca, c, i) { - if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE) - continue; - - err = "error starting moving GC thread"; - if (bch_moving_gc_thread_start(ca)) { + err = "error starting moving GC thread"; + for_each_cache(ca, c, i) + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && + bch_moving_gc_start(ca)) { percpu_ref_put(&ca->ref); goto err; } - } err = "error starting tiering thread"; - if (bch_tiering_read_start(c)) + if (bch_tiering_start(c)) goto err; schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); - return NULL; + if (c->state != BCH_FS_STARTING) + percpu_ref_reinit(&c->writes); + + c->state = BCH_FS_RW; + err = NULL; +out: + mutex_unlock(&c->state_lock); + return err; err: __bch_fs_read_only(c); - return err; -} - -const char *bch_fs_read_write(struct cache_set *c) -{ - const char *err; - - lockdep_assert_held(&bch_register_lock); - - if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags)) - return NULL; - - err = __bch_fs_read_write(c); - if (err) - return err; - - percpu_ref_reinit(&c->writes); - - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - clear_bit(BCH_FS_EMERGENCY_RO, &c->flags); - clear_bit(BCH_FS_RO_COMPLETE, &c->flags); - clear_bit(BCH_FS_RO, &c->flags); - return NULL; + goto out; } -/* Cache set startup/shutdown: */ +/* Filesystem startup/shutdown: */ static void bch_fs_free(struct cache_set *c) { - del_timer_sync(&c->foreground_write_wakeup); - cancel_delayed_work_sync(&c->pd_controllers_update); - cancel_work_sync(&c->read_only_work); - cancel_work_sync(&c->bio_submit_work); - cancel_work_sync(&c->read_retry_work); - - bch_fs_encryption_free(c); - bch_btree_cache_free(c); - bch_journal_free(&c->journal); + bch_fs_encryption_exit(c); + bch_fs_btree_exit(c); + bch_fs_journal_exit(&c->journal); bch_io_clock_exit(&c->io_clock[WRITE]); bch_io_clock_exit(&c->io_clock[READ]); - bch_compress_free(c); + bch_fs_compress_exit(c); bch_fs_blockdev_exit(c); bdi_destroy(&c->bdi); lg_lock_free(&c->bucket_stats_lock); @@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c) module_put(THIS_MODULE); } +static void bch_fs_exit(struct cache_set *c) +{ + unsigned i; + + del_timer_sync(&c->foreground_write_wakeup); + cancel_delayed_work_sync(&c->pd_controllers_update); + cancel_work_sync(&c->read_only_work); + cancel_work_sync(&c->bio_submit_work); + cancel_work_sync(&c->read_retry_work); + + for (i = 0; i < c->sb.nr_devices; i++) + if (c->cache[i]) + bch_dev_free(c->cache[i]); + + closure_debug_destroy(&c->cl); + kobject_put(&c->kobj); +} + +static void bch_fs_offline(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + mutex_lock(&bch_register_lock); + list_del(&c->list); + mutex_unlock(&bch_register_lock); + + if (c->kobj.state_in_sysfs) + kobject_del(&c->kobj); + + for_each_cache(ca, c, i) + if (ca->kobj.state_in_sysfs) + kobject_del(&ca->kobj); + + bch_fs_debug_exit(c); + bch_fs_chardev_exit(c); + + bch_cache_accounting_destroy(&c->accounting); + + kobject_put(&c->time_stats); + kobject_put(&c->opts_dir); + kobject_put(&c->internal); + + __bch_fs_read_only(c); +} + /* * should be __bch_fs_stop4 - block devices are closed, now we can finally * free it @@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c) void bch_fs_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - struct completion *stop_completion = c->stop_completion; bch_notify_fs_stopped(c); - bch_info(c, "stopped"); - bch_fs_free(c); - - if (stop_completion) - complete(stop_completion); } /* @@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj) static void __bch_fs_stop3(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); - struct cache *ca; - unsigned i; - mutex_lock(&bch_register_lock); - for_each_cache(ca, c, i) - bch_dev_stop(ca); - - list_del(&c->list); - mutex_unlock(&bch_register_lock); - - closure_debug_destroy(&c->cl); - kobject_put(&c->kobj); + bch_fs_exit(c); } /* @@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - bch_debug_exit_cache_set(c); - bch_fs_chardev_exit(c); - - if (c->kobj.state_in_sysfs) - kobject_del(&c->kobj); - - bch_cache_accounting_destroy(&c->accounting); - - kobject_put(&c->time_stats); - kobject_put(&c->opts_dir); - kobject_put(&c->internal); - - mutex_lock(&bch_register_lock); - bch_fs_read_only_sync(c); - mutex_unlock(&bch_register_lock); + bch_fs_offline(c); closure_return(cl); } /* - * First phase of the shutdown process that's kicked off by bch_fs_stop(); we - * haven't waited for anything to stop yet, we're just punting to process + * First phase of the shutdown process that's kicked off by bch_fs_stop_async(); + * we haven't waited for anything to stop yet, we're just punting to process * context to shut down block devices: */ static void __bch_fs_stop1(struct closure *cl) @@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl) continue_at(cl, __bch_fs_stop2, system_wq); } -void bch_fs_stop(struct cache_set *c) +void bch_fs_stop_async(struct cache_set *c) { - if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags)) + mutex_lock(&c->state_lock); + if (c->state != BCH_FS_STOPPING) { + c->state = BCH_FS_STOPPING; closure_queue(&c->caching); + } + mutex_unlock(&c->state_lock); } -void bch_fs_stop_sync(struct cache_set *c) +void bch_fs_stop(struct cache_set *c) { - DECLARE_COMPLETION_ONSTACK(complete); + mutex_lock(&c->state_lock); + BUG_ON(c->state == BCH_FS_STOPPING); + c->state = BCH_FS_STOPPING; + mutex_unlock(&c->state_lock); + + bch_blockdevs_stop(c); + + closure_sync(&c->caching); + closure_debug_destroy(&c->caching); + + bch_fs_offline(c); - c->stop_completion = &complete; - bch_fs_stop(c); closure_put(&c->cl); + closure_sync(&c->cl); - /* Killable? */ - wait_for_completion(&complete); + bch_fs_exit(c); + kobject_put(&c->kobj); } /* Stop, detaching from backing devices: */ void bch_fs_detach(struct cache_set *c) { if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags)) - bch_fs_stop(c); + bch_fs_stop_async(c); } static unsigned bch_fs_nr_devices(struct cache_set *c) @@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->minor = -1; + mutex_init(&c->state_lock); mutex_init(&c->sb_lock); INIT_RADIX_TREE(&c->devices, GFP_KERNEL); mutex_init(&c->btree_cache_lock); @@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BCH_TIME_STATS() #undef BCH_TIME_STAT - bch_open_buckets_init(c); - bch_tiering_init_cache_set(c); + bch_fs_allocator_init(c); + bch_fs_tiering_init(c); INIT_LIST_HEAD(&c->list); INIT_LIST_HEAD(&c->cached_devs); @@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch_fs_blockdev_init(c) || bch_io_clock_init(&c->io_clock[READ]) || bch_io_clock_init(&c->io_clock[WRITE]) || - bch_journal_alloc(&c->journal, journal_entry_bytes) || - bch_btree_cache_alloc(c) || + bch_fs_journal_init(&c->journal, journal_entry_bytes) || + bch_fs_btree_init(c) || bch_fs_encryption_init(c) || - bch_compress_init(c) || + bch_fs_compress_init(c) || bch_check_set_has_compressed_data(c, c->opts.compression)) goto err; @@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts) closure_init(&c->caching, &c->cl); set_closure_fn(&c->caching, __bch_fs_stop1, system_wq); + closure_get(&c->cl); continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq); return c; err: @@ -671,7 +660,20 @@ err: return NULL; } -static int bch_fs_online(struct cache_set *c) +static struct cache_set *bch_fs_lookup(uuid_le uuid) +{ + struct cache_set *c; + + lockdep_assert_held(&bch_register_lock); + + list_for_each_entry(c, &bch_fs_list, list) + if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) + return c; + + return NULL; +} + +static const char *__bch_fs_online(struct cache_set *c) { struct cache *ca; unsigned i; @@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c) lockdep_assert_held(&bch_register_lock); if (!list_empty(&c->list)) - return 0; + return NULL; - list_add(&c->list, &bch_fs_list); + if (bch_fs_lookup(c->sb.uuid)) + return "filesystem UUID already open"; ret = bch_fs_chardev_init(c); if (ret) - return ret; + return "error creating character device"; + + bch_fs_debug_init(c); if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || kobject_add(&c->internal, &c->kobj, "internal") || kobject_add(&c->opts_dir, &c->kobj, "options") || kobject_add(&c->time_stats, &c->kobj, "time_stats") || bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) - return -1; + return "error creating sysfs objects"; for_each_cache(ca, c, i) if (bch_dev_online(ca)) { percpu_ref_put(&ca->ref); - return -1; + return "error creating sysfs objects"; } + mutex_lock(&c->state_lock); + + if (bch_blockdev_volumes_start(c)) { + mutex_unlock(&c->state_lock); + return "can't bring up blockdev volumes"; + } + + bch_attach_backing_devs(c); + + mutex_unlock(&c->state_lock); + + list_add(&c->list, &bch_fs_list); + return 0; } -static const char *bch_fs_start(struct cache_set *c) +static const char *bch_fs_online(struct cache_set *c) +{ + const char *err; + + mutex_lock(&bch_register_lock); + err = __bch_fs_online(c); + mutex_unlock(&bch_register_lock); + + return err; +} + +static const char *__bch_fs_start(struct cache_set *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; @@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c) struct jset *j; int ret = -EINVAL; - lockdep_assert_held(&bch_register_lock); - BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags)); - - /* We don't want bch_fatal_error() to free underneath us */ - closure_get(&c->caching); + BUG_ON(c->state != BCH_FS_STARTING); /* * Make sure that each cache object's mi is up to date before @@ -826,6 +851,16 @@ static const char *bch_fs_start(struct cache_set *c) bch_notice(c, "initializing new filesystem"); + bch_initial_gc(c, NULL); + + err = "error starting allocator thread"; + for_each_cache(ca, c, i) + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && + bch_dev_allocator_start(ca)) { + percpu_ref_put(&ca->ref); + goto err; + } + err = "unable to allocate journal buckets"; for_each_cache(ca, c, i) if (bch_dev_journal_alloc(ca)) { @@ -833,8 +868,6 @@ static const char *bch_fs_start(struct cache_set *c) goto err; } - bch_initial_gc(c, NULL); - /* * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: @@ -842,14 +875,6 @@ static const char *bch_fs_start(struct cache_set *c) bch_journal_start(c); bch_journal_set_replay_done(&c->journal); - err = "error starting allocator thread"; - for_each_cache(ca, c, i) - if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && - bch_dev_allocator_start(ca)) { - percpu_ref_put(&ca->ref); - goto err; - } - err = "cannot allocate new btree root"; for (id = 0; id < BTREE_ID_NR; id++) if (bch_btree_root_alloc(c, id, &cl)) { @@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c) goto err; } recovery_done: + err = "dynamic fault"; + if (bch_fs_init_fault("fs_start")) + goto err; + if (c->opts.read_only) { - bch_fs_read_only_sync(c); + bch_fs_read_only(c); } else { - err = __bch_fs_read_write(c); + err = bch_fs_read_write(c); if (err) goto err; } @@ -901,27 +930,9 @@ recovery_done: bch_write_super(c); mutex_unlock(&c->sb_lock); - err = "dynamic fault"; - if (bch_fs_init_fault("fs_start")) - goto err; - - err = "error creating kobject"; - if (bch_fs_online(c)) - goto err; - - err = "can't bring up blockdev volumes"; - if (bch_blockdev_volumes_start(c)) - goto err; - - bch_debug_init_cache_set(c); - set_bit(BCH_FS_RUNNING, &c->flags); - bch_attach_backing_devs(c); - - bch_notify_fs_read_write(c); err = NULL; out: bch_journal_entries_free(&journal); - closure_put(&c->caching); return err; err: switch (ret) { @@ -955,6 +966,11 @@ err: goto out; } +const char *bch_fs_start(struct cache_set *c) +{ + return __bch_fs_start(c) ?: bch_fs_online(c); +} + static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c) { struct bch_sb_field_members *sb_mi; @@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c) return NULL; } -/* Cache device */ +/* Device startup/shutdown, ro/rw: */ bool bch_dev_read_only(struct cache *ca) { @@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca) bdevname(ca->disk_sb.bdev, buf); - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE) return false; if (!bch_dev_may_remove(ca)) { bch_err(c, "required member %s going RO, forcing fs RO", buf); - bch_fs_read_only_sync(c); + bch_fs_read_only(c); } trace_bcache_cache_read_only(ca); @@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca) static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca) { - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) return NULL; @@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca) if (bch_dev_allocator_start(ca)) return "error starting allocator thread"; - if (bch_moving_gc_thread_start(ca)) + if (bch_moving_gc_start(ca)) return "error starting moving GC thread"; - bch_dev_group_add(&c->journal.devs, ca); - - wake_up_process(c->tiering_read); + if (bch_tiering_start(c)) + return "error starting tiering thread"; bch_notify_dev_read_write(ca); trace_bcache_cache_read_write_done(ca); @@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca) return NULL; } -/* - * bch_dev_stop has already returned, so we no longer hold the register - * lock at the point this is called. - */ - void bch_dev_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); - percpu_ref_exit(&ca->ref); kfree(ca); } -static void bch_dev_free_work(struct work_struct *work) +static void bch_dev_free(struct cache *ca) { - struct cache *ca = container_of(work, struct cache, free_work); struct cache_set *c = ca->set; unsigned i; @@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work) kobject_del(&ca->kobj); bch_free_super(&ca->disk_sb); - - /* - * bch_dev_stop can be called in the middle of initialization - * of the struct cache object. - * As such, not all the sub-structures may be initialized. - * However, they were zeroed when the object was allocated. - */ - - bch_journal_free_cache(ca); + bch_dev_journal_exit(ca); free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->bucket_stats_percpu); @@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); + percpu_ref_exit(&ca->ref); kobject_put(&ca->kobj); if (c) kobject_put(&c->kobj); } +static void bch_dev_free_work(struct work_struct *work) +{ + struct cache *ca = container_of(work, struct cache, free_work); + + bch_dev_free(ca); +} + static void bch_dev_percpu_ref_release(struct percpu_ref *ref) { struct cache *ca = container_of(ref, struct cache, ref); @@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca) { struct cache_set *c = ca->set; - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->state_lock); - if (c) { - BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca); - rcu_assign_pointer(c->cache[ca->dev_idx], NULL); - } + BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca); + rcu_assign_pointer(c->cache[ca->dev_idx], NULL); call_rcu(&ca->free_rcu, bch_dev_free_rcu); } @@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work) */ closure_get(&c->cl); - mutex_lock(&bch_register_lock); + mutex_lock(&c->state_lock); + bch_dev_stop(ca); /* @@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work) */ synchronize_rcu(); - lockdep_assert_held(&bch_register_lock); - /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: @@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work) memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); bch_write_super(c); - mutex_unlock(&c->sb_lock); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->sb_lock); + mutex_unlock(&c->state_lock); closure_put(&c->cl); } -bool bch_dev_remove(struct cache *ca, bool force) +static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force) { - mutex_lock(&bch_register_lock); - if (test_bit(BCH_DEV_REMOVING, &ca->flags)) return false; if (!bch_dev_may_remove(ca)) { - bch_err(ca->set, "Can't remove last device in tier %u", - ca->mi.tier); + bch_err(ca->set, "Can't remove last RW device"); bch_notify_dev_remove_failed(ca); return false; } @@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force) if (force) set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags); + set_bit(BCH_DEV_REMOVING, &ca->flags); bch_notify_dev_removing(ca); - mutex_unlock(&bch_register_lock); - /* Migrate the data and finish removal asynchronously: */ queue_work(system_long_wq, &ca->remove_work); return true; } +bool bch_dev_remove(struct cache *ca, bool force) +{ + struct cache_set *c = ca->set; + bool ret; + + mutex_lock(&c->state_lock); + ret = __bch_dev_remove(c, ca, force); + mutex_unlock(&c->state_lock); + + return ret; +} + static int bch_dev_online(struct cache *ca) { char buf[12]; - lockdep_assert_held(&bch_register_lock); - sprintf(buf, "cache%u", ca->dev_idx); if (kobject_add(&ca->kobj, @@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, kobject_init(&ca->kobj, &bch_dev_ktype); spin_lock_init(&ca->self.lock); - ca->self.nr_devices = 1; + ca->self.nr = 1; rcu_assign_pointer(ca->self.d[0].dev, ca); ca->dev_idx = sb->sb->dev_idx; @@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, spin_lock_init(&ca->freelist_lock); spin_lock_init(&ca->prio_buckets_lock); mutex_init(&ca->heap_lock); - bch_moving_init_cache(ca); + bch_dev_moving_gc_init(ca); ca->disk_sb = *sb; - ca->disk_sb.bdev->bd_holder = ca; + if (sb->mode & FMODE_EXCL) + ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work); @@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) || - bch_journal_init_cache(ca)) + bch_dev_journal_init(ca)) goto err; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); @@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, err = "error creating kobject"; if (c->kobj.state_in_sysfs && bch_dev_online(ca)) - goto err; + pr_warn("error creating sysfs objects"); if (ret) *ret = ca; @@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb, kobject_put(&ca->kobj); return NULL; err: - bch_dev_stop(ca); + bch_dev_free(ca); return err; } -static struct cache_set *bch_fs_lookup(uuid_le uuid) -{ - struct cache_set *c; - - lockdep_assert_held(&bch_register_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) - return c; - - return NULL; -} - int bch_dev_add(struct cache_set *c, const char *path) { struct bcache_superblock sb; const char *err; struct cache *ca; - struct bch_sb_field *f; struct bch_sb_field_members *mi, *dev_mi; struct bch_member saved_mi; unsigned dev_idx, nr_devices, u64s; int ret = -EINVAL; - mutex_lock(&bch_register_lock); - err = bch_read_super(&sb, c->opts, path); if (err) - goto err_unlock_register; + return -EINVAL; err = bch_validate_cache_super(&sb); if (err) - goto err_unlock_register; - - mutex_lock(&c->sb_lock); + return -EINVAL; err = bch_dev_may_add(sb.sb, c); if (err) - goto err_unlock; + return -EINVAL; + + mutex_lock(&c->state_lock); + mutex_lock(&c->sb_lock); /* * Preserve the old cache member information (esp. tier) @@ -1571,17 +1568,14 @@ have_slot: sizeof(struct bch_member) * nr_devices) / sizeof(u64); err = "no space in superblock for member info"; - f = bch_fs_sb_field_resize(c, &mi->field, u64s); - if (!f) + mi = bch_fs_sb_resize_members(c, u64s); + if (!mi) goto err_unlock; - mi = container_of(f, struct bch_sb_field_members, field); - - f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s); - if (!f) + dev_mi = bch_sb_resize_members(&sb, u64s); + if (!dev_mi) goto err_unlock; - dev_mi = container_of(f, struct bch_sb_field_members, field); memcpy(dev_mi, mi, u64s * sizeof(u64)); dev_mi->members[dev_idx] = saved_mi; @@ -1619,14 +1613,13 @@ have_slot: kobject_put(&ca->kobj); mutex_unlock(&c->sb_lock); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); return 0; err_put: bch_dev_stop(ca); err_unlock: mutex_unlock(&c->sb_lock); -err_unlock_register: - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); bch_free_super(&sb); bch_err(c, "Unable to add device: %s", err); @@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices, const char *err; struct cache_set *c = NULL; struct bcache_superblock *sb; - uuid_le uuid; unsigned i; - memset(&uuid, 0, sizeof(uuid_le)); - if (!nr_devices) return "need at least one device"; @@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices, if (!sb) goto err; - /* - * bch_read_super() needs to happen under register_lock, so that the - * exclusive open is atomic with adding the new cache set to the list of - * cache sets: - */ - mutex_lock(&bch_register_lock); - for (i = 0; i < nr_devices; i++) { err = bch_read_super(&sb[i], opts, devices[i]); if (err) - goto err_unlock; + goto err; err = "attempting to register backing device"; if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) - goto err_unlock; + goto err; err = bch_validate_cache_super(&sb[i]); if (err) - goto err_unlock; + goto err; } - err = "cache set already registered"; - if (bch_fs_lookup(sb->sb->uuid)) - goto err_unlock; - err = "cannot allocate memory"; c = bch_fs_alloc(sb[0].sb, opts); if (!c) - goto err_unlock; + goto err; for (i = 0; i < nr_devices; i++) { err = bch_dev_alloc(&sb[i], c, NULL); if (err) - goto err_unlock; + goto err; } err = "insufficient devices"; if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c)) - goto err_unlock; + goto err; - err = bch_fs_start(c); - if (err) - goto err_unlock; + if (!c->opts.nostart) { + err = __bch_fs_start(c); + if (err) + goto err; + } - err = "error creating kobject"; - if (bch_fs_online(c)) - goto err_unlock; + err = bch_fs_online(c); + if (err) + goto err; - if (ret) { - closure_get(&c->cl); + if (ret) *ret = c; - } - - mutex_unlock(&bch_register_lock); + else + closure_put(&c->cl); err = NULL; out: @@ -1717,20 +1696,18 @@ out: if (err) c = NULL; return err; -err_unlock: +err: if (c) bch_fs_stop(c); - mutex_unlock(&bch_register_lock); -err: + for (i = 0; i < nr_devices; i++) bch_free_super(&sb[i]); goto out; } static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, - struct bch_opts opts) + struct bch_opts opts) { - char name[BDEVNAME_SIZE]; const char *err; struct cache_set *c; bool allocated_cache_set = false; @@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, if (err) return err; - bdevname(sb->bdev, name); - + mutex_lock(&bch_register_lock); c = bch_fs_lookup(sb->sb->uuid); if (c) { + closure_get(&c->cl); + err = bch_dev_in_fs(sb->sb, c); if (err) - return err; + goto err; } else { c = bch_fs_alloc(sb->sb, opts); + err = "cannot allocate memory"; if (!c) - return "cannot allocate memory"; + goto err; allocated_cache_set = true; } @@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb, if (err) goto err; - if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) { - err = bch_fs_start(c); + if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) && + !c->opts.nostart) { + err = __bch_fs_start(c); if (err) goto err; - } else { - err = "error creating kobject"; - if (bch_fs_online(c)) - goto err; } - bch_info(c, "started"); + err = __bch_fs_online(c); + if (err) + goto err; + + closure_put(&c->cl); + mutex_unlock(&bch_register_lock); + return NULL; err: + mutex_unlock(&bch_register_lock); + if (allocated_cache_set) bch_fs_stop(c); + else if (c) + closure_put(&c->cl); + return err; } @@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path) struct bch_opts opts = bch_opts_empty(); const char *err; - mutex_lock(&bch_register_lock); - err = bch_read_super(&sb, opts, path); if (err) - goto err; + return err; - if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) + if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) { + mutex_lock(&bch_register_lock); err = bch_backing_dev_register(&sb); - else + mutex_unlock(&bch_register_lock); + } else { err = __bch_fs_open_incremental(&sb, opts); + } bch_free_super(&sb); -err: - mutex_unlock(&bch_register_lock); + return err; } @@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) pr_info("Setting all devices read only:"); list_for_each_entry(c, &bch_fs_list, list) - bch_fs_read_only(c); + bch_fs_read_only_async(c); list_for_each_entry(c, &bch_fs_list, list) - bch_fs_read_only_sync(c); + bch_fs_read_only(c); mutex_unlock(&bch_register_lock); } @@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot, reboot_test); static void bcache_exit(void) { bch_debug_exit(); - bch_fs_exit(); + bch_vfs_exit(); bch_blockdev_exit(); bch_chardev_exit(); if (bcache_kset) @@ -1917,7 +1904,7 @@ static int __init bcache_init(void) sysfs_create_files(&bcache_kset->kobj, files) || bch_chardev_init() || bch_blockdev_init() || - bch_fs_init() || + bch_vfs_init() || bch_debug_init()) goto err; diff --git a/libbcache/super.h b/libbcache/super.h index bcf7d983..bafd88e0 100644 --- a/libbcache/super.h +++ b/libbcache/super.h @@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c, static inline bool bch_dev_may_remove(struct cache *ca) { struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; - - /* - * Right now, we can't remove the last device from a tier, - * - For tier 0, because all metadata lives in tier 0 and because - * there is no way to have foreground writes go directly to tier 1. - * - For tier 1, because the code doesn't completely support an - * empty tier 1. - */ - - /* - * Turning a device read-only removes it from the cache group, - * so there may only be one read-write device in a tier, and yet - * the device we are removing is in the same tier, so we have - * to check for identity. - * Removing the last RW device from a tier requires turning the - * whole cache set RO. - */ - - return tier->nr_devices != 1 || - rcu_access_pointer(tier->d[0].dev) != ca; + struct cache_group *grp = &c->cache_all; + + /* Can't remove the last RW device: */ + return grp->nr != 1 || + rcu_access_pointer(grp->d[0].dev) != ca; } void bch_dev_release(struct kobject *); @@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *); void bch_fs_detach(struct cache_set *); -bool bch_fs_read_only(struct cache_set *); bool bch_fs_emergency_read_only(struct cache_set *); -void bch_fs_read_only_sync(struct cache_set *); +void bch_fs_read_only(struct cache_set *); const char *bch_fs_read_write(struct cache_set *); void bch_fs_release(struct kobject *); +void bch_fs_stop_async(struct cache_set *); void bch_fs_stop(struct cache_set *); -void bch_fs_stop_sync(struct cache_set *); +const char *bch_fs_start(struct cache_set *); const char *bch_fs_open(char * const *, unsigned, struct bch_opts, struct cache_set **); const char *bch_fs_open_incremental(const char *path); diff --git a/libbcache/super_types.h b/libbcache/super_types.h index 41eaf0dd..69c747de 100644 --- a/libbcache/super_types.h +++ b/libbcache/super_types.h @@ -6,6 +6,7 @@ struct bcache_superblock { struct block_device *bdev; struct bio *bio; unsigned page_order; + fmode_t mode; }; #endif /* _BCACHE_SUPER_TYPES_H */ diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c index 9f45a6b0..48f9f1f6 100644 --- a/libbcache/sysfs.c +++ b/libbcache/sysfs.c @@ -22,6 +22,7 @@ #include "opts.h" #include "request.h" #include "super-io.h" +#include "tier.h" #include "writeback.h" #include <linux/blkdev.h> @@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy); rw_attribute(foreground_write_ratelimit_enabled); rw_attribute(copy_gc_enabled); sysfs_pd_controller_attribute(copy_gc); + +rw_attribute(tier); rw_attribute(tiering_enabled); rw_attribute(tiering_percent); sysfs_pd_controller_attribute(tiering); @@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent); rw_attribute(size); read_attribute(meta_replicas_have); read_attribute(data_replicas_have); -read_attribute(tier); #define BCH_DEBUG_PARAM(name, description) \ rw_attribute(name); @@ -680,7 +682,8 @@ SHOW(bch_fs) sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); sysfs_print(tiering_percent, c->tiering_percent); - sysfs_pd_controller_show(tiering, &c->tiering_pd); + + sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */ sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have); sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have); @@ -694,7 +697,7 @@ SHOW(bch_fs) BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_RUNNING, &c->flags)) + if (!bch_fs_running(c)) return -EPERM; if (attr == &sysfs_bset_tree_stats) @@ -723,7 +726,7 @@ STORE(__bch_fs) } if (attr == &sysfs_stop) { - bch_fs_stop(c); + bch_fs_stop_async(c); return size; } @@ -773,25 +776,18 @@ STORE(__bch_fs) ssize_t ret = strtoul_safe(buf, c->tiering_enabled) ?: (ssize_t) size; - if (c->tiering_read) - wake_up_process(c->tiering_read); + bch_tiering_start(c); /* issue wakeups */ return ret; } sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd); - if (attr == &sysfs_journal_flush) { - bch_journal_meta_async(&c->journal, NULL); - - return size; - } - sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); sysfs_strtoul(foreground_target_percent, c->foreground_target_percent); sysfs_strtoul(tiering_percent, c->tiering_percent); - sysfs_pd_controller_store(tiering, &c->tiering_pd); + sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */ /* Debugging: */ @@ -799,11 +795,14 @@ STORE(__bch_fs) BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_RUNNING, &c->flags)) + if (!bch_fs_running(c)) return -EPERM; - if (test_bit(BCH_FS_STOPPING, &c->flags)) - return -EINTR; + if (attr == &sysfs_journal_flush) { + bch_journal_meta_async(&c->journal, NULL); + + return size; + } if (attr == &sysfs_blockdev_volume_create) { u64 v = strtoi_h_or_return(buf); @@ -836,9 +835,9 @@ STORE(bch_fs) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); - mutex_lock(&bch_register_lock); + mutex_lock(&c->state_lock); size = __bch_fs_store(kobj, attr, buf, size); - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->state_lock); if (attr == &sysfs_add_device) { char *path = kstrdup(buf, GFP_KERNEL); @@ -1273,6 +1272,31 @@ STORE(__bch_dev) mutex_unlock(&c->sb_lock); } + if (attr == &sysfs_tier) { + unsigned prev_tier; + unsigned v = strtoul_restrict_or_return(buf, + 0, BCH_TIER_MAX - 1); + + mutex_lock(&c->sb_lock); + prev_tier = ca->mi.tier; + + if (v == ca->mi.tier) { + mutex_unlock(&c->sb_lock); + return size; + } + + mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_TIER(mi, v); + bch_write_super(c); + + bch_dev_group_remove(&c->tiers[prev_tier].devs, ca); + bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); + mutex_unlock(&c->sb_lock); + + bch_recalc_capacity(c); + bch_tiering_start(c); + } + if (attr == &sysfs_state_rw) { char name[BDEVNAME_SIZE]; const char *err = NULL; diff --git a/libbcache/tier.c b/libbcache/tier.c index 46864594..0ab17708 100644 --- a/libbcache/tier.c +++ b/libbcache/tier.c @@ -16,8 +16,7 @@ #include <trace/events/bcache.h> struct tiering_state { - struct cache_group *tier; - unsigned tier_idx; + struct bch_tier *tier; unsigned sectors; unsigned stripe_size; unsigned dev_idx; @@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c, mi = cache_member_info_get(c); extent_for_each_ptr(e, ptr) if (ptr->dev < mi->nr_devices && - mi->m[ptr->dev].tier >= s->tier_idx) + mi->m[ptr->dev].tier >= s->tier->idx) replicas++; cache_member_info_put(); @@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s) s->sectors = 0; s->dev_idx++; - spin_lock(&s->tier->lock); - if (s->dev_idx >= s->tier->nr_devices) + spin_lock(&s->tier->devs.lock); + if (s->dev_idx >= s->tier->devs.nr) s->dev_idx = 0; - if (s->tier->nr_devices) { - s->ca = s->tier->d[s->dev_idx].dev; + if (s->tier->devs.nr) { + s->ca = s->tier->devs.d[s->dev_idx].dev; percpu_ref_get(&s->ca->ref); } - spin_unlock(&s->tier->lock); + spin_unlock(&s->tier->devs.lock); } } @@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c, * tiering_next_cache - issue a move to write an extent to the next cache * device in round robin order */ -static s64 read_tiering(struct cache_set *c, struct cache_group *tier) +static s64 read_tiering(struct cache_set *c, struct bch_tier *tier) { struct moving_context ctxt; struct tiering_state s; struct btree_iter iter; struct bkey_s_c k; - unsigned nr_devices = READ_ONCE(tier->nr_devices); + unsigned nr_devices = READ_ONCE(tier->devs.nr); int ret; if (!nr_devices) @@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier) memset(&s, 0, sizeof(s)); s.tier = tier; - s.tier_idx = tier - c->cache_tiers; s.stripe_size = 2048; /* 1 mb for now */ - bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate, + bch_move_ctxt_init(&ctxt, &tier->pd.rate, nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); @@ -164,8 +162,8 @@ next: static int bch_tiering_thread(void *arg) { - struct cache_set *c = arg; - struct cache_group *tier = &c->cache_tiers[1]; + struct bch_tier *tier = arg; + struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]); struct io_clock *clock = &c->io_clock[WRITE]; struct cache *ca; u64 tier_capacity, available_sectors; @@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg) while (!kthread_should_stop()) { if (kthread_wait_freezable(c->tiering_enabled && - tier->nr_devices)) + tier->devs.nr)) break; while (1) { - struct cache_group *faster_tier; + struct bch_tier *faster_tier; last = atomic_long_read(&clock->now); tier_capacity = available_sectors = 0; rcu_read_lock(); - for (faster_tier = c->cache_tiers; + for (faster_tier = c->tiers; faster_tier != tier; faster_tier++) { - group_for_each_cache_rcu(ca, faster_tier, i) { + group_for_each_cache_rcu(ca, &faster_tier->devs, i) { tier_capacity += (ca->mi.nbuckets - ca->mi.first_bucket) << ca->bucket_bits; @@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg) return 0; } -void bch_tiering_init_cache_set(struct cache_set *c) +static void __bch_tiering_stop(struct bch_tier *tier) { - bch_pd_controller_init(&c->tiering_pd); + tier->pd.rate.rate = UINT_MAX; + bch_ratelimit_reset(&tier->pd.rate); + + if (tier->migrate) + kthread_stop(tier->migrate); + + tier->migrate = NULL; +} + +void bch_tiering_stop(struct cache_set *c) +{ + struct bch_tier *tier; + + for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) + __bch_tiering_stop(tier); +} + +static int __bch_tiering_start(struct bch_tier *tier) +{ + if (!tier->migrate) { + struct task_struct *p = + kthread_create(bch_tiering_thread, tier, + "bch_tier[%u]", tier->idx); + if (IS_ERR(p)) + return PTR_ERR(p); + + tier->migrate = p; + } + + wake_up_process(tier->migrate); + return 0; } -int bch_tiering_read_start(struct cache_set *c) +int bch_tiering_start(struct cache_set *c) { - struct task_struct *t; + struct bch_tier *tier; + bool have_faster_tier = false; if (c->opts.nochanges) return 0; - t = kthread_create(bch_tiering_thread, c, "bch_tier_read"); - if (IS_ERR(t)) - return PTR_ERR(t); + for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) { + if (!tier->devs.nr) + continue; - c->tiering_read = t; - wake_up_process(c->tiering_read); + if (have_faster_tier) { + int ret = __bch_tiering_start(tier); + if (ret) + return ret; + } else { + __bch_tiering_stop(tier); + } + + have_faster_tier = true; + } return 0; } -void bch_tiering_read_stop(struct cache_set *c) +void bch_fs_tiering_init(struct cache_set *c) { - if (!IS_ERR_OR_NULL(c->tiering_read)) { - kthread_stop(c->tiering_read); - c->tiering_read = NULL; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) { + c->tiers[i].idx = i; + bch_pd_controller_init(&c->tiers[i].pd); } } diff --git a/libbcache/tier.h b/libbcache/tier.h index 89c2bffd..b53e83d9 100644 --- a/libbcache/tier.h +++ b/libbcache/tier.h @@ -1,8 +1,8 @@ #ifndef _BCACHE_TIER_H #define _BCACHE_TIER_H -void bch_tiering_init_cache_set(struct cache_set *); -int bch_tiering_read_start(struct cache_set *); -void bch_tiering_read_stop(struct cache_set *); +void bch_tiering_stop(struct cache_set *); +int bch_tiering_start(struct cache_set *); +void bch_fs_tiering_init(struct cache_set *); #endif |