cmd_device_fail

Add a comamnd for setting a device as failed, update bcache sources
author: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-09 08:27:30 -0900
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-09 09:14:11 -0900
commit: ac1b32acb4ca8c59c0e4911a8d3b27fd72dc54af (patch)
tree: e73a6ea5ddb6f3ece6a3e6c069ffa9ecc5e1ee44 /libbcache/super.c
parent: a17f7bcec7ed810a247c24e56229af8f43a9a6ae (diff)
1 files changed, 231 insertions, 243 deletions
diff --git a/libbcache/super.c b/libbcache/super.c
index 5535639c..d2863e62 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -616,7 +616,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 					 c->sb.btree_node_size,
 					 BCH_ENCODED_EXTENT_MAX) /
 				   PAGE_SECTORS, 0) ||
-	    !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
+	    !(c->bucket_stats_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->bucket_stats_lock) ||
 	    mempool_init_page_pool(&c->btree_bounce_pool, 1,
 				   ilog2(btree_pages(c))) ||
@@ -1015,104 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 	return NULL;
 }
 
-/* Device startup/shutdown, ro/rw: */
-
-bool bch_dev_read_only(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct bch_sb_field_members *mi;
-	char buf[BDEVNAME_SIZE];
-
-	bdevname(ca->disk_sb.bdev, buf);
-
-	lockdep_assert_held(&c->state_lock);
-
-	if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
-		return false;
-
-	if (!bch_dev_may_remove(ca)) {
-		bch_err(c, "required member %s going RO, forcing fs RO", buf);
-		bch_fs_read_only(c);
-	}
-
-	trace_bcache_cache_read_only(ca);
-
-	bch_moving_gc_stop(ca);
-
-	/*
-	 * This stops new data writes (e.g. to existing open data
-	 * buckets) and then waits for all existing writes to
-	 * complete.
-	 */
-	bch_dev_allocator_stop(ca);
-
-	bch_dev_group_remove(&c->journal.devs, ca);
-
-	/*
-	 * Device data write barrier -- no non-meta-data writes should
-	 * occur after this point.  However, writes to btree buckets,
-	 * journal buckets, and the superblock can still occur.
-	 */
-	trace_bcache_cache_read_only_done(ca);
-
-	bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
-	bch_notify_dev_read_only(ca);
-
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
-			     BCH_MEMBER_STATE_RO);
-	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
-	return true;
-}
-
-static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
-{
-	lockdep_assert_held(&c->state_lock);
-
-	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
-		return NULL;
-
-	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
-		return "removing";
-
-	trace_bcache_cache_read_write(ca);
-
-	if (bch_dev_allocator_start(ca))
-		return "error starting allocator thread";
-
-	if (bch_moving_gc_start(ca))
-		return "error starting moving GC thread";
-
-	if (bch_tiering_start(c))
-		return "error starting tiering thread";
-
-	bch_notify_dev_read_write(ca);
-	trace_bcache_cache_read_write_done(ca);
-
-	return NULL;
-}
-
-const char *bch_dev_read_write(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct bch_sb_field_members *mi;
-	const char *err;
-
-	err = __bch_dev_read_write(c, ca);
-	if (err)
-		return err;
-
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
-			     BCH_MEMBER_STATE_ACTIVE);
-	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return NULL;
-}
+/* Device startup/shutdown: */
 
 void bch_dev_release(struct kobject *kobj)
 {
@@ -1209,148 +1112,6 @@ static void bch_dev_stop(struct cache *ca)
 	call_rcu(&ca->free_rcu, bch_dev_free_rcu);
 }
 
-static void bch_dev_remove_work(struct work_struct *work)
-{
-	struct cache *ca = container_of(work, struct cache, remove_work);
-	struct bch_sb_field_members *mi;
-	struct cache_set *c = ca->set;
-	char name[BDEVNAME_SIZE];
-	bool force = test_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
-	unsigned dev_idx = ca->dev_idx;
-
-	bdevname(ca->disk_sb.bdev, name);
-
-	/*
-	 * Device should already be RO, now migrate data off:
-	 *
-	 * XXX: locking is sketchy, bch_dev_read_write() has to check
-	 * BCH_DEV_REMOVING bit
-	 */
-	if (!ca->mi.has_data) {
-		/* Nothing to do: */
-	} else if (!bch_move_data_off_device(ca)) {
-		mutex_lock(&c->sb_lock);
-		mi = bch_sb_get_members(c->disk_sb);
-		SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
-		bch_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} else if (force) {
-		bch_flag_data_bad(ca);
-
-		mutex_lock(&c->sb_lock);
-		mi = bch_sb_get_members(c->disk_sb);
-		SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
-		bch_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} else {
-		bch_err(c, "Remove of %s failed, unable to migrate data off",
-			name);
-		clear_bit(BCH_DEV_REMOVING, &ca->flags);
-		return;
-	}
-
-	/* Now metadata: */
-
-	if (!ca->mi.has_metadata) {
-		/* Nothing to do: */
-	} else if (!bch_move_meta_data_off_device(ca)) {
-		mutex_lock(&c->sb_lock);
-		mi = bch_sb_get_members(c->disk_sb);
-		SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
-
-		bch_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	} else {
-		bch_err(c, "Remove of %s failed, unable to migrate metadata off",
-			name);
-		clear_bit(BCH_DEV_REMOVING, &ca->flags);
-		return;
-	}
-
-	/*
-	 * Ok, really doing the remove:
-	 * Drop device's prio pointer before removing it from superblock:
-	 */
-	bch_notify_dev_removed(ca);
-
-	spin_lock(&c->journal.lock);
-	c->journal.prio_buckets[dev_idx] = 0;
-	spin_unlock(&c->journal.lock);
-
-	bch_journal_meta(&c->journal);
-
-	/*
-	 * Stop device before removing it from the cache set's list of devices -
-	 * and get our own ref on cache set since ca is going away:
-	 */
-	closure_get(&c->cl);
-
-	mutex_lock(&c->state_lock);
-
-	bch_dev_stop(ca);
-
-	/*
-	 * RCU barrier between dropping between c->cache and dropping from
-	 * member info:
-	 */
-	synchronize_rcu();
-
-	/*
-	 * Free this device's slot in the bch_member array - all pointers to
-	 * this device must be gone:
-	 */
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
-
-	bch_write_super(c);
-
-	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&c->state_lock);
-
-	closure_put(&c->cl);
-}
-
-static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
-{
-	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
-		return false;
-
-	if (!bch_dev_may_remove(ca)) {
-		bch_err(ca->set, "Can't remove last RW device");
-		bch_notify_dev_remove_failed(ca);
-		return false;
-	}
-
-	/* First, go RO before we try to migrate data off: */
-	bch_dev_read_only(ca);
-
-	if (force)
-		set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
-
-	set_bit(BCH_DEV_REMOVING, &ca->flags);
-	bch_notify_dev_removing(ca);
-
-	/* Migrate the data and finish removal asynchronously: */
-
-	queue_work(system_long_wq, &ca->remove_work);
-	return true;
-}
-
-bool bch_dev_remove(struct cache *ca, bool force)
-{
-	struct cache_set *c = ca->set;
-	bool ret;
-
-	mutex_lock(&c->state_lock);
-	ret = __bch_dev_remove(c, ca, force);
-	mutex_unlock(&c->state_lock);
-
-	return ret;
-}
-
 static int bch_dev_online(struct cache *ca)
 {
 	char buf[12];
@@ -1402,7 +1163,6 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	ca->dev_idx = sb->sb->dev_idx;
 
 	INIT_WORK(&ca->free_work, bch_dev_free_work);
-	INIT_WORK(&ca->remove_work, bch_dev_remove_work);
 	spin_lock_init(&ca->freelist_lock);
 	spin_lock_init(&ca->prio_buckets_lock);
 	mutex_init(&ca->heap_lock);
@@ -1451,7 +1211,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
 					  2, GFP_KERNEL)) ||
 	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
-	    !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
+	    !(ca->bucket_stats_percpu = alloc_percpu(struct bch_dev_usage)) ||
 	    !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio)) ||
@@ -1506,6 +1266,232 @@ err:
 	return err;
 }
 
+/* Device management: */
+
+static void __bch_dev_read_only(struct cache_set *c, struct cache *ca)
+{
+	bch_moving_gc_stop(ca);
+
+	/*
+	 * This stops new data writes (e.g. to existing open data
+	 * buckets) and then waits for all existing writes to
+	 * complete.
+	 */
+	bch_dev_allocator_stop(ca);
+
+	bch_dev_group_remove(&c->journal.devs, ca);
+}
+
+static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
+		return NULL;
+
+	trace_bcache_cache_read_write(ca);
+
+	if (bch_dev_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch_moving_gc_start(ca))
+		return "error starting moving GC thread";
+
+	if (bch_tiering_start(c))
+		return "error starting tiering thread";
+
+	bch_notify_dev_read_write(ca);
+	trace_bcache_cache_read_write_done(ca);
+
+	return NULL;
+}
+
+bool bch_dev_state_allowed(struct cache_set *c, struct cache *ca,
+			   enum bch_member_state new_state, int flags)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	if (new_state == BCH_MEMBER_STATE_ACTIVE)
+		return true;
+
+	if (ca->mi.has_data &&
+	    !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+		return false;
+
+	if (ca->mi.has_data &&
+	    c->sb.data_replicas_have <= 1 &&
+	    !(flags & BCH_FORCE_IF_DATA_LOST))
+		return false;
+
+	if (ca->mi.has_metadata &&
+	    !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
+		return false;
+
+	if (ca->mi.has_metadata &&
+	    c->sb.meta_replicas_have <= 1 &&
+	    !(flags & BCH_FORCE_IF_METADATA_LOST))
+		return false;
+
+	return true;
+}
+
+int __bch_dev_set_state(struct cache_set *c, struct cache *ca,
+			enum bch_member_state new_state, int flags)
+{
+	struct bch_sb_field_members *mi;
+	char buf[BDEVNAME_SIZE];
+
+	if (ca->mi.state == new_state)
+		return 0;
+
+	if (!bch_dev_state_allowed(c, ca, new_state, flags))
+		return -EINVAL;
+
+	if (new_state == BCH_MEMBER_STATE_ACTIVE) {
+		if (__bch_dev_read_write(c, ca))
+			return -ENOMEM;
+	} else {
+		__bch_dev_read_only(c, ca);
+	}
+
+	bch_notice(c, "%s %s",
+		   bdevname(ca->disk_sb.bdev, buf),
+		   bch_dev_state[new_state]);
+
+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
+	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+	bch_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch_dev_set_state(struct cache_set *c, struct cache *ca,
+		      enum bch_member_state new_state, int flags)
+{
+	int ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_set_state(c, ca, new_state, flags);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
+#if 0
+int bch_dev_migrate_from(struct cache_set *c, struct cache *ca)
+{
+	/* First, go RO before we try to migrate data off: */
+	ret = bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, flags);
+	if (ret)
+		return ret;
+
+	bch_notify_dev_removing(ca);
+
+	/* Migrate data, metadata off device: */
+
+	ret = bch_move_data_off_device(ca);
+	if (ret && !(flags & BCH_FORCE_IF_DATA_LOST)) {
+		bch_err(c, "Remove of %s failed, unable to migrate data off",
+			name);
+		return ret;
+	}
+
+	if (ret)
+		ret = bch_flag_data_bad(ca);
+	if (ret) {
+		bch_err(c, "Remove of %s failed, unable to migrate data off",
+			name);
+		return ret;
+	}
+
+	ret = bch_move_metadata_off_device(ca);
+	if (ret)
+		return ret;
+}
+#endif
+
+/* Device add/removal: */
+
+static int __bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
+{
+	struct bch_sb_field_members *mi;
+	char name[BDEVNAME_SIZE];
+	unsigned dev_idx = ca->dev_idx;
+	int ret;
+
+	bdevname(ca->disk_sb.bdev, name);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
+		bch_err(ca->set, "Cannot remove RW device");
+		bch_notify_dev_remove_failed(ca);
+		return -EINVAL;
+	}
+
+	if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+		bch_err(ca->set, "Cannot remove %s without losing data", name);
+		bch_notify_dev_remove_failed(ca);
+		return -EINVAL;
+	}
+
+	/*
+	 * XXX: verify that dev_idx is really not in use anymore, anywhere
+	 *
+	 * flag_data_bad() does not check btree pointers
+	 */
+	ret = bch_flag_data_bad(ca);
+	if (ret) {
+		bch_err(c, "Remove of %s failed", name);
+		return ret;
+	}
+
+	/*
+	 * Ok, really doing the remove:
+	 * Drop device's prio pointer before removing it from superblock:
+	 */
+	bch_notify_dev_removed(ca);
+
+	spin_lock(&c->journal.lock);
+	c->journal.prio_buckets[dev_idx] = 0;
+	spin_unlock(&c->journal.lock);
+
+	bch_journal_meta(&c->journal);
+
+	bch_dev_stop(ca);
+
+	/*
+	 * RCU barrier between dropping between c->cache and dropping from
+	 * member info:
+	 */
+	synchronize_rcu();
+
+	/*
+	 * Free this device's slot in the bch_member array - all pointers to
+	 * this device must be gone:
+	 */
+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
+	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+	bch_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
+{
+	int ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_remove(c, ca, flags);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
 int bch_dev_add(struct cache_set *c, const char *path)
 {
 	struct bcache_superblock sb;
@@ -1626,6 +1612,8 @@ err_unlock:
 	return ret ?: -EINVAL;
 }
 
+/* Filesystem open: */
+
 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 			struct bch_opts opts, struct cache_set **ret)
 {
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-09 08:27:30 -0900
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-09 09:14:11 -0900
commit	ac1b32acb4ca8c59c0e4911a8d3b27fd72dc54af (patch)
tree	e73a6ea5ddb6f3ece6a3e6c069ffa9ecc5e1ee44 /libbcache/super.c
parent	a17f7bcec7ed810a247c24e56229af8f43a9a6ae (diff)