From 38b0bd0cda07d34ad6f145fce675ead74739c44e Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 6 Sep 2018 18:33:38 +0200
Subject: dm raid: fix reshape race on small devices

Loading a new mapping table, the dm-raid target's constructor
retrieves the volatile reshaping state from the raid superblocks.

When the new table is activated in a following resume, the actual
reshape position is retrieved.  The reshape driven by the previous
mapping can already have finished on small and/or fast devices thus
updating raid superblocks about the new raid layout.

This causes the actual array state (e.g. stripe size reshape finished)
to be inconsistent with the one in the new mapping, causing hangs with
left behind devices.

This race does not occur with usual raid device sizes but with small
ones (e.g. those created by the lvm2 test suite).

Fix by no longer transferring stale/inconsistent raid_set state during
preresume.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 48 +-----------------------------------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

(limited to 'drivers/md/dm-raid.c')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index cae689de75fd..d8406e0b4540 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -29,9 +29,6 @@
  */
 #define	MIN_RAID456_JOURNAL_SPACE (4*2048)
 
-/* Global list of all raid sets */
-static LIST_HEAD(raid_sets);
-
 static bool devices_handle_discard_safely = false;
 
 /*
@@ -227,7 +224,6 @@ struct rs_layout {
 
 struct raid_set {
 	struct dm_target *ti;
-	struct list_head list;
 
 	uint32_t stripe_cache_entries;
 	unsigned long ctr_flags;
@@ -273,19 +269,6 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
 	mddev->new_chunk_sectors = l->new_chunk_sectors;
 }
 
-/* Find any raid_set in active slot for @rs on global list */
-static struct raid_set *rs_find_active(struct raid_set *rs)
-{
-	struct raid_set *r;
-	struct mapped_device *md = dm_table_get_md(rs->ti->table);
-
-	list_for_each_entry(r, &raid_sets, list)
-		if (r != rs && dm_table_get_md(r->ti->table) == md)
-			return r;
-
-	return NULL;
-}
-
 /* raid10 algorithms (i.e. formats) */
 #define	ALGORITHM_RAID10_DEFAULT	0
 #define	ALGORITHM_RAID10_NEAR		1
@@ -764,7 +747,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 
 	mddev_init(&rs->md);
 
-	INIT_LIST_HEAD(&rs->list);
 	rs->raid_disks = raid_devs;
 	rs->delta_disks = 0;
 
@@ -782,9 +764,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);
 
-	/* Add @rs to global list. */
-	list_add(&rs->list, &raid_sets);
-
 	/*
 	 * Remaining items to be initialized by further RAID params:
 	 *  rs->md.persistent
@@ -797,7 +776,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	return rs;
 }
 
-/* Free all @rs allocations and remove it from global list. */
+/* Free all @rs allocations */
 static void raid_set_free(struct raid_set *rs)
 {
 	int i;
@@ -815,8 +794,6 @@ static void raid_set_free(struct raid_set *rs)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
 	}
 
-	list_del(&rs->list);
-
 	kfree(rs);
 }
 
@@ -3947,29 +3924,6 @@ static int raid_preresume(struct dm_target *ti)
 	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
 		return 0;
 
-	if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
-		struct raid_set *rs_active = rs_find_active(rs);
-
-		if (rs_active) {
-			/*
-			 * In case no rebuilds have been requested
-			 * and an active table slot exists, copy
-			 * current resynchonization completed and
-			 * reshape position pointers across from
-			 * suspended raid set in the active slot.
-			 *
-			 * This resumes the new mapping at current
-			 * offsets to continue recover/reshape without
-			 * necessarily redoing a raid set partially or
-			 * causing data corruption in case of a reshape.
-			 */
-			if (rs_active->md.curr_resync_completed != MaxSector)
-				mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
-			if (rs_active->md.reshape_position != MaxSector)
-				mddev->reshape_position = rs_active->md.reshape_position;
-		}
-	}
-
 	/*
 	 * The superblocks need to be updated on disk if the
 	 * array is new or new devices got added (thus zeroed
-- 
cgit v1.2.3


From 644e2537fdc77baeeefc829524937bca64329f82 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 6 Sep 2018 18:33:39 +0200
Subject: dm raid: fix stripe adding reshape deadlock

When initiating a stripe adding reshape, a deadlock between
md_stop_writes() waiting for the sync thread to stop and the running
sync thread waiting for inactive stripes occurs (this frequently happens
on single-core but rarely on multi-core systems).

Fix this deadlock by setting MD_RECOVERY_WAIT to have the main MD
resynchronization thread worker (md_do_sync()) bail out when initiating
the reshape via constructor arguments.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'drivers/md/dm-raid.c')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index d8406e0b4540..9129c5e0c280 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3869,14 +3869,13 @@ static int rs_start_reshape(struct raid_set *rs)
 	struct mddev *mddev = &rs->md;
 	struct md_personality *pers = mddev->pers;
 
+	/* Don't allow the sync thread to work until the table gets reloaded. */
+	set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
+
 	r = rs_setup_reshape(rs);
 	if (r)
 		return r;
 
-	/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
-	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
-		mddev_resume(mddev);
-
 	/*
 	 * Check any reshape constraints enforced by the personalility
 	 *
@@ -3900,10 +3899,6 @@ static int rs_start_reshape(struct raid_set *rs)
 		}
 	}
 
-	/* Suspend because a resume will happen in raid_resume() */
-	set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
-	mddev_suspend(mddev);
-
 	/*
 	 * Now reshape got set up, update superblocks to
 	 * reflect the fact so that a table reload will
-- 
cgit v1.2.3


From c44a5ee803d2b7ed8c2e6ce24a5c4dd60778886e Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 6 Sep 2018 18:33:40 +0200
Subject: dm raid: fix rebuild of specific devices by updating superblock

Update superblock when particular devices are requested via rebuild
(e.g. lvconvert --replace ...) to avoid spurious failure with the "New
device injected into existing raid set without 'delta_disks' or
'rebuild' parameter specified" error message.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md/dm-raid.c')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9129c5e0c280..6d961db8760e 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3126,6 +3126,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_new(rs);
 	} else if (rs_is_recovering(rs)) {
+		/* Rebuild particular devices */
+		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
+			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+			rs_setup_recovery(rs, MaxSector);
+		}
 		/* A recovering raid set may be resized */
 		; /* skip setup rs */
 	} else if (rs_is_reshaping(rs)) {
-- 
cgit v1.2.3


From 36a240a706d43383bbdd377522501ddd2e5771f6 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 6 Sep 2018 22:54:29 +0200
Subject: dm raid: fix RAID leg rebuild errors

On fast devices such as NVMe, a flaw in rs_get_progress() results in
false target status output when userspace lvm2 requests leg rebuilds
(symptom of the failure is device health chars 'aaaaaaaa' instead of
expected 'aAaAAAAA' causing lvm2 to fail).

The correct sync action state definitions already exist in
decipher_sync_action() so fix rs_get_progress() to use it.

Change decipher_sync_action() to return an enum rather than a string for
the sync states and call it from rs_get_progress().  Introduce
sync_str() to translate from enum to the string that is needed by
raid_status().

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 80 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 34 deletions(-)

(limited to 'drivers/md/dm-raid.c')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6d961db8760e..fceeb962f43b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3332,32 +3332,53 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_SUBMITTED;
 }
 
-/* Return string describing the current sync action of @mddev */
-static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery)
+/* Return sync state string for @state */
+enum sync_state { st_frozen, st_reshape, st_resync, st_check, st_repair, st_recover, st_idle };
+static const char *sync_str(enum sync_state state)
+{
+	/* Has to be in above sync_state order! */
+	static const char *sync_strs[] = {
+		"frozen",
+		"reshape",
+		"resync",
+		"check",
+		"repair",
+		"recover",
+		"idle"
+	};
+
+	return __within_range(state, 0, ARRAY_SIZE(sync_strs) - 1) ? sync_strs[state] : "undef";
+};
+
+/* Return enum sync_state for @mddev derived from @recovery flags */
+static const enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery)
 {
 	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
-		return "frozen";
+		return st_frozen;
 
-	/* The MD sync thread can be done with io but still be running */
+	/* The MD sync thread can be done with io or be interrupted but still be running */
 	if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
 	    (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
 	     (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
 		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
-			return "reshape";
+			return st_reshape;
 
 		if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
 			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
-				return "resync";
-			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
-				return "check";
-			return "repair";
+				return st_resync;
+			if (test_bit(MD_RECOVERY_CHECK, &recovery))
+				return st_check;
+			return st_repair;
 		}
 
 		if (test_bit(MD_RECOVERY_RECOVER, &recovery))
-			return "recover";
+			return st_recover;
+
+		if (mddev->reshape_position != MaxSector)
+			return st_reshape;
 	}
 
-	return "idle";
+	return st_idle;
 }
 
 /*
@@ -3391,6 +3412,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 				sector_t resync_max_sectors)
 {
 	sector_t r;
+	enum sync_state state;
 	struct mddev *mddev = &rs->md;
 
 	clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
@@ -3401,20 +3423,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 		set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
 	} else {
-		if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) &&
-		    !test_bit(MD_RECOVERY_INTR, &recovery) &&
-		    (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
-		     test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
-		     test_bit(MD_RECOVERY_RUNNING, &recovery)))
-			r = mddev->curr_resync_completed;
-		else
+		state = decipher_sync_action(mddev, recovery);
+
+		if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
 			r = mddev->recovery_cp;
+		else
+			r = mddev->curr_resync_completed;
 
-		if (r >= resync_max_sectors &&
-		    (!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
-		     (!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
-		      !test_bit(MD_RECOVERY_NEEDED, &recovery) &&
-		      !test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
+		if (state == st_idle && r >= resync_max_sectors) {
 			/*
 			 * Sync complete.
 			 */
@@ -3422,24 +3438,20 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			if (test_bit(MD_RECOVERY_RECOVER, &recovery))
 				set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
-		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) {
+		} else if (state == st_recover)
 			/*
 			 * In case we are recovering, the array is not in sync
 			 * and health chars should show the recovering legs.
 			 */
 			;
-
-		} else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
-			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+		else if (state == st_resync)
 			/*
 			 * If "resync" is occurring, the raid set
 			 * is or may be out of sync hence the health
 			 * characters shall be 'a'.
 			 */
 			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
-
-		} else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
-			   !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+		else if (state == st_reshape)
 			/*
 			 * If "reshape" is occurring, the raid set
 			 * is or may be out of sync hence the health
@@ -3447,7 +3459,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			 */
 			set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
 
-		} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+		else if (state == st_check || state == st_repair)
 			/*
 			 * If "check" or "repair" is occurring, the raid set has
 			 * undergone an initial sync and the health characters
@@ -3455,12 +3467,12 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 			 */
 			set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
-		} else {
+		else {
 			struct md_rdev *rdev;
 
 			/*
 			 * We are idle and recovery is needed, prevent 'A' chars race
-			 * caused by components still set to in-sync by constrcuctor.
+			 * caused by components still set to in-sync by constructor.
 			 */
 			if (test_bit(MD_RECOVERY_NEEDED, &recovery))
 				set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
@@ -3524,7 +3536,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		progress = rs_get_progress(rs, recovery, resync_max_sectors);
 		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
 				    atomic64_read(&mddev->resync_mismatches) : 0;
-		sync_action = decipher_sync_action(&rs->md, recovery);
+		sync_action = sync_str(decipher_sync_action(&rs->md, recovery));
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
 		for (i = 0; i < rs->raid_disks; i++)
-- 
cgit v1.2.3


From 5380c05b682991a6818c3755d450a3e87eeac0e5 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 6 Sep 2018 14:02:54 -0400
Subject: dm raid: bump target version, update comments and documentation

Bump target version to reflect the documented fixes are available.
Also fix some code comments (typos and clarity).

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  4 ++++
 drivers/md/dm-raid.c                    | 10 ++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'drivers/md/dm-raid.c')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 390c145f01d7..52a719b49afd 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -348,3 +348,7 @@ Version History
 1.13.1  Fix deadlock caused by early md_stop_writes().  Also fix size an
 	state races.
 1.13.2  Fix raid redundancy validation and avoid keeping raid set frozen
+1.14.0  Fix reshape race on small devices.  Fix stripe adding reshape
+	deadlock/potential data corruption.  Update superblock when
+	specific devices are requested via rebuild.  Fix RAID leg
+	rebuild errors.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index fceeb962f43b..5ba067fa0c72 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -2626,7 +2626,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
 		return 0;
 	}
 
-	/* HM FIXME: get InSync raid_dev? */
+	/* HM FIXME: get In_Sync raid_dev? */
 	rdev = &rs->dev[0].rdev;
 
 	if (rs->delta_disks < 0) {
@@ -3224,6 +3224,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	/* Start raid set read-only and assumed clean to change in raid_resume() */
 	rs->md.ro = 1;
 	rs->md.in_sync = 1;
+
+	/* Keep array frozen */
 	set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 
 	/* Has to be held on running the array */
@@ -3247,7 +3249,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
-	/* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+	/* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */
 	if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
 		r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
 		if (r) {
@@ -4012,7 +4014,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 13, 2},
+	.version = {1, 14, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
-- 
cgit v1.2.3