From 13864515f7bf6cabd60e63c62e09d311386ae1f1 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@novell.com>
Date: Sat, 28 Jun 2008 08:31:19 +1000
Subject: linear: correct disk numbering error check

From: "Nikanth Karthikesan" <knikanth@novell.com>

Correct disk numbering problem check.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/linear.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 10748240cb2f..ec921f58fbb8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -126,7 +126,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		int j = rdev->raid_disk;
 		dev_info_t *disk = conf->disks + j;
 
-		if (j < 0 || j > raid_disks || disk->rdev) {
+		if (j < 0 || j >= raid_disks || disk->rdev) {
 			printk("linear: disk numbering problem. Aborting!\n");
 			goto out;
 		}
-- 
cgit v1.2.3


From 0e13fe23a00ad88c737d91d94a050707c6139ce4 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:20 +1000
Subject: use bio_endio instead of a call to bi_end_io

Turn calls to bi->bi_end_io() into bio_endio(). Apparently bio_endio does
exactly the same error processing as is hardcoded at these places.

bio_endio() avoids recursion (or will soon), so it should be used.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54c8ee28fcc4..214b44122822 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -115,9 +115,7 @@ static void return_io(struct bio *return_bi)
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
 		bi->bi_size = 0;
-		bi->bi_end_io(bi,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
+		bio_endio(bi, 0);
 		bi = return_bi;
 	}
 }
@@ -3700,9 +3698,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 		if ( rw == WRITE )
 			md_write_end(mddev);
 
-		bi->bi_end_io(bi,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
+		bio_endio(bi, 0);
 	}
 	return 0;
 }
@@ -4005,12 +4001,8 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 	spin_lock_irq(&conf->device_lock);
 	remaining = --raid_bio->bi_phys_segments;
 	spin_unlock_irq(&conf->device_lock);
-	if (remaining == 0) {
-
-		raid_bio->bi_end_io(raid_bio,
-			      test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
-			        ? 0 : -EIO);
-	}
+	if (remaining == 0)
+		bio_endio(raid_bio, 0);
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;
-- 
cgit v1.2.3


From a0da84f35b25875870270d16b6eccda4884d61a7 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:22 +1000
Subject: Improve setting of "events_cleared" for write-intent bitmaps.

When an array is degraded, bits in the write-intent bitmap are not
cleared, so that if the missing device is re-added, it can be synced
by only updated those parts of the device that have changed since
it was removed.

The enable this a 'events_cleared' value is stored. It is the event
counter for the array the last time that any bits were cleared.

Sometimes - if a device disappears from an array while it is 'clean' -
the events_cleared value gets updated incorrectly (there are subtle
ordering issues between updateing events in the main metadata and the
bitmap metadata) resulting in the missing device appearing to require
a full resync when it is re-added.

With this patch, we update events_cleared precisely when we are about
to clear a bit in the bitmap.  We record events_cleared when we clear
the bit internally, and copy that to the superblock which is written
out before the bit on storage.  This makes it more "obviously correct".

We also need to update events_cleared when the event_count is going
backwards (as happens on a dirty->clean transition of a non-degraded
array).

Thanks to Mike Snitzer for identifying this problem and testing early
"fixes".

Cc:  "Mike Snitzer" <snitzer@gmail.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/bitmap.c         | 29 ++++++++++++++++++++++++-----
 include/linux/raid/bitmap.h |  1 +
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b26927ce889c..dedba16d42f7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -454,8 +454,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
-	if (!bitmap->mddev->degraded)
-		sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+	if (bitmap->mddev->events < bitmap->events_cleared) {
+		/* rocking back to read-only */
+		bitmap->events_cleared = bitmap->mddev->events;
+		sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+	}
 	kunmap_atomic(sb, KM_USER0);
 	write_page(bitmap, bitmap->sb_page, 1);
 }
@@ -1085,9 +1088,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 			} else
 				spin_unlock_irqrestore(&bitmap->lock, flags);
 			lastpage = page;
-/*
-			printk("bitmap clean at page %lu\n", j);
-*/
+
+			/* We are possibly going to clear some bits, so make
+			 * sure that events_cleared is up-to-date.
+			 */
+			if (bitmap->need_sync) {
+				bitmap_super_t *sb;
+				bitmap->need_sync = 0;
+				sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+				sb->events_cleared =
+					cpu_to_le64(bitmap->events_cleared);
+				kunmap_atomic(sb, KM_USER0);
+				write_page(bitmap, bitmap->sb_page, 1);
+			}
 			spin_lock_irqsave(&bitmap->lock, flags);
 			clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
 		}
@@ -1257,6 +1270,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 			return;
 		}
 
+		if (success &&
+		    bitmap->events_cleared < bitmap->mddev->events) {
+			bitmap->events_cleared = bitmap->mddev->events;
+			bitmap->need_sync = 1;
+		}
+
 		if (!success && ! (*bmc & NEEDED_MASK))
 			*bmc |= NEEDED_MASK;
 
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 78bfdea24a8e..e98900671ca9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -221,6 +221,7 @@ struct bitmap {
 	unsigned long syncchunk;
 
 	__u64	events_cleared;
+	int need_sync;
 
 	/* bitmap spinlock */
 	spinlock_t lock;
-- 
cgit v1.2.3


From 5e96ee65c8bd629ce093da67a066d3946468298a Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:24 +1000
Subject: Allow setting start point for requested check/repair

This makes it possible to just resync a small part of an array.
e.g. if a drive reports that it has questionable sectors,
a 'repair' of just the region covering those sectors will
cause them to be read and, if there is an error, re-written
with correct data.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c           | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/raid/md_k.h |  2 ++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..261322722c19 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -278,6 +278,7 @@ static mddev_t * mddev_find(dev_t unit)
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
+	new->resync_min = 0;
 	new->resync_max = MaxSector;
 	new->level = LEVEL_NONE;
 
@@ -3074,6 +3075,36 @@ sync_completed_show(mddev_t *mddev, char *page)
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+min_sync_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n",
+		       (unsigned long long)mddev->resync_min);
+}
+static ssize_t
+min_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	unsigned long long min;
+	if (strict_strtoull(buf, 10, &min))
+		return -EINVAL;
+	if (min > mddev->resync_max)
+		return -EINVAL;
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
+	/* Must be a multiple of chunk_size */
+	if (mddev->chunk_size) {
+		if (min & (sector_t)((mddev->chunk_size>>9)-1))
+			return -EINVAL;
+	}
+	mddev->resync_min = min;
+
+	return len;
+}
+
+static struct md_sysfs_entry md_min_sync =
+__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
+
 static ssize_t
 max_sync_show(mddev_t *mddev, char *page)
 {
@@ -3089,9 +3120,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 	if (strncmp(buf, "max", 3) == 0)
 		mddev->resync_max = MaxSector;
 	else {
-		char *ep;
-		unsigned long long max = simple_strtoull(buf, &ep, 10);
-		if (ep == buf || (*ep != 0 && *ep != '\n'))
+		unsigned long long max;
+		if (strict_strtoull(buf, 10, &max))
+			return -EINVAL;
+		if (max < mddev->resync_min)
 			return -EINVAL;
 		if (max < mddev->resync_max &&
 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3254,7 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_speed.attr,
 	&md_sync_force_parallel.attr,
 	&md_sync_completed.attr,
+	&md_min_sync.attr,
 	&md_max_sync.attr,
 	&md_suspend_lo.attr,
 	&md_suspend_hi.attr,
@@ -3777,6 +3810,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
 		mddev->size = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
+		mddev->resync_min = 0;
 		mddev->resync_max = MaxSector;
 		mddev->reshape_position = MaxSector;
 		mddev->external = 0;
@@ -5625,9 +5659,11 @@ void md_do_sync(mddev_t *mddev)
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
 		/* we don't use the checkpoint if there's a bitmap */
-		if (!mddev->bitmap &&
-		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			j = mddev->resync_min;
+		else if (!mddev->bitmap)
 			j = mddev->recovery_cp;
+
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->size << 1;
 	else {
@@ -5796,6 +5832,7 @@ void md_do_sync(mddev_t *mddev)
 
  skip:
 	mddev->curr_resync = 0;
+	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 3dea9f545c8f..780e0613e6d5 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -227,6 +227,8 @@ struct mddev_s
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
 	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 
-- 
cgit v1.2.3


From f48ed538386cb41559282d989354e8f5d442d71c Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:26 +1000
Subject: Close race in md_probe

There is a possible race in md_probe.  If two threads call md_probe
for the same device, then one could exit (having checked that
->gendisk exists) before the other has called kobject_init_and_add,
thus returning an incomplete kobj which will cause problems when
we try to add children to it.

So extend the range of protection of disks_mutex slightly to
avoid this possibility.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 261322722c19..97852099defd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3359,9 +3359,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	disk->queue = mddev->queue;
 	add_disk(disk);
 	mddev->gendisk = disk;
-	mutex_unlock(&disks_mutex);
 	error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
 				     "%s", "md");
+	mutex_unlock(&disks_mutex);
 	if (error)
 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
 		       disk->disk_name);
-- 
cgit v1.2.3


From 1a0fd497733bd029a7d5f2e5c69b1dff715b7792 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:27 +1000
Subject: Don't try to make md arrays dirty if that is not meaningful.

Arrays personalities such as 'raid0' and 'linear' have no redundancy,
and so marking them as 'clean' or 'dirty' is not meaningful.
So always allow write requests without requiring a superblock update.

Such arrays types are detected by ->sync_request being NULL.  If it is
not possible to send a sync request we don't need a 'dirty' flag because
all a dirty flag does is trigger some sync_requests.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 97852099defd..9e3ce432e37e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5536,6 +5536,8 @@ void md_allow_write(mddev_t *mddev)
 		return;
 	if (mddev->ro)
 		return;
+	if (!mddev->pers->sync_request)
+		return;
 
 	spin_lock_irq(&mddev->write_lock);
 	if (mddev->in_sync) {
-- 
cgit v1.2.3


From 8ed0a5216a0238f53b482ec88ce4aeed4b9f0da1 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:29 +1000
Subject: Enable setting of 'offset' and 'size' of a hot-added spare.

offset_store and rdev_size_store allow control of the region of a
device which is to be using in an md/raid array.
They only allow these values to be set when an array is being assembled,
as changing them on an active array could be dangerous.
However when adding a spare device to an array, we might need to
set the offset and size before starting recovery.  So allow
these values to be set also if "->raid_disk < 0" which indicates that
the device is still a spare.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e3ce432e37e..3b5cd4ef54f1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1984,7 +1984,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	unsigned long long offset = simple_strtoull(buf, &e, 10);
 	if (e==buf || (*e && *e != '\n'))
 		return -EINVAL;
-	if (rdev->mddev->pers)
+	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
 	if (rdev->size && rdev->mddev->external)
 		/* Must set offset before size, so overlap checks
@@ -2023,7 +2023,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 	if (e==buf || (*e && *e != '\n'))
 		return -EINVAL;
-	if (my_mddev->pers)
+	if (my_mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
 	rdev->size = size;
 	if (size > oldsize && rdev->mddev->external) {
-- 
cgit v1.2.3


From 6c2fce2ef6b4821c21b5c42c7207cb9cf8c87eda Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:31 +1000
Subject: Support adding a spare to a live md array with external metadata.

i.e. extend the 'md/dev-XXX/slot' attribute so that you can
tell a device to fill an vacant slot in an and md array.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c        | 42 +++++++++++++++++++++++++++++++++++++++---
 drivers/md/multipath.c |  7 ++++++-
 drivers/md/raid1.c     |  7 ++++++-
 drivers/md/raid10.c    | 10 ++++++++--
 drivers/md/raid5.c     | 10 ++++++++--
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3b5cd4ef54f1..5d6fac1fd39e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		slot = -1;
 	else if (e==buf || (*e && *e!= '\n'))
 		return -EINVAL;
-	if (rdev->mddev->pers) {
+	if (rdev->mddev->pers && slot == -1) {
 		/* Setting 'slot' on an active array requires also
 		 * updating the 'rd%d' link, and communicating
 		 * with the personality with ->hot_*_disk.
@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		 * failed/spare devices.  This normally happens automatically,
 		 * but not when the metadata is externally managed.
 		 */
-		if (slot != -1)
-			return -EBUSY;
 		if (rdev->raid_disk == -1)
 			return -EEXIST;
 		/* personality does all needed checks */
@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		sysfs_remove_link(&rdev->mddev->kobj, nm);
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
+	} else if (rdev->mddev->pers) {
+		mdk_rdev_t *rdev2;
+		struct list_head *tmp;
+		/* Activating a spare .. or possibly reactivating
+		 * if we every get bitmaps working here.
+		 */
+
+		if (rdev->raid_disk != -1)
+			return -EBUSY;
+
+		if (rdev->mddev->pers->hot_add_disk == NULL)
+			return -EINVAL;
+
+		rdev_for_each(rdev2, tmp, rdev->mddev)
+			if (rdev2->raid_disk == slot)
+				return -EEXIST;
+
+		rdev->raid_disk = slot;
+		if (test_bit(In_sync, &rdev->flags))
+			rdev->saved_raid_disk = slot;
+		else
+			rdev->saved_raid_disk = -1;
+		err = rdev->mddev->pers->
+			hot_add_disk(rdev->mddev, rdev);
+		if (err != 1) {
+			rdev->raid_disk = -1;
+			if (err == 0)
+				return -EEXIST;
+			return err;
+		}
+		sprintf(nm, "rd%d", rdev->raid_disk);
+		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+			printk(KERN_WARNING
+			       "md: cannot register "
+			       "%s for %s\n",
+			       nm, mdname(rdev->mddev));
+
+		/* don't wakeup anyone, leave that to userspace. */
 	} else {
 		if (slot >= rdev->mddev->raid_disks)
 			return -ENOSPC;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e968116e0de9..4a1d714c048e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int path;
 	struct multipath_info *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
+
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
 
 	print_multipath_conf(conf);
 
-	for (path=0; path<mddev->raid_disks; path++) 
+	for (path = first; path <= last; path++)
 		if ((p=conf->multipaths+path)->rdev == NULL) {
 			q = rdev->bdev->bd_disk->queue;
 			blk_queue_stack_limits(mddev->queue, q);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c610b947218a..d32fc559ff05 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int mirror = 0;
 	mirror_info_t *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
 
-	for (mirror=0; mirror < mddev->raid_disks; mirror++)
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
+
+	for (mirror = first; mirror <= last; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
 			blk_queue_stack_limits(mddev->queue,
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a71277b640ab..50ad8d2ae0e8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int mirror;
 	mirror_info_t *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!enough(conf))
 		return 0;
 
+	if (rdev->raid_disk)
+		first = last = rdev->raid_disk;
+
 	if (rdev->saved_raid_disk >= 0 &&
+	    rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else
-		mirror = 0;
-	for ( ; mirror < mddev->raid_disks; mirror++)
+		mirror = first;
+	for ( ; mirror <= last ; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
 			blk_queue_stack_limits(mddev->queue,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 214b44122822..002f33b1ae00 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4607,21 +4607,27 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int disk;
 	struct disk_info *p;
+	int first = 0;
+	int last = conf->raid_disks - 1;
 
 	if (mddev->degraded > conf->max_degraded)
 		/* no point adding a device */
 		return 0;
 
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
+
 	/*
 	 * find the disk ... but prefer rdev->saved_raid_disk
 	 * if possible.
 	 */
 	if (rdev->saved_raid_disk >= 0 &&
+	    rdev->saved_raid_disk >= first &&
 	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
 		disk = rdev->saved_raid_disk;
 	else
-		disk = 0;
-	for ( ; disk < conf->raid_disks; disk++)
+		disk = first;
+	for ( ; disk <= last ; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
-- 
cgit v1.2.3


From 199050ea1ff2270174ee525b73bc4c3323098897 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:33 +1000
Subject: rationalise return value for ->hot_add_disk method.

For all array types but linear, ->hot_add_disk returns 1 on
success, 0 on failure.
For linear, it returns 0 on success and -errno on failure.

This doesn't cause a functional problem because the ->hot_add_disk
function of linear is used quite differently to the others.
However it is confusing.

So convert all to return 0 for success or -errno on failure
and fix call sites to match.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c        |  7 +++----
 drivers/md/multipath.c |  8 +++++---
 drivers/md/raid1.c     |  6 +++---
 drivers/md/raid10.c    | 10 +++++-----
 drivers/md/raid5.c     | 10 +++++-----
 5 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5d6fac1fd39e..45e255d4916f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1977,10 +1977,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			rdev->saved_raid_disk = -1;
 		err = rdev->mddev->pers->
 			hot_add_disk(rdev->mddev, rdev);
-		if (err != 1) {
+		if (err) {
 			rdev->raid_disk = -1;
-			if (err == 0)
-				return -EEXIST;
 			return err;
 		}
 		sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5920,7 +5918,8 @@ static int remove_and_add_spares(mddev_t *mddev)
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
 				rdev->recovery_offset = 0;
-				if (mddev->pers->hot_add_disk(mddev,rdev)) {
+				if (mddev->pers->
+				    hot_add_disk(mddev, rdev) == 0) {
 					char nm[20];
 					sprintf(nm, "rd%d", rdev->raid_disk);
 					if (sysfs_create_link(&mddev->kobj,
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4a1d714c048e..541cbe3414bd 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -281,7 +281,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	multipath_conf_t *conf = mddev->private;
 	struct request_queue *q;
-	int found = 0;
+	int err = -EEXIST;
 	int path;
 	struct multipath_info *p;
 	int first = 0;
@@ -312,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->raid_disk = path;
 			set_bit(In_sync, &rdev->flags);
 			rcu_assign_pointer(p->rdev, rdev);
-			found = 1;
+			err = 0;
+			break;
 		}
 
 	print_multipath_conf(conf);
-	return found;
+
+	return err;
 }
 
 static int multipath_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d32fc559ff05..f05d5983efb6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1100,7 +1100,7 @@ static int raid1_spare_active(mddev_t *mddev)
 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	conf_t *conf = mddev->private;
-	int found = 0;
+	int err = -EEXIST;
 	int mirror = 0;
 	mirror_info_t *p;
 	int first = 0;
@@ -1124,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
-			found = 1;
+			err = 0;
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
 			 */
@@ -1135,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		}
 
 	print_conf(conf);
-	return found;
+	return err;
 }
 
 static int raid1_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50ad8d2ae0e8..df08a9fa3a1f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1113,7 +1113,7 @@ static int raid10_spare_active(mddev_t *mddev)
 static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	conf_t *conf = mddev->private;
-	int found = 0;
+	int err = -EEXIST;
 	int mirror;
 	mirror_info_t *p;
 	int first = 0;
@@ -1123,9 +1123,9 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
 		 */
-		return 0;
+		return -EBUSY;
 	if (!enough(conf))
-		return 0;
+		return -EINVAL;
 
 	if (rdev->raid_disk)
 		first = last = rdev->raid_disk;
@@ -1151,7 +1151,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
-			found = 1;
+			err = 0;
 			if (rdev->saved_raid_disk != mirror)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
@@ -1159,7 +1159,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		}
 
 	print_conf(conf);
-	return found;
+	return err;
 }
 
 static int raid10_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 002f33b1ae00..8c4e6149daea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4604,7 +4604,7 @@ abort:
 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	raid5_conf_t *conf = mddev->private;
-	int found = 0;
+	int err = -EEXIST;
 	int disk;
 	struct disk_info *p;
 	int first = 0;
@@ -4612,7 +4612,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (mddev->degraded > conf->max_degraded)
 		/* no point adding a device */
-		return 0;
+		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
@@ -4631,14 +4631,14 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
-			found = 1;
+			err = 0;
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
 			break;
 		}
 	print_raid5_conf(conf);
-	return found;
+	return err;
 }
 
 static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4739,7 +4739,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	rdev_for_each(rdev, rtmp, mddev)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
-			if (raid5_add_disk(mddev, rdev)) {
+			if (raid5_add_disk(mddev, rdev) == 0) {
 				char nm[20];
 				set_bit(In_sync, &rdev->flags);
 				added_devices++;
-- 
cgit v1.2.3


From c7d0c941ae7f82940a13f785be70dc3097d96687 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:34 +1000
Subject: Don't reject HOT_REMOVE_DISK request for an array that is not yet
 started.

There is really no need for this test here, and there are valid
cases for selectively removing devices from an array that
it not actually active.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 45e255d4916f..1442761ac98e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4300,9 +4300,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
 	char b[BDEVNAME_SIZE];
 	mdk_rdev_t *rdev;
 
-	if (!mddev->pers)
-		return -ENODEV;
-
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
-- 
cgit v1.2.3


From 0fd62b861eac7d2dea9b7e939953b20f37186ea1 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:36 +1000
Subject: Make sure all changes to md/array_state are notified.

Changes in md/array_state could be of interest to a monitoring
program.  So make sure all changes trigger a notification.

Exceptions:
   changing active_idle to active is not reported because it
      is frequent and not interesting.
   changing active to active_idle is only reported on arrays
      with externally managed metadata, as it is not interesting
      otherwise.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt |  5 +++++
 drivers/md/md.c      | 29 ++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index a8b430627473..dca97ba4944a 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -236,6 +236,11 @@ All md devices contain:
      writing the word for the desired state, however some states
      cannot be explicitly set, and some transitions are not allowed.
 
+     Select/poll works on this file.  All changes except between
+     	active_idle and active (which can be frequent and are not
+	very interesting) are notified.  active->active_idle is
+	reported if the metadata is externally managed.
+
      clear
          No devices, no size, no level
          Writing is equivalent to STOP_ARRAY ioctl
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1442761ac98e..5b9d4fe4e6e4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2716,8 +2716,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	}
 	if (err)
 		return err;
-	else
+	else {
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
 		return len;
+	}
 }
 static struct md_sysfs_entry md_array_state =
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -3408,7 +3410,11 @@ static void md_safemode_timeout(unsigned long data)
 {
 	mddev_t *mddev = (mddev_t *) data;
 
-	mddev->safemode = 1;
+	if (!atomic_read(&mddev->writes_pending)) {
+		mddev->safemode = 1;
+		if (mddev->external)
+			sysfs_notify(&mddev->kobj, NULL, "array_state");
+	}
 	md_wakeup_thread(mddev->thread);
 }
 
@@ -3675,6 +3681,7 @@ static int do_md_run(mddev_t * mddev)
 
 	mddev->changed = 1;
 	md_new_event(mddev);
+	sysfs_notify(&mddev->kobj, NULL, "array_state");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -3709,6 +3716,8 @@ static int restart_array(mddev_t *mddev)
 		md_wakeup_thread(mddev->thread);
 		md_wakeup_thread(mddev->sync_thread);
 		err = 0;
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
+
 	} else
 		err = -EINVAL;
 
@@ -3879,6 +3888,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
 			mdname(mddev));
 	err = 0;
 	md_new_event(mddev);
+	sysfs_notify(&mddev->kobj, NULL, "array_state");
 out:
 	return err;
 }
@@ -4876,8 +4886,9 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	    mddev->ro && mddev->pers) {
 		if (mddev->ro == 2) {
 			mddev->ro = 0;
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-		md_wakeup_thread(mddev->thread);
+			sysfs_notify(&mddev->kobj, NULL, "array_state");
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			md_wakeup_thread(mddev->thread);
 
 		} else {
 			err = -EROFS;
@@ -5516,6 +5527,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
  */
 void md_write_start(mddev_t *mddev, struct bio *bi)
 {
+	int did_change = 0;
 	if (bio_data_dir(bi) != WRITE)
 		return;
 
@@ -5526,6 +5538,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		md_wakeup_thread(mddev->sync_thread);
+		did_change = 1;
 	}
 	atomic_inc(&mddev->writes_pending);
 	if (mddev->safemode == 1)
@@ -5536,10 +5549,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 			mddev->in_sync = 0;
 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			md_wakeup_thread(mddev->thread);
+			did_change = 1;
 		}
 		spin_unlock_irq(&mddev->write_lock);
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
 	}
+	if (did_change)
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
 	wait_event(mddev->sb_wait,
 		   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5991,18 +6006,22 @@ void md_check_recovery(mddev_t *mddev)
 		int spares = 0;
 
 		if (!mddev->external) {
+			int did_change = 0;
 			spin_lock_irq(&mddev->write_lock);
 			if (mddev->safemode &&
 			    !atomic_read(&mddev->writes_pending) &&
 			    !mddev->in_sync &&
 			    mddev->recovery_cp == MaxSector) {
 				mddev->in_sync = 1;
+				did_change = 1;
 				if (mddev->persistent)
 					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			}
 			if (mddev->safemode == 1)
 				mddev->safemode = 0;
 			spin_unlock_irq(&mddev->write_lock);
+			if (did_change)
+				sysfs_notify(&mddev->kobj, NULL, "array_state");
 		}
 
 		if (mddev->flags)
-- 
cgit v1.2.3


From 72a23c211e4587859d5bf61ac4962d76e593fb02 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:41 +1000
Subject: Make sure all changes to md/sync_action are notified.

When the 'resync' thread starts or stops, when we explicitly
set sync_action, or when we determine that there is definitely nothing
to do, we notify sync_action.

To stop "sync_action" from occasionally showing the wrong value,
we introduce a new flags - MD_RECOVERY_RECOVER - to say that a
recovery is probably needed or happening, and we make sure
that we set MD_RECOVERY_RUNNING before clearing MD_RECOVERY_NEEDED.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt      |  6 ++++++
 drivers/md/md.c           | 34 ++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h |  2 ++
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index dca97ba4944a..c05bfb55659e 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -386,6 +386,12 @@ also have
 	'check' and 'repair' will start the appropriate process
            providing the current state is 'idle'.
 
+      This file responds to select/poll.  Any important change in the value
+      triggers a poll event.  Sometimes the value will briefly be
+      "recover" if a recovery seems to be needed, but cannot be
+      achieved. In that case, the transition to "recover" isn't
+      notified, but the transition away is.
+
    mismatch_count
       When performing 'check' and 'repair', and possibly when
       performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b9d4fe4e6e4..c26dcad8a3ac 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
-	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
@@ -2936,7 +2935,7 @@ action_show(mddev_t *mddev, char *page)
 				type = "check";
 			else
 				type = "repair";
-		} else
+		} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
 			type = "recover";
 	}
 	return sprintf(page, "%s\n", type);
@@ -2958,9 +2957,12 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
-	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+	else if (cmd_match(page, "resync"))
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	else if (cmd_match(page, "recover")) {
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	else if (cmd_match(page, "reshape")) {
+	} else if (cmd_match(page, "reshape")) {
 		int err;
 		if (mddev->pers->start_reshape == NULL)
 			return -EINVAL;
@@ -2977,6 +2979,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
+	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	return len;
 }
 
@@ -3682,6 +3685,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->changed = 1;
 	md_new_event(mddev);
 	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -4252,6 +4256,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			export_rdev(rdev);
 
 		md_update_sb(mddev, 1);
+		if (mddev->degraded)
+			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		return err;
@@ -5105,6 +5111,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
+	if (mddev->degraded)
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6055,13 +6063,18 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			sysfs_notify(&mddev->kobj, NULL, "sync_action");
 			md_new_event(mddev);
 			goto unlock;
 		}
+		/* Set RUNNING before clearing NEEDED to avoid
+		 * any transients in the value of "sync_action".
+		 */
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		/* Clear some bits that don't mean anything, but
 		 * might be left set
 		 */
-		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
@@ -6079,17 +6092,19 @@ void md_check_recovery(mddev_t *mddev)
 				/* Cannot proceed */
 				goto unlock;
 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if ((spares = remove_and_add_spares(mddev))) {
 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (mddev->recovery_cp < MaxSector) {
 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 			/* nothing to be done ... */
 			goto unlock;
 
 		if (mddev->pers->sync_request) {
-			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
 				/* We are adding a device or devices to an array
 				 * which has the bitmap stored on all devices.
@@ -6108,9 +6123,16 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->recovery = 0;
 			} else
 				md_wakeup_thread(mddev->sync_thread);
+			sysfs_notify(&mddev->kobj, NULL, "sync_action");
 			md_new_event(mddev);
 		}
 	unlock:
+		if (!mddev->sync_thread) {
+			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
+					       &mddev->recovery))
+				sysfs_notify(&mddev->kobj, NULL, "sync_action");
+		}
 		mddev_unlock(mddev);
 	}
 }
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 780e0613e6d5..62aa9c9a6ddc 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -188,6 +188,7 @@ struct mddev_s
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
 	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
 	 * INTR:     resync needs to be aborted for some reason
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
@@ -198,6 +199,7 @@ struct mddev_s
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
 #define	MD_RECOVERY_INTR	3
 #define	MD_RECOVERY_DONE	4
 #define	MD_RECOVERY_NEEDED	5
-- 
cgit v1.2.3


From a99ac97113d5bc25ddc4d17f404c2024ac6c57f9 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:43 +1000
Subject: Make sure all changes to md/degraded are notified.

When a device fails, when a spare is activated, when
an array is reshaped, or when an array is started,
the extent to which the array is degraded can change.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt | 7 +++++++
 drivers/md/md.c      | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index c05bfb55659e..eb6e69e3732e 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -392,6 +392,13 @@ also have
       achieved. In that case, the transition to "recover" isn't
       notified, but the transition away is.
 
+   degraded
+      This contains a count of the number of devices by which the
+      arrays is degraded.  So an optimal array with show '0'.  A
+      single failed/missing drive will show '1', etc.
+      This file responds to select/poll, any increase or decrease
+      in the count of missing devices will trigger an event.
+
    mismatch_count
       When performing 'check' and 'repair', and possibly when
       performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c26dcad8a3ac..60d4cad88c20 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2969,6 +2969,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 		err = mddev->pers->start_reshape(mddev);
 		if (err)
 			return err;
+		sysfs_notify(&mddev->kobj, NULL, "degraded");
 	} else {
 		if (cmd_match(page, "check"))
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -3686,6 +3687,7 @@ static int do_md_run(mddev_t * mddev)
 	md_new_event(mddev);
 	sysfs_notify(&mddev->kobj, NULL, "array_state");
 	sysfs_notify(&mddev->kobj, NULL, "sync_action");
+	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -6049,7 +6051,9 @@ void md_check_recovery(mddev_t *mddev)
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				/* success...*/
 				/* activate any spares */
-				mddev->pers->spare_active(mddev);
+				if (mddev->pers->spare_active(mddev))
+					sysfs_notify(&mddev->kobj, NULL,
+						     "degraded");
 			}
 			md_update_sb(mddev, 1);
 
-- 
cgit v1.2.3


From 526647320e696f434647f38421a6ecf65b859c43 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:44 +1000
Subject: Make sure all changes to md/dev-XX/state are notified

The important state change happens during an interrupt
in md_error.  So just set a flag there and call sysfs_notify
later in process context.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt      | 10 ++++++++++
 drivers/md/md.c           | 14 +++++++++++++-
 include/linux/raid/md_k.h |  3 +++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index eb6e69e3732e..e06cc59437e4 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -297,6 +297,10 @@ Each directory contains:
 	      writemostly - device will only be subject to read
 		         requests if there are no other options.
 			 This applies only to raid1 arrays.
+	      blocked  - device has failed, metadata is "external",
+	                 and the failure hasn't been acknowledged yet.
+			 Writes that would write to this device if
+			 it were not faulty are blocked.
 	      spare    - device is working, but not a full member.
 			 This includes spares that are in the process
 			 of being recovered to
@@ -306,6 +310,12 @@ Each directory contains:
 	Writing "remove" removes the device from the array.
 	Writing "writemostly" sets the writemostly flag.
 	Writing "-writemostly" clears the writemostly flag.
+	Writing "blocked" sets the "blocked" flag.
+	Writing "-blocked" clear the "blocked" flag and allows writes
+		to complete.
+
+	This file responds to select/poll. Any change to 'faulty'
+	or 'blocked' causes an event.
 
       errors
 	An approximate count of read errors that have been detected on
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 60d4cad88c20..dc99d95a1b6d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1886,6 +1886,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 		err = 0;
 	}
+	if (!err)
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	return err ? err : len;
 }
 static struct rdev_sysfs_entry rdev_state =
@@ -1979,7 +1981,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		if (err) {
 			rdev->raid_disk = -1;
 			return err;
-		}
+		} else
+			sysfs_notify(&rdev->kobj, NULL, "state");
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
 			printk(KERN_WARNING
@@ -1996,6 +1999,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		clear_bit(Faulty, &rdev->flags);
 		clear_bit(WriteMostly, &rdev->flags);
 		set_bit(In_sync, &rdev->flags);
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	}
 	return len;
 }
@@ -3525,6 +3529,7 @@ static int do_md_run(mddev_t * mddev)
 				return -EINVAL;
 			}
 		}
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	}
 
 	md_probe(mddev->unit, NULL, NULL);
@@ -4256,6 +4261,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		}
 		if (err)
 			export_rdev(rdev);
+		else
+			sysfs_notify(&rdev->kobj, NULL, "state");
 
 		md_update_sb(mddev, 1);
 		if (mddev->degraded)
@@ -5115,6 +5122,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+	set_bit(StateChanged, &rdev->flags);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6037,6 +6045,10 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->flags)
 			md_update_sb(mddev, 0);
 
+		rdev_for_each(rdev, rtmp, mddev)
+			if (test_and_clear_bit(StateChanged, &rdev->flags))
+				sysfs_notify(&rdev->kobj, NULL, "state");
+
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 62aa9c9a6ddc..df30c4395875 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -87,6 +87,9 @@ struct mdk_rdev_s
 #define Blocked		8		/* An error occured on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
-- 
cgit v1.2.3


From 0cd17fec983b6bca505eecee1af33138687220b6 Mon Sep 17 00:00:00 2001
From: Chris Webb <chris@arachsys.com>
Date: Sat, 28 Jun 2008 08:31:46 +1000
Subject: Support changing rdev size on running arrays.

From: Chris Webb <chris@arachsys.com>

Allow /sys/block/mdX/md/rdY/size to change on running arrays, moving the
superblock if necessary for this metadata version. We prevent the available
space from shrinking to less than the used size, and allow it to be set to zero
to fill all the available space on the underlying device.

Signed-off-by: Chris Webb <chris@arachsys.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 87 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index dc99d95a1b6d..df1230af02cd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -658,11 +658,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
  */
 
 struct super_type  {
-	char 		*name;
-	struct module	*owner;
-	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
-	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
-	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	char		    *name;
+	struct module	    *owner;
+	int		    (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
+					  int minor_version);
+	int		    (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	void		    (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
+						unsigned long long size);
 };
 
 /*
@@ -1003,6 +1006,27 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->sb_csum = calc_sb_csum(sb);
 }
 
+/*
+ * rdev_size_change for 0.90.0
+ */
+static unsigned long long
+super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
+{
+	if (size && size < rdev->mddev->size)
+		return 0; /* component must fit device */
+	size *= 2; /* convert to sectors */
+	if (rdev->mddev->bitmap_offset)
+		return 0; /* can't move bitmap */
+	rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+	if (!size || size > rdev->sb_offset*2)
+		size = rdev->sb_offset*2;
+	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+		       rdev->sb_page);
+	md_super_wait(rdev->mddev);
+	return size/2; /* kB for sysfs */
+}
+
+
 /*
  * version 1 superblock
  */
@@ -1328,21 +1352,59 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
+static unsigned long long
+super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
+{
+	struct mdp_superblock_1 *sb;
+	unsigned long long max_size;
+	if (size && size < rdev->mddev->size)
+		return 0; /* component must fit device */
+	size *= 2; /* convert to sectors */
+	if (rdev->sb_offset < rdev->data_offset/2) {
+		/* minor versions 1 and 2; superblock before data */
+		max_size = (rdev->bdev->bd_inode->i_size >> 9);
+		max_size -= rdev->data_offset;
+		if (!size || size > max_size)
+			size = max_size;
+	} else if (rdev->mddev->bitmap_offset) {
+		/* minor version 0 with bitmap we can't move */
+		return 0;
+	} else {
+		/* minor version 0; superblock after data */
+		sector_t sb_offset;
+		sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+		sb_offset &= ~(sector_t)(4*2 - 1);
+		max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2;
+		if (!size || size > max_size)
+			size = max_size;
+		rdev->sb_offset = sb_offset/2;
+	}
+	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
+	sb->data_size = cpu_to_le64(size);
+	sb->super_offset = rdev->sb_offset*2;
+	sb->sb_csum = calc_sb_1_csum(sb);
+	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+		       rdev->sb_page);
+	md_super_wait(rdev->mddev);
+	return size/2; /* kB for sysfs */
+}
 
 static struct super_type super_types[] = {
 	[0] = {
 		.name	= "0.90.0",
 		.owner	= THIS_MODULE,
-		.load_super	= super_90_load,
-		.validate_super	= super_90_validate,
-		.sync_super	= super_90_sync,
+		.load_super	    = super_90_load,
+		.validate_super	    = super_90_validate,
+		.sync_super	    = super_90_sync,
+		.rdev_size_change   = super_90_rdev_size_change,
 	},
 	[1] = {
 		.name	= "md-1",
 		.owner	= THIS_MODULE,
-		.load_super	= super_1_load,
-		.validate_super	= super_1_validate,
-		.sync_super	= super_1_sync,
+		.load_super	    = super_1_load,
+		.validate_super	    = super_1_validate,
+		.sync_super	    = super_1_sync,
+		.rdev_size_change   = super_1_rdev_size_change,
 	},
 };
 
@@ -2060,8 +2122,20 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 	if (e==buf || (*e && *e != '\n'))
 		return -EINVAL;
-	if (my_mddev->pers && rdev->raid_disk >= 0)
-		return -EBUSY;
+	if (my_mddev->pers && rdev->raid_disk >= 0) {
+		if (rdev->mddev->persistent) {
+			size = super_types[rdev->mddev->major_version].
+				rdev_size_change(rdev, size);
+			if (!size)
+				return -EBUSY;
+		} else if (!size) {
+			size = (rdev->bdev->bd_inode->i_size >> 10);
+			size -= rdev->data_offset/2;
+		}
+		if (size < rdev->mddev->size)
+			return -EINVAL; /* component must fit device */
+	}
+
 	rdev->size = size;
 	if (size > oldsize && rdev->mddev->external) {
 		/* need to check that all other rdevs with the same ->bdev
-- 
cgit v1.2.3


From b203886edbcaac3ca427cf4dbcb50b18bdb346fd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:50 +1000
Subject: md: kill STRIPE_OP_MOD_DMA in raid5 offload

From: Dan Williams <dan.j.williams@intel.com>

This micro-optimization allowed the raid code to skip a re-read of the
parity block after checking parity.  It took advantage of the fact that
xor-offload-engines have their own internal result buffer and can check
parity without writing to memory.  Remove it for the following reasons:

1/ It is a layering violation for MD to need to manage the DMA and
   non-DMA paths within async_xor_zero_sum
2/ Bad precedent to toggle the 'ops' flags outside the lock
3/ Hard to realize a performance gain as reads will not need an updated
   parity block and writes will dirty it anyways.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 10 ----------
 include/linux/raid/raid5.h |  2 --
 2 files changed, 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8c4e6149daea..60e61d2464b5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -837,15 +837,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 static void ops_complete_check(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	int pd_idx = sh->pd_idx;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
-		sh->ops.zero_sum_result == 0)
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-
 	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -873,11 +868,6 @@ static void ops_run_check(struct stripe_head *sh)
 	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
 
-	if (tx)
-		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-	else
-		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-
 	atomic_inc(&sh->count);
 	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
 		ops_complete_check, sh);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f0827d31ae6f..4ecae31a3dcb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -267,10 +267,8 @@ struct r6_state {
 
 /* modifiers to the base operations
  * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
- * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
  */
 #define STRIPE_OP_MOD_REPAIR_PD 7
-#define STRIPE_OP_MOD_DMA_CHECK 8
 
 /*
  * Plugging:
-- 
cgit v1.2.3


From 2b7497f0e0a0b9cf21d822e427d5399b2056501a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:52 +1000
Subject: md: kill STRIPE_OP_IO flag

From: Dan Williams <dan.j.williams@intel.com>

The R5_Want{Read,Write} flags already gate i/o.  So, this flag is
superfluous and we can unconditionally call ops_run_io().

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 32 +++++---------------------------
 include/linux/raid/raid5.h |  1 -
 2 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 60e61d2464b5..cac97080b278 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -373,8 +373,6 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
 	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
 	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
 	test_and_ack_op(STRIPE_OP_CHECK, pending);
-	if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
-		ack++;
 
 	sh->ops.count -= ack;
 	if (unlikely(sh->ops.count < 0)) {
@@ -399,7 +397,6 @@ static void ops_run_io(struct stripe_head *sh)
 
 	might_sleep();
 
-	set_bit(STRIPE_IO_STARTED, &sh->state);
 	for (i = disks; i--; ) {
 		int rw;
 		struct bio *bi;
@@ -433,6 +430,8 @@ static void ops_run_io(struct stripe_head *sh)
 				test_bit(STRIPE_EXPAND_READY, &sh->state))
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
+			set_bit(STRIPE_IO_STARTED, &sh->state);
+
 			bi->bi_bdev = rdev->bdev;
 			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 				__func__, (unsigned long long)sh->sector,
@@ -900,9 +899,6 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 	if (test_bit(STRIPE_OP_CHECK, &pending))
 		ops_run_check(sh);
 
-	if (test_bit(STRIPE_OP_IO, &pending))
-		ops_run_io(sh);
-
 	if (overlap_clear)
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2013,8 +2009,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 */
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			s->locked++;
 			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
 				s->syncing);
@@ -2208,9 +2202,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 						"%d for r-m-w\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
-					if (!test_and_set_bit(
-						STRIPE_OP_IO, &sh->ops.pending))
-						sh->ops.count++;
 					s->locked++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
@@ -2234,9 +2225,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 						"%d for Reconstruct\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
-					if (!test_and_set_bit(
-						STRIPE_OP_IO, &sh->ops.pending))
-						sh->ops.count++;
 					s->locked++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
@@ -2444,8 +2432,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 
 		set_bit(R5_LOCKED, &dev->flags);
 		set_bit(R5_Wantwrite, &dev->flags);
-		if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-			sh->ops.count++;
 
 		clear_bit(STRIPE_DEGRADED, &sh->state);
 		s->locked++;
@@ -2801,9 +2787,6 @@ static void handle_stripe5(struct stripe_head *sh)
 				(i == sh->pd_idx || dev->written)) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
-				if (!test_and_set_bit(
-				    STRIPE_OP_IO, &sh->ops.pending))
-					sh->ops.count++;
 				if (prexor)
 					continue;
 				if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2857,16 +2840,12 @@ static void handle_stripe5(struct stripe_head *sh)
 		dev = &sh->dev[s.failed_num];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
 		}
@@ -2884,13 +2863,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
 		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
 
-		for (i = conf->raid_disks; i--; ) {
+		for (i = conf->raid_disks; i--; )
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
-		}
 	}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
@@ -2926,6 +2902,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (pending)
 		raid5_run_ops(sh, pending);
 
+	ops_run_io(sh);
+
 	return_io(return_bi);
 
 }
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 4ecae31a3dcb..1301195abf4b 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -263,7 +263,6 @@ struct r6_state {
 #define STRIPE_OP_BIODRAIN	3
 #define STRIPE_OP_POSTXOR	4
 #define STRIPE_OP_CHECK	5
-#define STRIPE_OP_IO		6
 
 /* modifiers to the base operations
  * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
-- 
cgit v1.2.3


From c4e5ac0a22e664eecf29249553cf16c2433f5f25 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:53 +1000
Subject: md: use stripe_head_state in ops_run_io()

From: Dan Williams <dan.j.williams@intel.com>

In handle_stripe after taking sh->lock we sample some bits into 's' (struct
stripe_head_state):

	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);

Use these values from 's' in ops_run_io() rather than re-sampling the bits.
This ensures a consistent snapshot (as seen under sh->lock) is used.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cac97080b278..c4ef3071c290 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -390,7 +390,7 @@ raid5_end_read_request(struct bio *bi, int error);
 static void
 raid5_end_write_request(struct bio *bi, int error);
 
-static void ops_run_io(struct stripe_head *sh)
+static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, disks = sh->disks;
@@ -425,9 +425,7 @@ static void ops_run_io(struct stripe_head *sh)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (test_bit(STRIPE_SYNCING, &sh->state) ||
-				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
-				test_bit(STRIPE_EXPAND_READY, &sh->state))
+			if (s->syncing || s->expanding || s->expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -2902,10 +2900,9 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (pending)
 		raid5_run_ops(sh, pending);
 
-	ops_run_io(sh);
+	ops_run_io(sh, &s);
 
 	return_io(return_bi);
-
 }
 
 static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
-- 
cgit v1.2.3


From f0e43bcdebf709d747a3effb210aff1941e819ab Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:55 +1000
Subject: md: unify raid5/6 i/o submission

From: Dan Williams <dan.j.williams@intel.com>

Let the raid6 path call ops_run_io to get pending i/o submitted.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 63 ++----------------------------------------------------
 1 file changed, 2 insertions(+), 61 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c4ef3071c290..6f3dd12dd3a4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3114,68 +3114,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	return_io(return_bi);
-
-	for (i=disks; i-- ;) {
-		int rw;
-		struct bio *bi;
-		mdk_rdev_t *rdev;
-		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-			rw = WRITE;
-		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-			rw = READ;
-		else
-			continue;
-
-		set_bit(STRIPE_IO_STARTED, &sh->state);
-
-		bi = &sh->dev[i].req;
-
-		bi->bi_rw = rw;
-		if (rw == WRITE)
-			bi->bi_end_io = raid5_end_write_request;
-		else
-			bi->bi_end_io = raid5_end_read_request;
-
-		rcu_read_lock();
-		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && test_bit(Faulty, &rdev->flags))
-			rdev = NULL;
-		if (rdev)
-			atomic_inc(&rdev->nr_pending);
-		rcu_read_unlock();
-
-		if (rdev) {
-			if (s.syncing || s.expanding || s.expanded)
-				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+	ops_run_io(sh, &s);
 
-			bi->bi_bdev = rdev->bdev;
-			pr_debug("for %llu schedule op %ld on disc %d\n",
-				(unsigned long long)sh->sector, bi->bi_rw, i);
-			atomic_inc(&sh->count);
-			bi->bi_sector = sh->sector + rdev->data_offset;
-			bi->bi_flags = 1 << BIO_UPTODATE;
-			bi->bi_vcnt = 1;
-			bi->bi_max_vecs = 1;
-			bi->bi_idx = 0;
-			bi->bi_io_vec = &sh->dev[i].vec;
-			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-			bi->bi_io_vec[0].bv_offset = 0;
-			bi->bi_size = STRIPE_SIZE;
-			bi->bi_next = NULL;
-			if (rw == WRITE &&
-			    test_bit(R5_ReWrite, &sh->dev[i].flags))
-				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-			generic_make_request(bi);
-		} else {
-			if (rw == WRITE)
-				set_bit(STRIPE_DEGRADED, &sh->state);
-			pr_debug("skip op %ld on disc %d for sector %llu\n",
-				bi->bi_rw, i, (unsigned long long)sh->sector);
-			clear_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(STRIPE_HANDLE, &sh->state);
-		}
-	}
+	return_io(return_bi);
 }
 
 static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
-- 
cgit v1.2.3


From ecc65c9b3f9b9d740a5deade3d85b39be56401b6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:57 +1000
Subject: md: replace STRIPE_OP_CHECK with 'check_states'

From: Dan Williams <dan.j.williams@intel.com>

The STRIPE_OP_* flags record the state of stripe operations which are
performed outside the stripe lock.  Their use in indicating which
operations need to be run is straightforward; however, interpolating what
the next state of the stripe should be based on a given combination of
these flags is not straightforward, and has led to bugs.  An easier to read
implementation with minimal degrees of freedom is needed.

Towards this goal, this patch introduces explicit states to replace what was
previously interpolated from the STRIPE_OP_* flags.  For now this only converts
the handle_parity_checks5 path, removing a user of the
ops.{pending,ack,complete,count} fields of struct stripe_operations.

This conversion also found a remaining issue with the current code.  There is
a small window for a drive to fail between when we schedule a repair and when
the parity calculation for that repair completes.  When this happens we will
writeback to 'failed_num' when we really want to write back to 'pd_idx'.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 172 ++++++++++++++++++++++-----------------------
 include/linux/raid/raid5.h |  46 ++++++++++--
 2 files changed, 123 insertions(+), 95 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6f3dd12dd3a4..544e1600f208 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	set_bit(R5_UPTODATE, &tgt->flags);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	clear_bit(R5_Wantcompute, &tgt->flags);
-	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
+	if (sh->check_state == check_state_compute_run)
+		sh->check_state = check_state_compute_result;
+	else
+		set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+	sh->check_state = check_state_check_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh)
 		ops_complete_check, sh);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
+			  unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
+	    test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
 		tx = ops_run_compute5(sh, pending);
 
 	if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 	if (test_bit(STRIPE_OP_POSTXOR, &pending))
 		ops_run_postxor(sh, tx, pending);
 
-	if (test_bit(STRIPE_OP_CHECK, &pending))
+	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
 
 	if (overlap_clear)
@@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	/* don't schedule compute operations or reads on the parity block while
 	 * a check is in flight
 	 */
-	if ((disk_idx == sh->pd_idx) &&
-	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+	if (disk_idx == sh->pd_idx && sh->check_state)
 		return ~0;
 
 	/* is the data in this block needed, and can we get it? */
@@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 		 * 3/ We hold off parity block re-reads until check operations
 		 * have quiesced.
 		 */
-		if ((s->uptodate == disks - 1) &&
-		    (s->failed && disk_idx == s->failed_num) &&
-		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+		if ((s->uptodate == disks - 1) && !sh->check_state &&
+		    (s->failed && disk_idx == s->failed_num)) {
 			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
 			set_bit(R5_Wantcompute, &dev->flags);
 			sh->ops.target = disk_idx;
@@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 {
 	int i;
 
-	/* Clear completed compute operations.  Parity recovery
-	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
-	 * later on in this routine
-	 */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+	/* Clear completed compute operations */
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
@@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks)
 {
-	int canceled_check = 0;
+	struct r5dev *dev = NULL;
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 
-	/* complete a check operation */
-	if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+	switch (sh->check_state) {
+	case check_state_idle:
+		/* start a new check operation if there are no failures */
 		if (s->failed == 0) {
-			if (sh->ops.zero_sum_result == 0)
-				/* parity is correct (on disc,
-				 * not in buffer any more)
-				 */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			else {
-				conf->mddev->resync_mismatches +=
-					STRIPE_SECTORS;
-				if (test_bit(
-				     MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					set_bit(STRIPE_INSYNC, &sh->state);
-				else {
-					set_bit(STRIPE_OP_COMPUTE_BLK,
-						&sh->ops.pending);
-					set_bit(STRIPE_OP_MOD_REPAIR_PD,
-						&sh->ops.pending);
-					set_bit(R5_Wantcompute,
-						&sh->dev[sh->pd_idx].flags);
-					sh->ops.target = sh->pd_idx;
-					sh->ops.count++;
-					s->uptodate++;
-				}
-			}
-		} else
-			canceled_check = 1; /* STRIPE_INSYNC is not set */
-	}
-
-	/* start a new check operation if there are no failures, the stripe is
-	 * not insync, and a repair is not in flight
-	 */
-	if (s->failed == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state) &&
-	    !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-		if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
 			BUG_ON(s->uptodate != disks);
+			sh->check_state = check_state_run;
+			set_bit(STRIPE_OP_CHECK, &s->ops_request);
 			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
-			sh->ops.count++;
 			s->uptodate--;
+			break;
 		}
-	}
-
-	/* check if we can clear a parity disk reconstruct */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-	    test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-
-		clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-	}
-
+		dev = &sh->dev[s->failed_num];
+		/* fall through */
+	case check_state_compute_result:
+		sh->check_state = check_state_idle;
+		if (!dev)
+			dev = &sh->dev[sh->pd_idx];
+
+		/* check that a write has not made the stripe insync */
+		if (test_bit(STRIPE_INSYNC, &sh->state))
+			break;
 
-	/* Wait for check parity and compute block operations to complete
-	 * before write-back.  If a failure occurred while the check operation
-	 * was in flight we need to cycle this stripe through handle_stripe
-	 * since the parity block may not be uptodate
-	 */
-	if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
-	    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
-		struct r5dev *dev;
 		/* either failed parity check, or recovery is happening */
-		if (s->failed == 0)
-			s->failed_num = sh->pd_idx;
-		dev = &sh->dev[s->failed_num];
 		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
 		BUG_ON(s->uptodate != disks);
 
 		set_bit(R5_LOCKED, &dev->flags);
+		s->locked++;
 		set_bit(R5_Wantwrite, &dev->flags);
 
 		clear_bit(STRIPE_DEGRADED, &sh->state);
-		s->locked++;
 		set_bit(STRIPE_INSYNC, &sh->state);
+		break;
+	case check_state_run:
+		break; /* we will be called again upon completion */
+	case check_state_check_result:
+		sh->check_state = check_state_idle;
+
+		/* if a failure occurred during the check operation, leave
+		 * STRIPE_INSYNC not set and let the stripe be handled again
+		 */
+		if (s->failed)
+			break;
+
+		/* handle a successful check operation, if parity is correct
+		 * we are done.  Otherwise update the mismatch count and repair
+		 * parity if !MD_RECOVERY_CHECK
+		 */
+		if (sh->ops.zero_sum_result == 0)
+			/* parity is correct (on disc,
+			 * not in buffer any more)
+			 */
+			set_bit(STRIPE_INSYNC, &sh->state);
+		else {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				sh->check_state = check_state_compute_run;
+				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+				set_bit(R5_Wantcompute,
+					&sh->dev[sh->pd_idx].flags);
+				sh->ops.target = sh->pd_idx;
+				s->uptodate++;
+			}
+		}
+		break;
+	case check_state_compute_run:
+		break;
+	default:
+		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+		       __func__, sh->check_state,
+		       (unsigned long long) sh->sector);
+		BUG();
 	}
 }
 
@@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 *    block.
 	 */
 	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-			  !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+	    !sh->check_state)
 		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * data is available.  The parity check is held off while parity
 	 * dependent operations are in flight.
 	 */
-	if ((s.syncing && s.locked == 0 &&
+	if (sh->check_state ||
+	    (s.syncing && s.locked == 0 &&
 	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
-	     !test_bit(STRIPE_INSYNC, &sh->state)) ||
-	      test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
-	      test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
+	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks5(conf, sh, &s, disks);
 
 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	if (pending)
-		raid5_run_ops(sh, pending);
+	if (pending || s.ops_request)
+		raid5_run_ops(sh, pending, s.ops_request);
 
 	ops_run_io(sh, &s);
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 1301195abf4b..2c96d5fd54bf 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -158,6 +158,41 @@
  *    the compute block completes.
  */
 
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;			/* inactive_list or handle_list */
@@ -169,6 +204,7 @@ struct stripe_head {
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	enum check_states	check_state;
 	/* stripe_operations
 	 * @pending - pending ops flags (set for request->issue->complete)
 	 * @ack - submitted ops flags (set for issue->complete)
@@ -202,6 +238,7 @@ struct stripe_head_state {
 	int locked, uptodate, to_read, to_write, failed, written;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num;
+	unsigned long ops_request;
 };
 
 /* r6_state - extra state data only relevant to r6 */
@@ -254,8 +291,10 @@ struct r6_state {
 #define	STRIPE_EXPAND_READY	11
 #define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
 #define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
 /*
- * Operations flags (in issue order)
+ * Operation request flags
  */
 #define STRIPE_OP_BIOFILL	0
 #define STRIPE_OP_COMPUTE_BLK	1
@@ -264,11 +303,6 @@ struct r6_state {
 #define STRIPE_OP_POSTXOR	4
 #define STRIPE_OP_CHECK	5
 
-/* modifiers to the base operations
- * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
- */
-#define STRIPE_OP_MOD_REPAIR_PD 7
-
 /*
  * Plugging:
  *
-- 
cgit v1.2.3


From 83de75cc92be599850e5ef3928e07cd840833499 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:58 +1000
Subject: md: replace STRIPE_OP_BIOFILL with STRIPE_BIOFILL_RUN

From: Dan Williams <dan.j.williams@intel.com>

Track the state of read operations (copying data from the stripe cache to bio
buffers outside the lock) with a state flag.  Reduce the scope of the
STRIPE_OP_BIOFILL flag to only tracking whether a biofill operation has been
requested via the ops_request field of struct stripe_head_state.

This is another step towards the removal of ops.{pending,ack,complete,count},
i.e. STRIPE_OP_BIOFILL only requests an operation and does not track the state
of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 544e1600f208..b9c0a32a4f95 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -523,38 +523,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
 		(unsigned long long)sh->sector);
 
 	/* clear completed biofills */
+	spin_lock_irq(&conf->device_lock);
 	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		/* acknowledge completion of a biofill operation */
 		/* and check if we need to reply to a read request,
 		 * new R5_Wantfill requests are held off until
-		 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
+		 * !STRIPE_BIOFILL_RUN
 		 */
 		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
 			struct bio *rbi, *rbi2;
 
-			/* The access to dev->read is outside of the
-			 * spin_lock_irq(&conf->device_lock), but is protected
-			 * by the STRIPE_OP_BIOFILL pending bit
-			 */
 			BUG_ON(!dev->read);
 			rbi = dev->read;
 			dev->read = NULL;
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				spin_lock_irq(&conf->device_lock);
 				if (--rbi->bi_phys_segments == 0) {
 					rbi->bi_next = return_bi;
 					return_bi = rbi;
 				}
-				spin_unlock_irq(&conf->device_lock);
 				rbi = rbi2;
 			}
 		}
 	}
-	set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
+	spin_unlock_irq(&conf->device_lock);
+	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
 	return_io(return_bi);
 
@@ -880,7 +876,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
 
-	if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
+	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
 	}
@@ -2630,15 +2626,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-	/* Now to look around and see what can be done */
-
-	/* clean-up completed biofill operations */
-	if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
-		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
-		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
-	}
 
+	/* Now to look around and see what can be done */
 	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
@@ -2652,10 +2641,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		/* maybe we can request a biofill operation
 		 *
 		 * new wantfill requests are only permitted while
-		 * STRIPE_OP_BIOFILL is clear
+		 * ops_complete_biofill is guaranteed to be inactive
 		 */
 		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
-			!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
 			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
@@ -2699,8 +2688,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		goto unlock;
 	}
 
-	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
-		sh->ops.count++;
+	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
-- 
cgit v1.2.3


From 976ea8d475675da6e86bd434328814ccbf5ae641 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:03 +1000
Subject: md: replace STRIPE_OP_COMPUTE_BLK with STRIPE_COMPUTE_RUN

From: Dan Williams <dan.j.williams@intel.com>

Track the state of compute operations (recalculating a block from all the other
blocks in a stripe) with a state flag.  Reduces the scope of the
STRIPE_OP_COMPUTE_BLK flag to only tracking whether a compute operation has
been requested via the ops_request field of struct stripe_head_state.

Note, the compute operation that is performed in the course of doing a 'repair'
operation (check the parity block, recalculate it and write it back if the
check result is not zero) is tracked separately with the 'check_state'
variable.  Compute operations are held off while a 'check' is in progress, and
moving this check out to handle_issuing_new_read_requests5 the helper routine
__handle_issuing_new_read_requests5 can be simplified.

This is another step towards the removal of ops.{pending,ack,complete,count},
i.e. STRIPE_OP_COMPUTE_BLK only requests an operation and does not track the
state of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 65 +++++++++++++++---------------------------------------
 1 file changed, 18 insertions(+), 47 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9c0a32a4f95..835046bf384e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -604,8 +604,6 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
 	if (sh->check_state == check_state_compute_run)
 		sh->check_state = check_state_compute_result;
-	else
-		set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -881,8 +879,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
-	    test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
 		tx = ops_run_compute5(sh, pending);
 
 	if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -1960,12 +1957,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	struct r5dev *dev = &sh->dev[disk_idx];
 	struct r5dev *failed_dev = &sh->dev[s->failed_num];
 
-	/* don't schedule compute operations or reads on the parity block while
-	 * a check is in flight
-	 */
-	if (disk_idx == sh->pd_idx && sh->check_state)
-		return ~0;
-
 	/* is the data in this block needed, and can we get it? */
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
 	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
@@ -1974,23 +1965,16 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	     (failed_dev->toread || (failed_dev->towrite &&
 	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
 	     ))))) {
-		/* 1/ We would like to get this block, possibly by computing it,
-		 * but we might not be able to.
-		 *
-		 * 2/ Since parity check operations potentially make the parity
-		 * block !uptodate it will need to be refreshed before any
-		 * compute operations on data disks are scheduled.
-		 *
-		 * 3/ We hold off parity block re-reads until check operations
-		 * have quiesced.
+		/* We would like to get this block, possibly by computing it,
+		 * otherwise read it if the backing disk is insync
 		 */
-		if ((s->uptodate == disks - 1) && !sh->check_state &&
+		if ((s->uptodate == disks - 1) &&
 		    (s->failed && disk_idx == s->failed_num)) {
-			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
 			set_bit(R5_Wantcompute, &dev->flags);
 			sh->ops.target = disk_idx;
 			s->req_compute = 1;
-			sh->ops.count++;
 			/* Careful: from this point on 'uptodate' is in the eye
 			 * of raid5_run_ops which services 'compute' operations
 			 * before writes. R5_Wantcompute flags a block that will
@@ -1999,12 +1983,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 */
 			s->uptodate++;
 			return 0; /* uptodate + compute == disks */
-		} else if ((s->uptodate < disks - 1) &&
-			test_bit(R5_Insync, &dev->flags)) {
-			/* Note: we hold off compute operations while checks are
-			 * in flight, but we still prefer 'compute' over 'read'
-			 * hence we only read if (uptodate < * disks-1)
-			 */
+		} else if (test_bit(R5_Insync, &dev->flags)) {
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
 			s->locked++;
@@ -2021,20 +2000,13 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 {
 	int i;
 
-	/* Clear completed compute operations */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-	}
-
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
 	 */
-	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
-		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+	    !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+	    !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
 		for (i = disks; i--; )
 			if (__handle_issuing_new_read_requests5(
 				sh, s, i, disks) == 0)
@@ -2236,10 +2208,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	 * simultaneously.  If this is not the case then new writes need to be
 	 * held off until the compute completes.
 	 */
-	if ((s->req_compute ||
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
-		(s->locked == 0 && (rcw == 0 || rmw == 0) &&
-		!test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
+	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		s->locked += handle_write_operations5(sh, rcw == 0, 0);
 }
 
@@ -2410,6 +2381,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				set_bit(STRIPE_INSYNC, &sh->state);
 			else {
 				sh->check_state = check_state_compute_run;
+				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
 				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
 				set_bit(R5_Wantcompute,
 					&sh->dev[sh->pd_idx].flags);
@@ -2725,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * or to load a block that is being partially written.
 	 */
 	if (s.to_read || s.non_overwrite ||
-	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
-	    test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
 		handle_issuing_new_read_requests5(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
@@ -2803,7 +2774,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 */
 	if (sh->check_state ||
 	    (s.syncing && s.locked == 0 &&
-	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks5(conf, sh, &s, disks);
 
@@ -2869,7 +2840,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	}
 
 	if (s.expanding && s.locked == 0 &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, NULL);
 
 	if (sh->ops.count)
@@ -3089,7 +3060,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	}
 
 	if (s.expanding && s.locked == 0 &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, &r6s);
 
  unlock:
-- 
cgit v1.2.3


From 600aa10993012ff2dd5617720dac081e4f992017 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:05 +1000
Subject: md: replace STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} with
 'reconstruct_states'

From: Dan Williams <dan.j.williams@intel.com>

Track the state of reconstruct operations (recalculating the parity block
usually due to incoming writes, or as part of array expansion)  Reduces the
scope of the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags to only tracking whether
a reconstruct operation has been requested via the ops_request field of struct
stripe_head_state.

This is the final step in the removal of ops.{pending,ack,complete,count}, i.e.
the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags only request an operation and do
not track the state of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 204 ++++++++++++++-------------------------------
 include/linux/raid/raid5.h |   9 +-
 2 files changed, 63 insertions(+), 150 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 835046bf384e..b9159367491a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -122,6 +122,13 @@ static void return_io(struct bio *return_bi)
 
 static void print_raid5_conf (raid5_conf_t *conf);
 
+static int stripe_operations_active(struct stripe_head *sh)
+{
+	return sh->check_state || sh->reconstruct_state ||
+	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
+	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
+}
+
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	if (atomic_dec_and_test(&sh->count)) {
@@ -141,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 			}
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
-			BUG_ON(sh->ops.pending);
+			BUG_ON(stripe_operations_active(sh));
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 				atomic_dec(&conf->preread_active_stripes);
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -243,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-	BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+	BUG_ON(stripe_operations_active(sh));
 
 	CHECK_DEVLOCK();
 	pr_debug("init_stripe called, stripe %llu\n",
@@ -344,47 +351,6 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	return sh;
 }
 
-/* test_and_ack_op() ensures that we only dequeue an operation once */
-#define test_and_ack_op(op, pend) \
-do {							\
-	if (test_bit(op, &sh->ops.pending) &&		\
-		!test_bit(op, &sh->ops.complete)) {	\
-		if (test_and_set_bit(op, &sh->ops.ack)) \
-			clear_bit(op, &pend);		\
-		else					\
-			ack++;				\
-	} else						\
-		clear_bit(op, &pend);			\
-} while (0)
-
-/* find new work to run, do not resubmit work that is already
- * in flight
- */
-static unsigned long get_stripe_work(struct stripe_head *sh)
-{
-	unsigned long pending;
-	int ack = 0;
-
-	pending = sh->ops.pending;
-
-	test_and_ack_op(STRIPE_OP_BIOFILL, pending);
-	test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
-	test_and_ack_op(STRIPE_OP_PREXOR, pending);
-	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
-	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
-	test_and_ack_op(STRIPE_OP_CHECK, pending);
-
-	sh->ops.count -= ack;
-	if (unlikely(sh->ops.count < 0)) {
-		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
-			"ops.complete: %#lx\n", pending, sh->ops.pending,
-			sh->ops.ack, sh->ops.complete);
-		BUG();
-	}
-
-	return pending;
-}
-
 static void
 raid5_end_read_request(struct bio *bi, int error);
 static void
@@ -609,7 +575,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_compute5(struct stripe_head *sh, unsigned long pending)
+ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -640,7 +606,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
 			ops_complete_compute5, sh);
 
 	/* ack now if postxor is not set to be run */
-	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
+	if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
 		async_tx_ack(tx);
 
 	return tx;
@@ -652,8 +618,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
-
-	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
 }
 
 static struct dma_async_tx_descriptor *
@@ -686,7 +650,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		 unsigned long pending)
+		 unsigned long ops_request)
 {
 	int disks = sh->disks;
 	int pd_idx = sh->pd_idx, i;
@@ -694,7 +658,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (Wantprexor)
 	 */
-	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -744,7 +708,7 @@ static void ops_complete_postxor(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+	sh->reconstruct_state = reconstruct_state_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -763,16 +727,14 @@ static void ops_complete_write(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
-	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
+	sh->reconstruct_state = reconstruct_state_drain_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
 static void
 ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		unsigned long pending)
+		unsigned long ops_request)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -780,7 +742,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
-	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
 	unsigned long flags;
 	dma_async_tx_callback callback;
 
@@ -807,7 +769,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	}
 
 	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
+	callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ?
 		ops_complete_write : ops_complete_postxor;
 
 	/* 1/ if we prexor'd then the dest is reused as a source
@@ -868,8 +830,7 @@ static void ops_run_check(struct stripe_head *sh)
 		ops_complete_check, sh);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
-			  unsigned long ops_request)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -880,18 +841,18 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 	}
 
 	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
-		tx = ops_run_compute5(sh, pending);
+		tx = ops_run_compute5(sh, ops_request);
 
-	if (test_bit(STRIPE_OP_PREXOR, &pending))
+	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 		tx = ops_run_prexor(sh, tx);
 
-	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
-		tx = ops_run_biodrain(sh, tx, pending);
+	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
+		tx = ops_run_biodrain(sh, tx, ops_request);
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_POSTXOR, &pending))
-		ops_run_postxor(sh, tx, pending);
+	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
+		ops_run_postxor(sh, tx, ops_request);
 
 	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
@@ -1684,11 +1645,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 	}
 }
 
-static int
-handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
+static void
+handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
+			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
-	int locked = 0;
 
 	if (rcw) {
 		/* if we are not expanding this is a proper write request, and
@@ -1696,12 +1657,12 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 		 * stripe cache
 		 */
 		if (!expand) {
-			set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-			sh->ops.count++;
-		}
+			sh->reconstruct_state = reconstruct_state_drain_run;
+			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		} else
+			sh->reconstruct_state = reconstruct_state_run;
 
-		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-		sh->ops.count++;
+		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1710,21 +1671,20 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				set_bit(R5_LOCKED, &dev->flags);
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
-				locked++;
+				s->locked++;
 			}
 		}
-		if (locked + 1 == disks)
+		if (s->locked + 1 == disks)
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&sh->raid_conf->pending_full_writes);
 	} else {
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-		set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-
-		sh->ops.count += 3;
+		sh->reconstruct_state = reconstruct_state_drain_run;
+		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1742,7 +1702,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				set_bit(R5_Wantprexor, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
-				locked++;
+				s->locked++;
 			}
 		}
 	}
@@ -1752,13 +1712,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 	 */
 	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
 	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-	locked++;
+	s->locked++;
 
-	pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
+	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
-		locked, sh->ops.pending);
-
-	return locked;
+		s->locked, s->ops_request);
 }
 
 /*
@@ -2005,8 +1963,7 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
-	    !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	    !sh->reconstruct_state) {
 		for (i = disks; i--; )
 			if (__handle_issuing_new_read_requests5(
 				sh, s, i, disks) == 0)
@@ -2211,7 +2168,7 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-		s->locked += handle_write_operations5(sh, rcw == 0, 0);
+		handle_write_operations5(sh, s, rcw == 0, 0);
 }
 
 static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
@@ -2581,15 +2538,14 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct bio *return_bi = NULL;
 	struct stripe_head_state s;
 	struct r5dev *dev;
-	unsigned long pending = 0;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
 
 	memset(&s, 0, sizeof(s));
-	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
-		"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
-		atomic_read(&sh->count), sh->pd_idx,
-		sh->ops.pending, sh->ops.ack, sh->ops.complete);
+	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
+		 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
+		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
+		 sh->reconstruct_state);
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2703,34 +2659,12 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* Now we check to see if any write operations have recently
 	 * completed
 	 */
-
-	/* leave prexor set until postxor is done, allows us to distinguish
-	 * a rmw from a rcw during biodrain
-	 */
 	prexor = 0;
-	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		prexor = 1;
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-
+	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
 		for (i = disks; i--; )
-			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
-	}
-
-	/* if only POSTXOR is set then this is an 'expand' postxor */
-	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+			prexor += test_and_clear_bit(R5_Wantprexor,
+						     &sh->dev[i].flags);
 
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
@@ -2763,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
 	 */
-	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-	    !sh->check_state)
+	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
 		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -2805,18 +2738,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 	}
 
-	/* Finish postxor operations initiated by the expansion
-	 * process
-	 */
-	if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
-		!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
-
+	/* Finish reconstruct operations initiated by the expansion process */
+	if (sh->reconstruct_state == reconstruct_state_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
 		for (i = conf->raid_disks; i--; )
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &dev->flags);
@@ -2824,15 +2749,13 @@ static void handle_stripe5(struct stripe_head *sh)
 	}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	    !sh->reconstruct_state) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
 			conf->raid_disks);
-		s.locked += handle_write_operations5(sh, 1, 1);
-	} else if (s.expanded &&
-		   s.locked == 0 &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+		handle_write_operations5(sh, &s, 1, 1);
+	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
@@ -2843,9 +2766,6 @@ static void handle_stripe5(struct stripe_head *sh)
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, NULL);
 
-	if (sh->ops.count)
-		pending = get_stripe_work(sh);
-
  unlock:
 	spin_unlock(&sh->lock);
 
@@ -2853,8 +2773,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	if (pending || s.ops_request)
-		raid5_run_ops(sh, pending, s.ops_request);
+	if (s.ops_request)
+		raid5_run_ops(sh, s.ops_request);
 
 	ops_run_io(sh, &s);
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 2c96d5fd54bf..5f3e674b87dd 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -205,19 +205,12 @@ struct stripe_head {
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
 	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
 	/* stripe_operations
-	 * @pending - pending ops flags (set for request->issue->complete)
-	 * @ack - submitted ops flags (set for issue->complete)
-	 * @complete - completed ops flags (set for complete)
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-	 * @count - raid5_runs_ops is set to run when this is non-zero
 	 */
 	struct stripe_operations {
-		unsigned long	   pending;
-		unsigned long	   ack;
-		unsigned long	   complete;
 		int		   target;
-		int		   count;
 		u32		   zero_sum_result;
 	} ops;
 	struct r5dev {
-- 
cgit v1.2.3


From d8ee0728b5b30d7a6f62c399a95e953616d31f23 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:06 +1000
Subject: md: replace R5_WantPrexor with R5_WantDrain, add 'prexor'
 reconstruct_states

From: Dan Williams <dan.j.williams@intel.com>

Currently ops_run_biodrain and other locations have extra logic to determine
which blocks are processed in the prexor and non-prexor cases.  This can be
eliminated if handle_write_operations5 flags the blocks to be processed in all
cases via R5_Wantdrain.  The presence of the prexor operation is tracked in
sh->reconstruct_state.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 89 +++++++++++++++-------------------------------
 include/linux/raid/raid5.h |  6 ++--
 2 files changed, 32 insertions(+), 63 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9159367491a..c71246061c0e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -637,7 +637,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
+		if (test_bit(R5_Wantdrain, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -649,16 +649,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		 unsigned long ops_request)
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	int pd_idx = sh->pd_idx, i;
-
-	/* check if prexor is active which means only process blocks
-	 * that are part of a read-modify-write (Wantprexor)
-	 */
-	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -666,20 +660,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		struct bio *chosen;
-		int towrite;
 
-		towrite = 0;
-		if (prexor) { /* rmw */
-			if (dev->towrite &&
-			    test_bit(R5_Wantprexor, &dev->flags))
-				towrite = 1;
-		} else { /* rcw */
-			if (i != pd_idx && dev->towrite &&
-				test_bit(R5_LOCKED, &dev->flags))
-				towrite = 1;
-		}
-
-		if (towrite) {
+		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
 			struct bio *wbi;
 
 			spin_lock(&sh->lock);
@@ -702,18 +684,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 }
 
 static void ops_complete_postxor(void *stripe_head_ref)
-{
-	struct stripe_head *sh = stripe_head_ref;
-
-	pr_debug("%s: stripe %llu\n", __func__,
-		(unsigned long long)sh->sector);
-
-	sh->reconstruct_state = reconstruct_state_result;
-	set_bit(STRIPE_HANDLE, &sh->state);
-	release_stripe(sh);
-}
-
-static void ops_complete_write(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
 	int disks = sh->disks, i, pd_idx = sh->pd_idx;
@@ -727,14 +697,21 @@ static void ops_complete_write(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
-	sh->reconstruct_state = reconstruct_state_drain_result;
+	if (sh->reconstruct_state == reconstruct_state_drain_run)
+		sh->reconstruct_state = reconstruct_state_drain_result;
+	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
+		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
+	else {
+		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
+		sh->reconstruct_state = reconstruct_state_result;
+	}
+
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
 static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		unsigned long ops_request)
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -742,9 +719,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
-	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
+	int prexor = 0;
 	unsigned long flags;
-	dma_async_tx_callback callback;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -752,7 +728,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
-	if (prexor) {
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		prexor = 1;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -768,10 +745,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 		}
 	}
 
-	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ?
-		ops_complete_write : ops_complete_postxor;
-
 	/* 1/ if we prexor'd then the dest is reused as a source
 	 * 2/ if we did not prexor then we are redoing the parity
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -785,10 +758,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	if (unlikely(count == 1)) {
 		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			flags, tx, callback, sh);
+			flags, tx, ops_complete_postxor, sh);
 	} else
 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			flags, tx, callback, sh);
+			flags, tx, ops_complete_postxor, sh);
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -847,12 +820,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 		tx = ops_run_prexor(sh, tx);
 
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
-		tx = ops_run_biodrain(sh, tx, ops_request);
+		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
 	}
 
 	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
-		ops_run_postxor(sh, tx, ops_request);
+		ops_run_postxor(sh, tx);
 
 	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
@@ -1669,6 +1642,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 
 			if (dev->towrite) {
 				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
@@ -1681,7 +1655,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		sh->reconstruct_state = reconstruct_state_drain_run;
+		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
 		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
 		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
 		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
@@ -1691,15 +1665,10 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 			if (i == pd_idx)
 				continue;
 
-			/* For a read-modify write there may be blocks that are
-			 * locked for reading while others are ready to be
-			 * written so we distinguish these blocks by the
-			 * R5_Wantprexor bit
-			 */
 			if (dev->towrite &&
 			    (test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags))) {
-				set_bit(R5_Wantprexor, &dev->flags);
+			     test_bit(R5_Wantcompute, &dev->flags))) {
+				set_bit(R5_Wantdrain, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
@@ -2660,11 +2629,11 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * completed
 	 */
 	prexor = 0;
-	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+		prexor = 1;
+	if (sh->reconstruct_state == reconstruct_state_drain_result ||
+	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
 		sh->reconstruct_state = reconstruct_state_idle;
-		for (i = disks; i--; )
-			prexor += test_and_clear_bit(R5_Wantprexor,
-						     &sh->dev[i].flags);
 
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 5f3e674b87dd..3b2672792457 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -187,8 +187,10 @@ enum check_states {
  */
 enum reconstruct_states {
 	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
 	reconstruct_state_drain_run,		/* write */
 	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
 	reconstruct_state_drain_result,
 	reconstruct_state_result,
 };
@@ -258,9 +260,7 @@ struct r6_state {
 #define	R5_Wantfill	12 /* dev->toread contains a bio that needs
 				    * filling
 				    */
-#define	R5_Wantprexor	13 /* distinguish blocks ready for rmw from
-				    * other "towrites"
-				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
 /*
  * Write method
  */
-- 
cgit v1.2.3


From 7b3a871ed995270268a481404454ceafe1a87478 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:09 +1000
Subject: md: handle operation chaining in raid5_run_ops

From: Dan Williams <dan.j.williams@intel.com>

Neil said:
> At the end of ops_run_compute5 you have:
>         /* ack now if postxor is not set to be run */
>         if (tx && !test_bit(STRIPE_OP_POSTXOR, &s->ops_run))
>                 async_tx_ack(tx);
>
> It looks odd having that test there.  Would it fit in raid5_run_ops
> better?

The intended global interpretation is that raid5_run_ops can build a chain
of xor and memcpy operations.  When MD registers the compute-xor it tells
async_tx to keep the operation handle around so that another item in the
dependency chain can be submitted. If we are just computing a block to
satisfy a read then we can terminate the chain immediately.  raid5_run_ops
gives a better context for this test since it cares about the entire chain.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c71246061c0e..456c3c2c961d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -574,8 +574,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	release_stripe(sh);
 }
 
-static struct dma_async_tx_descriptor *
-ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
+static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -605,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
 			ASYNC_TX_XOR_ZERO_DST, NULL,
 			ops_complete_compute5, sh);
 
-	/* ack now if postxor is not set to be run */
-	if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
-		async_tx_ack(tx);
-
 	return tx;
 }
 
@@ -813,8 +808,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
-		tx = ops_run_compute5(sh, ops_request);
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
+		tx = ops_run_compute5(sh);
+		/* terminate the chain if postxor is not set to be run */
+		if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
+			async_tx_ack(tx);
+	}
 
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 		tx = ops_run_prexor(sh, tx);
-- 
cgit v1.2.3


From 1fe797e67fb07d605b82300934d0de67068a0aca Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 09:16:30 +1000
Subject: md: rationalize raid5 function names

From: Dan Williams <dan.j.williams@intel.com>

Commit a4456856 refactored some of the deep code paths in raid5.c into separate
functions.  The names chosen at the time do not consistently indicate what is
going to happen to the stripe.  So, update the names, and since a stripe is a
cache element use cache semantics like fill, dirty, and clean.

(also, fix up the indentation in fetch_block5)

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 76 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 456c3c2c961d..442622067cae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1618,7 +1618,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 }
 
 static void
-handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
+schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
@@ -1783,7 +1783,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 }
 
 static void
-handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
+handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks,
 				struct bio **return_bi)
 {
@@ -1874,23 +1874,28 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
- * to process
+/* fetch_block5 - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill5 to continue
  */
-static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
-			struct stripe_head_state *s, int disk_idx, int disks)
+static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
+			int disk_idx, int disks)
 {
 	struct r5dev *dev = &sh->dev[disk_idx];
 	struct r5dev *failed_dev = &sh->dev[s->failed_num];
 
 	/* is the data in this block needed, and can we get it? */
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
-	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
-	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-	     s->syncing || s->expanding || (s->failed &&
-	     (failed_dev->toread || (failed_dev->towrite &&
-	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
-	     ))))) {
+	    !test_bit(R5_UPTODATE, &dev->flags) &&
+	    (dev->toread ||
+	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+	     s->syncing || s->expanding ||
+	     (s->failed &&
+	      (failed_dev->toread ||
+	       (failed_dev->towrite &&
+		!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
 		/* We would like to get this block, possibly by computing it,
 		 * otherwise read it if the backing disk is insync
 		 */
@@ -1908,7 +1913,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 * subsequent operation.
 			 */
 			s->uptodate++;
-			return 0; /* uptodate + compute == disks */
+			return 1; /* uptodate + compute == disks */
 		} else if (test_bit(R5_Insync, &dev->flags)) {
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
@@ -1918,10 +1923,13 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 		}
 	}
 
-	return ~0;
+	return 0;
 }
 
-static void handle_issuing_new_read_requests5(struct stripe_head *sh,
+/**
+ * handle_stripe_fill5 - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill5(struct stripe_head *sh,
 			struct stripe_head_state *s, int disks)
 {
 	int i;
@@ -1931,16 +1939,14 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !sh->reconstruct_state) {
+	    !sh->reconstruct_state)
 		for (i = disks; i--; )
-			if (__handle_issuing_new_read_requests5(
-				sh, s, i, disks) == 0)
+			if (fetch_block5(sh, s, i, disks))
 				break;
-	}
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
-static void handle_issuing_new_read_requests6(struct stripe_head *sh,
+static void handle_stripe_fill6(struct stripe_head *sh,
 			struct stripe_head_state *s, struct r6_state *r6s,
 			int disks)
 {
@@ -1999,12 +2005,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
 }
 
 
-/* handle_completed_write_requests
+/* handle_stripe_clean_event
  * any written block on an uptodate or failed drive can be returned.
  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
  * never LOCKED, so we don't need to test 'failed' directly.
  */
-static void handle_completed_write_requests(raid5_conf_t *conf,
+static void handle_stripe_clean_event(raid5_conf_t *conf,
 	struct stripe_head *sh, int disks, struct bio **return_bi)
 {
 	int i;
@@ -2049,7 +2055,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
+static void handle_stripe_dirtying5(raid5_conf_t *conf,
 		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
 {
 	int rmw = 0, rcw = 0, i;
@@ -2136,10 +2142,10 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-		handle_write_operations5(sh, s, rcw == 0, 0);
+		schedule_reconstruction5(sh, s, rcw == 0, 0);
 }
 
-static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
+static void handle_stripe_dirtying6(raid5_conf_t *conf,
 		struct stripe_head *sh,	struct stripe_head_state *s,
 		struct r6_state *r6s, int disks)
 {
@@ -2597,8 +2603,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * need to be failed
 	 */
 	if (s.failed > 1 && s.to_read+s.to_write+s.written)
-		handle_requests_to_failed_array(conf, sh, &s, disks,
-						&return_bi);
+		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
 	if (s.failed > 1 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2614,7 +2619,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	       !test_bit(R5_LOCKED, &dev->flags) &&
 	       test_bit(R5_UPTODATE, &dev->flags)) ||
 	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
-		handle_completed_write_requests(conf, sh, disks, &return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -2622,7 +2627,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 */
 	if (s.to_read || s.non_overwrite ||
 	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
-		handle_issuing_new_read_requests5(sh, &s, disks);
+		handle_stripe_fill5(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -2666,7 +2671,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 *    block.
 	 */
 	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_issuing_new_write_requests5(conf, sh, &s, disks);
+		handle_stripe_dirtying5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -2722,7 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
 			conf->raid_disks);
-		handle_write_operations5(sh, &s, 1, 1);
+		schedule_reconstruction5(sh, &s, 1, 1);
 	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
@@ -2854,8 +2859,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 * might need to be failed
 	 */
 	if (s.failed > 2 && s.to_read+s.to_write+s.written)
-		handle_requests_to_failed_array(conf, sh, &s, disks,
-						&return_bi);
+		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
 	if (s.failed > 2 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2880,7 +2884,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && test_bit(R5_UPTODATE, &qdev->flags)))))
-		handle_completed_write_requests(conf, sh, disks, &return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -2888,11 +2892,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 */
 	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
 	    (s.syncing && (s.uptodate < disks)) || s.expanding)
-		handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
+		handle_stripe_fill6(sh, &s, &r6s, disks);
 
 	/* now to consider writing and what else, if anything should be read */
 	if (s.to_write)
-		handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
+		handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
-- 
cgit v1.2.3