From 46d5b785621ad10a373e292f9101ccfc626466e0 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:02 +1100
Subject: raid5: use flex_array for scribble data

Use flex_array for scribble data. Next patch will batch several stripes
together, so scribble data should be able to cover several stripes, so this
patch also allocates scribble data for stripes across a chunk.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 983e18a83db1..1d0f241d7d3b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -458,15 +458,11 @@ struct r5conf {
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
-		void		*scribble;   /* space for constructing buffer
+		struct flex_array *scribble;   /* space for constructing buffer
 					      * lists and performing address
 					      * conversions
 					      */
 	} __percpu *percpu;
-	size_t			scribble_len; /* size of scribble region must be
-					       * associated with conf to handle
-					       * cpu hotplug while reshaping
-					       */
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block	cpu_notify;
 #endif
-- 
cgit v1.2.3


From da41ba65972532a04f73927c903029a7ec3bc2ed Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: raid5: add a new flag to track if a stripe can be batched

A freshly new stripe with write request can be batched. Any time the stripe is
handled or new read is queued, the flag will be cleared.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 +++++++++---
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7fb510e54548..49b0f23dbad2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -555,6 +555,7 @@ retry:
 		goto retry;
 	insert_hash(conf, sh);
 	sh->cpu = smp_processor_id();
+	set_bit(STRIPE_BATCH_READY, &sh->state);
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -2645,7 +2646,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
+			  int forwrite, int previous)
 {
 	struct bio **bip;
 	struct r5conf *conf = sh->raid_conf;
@@ -2678,6 +2680,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (!forwrite || previous)
+		clear_bit(STRIPE_BATCH_READY, &sh->state);
+
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
 	if (*bip)
 		bi->bi_next = *bip;
@@ -3824,6 +3829,7 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
+	clear_bit(STRIPE_BATCH_READY, &sh->state);
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
@@ -4793,7 +4799,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-			    !add_stripe_bio(sh, bi, dd_idx, rw)) {
+			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
 				/* Stripe is busy expanding or
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
@@ -5206,7 +5212,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 			return handled;
 		}
 
-		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			release_stripe(sh);
 			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1d0f241d7d3b..37644e3d5293 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -327,6 +327,7 @@ enum {
 	STRIPE_ON_UNPLUG_LIST,
 	STRIPE_DISCARD,
 	STRIPE_ON_RELEASE_LIST,
+	STRIPE_BATCH_READY,
 };
 
 /*
-- 
cgit v1.2.3


From 7a87f43405e91ca12b8770eb689dd9886f217091 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: raid5: track overwrite disk count

Track overwrite disk count, so we can know if a stripe is a full stripe write.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 14 +++++++++++++-
 drivers/md/raid5.h |  4 ++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 49b0f23dbad2..e801c6669c6d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -553,6 +553,7 @@ retry:
 	}
 	if (read_seqcount_retry(&conf->gen_lock, seq))
 		goto retry;
+	sh->overwrite_disks = 0;
 	insert_hash(conf, sh);
 	sh->cpu = smp_processor_id();
 	set_bit(STRIPE_BATCH_READY, &sh->state);
@@ -710,6 +711,12 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 	return sh;
 }
 
+static bool is_full_stripe_write(struct stripe_head *sh)
+{
+	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
+	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
+}
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -1413,6 +1420,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
+			sh->overwrite_disks = 0;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
@@ -2700,7 +2708,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 				sector = bio_end_sector(bi);
 		}
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
-			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+			if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
+				sh->overwrite_disks++;
 	}
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -2772,6 +2781,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		/* fail all writes first */
 		bi = sh->dev[i].towrite;
 		sh->dev[i].towrite = NULL;
+		sh->overwrite_disks = 0;
 		spin_unlock_irq(&sh->stripe_lock);
 		if (bi)
 			bitmap_end = 1;
@@ -4630,12 +4640,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		}
 		set_bit(STRIPE_DISCARD, &sh->state);
 		finish_wait(&conf->wait_for_overlap, &w);
+		sh->overwrite_disks = 0;
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
 		if (conf->mddev->bitmap) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 37644e3d5293..4cc1a48127c7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,10 @@ struct stripe_head {
 	atomic_t		count;	      /* nr of active thread/requests */
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;		/* disks in stripe */
+	int			overwrite_disks; /* total overwrite disks in stripe,
+						  * this is only checked when stripe
+						  * has STRIPE_BATCH_READY
+						  */
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
-- 
cgit v1.2.3


From 59fc630b8b5f9f21c8ce3ba153341c107dce1b0c Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: RAID5: batch adjacent full stripe write

stripe cache is 4k size. Even adjacent full stripe writes are handled in 4k
unit. Idealy we should use big size for adjacent full stripe writes. Bigger
stripe cache size means less stripes runing in the state machine so can reduce
cpu overhead. And also bigger size can cause bigger IO size dispatched to under
layer disks.

With below patch, we will automatically batch adjacent full stripe write
together. Such stripes will be added to the batch list. Only the first stripe
of the list will be put to handle_list and so run handle_stripe(). Some steps
of handle_stripe() are extended to cover all stripes of the list, including
ops_run_io, ops_run_biodrain and so on. With this patch, we have less stripes
running in handle_stripe() and we send IO of whole stripe list together to
increase IO size.

Stripes added to a batch list have some limitations. A batch list can only
include full stripe write and can't cross chunk boundary to make sure stripes
have the same parity disks. Stripes in a batch list must be in the same state
(no written, toread and so on). If a stripe is in a batch list, all new
read/write to add_stripe_bio will be blocked to overlap conflict till the batch
list is handled. The limitations will make sure stripes in a batch list be in
exactly the same state in the life circly.

I did test running 160k randwrite in a RAID5 array with 32k chunk size and 6
PCIe SSD. This patch improves around 30% performance and IO size to under layer
disk is exactly 32k. I also run a 4k randwrite test in the same array to make
sure the performance isn't changed with the patch.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 357 +++++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/raid5.h |   4 +
 2 files changed, 336 insertions(+), 25 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e801c6669c6d..717189e74243 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -526,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 	BUG_ON(stripe_operations_active(sh));
+	BUG_ON(sh->batch_head);
 
 	pr_debug("init_stripe called, stripe %llu\n",
 		(unsigned long long)sector);
@@ -717,6 +718,124 @@ static bool is_full_stripe_write(struct stripe_head *sh)
 	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 }
 
+static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+	local_irq_disable();
+	if (sh1 > sh2) {
+		spin_lock(&sh2->stripe_lock);
+		spin_lock_nested(&sh1->stripe_lock, 1);
+	} else {
+		spin_lock(&sh1->stripe_lock);
+		spin_lock_nested(&sh2->stripe_lock, 1);
+	}
+}
+
+static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+	spin_unlock(&sh1->stripe_lock);
+	spin_unlock(&sh2->stripe_lock);
+	local_irq_enable();
+}
+
+/* Only freshly new full stripe normal write stripe can be added to a batch list */
+static bool stripe_can_batch(struct stripe_head *sh)
+{
+	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+		is_full_stripe_write(sh);
+}
+
+/* we only do back search */
+static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+{
+	struct stripe_head *head;
+	sector_t head_sector, tmp_sec;
+	int hash;
+	int dd_idx;
+
+	if (!stripe_can_batch(sh))
+		return;
+	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
+	tmp_sec = sh->sector;
+	if (!sector_div(tmp_sec, conf->chunk_sectors))
+		return;
+	head_sector = sh->sector - STRIPE_SECTORS;
+
+	hash = stripe_hash_locks_hash(head_sector);
+	spin_lock_irq(conf->hash_locks + hash);
+	head = __find_stripe(conf, head_sector, conf->generation);
+	if (head && !atomic_inc_not_zero(&head->count)) {
+		spin_lock(&conf->device_lock);
+		if (!atomic_read(&head->count)) {
+			if (!test_bit(STRIPE_HANDLE, &head->state))
+				atomic_inc(&conf->active_stripes);
+			BUG_ON(list_empty(&head->lru) &&
+			       !test_bit(STRIPE_EXPANDING, &head->state));
+			list_del_init(&head->lru);
+			if (head->group) {
+				head->group->stripes_cnt--;
+				head->group = NULL;
+			}
+		}
+		atomic_inc(&head->count);
+		spin_unlock(&conf->device_lock);
+	}
+	spin_unlock_irq(conf->hash_locks + hash);
+
+	if (!head)
+		return;
+	if (!stripe_can_batch(head))
+		goto out;
+
+	lock_two_stripes(head, sh);
+	/* clear_batch_ready clear the flag */
+	if (!stripe_can_batch(head) || !stripe_can_batch(sh))
+		goto unlock_out;
+
+	if (sh->batch_head)
+		goto unlock_out;
+
+	dd_idx = 0;
+	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+		dd_idx++;
+	if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
+		goto unlock_out;
+
+	if (head->batch_head) {
+		spin_lock(&head->batch_head->batch_lock);
+		/* This batch list is already running */
+		if (!stripe_can_batch(head)) {
+			spin_unlock(&head->batch_head->batch_lock);
+			goto unlock_out;
+		}
+
+		/*
+		 * at this point, head's BATCH_READY could be cleared, but we
+		 * can still add the stripe to batch list
+		 */
+		list_add(&sh->batch_list, &head->batch_list);
+		spin_unlock(&head->batch_head->batch_lock);
+
+		sh->batch_head = head->batch_head;
+	} else {
+		head->batch_head = head;
+		sh->batch_head = head->batch_head;
+		spin_lock(&head->batch_lock);
+		list_add_tail(&sh->batch_list, &head->batch_list);
+		spin_unlock(&head->batch_lock);
+	}
+
+	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		if (atomic_dec_return(&conf->preread_active_stripes)
+		    < IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+
+	atomic_inc(&sh->count);
+unlock_out:
+	unlock_two_stripes(head, sh);
+out:
+	release_stripe(head);
+}
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -747,6 +866,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
+	struct stripe_head *head_sh = sh;
 
 	might_sleep();
 
@@ -755,6 +875,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		int replace_only = 0;
 		struct bio *bi, *rbi;
 		struct md_rdev *rdev, *rrdev = NULL;
+
+		sh = head_sh;
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 				rw = WRITE_FUA;
@@ -773,6 +895,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
 			rw |= REQ_SYNC;
 
+again:
 		bi = &sh->dev[i].req;
 		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
@@ -791,7 +914,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				/* We raced and saw duplicates */
 				rrdev = NULL;
 		} else {
-			if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
 				rdev = rrdev;
 			rrdev = NULL;
 		}
@@ -862,13 +985,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				__func__, (unsigned long long)sh->sector,
 				bi->bi_rw, i);
 			atomic_inc(&sh->count);
+			if (sh != head_sh)
+				atomic_inc(&head_sh->count);
 			if (use_new_offset(conf, sh))
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->new_data_offset);
 			else
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->data_offset);
-			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
 				bi->bi_rw |= REQ_NOMERGE;
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
@@ -912,6 +1037,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				__func__, (unsigned long long)sh->sector,
 				rbi->bi_rw, i);
 			atomic_inc(&sh->count);
+			if (sh != head_sh)
+				atomic_inc(&head_sh->count);
 			if (use_new_offset(conf, sh))
 				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->new_data_offset);
@@ -945,6 +1072,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
+
+		if (!head_sh->batch_head)
+			continue;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+				      batch_list);
+		if (sh != head_sh)
+			goto again;
 	}
 }
 
@@ -1060,6 +1194,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 	struct async_submit_ctl submit;
 	int i;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
@@ -1148,6 +1283,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct async_submit_ctl submit;
 	int i;
 
+	BUG_ON(sh->batch_head);
+
 	pr_debug("%s: stripe %llu block: %d\n",
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
@@ -1214,6 +1351,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	int i;
 	int count;
 
+	BUG_ON(sh->batch_head);
 	if (sh->ops.target < 0)
 		target = sh->ops.target2;
 	else if (sh->ops.target2 < 0)
@@ -1272,6 +1410,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct page **blocks = to_addr_page(percpu, 0);
 	struct async_submit_ctl submit;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
 		 __func__, (unsigned long long)sh->sector, target, target2);
 	BUG_ON(target < 0 || target2 < 0);
@@ -1384,6 +1523,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	/* existing parity data subtracted */
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
@@ -1406,17 +1546,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
 	int i;
+	struct stripe_head *head_sh = sh;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
 	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
+		struct r5dev *dev;
 		struct bio *chosen;
 
-		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
+		sh = head_sh;
+		if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
 			struct bio *wbi;
 
+again:
+			dev = &sh->dev[i];
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1445,6 +1589,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 				}
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
+
+			if (head_sh->batch_head) {
+				sh = list_first_entry(&sh->batch_list,
+						      struct stripe_head,
+						      batch_list);
+				if (sh == head_sh)
+					continue;
+				goto again;
+			}
 		}
 	}
 
@@ -1500,12 +1653,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		     struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = to_addr_page(percpu, 0);
+	struct page **xor_srcs;
 	struct async_submit_ctl submit;
-	int count = 0, pd_idx = sh->pd_idx, i;
+	int count, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
 	int prexor = 0;
 	unsigned long flags;
+	int j = 0;
+	struct stripe_head *head_sh = sh;
+	int last_stripe;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -1522,15 +1678,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		ops_complete_reconstruct(sh);
 		return;
 	}
+again:
+	count = 0;
+	xor_srcs = to_addr_page(percpu, j);
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
-	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
 		prexor = 1;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (dev->written)
+			if (head_sh->dev[i].written)
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -1547,17 +1706,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
 	 * for the synchronous xor case
 	 */
-	flags = ASYNC_TX_ACK |
-		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
-
-	atomic_inc(&sh->count);
+	last_stripe = !head_sh->batch_head ||
+		list_first_entry(&sh->batch_list,
+				 struct stripe_head, batch_list) == head_sh;
+	if (last_stripe) {
+		flags = ASYNC_TX_ACK |
+			(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+		atomic_inc(&head_sh->count);
+		init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
+				  to_addr_conv(sh, percpu, j));
+	} else {
+		flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
+		init_async_submit(&submit, flags, tx, NULL, NULL,
+				  to_addr_conv(sh, percpu, j));
+	}
 
-	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
-			  to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+	if (!last_stripe) {
+		j++;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+				      batch_list);
+		goto again;
+	}
 }
 
 static void
@@ -1565,8 +1739,10 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 		     struct dma_async_tx_descriptor *tx)
 {
 	struct async_submit_ctl submit;
-	struct page **blocks = to_addr_page(percpu, 0);
-	int count, i;
+	struct page **blocks;
+	int count, i, j = 0;
+	struct stripe_head *head_sh = sh;
+	int last_stripe;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1584,13 +1760,27 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 		return;
 	}
 
+again:
+	blocks = to_addr_page(percpu, j);
 	count = set_syndrome_sources(blocks, sh);
-
-	atomic_inc(&sh->count);
-
-	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
-			  sh, to_addr_conv(sh, percpu, 0));
+	last_stripe = !head_sh->batch_head ||
+		list_first_entry(&sh->batch_list,
+				 struct stripe_head, batch_list) == head_sh;
+
+	if (last_stripe) {
+		atomic_inc(&head_sh->count);
+		init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+				  head_sh, to_addr_conv(sh, percpu, j));
+	} else
+		init_async_submit(&submit, 0, tx, NULL, NULL,
+				  to_addr_conv(sh, percpu, j));
 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+	if (!last_stripe) {
+		j++;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+				      batch_list);
+		goto again;
+	}
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -1620,6 +1810,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	BUG_ON(sh->batch_head);
 	count = 0;
 	xor_dest = sh->dev[pd_idx].page;
 	xor_srcs[count++] = xor_dest;
@@ -1648,6 +1839,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
 		(unsigned long long)sh->sector, checkp);
 
+	BUG_ON(sh->batch_head);
 	count = set_syndrome_sources(srcs, sh);
 	if (!checkp)
 		srcs[count] = NULL;
@@ -1715,7 +1907,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			BUG();
 	}
 
-	if (overlap_clear)
+	if (overlap_clear && !sh->batch_head)
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
@@ -1745,6 +1937,10 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
 	INIT_LIST_HEAD(&sh->lru);
+
+	spin_lock_init(&sh->batch_lock);
+	INIT_LIST_HEAD(&sh->batch_list);
+	sh->batch_head = NULL;
 	release_stripe(sh);
 	return 1;
 }
@@ -2188,6 +2384,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
+
+	if (sh->batch_head && sh != sh->batch_head)
+		release_stripe(sh->batch_head);
 }
 
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
@@ -2674,6 +2873,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	 * protect it.
 	 */
 	spin_lock_irq(&sh->stripe_lock);
+	/* Don't allow new IO added to stripes in batch list */
+	if (sh->batch_head)
+		goto overlap;
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
 		if (*bip == NULL)
@@ -2723,6 +2925,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		sh->bm_seq = conf->seq_flush+1;
 		set_bit(STRIPE_BIT_DELAY, &sh->state);
 	}
+
+	if (stripe_can_batch(sh))
+		stripe_add_to_batch_list(conf, sh);
 	return 1;
 
  overlap:
@@ -2755,6 +2960,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct bio **return_bi)
 {
 	int i;
+	BUG_ON(sh->batch_head);
 	for (i = disks; i--; ) {
 		struct bio *bi;
 		int bitmap_end = 0;
@@ -2870,6 +3076,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	int abort = 0;
 	int i;
 
+	BUG_ON(sh->batch_head);
 	clear_bit(STRIPE_SYNCING, &sh->state);
 	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
 		wake_up(&conf->wait_for_overlap);
@@ -3100,6 +3307,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
 {
 	int i;
 
+	BUG_ON(sh->batch_head);
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
@@ -3123,6 +3331,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	int i;
 	struct r5dev *dev;
 	int discard_pending = 0;
+	struct stripe_head *head_sh = sh;
+	bool do_endio = false;
+	int wakeup_nr = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
@@ -3138,8 +3349,11 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					clear_bit(R5_UPTODATE, &dev->flags);
 				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
 					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
-					dev->page = dev->orig_page;
 				}
+				do_endio = true;
+
+returnbi:
+				dev->page = dev->orig_page;
 				wbi = dev->written;
 				dev->written = NULL;
 				while (wbi && wbi->bi_iter.bi_sector <
@@ -3156,6 +3370,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 						STRIPE_SECTORS,
 					 !test_bit(STRIPE_DEGRADED, &sh->state),
 						0);
+				if (head_sh->batch_head) {
+					sh = list_first_entry(&sh->batch_list,
+							      struct stripe_head,
+							      batch_list);
+					if (sh != head_sh) {
+						dev = &sh->dev[i];
+						goto returnbi;
+					}
+				}
+				sh = head_sh;
+				dev = &sh->dev[i];
 			} else if (test_bit(R5_Discard, &dev->flags))
 				discard_pending = 1;
 			WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
@@ -3177,8 +3402,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		 * will be reinitialized
 		 */
 		spin_lock_irq(&conf->device_lock);
+unhash:
 		remove_hash(sh);
+		if (head_sh->batch_head) {
+			sh = list_first_entry(&sh->batch_list,
+					      struct stripe_head, batch_list);
+			if (sh != head_sh)
+					goto unhash;
+		}
 		spin_unlock_irq(&conf->device_lock);
+		sh = head_sh;
+
 		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
 			set_bit(STRIPE_HANDLE, &sh->state);
 
@@ -3187,6 +3421,39 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
 		if (atomic_dec_and_test(&conf->pending_full_writes))
 			md_wakeup_thread(conf->mddev->thread);
+
+	if (!head_sh->batch_head || !do_endio)
+		return;
+	for (i = 0; i < head_sh->disks; i++) {
+		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+			wakeup_nr++;
+	}
+	while (!list_empty(&head_sh->batch_list)) {
+		int i;
+		sh = list_first_entry(&head_sh->batch_list,
+				      struct stripe_head, batch_list);
+		list_del_init(&sh->batch_list);
+
+		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
+			(1 << STRIPE_PREREAD_ACTIVE)));
+		sh->check_state = head_sh->check_state;
+		sh->reconstruct_state = head_sh->reconstruct_state;
+		for (i = 0; i < sh->disks; i++) {
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wakeup_nr++;
+			sh->dev[i].flags = head_sh->dev[i].flags;
+		}
+
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
+		release_stripe(sh);
+	}
+
+	spin_lock_irq(&head_sh->stripe_lock);
+	head_sh->batch_head = NULL;
+	spin_unlock_irq(&head_sh->stripe_lock);
+	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3326,6 +3593,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 {
 	struct r5dev *dev = NULL;
 
+	BUG_ON(sh->batch_head);
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	switch (sh->check_state) {
@@ -3416,6 +3684,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 	int qd_idx = sh->qd_idx;
 	struct r5dev *dev;
 
+	BUG_ON(sh->batch_head);
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	BUG_ON(s->failed > 2);
@@ -3579,6 +3848,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 	 * copy some of them into a target stripe for expand.
 	 */
 	struct dma_async_tx_descriptor *tx = NULL;
+	BUG_ON(sh->batch_head);
 	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	for (i = 0; i < sh->disks; i++)
 		if (i != sh->pd_idx && i != sh->qd_idx) {
@@ -3822,6 +4092,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 	rcu_read_unlock();
 }
 
+static int clear_batch_ready(struct stripe_head *sh)
+{
+	struct stripe_head *tmp;
+	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
+		return 0;
+	spin_lock(&sh->stripe_lock);
+	if (!sh->batch_head) {
+		spin_unlock(&sh->stripe_lock);
+		return 0;
+	}
+
+	/*
+	 * this stripe could be added to a batch list before we check
+	 * BATCH_READY, skips it
+	 */
+	if (sh->batch_head != sh) {
+		spin_unlock(&sh->stripe_lock);
+		return 1;
+	}
+	spin_lock(&sh->batch_lock);
+	list_for_each_entry(tmp, &sh->batch_list, batch_list)
+		clear_bit(STRIPE_BATCH_READY, &tmp->state);
+	spin_unlock(&sh->batch_lock);
+	spin_unlock(&sh->stripe_lock);
+
+	/*
+	 * BATCH_READY is cleared, no new stripes can be added.
+	 * batch_list can be accessed without lock
+	 */
+	return 0;
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -3839,7 +4141,11 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
-	clear_bit(STRIPE_BATCH_READY, &sh->state);
+	if (clear_batch_ready(sh) ) {
+		clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+		return;
+	}
+
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
@@ -4824,7 +5130,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			}
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
-			if ((bi->bi_rw & REQ_SYNC) &&
+			if ((!sh->batch_head || sh == sh->batch_head) &&
+			    (bi->bi_rw & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe_plug(mddev, sh);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc1a48127c7..c8d0004dca8f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -219,6 +219,10 @@ struct stripe_head {
 	spinlock_t		stripe_lock;
 	int			cpu;
 	struct r5worker_group	*group;
+
+	struct stripe_head	*batch_head; /* protected by stripe lock */
+	spinlock_t		batch_lock; /* only header's lock is useful */
+	struct list_head	batch_list; /* protected by head's batch lock*/
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-- 
cgit v1.2.3


From 72ac733015bbdc0356ba3e92c52137a265910a91 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: raid5: handle io error of batch list

If io error happens in any stripe of a batch list, the batch list will be
split, then normal process will run for the stripes in the list.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |  1 +
 2 files changed, 49 insertions(+)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 717189e74243..54f3cb312b42 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1070,6 +1070,9 @@ again:
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			if (sh->batch_head)
+				set_bit(STRIPE_BATCH_ERR,
+					&sh->batch_head->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
 
@@ -2380,6 +2383,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
+	if (sh->batch_head && !uptodate)
+		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -4124,6 +4130,46 @@ static int clear_batch_ready(struct stripe_head *sh)
 	return 0;
 }
 
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+	struct stripe_head *head_sh, *next;
+	int i;
+
+	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+		return;
+
+	head_sh = sh;
+	do {
+		sh = list_first_entry(&sh->batch_list,
+				      struct stripe_head, batch_list);
+		BUG_ON(sh == head_sh);
+	} while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+	while (sh != head_sh) {
+		next = list_first_entry(&sh->batch_list,
+					struct stripe_head, batch_list);
+		list_del_init(&sh->batch_list);
+
+		sh->state = head_sh->state & ~((1 << STRIPE_ACTIVE) |
+					       (1 << STRIPE_PREREAD_ACTIVE) |
+					       (1 << STRIPE_DEGRADED));
+		sh->check_state = head_sh->check_state;
+		sh->reconstruct_state = head_sh->reconstruct_state;
+		for (i = 0; i < sh->disks; i++)
+			sh->dev[i].flags = head_sh->dev[i].flags &
+				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		release_stripe(sh);
+
+		sh = next;
+	}
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -4146,6 +4192,8 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
+	check_break_stripe_batch_list(sh);
+
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c8d0004dca8f..cf3562e99440 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -336,6 +336,7 @@ enum {
 	STRIPE_DISCARD,
 	STRIPE_ON_RELEASE_LIST,
 	STRIPE_BATCH_READY,
+	STRIPE_BATCH_ERR,
 };
 
 /*
-- 
cgit v1.2.3


From dabc4ec6ba72418ebca6bf1884f344bba40c8709 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:04 +1100
Subject: raid5: handle expansion/resync case with stripe batching

expansion/resync can grab a stripe when the stripe is in batch list. Since all
stripes in batch list must be in the same state, we can't allow some stripes
run into expansion/resync. So we delay expansion/resync for stripe in batch
list.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 24 ++++++++++++++++--------
 drivers/md/raid5.h |  5 +++++
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54f3cb312b42..3ae097d50b51 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3440,8 +3440,10 @@ unhash:
 				      struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
-			(1 << STRIPE_PREREAD_ACTIVE)));
+		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
+						 (1 << STRIPE_PREREAD_ACTIVE) |
+						 STRIPE_EXPAND_SYNC_FLAG));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
@@ -3453,6 +3455,8 @@ unhash:
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
+		if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
 
@@ -3460,6 +3464,8 @@ unhash:
 	head_sh->batch_head = NULL;
 	spin_unlock_irq(&head_sh->stripe_lock);
 	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3927,8 +3933,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
 	memset(s, 0, sizeof(*s));
 
-	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
 	s->failed_num[0] = -1;
 	s->failed_num[1] = -1;
 
@@ -4150,9 +4156,11 @@ static void check_break_stripe_batch_list(struct stripe_head *sh)
 					struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		sh->state = head_sh->state & ~((1 << STRIPE_ACTIVE) |
-					       (1 << STRIPE_PREREAD_ACTIVE) |
-					       (1 << STRIPE_DEGRADED));
+		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
+						 (1 << STRIPE_PREREAD_ACTIVE) |
+						 (1 << STRIPE_DEGRADED) |
+						 STRIPE_EXPAND_SYNC_FLAG));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++)
@@ -4194,7 +4202,7 @@ static void handle_stripe(struct stripe_head *sh)
 
 	check_break_stripe_batch_list(sh);
 
-	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
 		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cf3562e99440..ee65ed844d3f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -339,6 +339,11 @@ enum {
 	STRIPE_BATCH_ERR,
 };
 
+#define STRIPE_EXPAND_SYNC_FLAG \
+	((1 << STRIPE_EXPAND_SOURCE) |\
+	(1 << STRIPE_EXPAND_READY) |\
+	(1 << STRIPE_EXPANDING) |\
+	(1 << STRIPE_SYNC_REQUESTED))
 /*
  * Operation request flags
  */
-- 
cgit v1.2.3


From 584acdd49cd2472ca0f5a06adbe979db82d0b4af Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:05 +1100
Subject: md/raid5: activate raid6 rmw feature

Glue it altogehter. The raid6 rmw path should work the same as the
already existing raid5 logic. So emulate the prexor handling/flags
and split functions as needed.

1) Enable xor_syndrome() in the async layer.

2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome
at the start of a rmw run as we did it before for the single parity.

3) Take care of rmw run in ops_run_reconstruct6(). Again process only
the changed pages to get syndrome back into sync.

4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw
run. The lower layers will calculate start & end pages from that and
call the xor_syndrome() correspondingly.

5) Adapt the several places where we ignored Q handling up to now.

Performance numbers for a single E5630 system with a mix of 10 7200k
desktop/server disks. 300 seconds random write with 8 threads onto a
3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4)

bsize   rmw_level=1   rmw_level=0   rmw_level=1   rmw_level=0
        skip_copy=1   skip_copy=1   skip_copy=0   skip_copy=0
   4K      115 KB/s      141 KB/s      165 KB/s      140 KB/s
   8K      225 KB/s      275 KB/s      324 KB/s      274 KB/s
  16K      434 KB/s      536 KB/s      640 KB/s      534 KB/s
  32K      751 KB/s    1,051 KB/s    1,234 KB/s    1,045 KB/s
  64K    1,339 KB/s    1,958 KB/s    2,282 KB/s    1,962 KB/s
 128K    2,673 KB/s    3,862 KB/s    4,113 KB/s    3,898 KB/s
 256K    7,685 KB/s    7,539 KB/s    7,557 KB/s    7,638 KB/s
 512K   19,556 KB/s   19,558 KB/s   19,652 KB/s   19,688 Kb/s

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 crypto/async_tx/async_pq.c |  19 +++++++--
 drivers/md/raid5.c         | 104 +++++++++++++++++++++++++++++++++------------
 drivers/md/raid5.h         |  19 ++++++++-
 include/linux/async_tx.h   |   3 ++
 4 files changed, 115 insertions(+), 30 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index d05327caf69d..5d355e0c2633 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 {
 	void **srcs;
 	int i;
+	int start = -1, stop = disks - 3;
 
 	if (submit->scribble)
 		srcs = submit->scribble;
@@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 		if (blocks[i] == NULL) {
 			BUG_ON(i > disks - 3); /* P or Q can't be zero */
 			srcs[i] = (void*)raid6_empty_zero_page;
-		} else
+		} else {
 			srcs[i] = page_address(blocks[i]) + offset;
+			if (i < disks - 2) {
+				stop = i;
+				if (start == -1)
+					start = i;
+			}
+		}
 	}
-	raid6_call.gen_syndrome(disks, len, srcs);
+	if (submit->flags & ASYNC_TX_PQ_XOR_DST) {
+		BUG_ON(!raid6_call.xor_syndrome);
+		if (start >= 0)
+			raid6_call.xor_syndrome(disks, start, stop, len, srcs);
+	} else
+		raid6_call.gen_syndrome(disks, len, srcs);
 	async_tx_sync_epilog(submit);
 }
 
@@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
 	if (device)
 		unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
 
-	if (unmap &&
+	/* XORing P/Q is only implemented in software */
+	if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
 	    (src_cnt <= dma_maxpq(device, 0) ||
 	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
 	    is_dma_pq_aligned(device, offset, 0, len)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3ae097d50b51..c82ce1fd8723 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
  * destination buffer is recorded in srcs[count] and the Q destination
  * is recorded in srcs[count+1]].
  */
-static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+static int set_syndrome_sources(struct page **srcs,
+				struct stripe_head *sh,
+				int srctype)
 {
 	int disks = sh->disks;
 	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 	i = d0_idx;
 	do {
 		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+		struct r5dev *dev = &sh->dev[i];
 
-		srcs[slot] = sh->dev[i].page;
+		if (i == sh->qd_idx || i == sh->pd_idx ||
+		    (srctype == SYNDROME_SRC_ALL) ||
+		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
+		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		    (srctype == SYNDROME_SRC_WRITTEN &&
+		     dev->written))
+			srcs[slot] = sh->dev[i].page;
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
 
@@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	atomic_inc(&sh->count);
 
 	if (target == qd_idx) {
-		count = set_syndrome_sources(blocks, sh);
+		count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
 		blocks[count] = NULL; /* regenerating p is not necessary */
 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 				       &submit);
 
-			count = set_syndrome_sources(blocks, sh);
+			count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
 			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
 					  ops_complete_compute, sh,
 					  to_addr_conv(sh, percpu, 0));
@@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
-	       struct dma_async_tx_descriptor *tx)
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+		struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
 	struct page **xor_srcs = to_addr_page(percpu, 0);
@@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	return tx;
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+		struct dma_async_tx_descriptor *tx)
+{
+	struct page **blocks = to_addr_page(percpu, 0);
+	int count;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__,
+		(unsigned long long)sh->sector);
+
+	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+
+	return tx;
+}
+
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 	int count, i, j = 0;
 	struct stripe_head *head_sh = sh;
 	int last_stripe;
+	int synflags;
+	unsigned long txflags;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 
 again:
 	blocks = to_addr_page(percpu, j);
-	count = set_syndrome_sources(blocks, sh);
+
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		synflags = SYNDROME_SRC_WRITTEN;
+		txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+	} else {
+		synflags = SYNDROME_SRC_ALL;
+		txflags = ASYNC_TX_ACK;
+	}
+
+	count = set_syndrome_sources(blocks, sh, synflags);
 	last_stripe = !head_sh->batch_head ||
 		list_first_entry(&sh->batch_list,
 				 struct stripe_head, batch_list) == head_sh;
 
 	if (last_stripe) {
 		atomic_inc(&head_sh->count);
-		init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+		init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
 				  head_sh, to_addr_conv(sh, percpu, j));
 	} else
 		init_async_submit(&submit, 0, tx, NULL, NULL,
@@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 		(unsigned long long)sh->sector, checkp);
 
 	BUG_ON(sh->batch_head);
-	count = set_syndrome_sources(srcs, sh);
+	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
 	if (!checkp)
 		srcs[count] = NULL;
 
@@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
-	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-		tx = ops_run_prexor(sh, percpu, tx);
+	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+		if (level < 6)
+			tx = ops_run_prexor5(sh, percpu, tx);
+		else
+			tx = ops_run_prexor6(sh, percpu, tx);
+	}
 
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
@@ -2770,7 +2814,7 @@ static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
 {
-	int i, pd_idx = sh->pd_idx, disks = sh->disks;
+	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
 	struct r5conf *conf = sh->raid_conf;
 	int level = conf->level;
 
@@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&conf->pending_full_writes);
 	} else {
-		BUG_ON(level == 6);
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+		BUG_ON(level == 6 &&
+			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+			   test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i == pd_idx)
+			if (i == pd_idx || i == qd_idx)
 				continue;
 
 			if (dev->towrite &&
@@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
 
-	/* RAID6 requires 'rcw' in current implementation.
-	 * Otherwise, check whether resync is now happening or should start.
+	/* Check whether resync is now happening or should start.
 	 * If yes, then the array is dirty (after unclean shutdown or
 	 * initial creation), so parity in some stripes might be inconsistent.
 	 * In this case, we need to always do reconstruct-write, to ensure
 	 * that in case of drive failure or read-error correction, we
 	 * generate correct data from the parity.
 	 */
-	if (conf->max_degraded == 2 ||
+	if (conf->rmw_level == PARITY_DISABLE_RMW ||
 	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
 	     s->failed == 0)) {
 		/* Calculate the real rcw later - for now make it
 		 * look like rcw is cheaper
 		 */
 		rcw = 1; rmw = 2;
-		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
-			 conf->max_degraded, (unsigned long long)recovery_cp,
+		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+			 conf->rmw_level, (unsigned long long)recovery_cp,
 			 (unsigned long long)sh->sector);
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 				rmw += 2*disks;  /* cannot read it */
 		}
 		/* Would I have to read this buffer for reconstruct_write */
-		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		    test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
-	if (rmw < rcw && rmw > 0) {
+	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
 		/* prefer read-modify-write, but need to get some data */
 		if (conf->mddev->queue)
 			blk_add_trace_msg(conf->mddev->queue,
@@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx) &&
+			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			    test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			}
 		}
 	}
-	if (rcw <= rmw && rcw > 0) {
+	if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
 		/* want reconstruct write, but need to get some data */
 		int qread =0;
 		rcw = 0;
@@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	conf->level = mddev->new_level;
-	if (conf->level == 6)
+	if (conf->level == 6) {
 		conf->max_degraded = 2;
-	else
+		if (raid6_call.xor_syndrome)
+			conf->rmw_level = PARITY_ENABLE_RMW;
+		else
+			conf->rmw_level = PARITY_DISABLE_RMW;
+	} else {
 		conf->max_degraded = 1;
+		conf->rmw_level = PARITY_ENABLE_RMW;
+	}
 	conf->algorithm = mddev->new_layout;
 	conf->reshape_progress = mddev->reshape_position;
 	if (conf->reshape_progress != MaxSector) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ee65ed844d3f..57fef9ba36fa 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -355,6 +355,23 @@ enum {
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
 };
+
+/*
+ * RAID parity calculation preferences
+ */
+enum {
+	PARITY_DISABLE_RMW = 0,
+	PARITY_ENABLE_RMW,
+};
+
+/*
+ * Pages requested from set_syndrome_sources()
+ */
+enum {
+	SYNDROME_SRC_ALL,
+	SYNDROME_SRC_WANT_DRAIN,
+	SYNDROME_SRC_WRITTEN,
+};
 /*
  * Plugging:
  *
@@ -411,7 +428,7 @@ struct r5conf {
 	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS];
 	struct mddev		*mddev;
 	int			chunk_sectors;
-	int			level, algorithm;
+	int			level, algorithm, rmw_level;
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 179b38ffd351..388574ea38ed 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -60,12 +60,15 @@ struct dma_chan_ref {
  * dependency chain
  * @ASYNC_TX_FENCE: specify that the next operation in the dependency
  * chain uses this operation's result as an input
+ * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the
+ * input data. Required for rmw case.
  */
 enum async_tx_flags {
 	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
 	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
 	ASYNC_TX_ACK		 = (1 << 2),
 	ASYNC_TX_FENCE		 = (1 << 3),
+	ASYNC_TX_PQ_XOR_DST	 = (1 << 4),
 };
 
 /**
-- 
cgit v1.2.3


From d06f191f8ecaef4d524e765fdb455f96392fbd42 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:05 +1100
Subject: md/raid5: introduce configuration option rmw_level

Depending on the available coding we allow optimized rmw logic for write
operations. To support easier testing this patch allows manual control
of the rmw/rcw descision through the interface /sys/block/mdX/md/rmw_level.

The configuration can handle three levels of control.

rmw_level=0: Disable rmw for all RAID types. Hardware assisted P/Q
calculation has no implementation path yet to factor in/out chunks of
a syndrome. Enforcing this level can be benefical for slow CPUs with
hardware syndrome support and fast SSDs.

rmw_level=1: Estimate rmw IOs and rcw IOs. Execute rmw only if we will
save IOs. This equals the "old" unpatched behaviour and will be the
default.

rmw_level=2: Execute rmw even if calculated IOs for rmw and rcw are
equal. We might have higher CPU consumption because of calculating the
parity twice but it can be benefical otherwise. E.g. RAID4 with fast
dedicated parity disk/SSD. The option is implemented just to be
forward-looking and will ONLY work with this patch!

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |  1 +
 2 files changed, 45 insertions(+)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c82ce1fd8723..f78b1964543b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5879,6 +5879,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
 				raid5_show_stripe_cache_size,
 				raid5_store_stripe_cache_size);
 
+static ssize_t
+raid5_show_rmw_level(struct mddev  *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->rmw_level);
+	else
+		return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+
+	if (!conf)
+		return -ENODEV;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+
+	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+		return -EINVAL;
+
+	if (new != PARITY_DISABLE_RMW &&
+	    new != PARITY_ENABLE_RMW &&
+	    new != PARITY_PREFER_RMW)
+		return -EINVAL;
+
+	conf->rmw_level = new;
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+			 raid5_show_rmw_level,
+			 raid5_store_rmw_level);
+
+
 static ssize_t
 raid5_show_preread_threshold(struct mddev *mddev, char *page)
 {
@@ -6065,6 +6108,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_preread_bypass_threshold.attr,
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
+	&raid5_rmw_level.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 57fef9ba36fa..6614ac5ffc0e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -362,6 +362,7 @@ enum {
 enum {
 	PARITY_DISABLE_RMW = 0,
 	PARITY_ENABLE_RMW,
+	PARITY_PREFER_RMW,
 };
 
 /*
-- 
cgit v1.2.3


From 5423399a84ee1d92d29d763029ed40e4905cf50f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 26 Feb 2015 12:21:04 +1100
Subject: md/raid5: change ->inactive_blocked to a bit-flag.

This allows us to easily add more (atomic) flags.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 13 ++++++++-----
 drivers/md/raid5.h |  9 ++++++---
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78ac7dc853c7..b7cd32e7f29e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -672,20 +672,23 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
-			if (!conf->inactive_blocked)
+			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 				sh = get_free_stripe(conf, hash);
 			if (noblock && sh == NULL)
 				break;
 			if (!sh) {
-				conf->inactive_blocked = 1;
+				set_bit(R5_INACTIVE_BLOCKED,
+					&conf->cache_state);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
 					(atomic_read(&conf->active_stripes)
 					 < (conf->max_nr_stripes * 3 / 4)
-					 || !conf->inactive_blocked),
+					 || !test_bit(R5_INACTIVE_BLOCKED,
+						      &conf->cache_state)),
 					*(conf->hash_locks + hash));
-				conf->inactive_blocked = 0;
+				clear_bit(R5_INACTIVE_BLOCKED,
+					  &conf->cache_state);
 			} else {
 				init_stripe(sh, sector, previous);
 				atomic_inc(&sh->count);
@@ -4602,7 +4605,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
 	 * how busy the stripe_cache is
 	 */
 
-	if (conf->inactive_blocked)
+	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 		return 1;
 	if (conf->quiesce)
 		return 1;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6614ac5ffc0e..ebe4e24bc14d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,9 +509,11 @@ struct r5conf {
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
-	int			inactive_blocked;	/* release of inactive stripes blocked,
-							 * waiting for 25% to be free
-							 */
+	unsigned long		cache_state;
+#define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
+					 * waiting for 25% to be free
+					 */
+
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
@@ -526,6 +528,7 @@ struct r5conf {
 	int			worker_cnt_per_group;
 };
 
+
 /*
  * Our supported algorithms
  */
-- 
cgit v1.2.3


From edbe83ab4c27ea6669eb57adb5ed7eaec1118ceb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 26 Feb 2015 12:47:56 +1100
Subject: md/raid5: allow the stripe_cache to grow and shrink.

The default setting of 256 stripe_heads is probably
much too small for many configurations.  So it is best to make it
auto-configure.

Shrinking the cache under memory pressure is easy.  The only
interesting part here is that we put a fairly high cost
('seeks') on shrinking the cache as the cost is greater than
just having to read more data, it reduces parallelism.

Growing the cache on demand needs to be done carefully.  If we allow
fast growth, that can upset memory balance as lots of dirty memory can
quickly turn into lots of memory queued in the stripe_cache.
It is important for the raid5 block device to appear congested to
allow write-throttling to work.

So we only add stripes slowly. We set a flag when an allocation
fails because all stripes are in use, allocate at a convenient
time when that flag is set, and don't allow it to be set again
until at least one stripe_head has been released for re-use.

This means that a spurt of requests will only cause one stripe_head
to be allocated, but a steady stream of requests will slowly
increase the cache size - until memory pressure puts it back again.

It could take hours to reach a steady state.

The value written to, and displayed in, stripe_cache_size is
used as a minimum.  The cache can grow above this and shrink back
down to it.  The actual size is not directly visible, though it can
be deduced to some extent by watching stripe_cache_active.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid5.h | 11 ++++++++-
 2 files changed, 71 insertions(+), 8 deletions(-)

(limited to 'drivers/md/raid5.h')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b7cd32e7f29e..9716319cc477 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
-			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 				sh = get_free_stripe(conf, hash);
+				if (!sh && llist_empty(&conf->released_stripes) &&
+				    !test_bit(R5_DID_ALLOC, &conf->cache_state))
+					set_bit(R5_ALLOC_MORE,
+						&conf->cache_state);
+			}
 			if (noblock && sh == NULL)
 				break;
 			if (!sh) {
@@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread)
 		int batch_size, released;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);
+		if (released)
+			clear_bit(R5_DID_ALLOC, &conf->cache_state);
 
 		if (
 		    !list_empty(&conf->bitmap_list)) {
@@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread)
 	pr_debug("%d stripes handled\n", handled);
 
 	spin_unlock_irq(&conf->device_lock);
+	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+		grow_one_stripe(conf, __GFP_NOWARN);
+		/* Set flag even if allocation failed.  This helps
+		 * slow down allocation requests when mem is short
+		 */
+		set_bit(R5_DID_ALLOC, &conf->cache_state);
+	}
 
 	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
@@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
 	spin_lock(&mddev->lock);
 	conf = mddev->private;
 	if (conf)
-		ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+		ret = sprintf(page, "%d\n", conf->min_nr_stripes);
 	spin_unlock(&mddev->lock);
 	return ret;
 }
@@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
 
+	conf->min_nr_stripes = size;
 	while (size < conf->max_nr_stripes &&
 	       drop_one_stripe(conf))
 		;
 
+
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
@@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 	conf = mddev->private;
 	if (!conf)
 		err = -ENODEV;
-	else if (new > conf->max_nr_stripes)
+	else if (new > conf->min_nr_stripes)
 		err = -EINVAL;
 	else
 		conf->bypass_threshold = new;
@@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	if (conf->shrinker.seeks)
+		unregister_shrinker(&conf->shrinker);
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
@@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 	return err;
 }
 
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
+{
+	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+	int ret = 0;
+	while (ret < sc->nr_to_scan) {
+		if (drop_one_stripe(conf) == 0)
+			return SHRINK_STOP;
+		ret++;
+	}
+	return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+	if (conf->max_nr_stripes < conf->min_nr_stripes)
+		/* unlikely, but not impossible */
+		return 0;
+	return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
 	struct r5conf *conf;
@@ -6445,10 +6487,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		conf->prev_algo = mddev->layout;
 	}
 
-	memory = NR_STRIPES * (sizeof(struct stripe_head) +
+	conf->min_nr_stripes = NR_STRIPES;
+	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-	if (grow_stripes(conf, NR_STRIPES)) {
+	if (grow_stripes(conf, conf->min_nr_stripes)) {
 		printk(KERN_ERR
 		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
 		       mdname(mddev), memory);
@@ -6456,6 +6499,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	} else
 		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
 		       mdname(mddev), memory);
+	/*
+	 * Losing a stripe head costs more than the time to refill it,
+	 * it reduces the queue depth and so can hurt throughput.
+	 * So set it rather large, scaled by number of devices.
+	 */
+	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+	conf->shrinker.scan_objects = raid5_cache_scan;
+	conf->shrinker.count_objects = raid5_cache_count;
+	conf->shrinker.batch = 128;
+	conf->shrinker.flags = 0;
+	register_shrinker(&conf->shrinker);
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
 	conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev)
 	 */
 	struct r5conf *conf = mddev->private;
 	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-	    > conf->max_nr_stripes ||
+	    > conf->min_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-	    > conf->max_nr_stripes) {
+	    > conf->min_nr_stripes) {
 		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
 		       mdname(mddev),
 		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ebe4e24bc14d..7dc0dd86074b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -433,6 +433,7 @@ struct r5conf {
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
+	int			min_nr_stripes;
 
 	/* reshape_progress is the leading edge of a 'reshape'
 	 * It has value MaxSector when no reshape is happening
@@ -513,7 +514,15 @@ struct r5conf {
 #define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
 					 * waiting for 25% to be free
 					 */
-
+#define R5_ALLOC_MORE		2	/* It might help to allocate another
+					 * stripe.
+					 */
+#define R5_DID_ALLOC		4	/* A stripe was allocated, don't allocate
+					 * more until at least one has been
+					 * released.  This avoids flooding
+					 * the cache.
+					 */
+	struct shrinker		shrinker;
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
-- 
cgit v1.2.3