From 46d5b785621ad10a373e292f9101ccfc626466e0 Mon Sep 17 00:00:00 2001 From: "shli@kernel.org" Date: Mon, 15 Dec 2014 12:57:02 +1100 Subject: raid5: use flex_array for scribble data Use flex_array for scribble data. Next patch will batch several stripes together, so scribble data should be able to cover several stripes, so this patch also allocates scribble data for stripes across a chunk. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 983e18a83db1..1d0f241d7d3b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -458,15 +458,11 @@ struct r5conf { /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer + struct flex_array *scribble; /* space for constructing buffer * lists and performing address * conversions */ } __percpu *percpu; - size_t scribble_len; /* size of scribble region must be - * associated with conf to handle - * cpu hotplug while reshaping - */ #ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif -- cgit v1.2.3 From da41ba65972532a04f73927c903029a7ec3bc2ed Mon Sep 17 00:00:00 2001 From: "shli@kernel.org" Date: Mon, 15 Dec 2014 12:57:03 +1100 Subject: raid5: add a new flag to track if a stripe can be batched A freshly new stripe with write request can be batched. Any time the stripe is handled or new read is queued, the flag will be cleared. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 12 +++++++++--- drivers/md/raid5.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7fb510e54548..49b0f23dbad2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -555,6 +555,7 @@ retry: goto retry; insert_hash(conf, sh); sh->cpu = smp_processor_id(); + set_bit(STRIPE_BATCH_READY, &sh->state); } static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, @@ -2645,7 +2646,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, + int forwrite, int previous) { struct bio **bip; struct r5conf *conf = sh->raid_conf; @@ -2678,6 +2680,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; + if (!forwrite || previous) + clear_bit(STRIPE_BATCH_READY, &sh->state); + BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); if (*bip) bi->bi_next = *bip; @@ -3824,6 +3829,7 @@ static void handle_stripe(struct stripe_head *sh) return; } + clear_bit(STRIPE_BATCH_READY, &sh->state); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { spin_lock(&sh->stripe_lock); /* Cannot process 'sync' concurrently with 'discard' */ @@ -4793,7 +4799,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw)) { + !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or * add failed due to overlap. Flush everything * and wait a while @@ -5206,7 +5212,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } - if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { + if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { release_stripe(sh); raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 1d0f241d7d3b..37644e3d5293 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -327,6 +327,7 @@ enum { STRIPE_ON_UNPLUG_LIST, STRIPE_DISCARD, STRIPE_ON_RELEASE_LIST, + STRIPE_BATCH_READY, }; /* -- cgit v1.2.3 From 7a87f43405e91ca12b8770eb689dd9886f217091 Mon Sep 17 00:00:00 2001 From: "shli@kernel.org" Date: Mon, 15 Dec 2014 12:57:03 +1100 Subject: raid5: track overwrite disk count Track overwrite disk count, so we can know if a stripe is a full stripe write. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 14 +++++++++++++- drivers/md/raid5.h | 4 ++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 49b0f23dbad2..e801c6669c6d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -553,6 +553,7 @@ retry: } if (read_seqcount_retry(&conf->gen_lock, seq)) goto retry; + sh->overwrite_disks = 0; insert_hash(conf, sh); sh->cpu = smp_processor_id(); set_bit(STRIPE_BATCH_READY, &sh->state); @@ -710,6 +711,12 @@ get_active_stripe(struct r5conf *conf, sector_t sector, return sh; } +static bool is_full_stripe_write(struct stripe_head *sh) +{ + BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); + return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); +} + /* Determine if 'data_offset' or 'new_data_offset' should be used * in this stripe_head. */ @@ -1413,6 +1420,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; + sh->overwrite_disks = 0; BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); @@ -2700,7 +2708,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, sector = bio_end_sector(bi); } if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) - set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); + if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) + sh->overwrite_disks++; } pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", @@ -2772,6 +2781,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; + sh->overwrite_disks = 0; spin_unlock_irq(&sh->stripe_lock); if (bi) bitmap_end = 1; @@ -4630,12 +4640,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } set_bit(STRIPE_DISCARD, &sh->state); finish_wait(&conf->wait_for_overlap, &w); + sh->overwrite_disks = 0; for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) continue; sh->dev[d].towrite = bi; set_bit(R5_OVERWRITE, &sh->dev[d].flags); raid5_inc_bi_active_stripes(bi); + sh->overwrite_disks++; } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 37644e3d5293..4cc1a48127c7 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -210,6 +210,10 @@ struct stripe_head { atomic_t count; /* nr of active thread/requests */ int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ + int overwrite_disks; /* total overwrite disks in stripe, + * this is only checked when stripe + * has STRIPE_BATCH_READY + */ enum check_states check_state; enum reconstruct_states reconstruct_state; spinlock_t stripe_lock; -- cgit v1.2.3 From 59fc630b8b5f9f21c8ce3ba153341c107dce1b0c Mon Sep 17 00:00:00 2001 From: "shli@kernel.org" Date: Mon, 15 Dec 2014 12:57:03 +1100 Subject: RAID5: batch adjacent full stripe write stripe cache is 4k size. Even adjacent full stripe writes are handled in 4k unit. Idealy we should use big size for adjacent full stripe writes. Bigger stripe cache size means less stripes runing in the state machine so can reduce cpu overhead. And also bigger size can cause bigger IO size dispatched to under layer disks. With below patch, we will automatically batch adjacent full stripe write together. Such stripes will be added to the batch list. Only the first stripe of the list will be put to handle_list and so run handle_stripe(). Some steps of handle_stripe() are extended to cover all stripes of the list, including ops_run_io, ops_run_biodrain and so on. With this patch, we have less stripes running in handle_stripe() and we send IO of whole stripe list together to increase IO size. Stripes added to a batch list have some limitations. A batch list can only include full stripe write and can't cross chunk boundary to make sure stripes have the same parity disks. Stripes in a batch list must be in the same state (no written, toread and so on). If a stripe is in a batch list, all new read/write to add_stripe_bio will be blocked to overlap conflict till the batch list is handled. The limitations will make sure stripes in a batch list be in exactly the same state in the life circly. I did test running 160k randwrite in a RAID5 array with 32k chunk size and 6 PCIe SSD. This patch improves around 30% performance and IO size to under layer disk is exactly 32k. I also run a 4k randwrite test in the same array to make sure the performance isn't changed with the patch. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 357 +++++++++++++++++++++++++++++++++++++++++++++++++---- drivers/md/raid5.h | 4 + 2 files changed, 336 insertions(+), 25 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e801c6669c6d..717189e74243 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -526,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); BUG_ON(stripe_operations_active(sh)); + BUG_ON(sh->batch_head); pr_debug("init_stripe called, stripe %llu\n", (unsigned long long)sector); @@ -717,6 +718,124 @@ static bool is_full_stripe_write(struct stripe_head *sh) return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); } +static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) +{ + local_irq_disable(); + if (sh1 > sh2) { + spin_lock(&sh2->stripe_lock); + spin_lock_nested(&sh1->stripe_lock, 1); + } else { + spin_lock(&sh1->stripe_lock); + spin_lock_nested(&sh2->stripe_lock, 1); + } +} + +static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) +{ + spin_unlock(&sh1->stripe_lock); + spin_unlock(&sh2->stripe_lock); + local_irq_enable(); +} + +/* Only freshly new full stripe normal write stripe can be added to a batch list */ +static bool stripe_can_batch(struct stripe_head *sh) +{ + return test_bit(STRIPE_BATCH_READY, &sh->state) && + is_full_stripe_write(sh); +} + +/* we only do back search */ +static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) +{ + struct stripe_head *head; + sector_t head_sector, tmp_sec; + int hash; + int dd_idx; + + if (!stripe_can_batch(sh)) + return; + /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ + tmp_sec = sh->sector; + if (!sector_div(tmp_sec, conf->chunk_sectors)) + return; + head_sector = sh->sector - STRIPE_SECTORS; + + hash = stripe_hash_locks_hash(head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = __find_stripe(conf, head_sector, conf->generation); + if (head && !atomic_inc_not_zero(&head->count)) { + spin_lock(&conf->device_lock); + if (!atomic_read(&head->count)) { + if (!test_bit(STRIPE_HANDLE, &head->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&head->lru) && + !test_bit(STRIPE_EXPANDING, &head->state)); + list_del_init(&head->lru); + if (head->group) { + head->group->stripes_cnt--; + head->group = NULL; + } + } + atomic_inc(&head->count); + spin_unlock(&conf->device_lock); + } + spin_unlock_irq(conf->hash_locks + hash); + + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; + + lock_two_stripes(head, sh); + /* clear_batch_ready clear the flag */ + if (!stripe_can_batch(head) || !stripe_can_batch(sh)) + goto unlock_out; + + if (sh->batch_head) + goto unlock_out; + + dd_idx = 0; + while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + dd_idx++; + if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw) + goto unlock_out; + + if (head->batch_head) { + spin_lock(&head->batch_head->batch_lock); + /* This batch list is already running */ + if (!stripe_can_batch(head)) { + spin_unlock(&head->batch_head->batch_lock); + goto unlock_out; + } + + /* + * at this point, head's BATCH_READY could be cleared, but we + * can still add the stripe to batch list + */ + list_add(&sh->batch_list, &head->batch_list); + spin_unlock(&head->batch_head->batch_lock); + + sh->batch_head = head->batch_head; + } else { + head->batch_head = head; + sh->batch_head = head->batch_head; + spin_lock(&head->batch_lock); + list_add_tail(&sh->batch_list, &head->batch_list); + spin_unlock(&head->batch_lock); + } + + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + + atomic_inc(&sh->count); +unlock_out: + unlock_two_stripes(head, sh); +out: + release_stripe(head); +} + /* Determine if 'data_offset' or 'new_data_offset' should be used * in this stripe_head. */ @@ -747,6 +866,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; + struct stripe_head *head_sh = sh; might_sleep(); @@ -755,6 +875,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) int replace_only = 0; struct bio *bi, *rbi; struct md_rdev *rdev, *rrdev = NULL; + + sh = head_sh; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) rw = WRITE_FUA; @@ -773,6 +895,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) rw |= REQ_SYNC; +again: bi = &sh->dev[i].req; rbi = &sh->dev[i].rreq; /* For writing to replacement */ @@ -791,7 +914,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) /* We raced and saw duplicates */ rrdev = NULL; } else { - if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) + if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) rdev = rrdev; rrdev = NULL; } @@ -862,13 +985,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); + if (sh != head_sh) + atomic_inc(&head_sh->count); if (use_new_offset(conf, sh)) bi->bi_iter.bi_sector = (sh->sector + rdev->new_data_offset); else bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); - if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) bi->bi_rw |= REQ_NOMERGE; if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) @@ -912,6 +1037,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, rbi->bi_rw, i); atomic_inc(&sh->count); + if (sh != head_sh) + atomic_inc(&head_sh->count); if (use_new_offset(conf, sh)) rbi->bi_iter.bi_sector = (sh->sector + rrdev->new_data_offset); @@ -945,6 +1072,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } + + if (!head_sh->batch_head) + continue; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + if (sh != head_sh) + goto again; } } @@ -1060,6 +1194,7 @@ static void ops_run_biofill(struct stripe_head *sh) struct async_submit_ctl submit; int i; + BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1148,6 +1283,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) struct async_submit_ctl submit; int i; + BUG_ON(sh->batch_head); + pr_debug("%s: stripe %llu block: %d\n", __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); @@ -1214,6 +1351,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) int i; int count; + BUG_ON(sh->batch_head); if (sh->ops.target < 0) target = sh->ops.target2; else if (sh->ops.target2 < 0) @@ -1272,6 +1410,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) struct page **blocks = to_addr_page(percpu, 0); struct async_submit_ctl submit; + BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu block1: %d block2: %d\n", __func__, (unsigned long long)sh->sector, target, target2); BUG_ON(target < 0 || target2 < 0); @@ -1384,6 +1523,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1406,17 +1546,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; int i; + struct stripe_head *head_sh = sh; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; + struct r5dev *dev; struct bio *chosen; - if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { + sh = head_sh; + if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { struct bio *wbi; +again: + dev = &sh->dev[i]; spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; @@ -1445,6 +1589,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } wbi = r5_next_bio(wbi, dev->sector); } + + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, + batch_list); + if (sh == head_sh) + continue; + goto again; + } } } @@ -1500,12 +1653,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - struct page **xor_srcs = to_addr_page(percpu, 0); + struct page **xor_srcs; struct async_submit_ctl submit; - int count = 0, pd_idx = sh->pd_idx, i; + int count, pd_idx = sh->pd_idx, i; struct page *xor_dest; int prexor = 0; unsigned long flags; + int j = 0; + struct stripe_head *head_sh = sh; + int last_stripe; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1522,15 +1678,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, ops_complete_reconstruct(sh); return; } +again: + count = 0; + xor_srcs = to_addr_page(percpu, j); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written) + if (head_sh->dev[i].written) xor_srcs[count++] = dev->page; } } else { @@ -1547,17 +1706,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case */ - flags = ASYNC_TX_ACK | - (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); - - atomic_inc(&sh->count); + last_stripe = !head_sh->batch_head || + list_first_entry(&sh->batch_list, + struct stripe_head, batch_list) == head_sh; + if (last_stripe) { + flags = ASYNC_TX_ACK | + (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); + + atomic_inc(&head_sh->count); + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, + to_addr_conv(sh, percpu, j)); + } else { + flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; + init_async_submit(&submit, flags, tx, NULL, NULL, + to_addr_conv(sh, percpu, j)); + } - init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, - to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + if (!last_stripe) { + j++; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + goto again; + } } static void @@ -1565,8 +1739,10 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { struct async_submit_ctl submit; - struct page **blocks = to_addr_page(percpu, 0); - int count, i; + struct page **blocks; + int count, i, j = 0; + struct stripe_head *head_sh = sh; + int last_stripe; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1584,13 +1760,27 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, return; } +again: + blocks = to_addr_page(percpu, j); count = set_syndrome_sources(blocks, sh); - - atomic_inc(&sh->count); - - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, - sh, to_addr_conv(sh, percpu, 0)); + last_stripe = !head_sh->batch_head || + list_first_entry(&sh->batch_list, + struct stripe_head, batch_list) == head_sh; + + if (last_stripe) { + atomic_inc(&head_sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + head_sh, to_addr_conv(sh, percpu, j)); + } else + init_async_submit(&submit, 0, tx, NULL, NULL, + to_addr_conv(sh, percpu, j)); async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + if (!last_stripe) { + j++; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + goto again; + } } static void ops_complete_check(void *stripe_head_ref) @@ -1620,6 +1810,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + BUG_ON(sh->batch_head); count = 0; xor_dest = sh->dev[pd_idx].page; xor_srcs[count++] = xor_dest; @@ -1648,6 +1839,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu pr_debug("%s: stripe %llu checkp: %d\n", __func__, (unsigned long long)sh->sector, checkp); + BUG_ON(sh->batch_head); count = set_syndrome_sources(srcs, sh); if (!checkp) srcs[count] = NULL; @@ -1715,7 +1907,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) BUG(); } - if (overlap_clear) + if (overlap_clear && !sh->batch_head) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_Overlap, &dev->flags)) @@ -1745,6 +1937,10 @@ static int grow_one_stripe(struct r5conf *conf, int hash) atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); INIT_LIST_HEAD(&sh->lru); + + spin_lock_init(&sh->batch_lock); + INIT_LIST_HEAD(&sh->batch_list); + sh->batch_head = NULL; release_stripe(sh); return 1; } @@ -2188,6 +2384,9 @@ static void raid5_end_write_request(struct bio *bi, int error) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); + + if (sh->batch_head && sh != sh->batch_head) + release_stripe(sh->batch_head); } static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); @@ -2674,6 +2873,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, * protect it. */ spin_lock_irq(&sh->stripe_lock); + /* Don't allow new IO added to stripes in batch list */ + if (sh->batch_head) + goto overlap; if (forwrite) { bip = &sh->dev[dd_idx].towrite; if (*bip == NULL) @@ -2723,6 +2925,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, sh->bm_seq = conf->seq_flush+1; set_bit(STRIPE_BIT_DELAY, &sh->state); } + + if (stripe_can_batch(sh)) + stripe_add_to_batch_list(conf, sh); return 1; overlap: @@ -2755,6 +2960,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio **return_bi) { int i; + BUG_ON(sh->batch_head); for (i = disks; i--; ) { struct bio *bi; int bitmap_end = 0; @@ -2870,6 +3076,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int abort = 0; int i; + BUG_ON(sh->batch_head); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up(&conf->wait_for_overlap); @@ -3100,6 +3307,7 @@ static void handle_stripe_fill(struct stripe_head *sh, { int i; + BUG_ON(sh->batch_head); /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write @@ -3123,6 +3331,9 @@ static void handle_stripe_clean_event(struct r5conf *conf, int i; struct r5dev *dev; int discard_pending = 0; + struct stripe_head *head_sh = sh; + bool do_endio = false; + int wakeup_nr = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -3138,8 +3349,11 @@ static void handle_stripe_clean_event(struct r5conf *conf, clear_bit(R5_UPTODATE, &dev->flags); if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); - dev->page = dev->orig_page; } + do_endio = true; + +returnbi: + dev->page = dev->orig_page; wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < @@ -3156,6 +3370,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, + batch_list); + if (sh != head_sh) { + dev = &sh->dev[i]; + goto returnbi; + } + } + sh = head_sh; + dev = &sh->dev[i]; } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); @@ -3177,8 +3402,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, * will be reinitialized */ spin_lock_irq(&conf->device_lock); +unhash: remove_hash(sh); + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + if (sh != head_sh) + goto unhash; + } spin_unlock_irq(&conf->device_lock); + sh = head_sh; + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) set_bit(STRIPE_HANDLE, &sh->state); @@ -3187,6 +3421,39 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); + + if (!head_sh->batch_head || !do_endio) + return; + for (i = 0; i < head_sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) + wakeup_nr++; + } + while (!list_empty(&head_sh->batch_list)) { + int i; + sh = list_first_entry(&head_sh->batch_list, + struct stripe_head, batch_list); + list_del_init(&sh->batch_list); + + sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) | + (1 << STRIPE_PREREAD_ACTIVE))); + sh->check_state = head_sh->check_state; + sh->reconstruct_state = head_sh->reconstruct_state; + for (i = 0; i < sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wakeup_nr++; + sh->dev[i].flags = head_sh->dev[i].flags; + } + + spin_lock_irq(&sh->stripe_lock); + sh->batch_head = NULL; + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + } + + spin_lock_irq(&head_sh->stripe_lock); + head_sh->batch_head = NULL; + spin_unlock_irq(&head_sh->stripe_lock); + wake_up_nr(&conf->wait_for_overlap, wakeup_nr); } static void handle_stripe_dirtying(struct r5conf *conf, @@ -3326,6 +3593,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, { struct r5dev *dev = NULL; + BUG_ON(sh->batch_head); set_bit(STRIPE_HANDLE, &sh->state); switch (sh->check_state) { @@ -3416,6 +3684,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, int qd_idx = sh->qd_idx; struct r5dev *dev; + BUG_ON(sh->batch_head); set_bit(STRIPE_HANDLE, &sh->state); BUG_ON(s->failed > 2); @@ -3579,6 +3848,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) * copy some of them into a target stripe for expand. */ struct dma_async_tx_descriptor *tx = NULL; + BUG_ON(sh->batch_head); clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && i != sh->qd_idx) { @@ -3822,6 +4092,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rcu_read_unlock(); } +static int clear_batch_ready(struct stripe_head *sh) +{ + struct stripe_head *tmp; + if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) + return 0; + spin_lock(&sh->stripe_lock); + if (!sh->batch_head) { + spin_unlock(&sh->stripe_lock); + return 0; + } + + /* + * this stripe could be added to a batch list before we check + * BATCH_READY, skips it + */ + if (sh->batch_head != sh) { + spin_unlock(&sh->stripe_lock); + return 1; + } + spin_lock(&sh->batch_lock); + list_for_each_entry(tmp, &sh->batch_list, batch_list) + clear_bit(STRIPE_BATCH_READY, &tmp->state); + spin_unlock(&sh->batch_lock); + spin_unlock(&sh->stripe_lock); + + /* + * BATCH_READY is cleared, no new stripes can be added. + * batch_list can be accessed without lock + */ + return 0; +} + static void handle_stripe(struct stripe_head *sh) { struct stripe_head_state s; @@ -3839,7 +4141,11 @@ static void handle_stripe(struct stripe_head *sh) return; } - clear_bit(STRIPE_BATCH_READY, &sh->state); + if (clear_batch_ready(sh) ) { + clear_bit_unlock(STRIPE_ACTIVE, &sh->state); + return; + } + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { spin_lock(&sh->stripe_lock); /* Cannot process 'sync' concurrently with 'discard' */ @@ -4824,7 +5130,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) } set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - if ((bi->bi_rw & REQ_SYNC) && + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe_plug(mddev, sh); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4cc1a48127c7..c8d0004dca8f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -219,6 +219,10 @@ struct stripe_head { spinlock_t stripe_lock; int cpu; struct r5worker_group *group; + + struct stripe_head *batch_head; /* protected by stripe lock */ + spinlock_t batch_lock; /* only header's lock is useful */ + struct list_head batch_list; /* protected by head's batch lock*/ /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target -- cgit v1.2.3 From 72ac733015bbdc0356ba3e92c52137a265910a91 Mon Sep 17 00:00:00 2001 From: "shli@kernel.org" Date: Mon, 15 Dec 2014 12:57:03 +1100 Subject: raid5: handle io error of batch list If io error happens in any stripe of a batch list, the batch list will be split, then normal process will run for the stripes in the list. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/raid5.h | 1 + 2 files changed, 49 insertions(+) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 717189e74243..54f3cb312b42 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1070,6 +1070,9 @@ again: pr_debug("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); + if (sh->batch_head) + set_bit(STRIPE_BATCH_ERR, + &sh->batch_head->state); set_bit(STRIPE_HANDLE, &sh->state); } @@ -2380,6 +2383,9 @@ static void raid5_end_write_request(struct bio *bi, int error) } rdev_dec_pending(rdev, conf->mddev); + if (sh->batch_head && !uptodate) + set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); + if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); @@ -4124,6 +4130,46 @@ static int clear_batch_ready(struct stripe_head *sh) return 0; } +static void check_break_stripe_batch_list(struct stripe_head *sh) +{ + struct stripe_head *head_sh, *next; + int i; + + if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) + return; + + head_sh = sh; + do { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + BUG_ON(sh == head_sh); + } while (!test_bit(STRIPE_DEGRADED, &sh->state)); + + while (sh != head_sh) { + next = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + list_del_init(&sh->batch_list); + + sh->state = head_sh->state & ~((1 << STRIPE_ACTIVE) | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DEGRADED)); + sh->check_state = head_sh->check_state; + sh->reconstruct_state = head_sh->reconstruct_state; + for (i = 0; i < sh->disks; i++) + sh->dev[i].flags = head_sh->dev[i].flags & + (~((1 << R5_WriteError) | (1 << R5_Overlap))); + + spin_lock_irq(&sh->stripe_lock); + sh->batch_head = NULL; + spin_unlock_irq(&sh->stripe_lock); + + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + + sh = next; + } +} + static void handle_stripe(struct stripe_head *sh) { struct stripe_head_state s; @@ -4146,6 +4192,8 @@ static void handle_stripe(struct stripe_head *sh) return; } + check_break_stripe_batch_list(sh); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { spin_lock(&sh->stripe_lock); /* Cannot process 'sync' concurrently with 'discard' */ diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index c8d0004dca8f..cf3562e99440 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -336,6 +336,7 @@ enum { STRIPE_DISCARD, STRIPE_ON_RELEASE_LIST, STRIPE_BATCH_READY, + STRIPE_BATCH_ERR, }; /* -- cgit v1.2.3 From dabc4ec6ba72418ebca6bf1884f344bba40c8709 Mon Sep 17 00:00:00 2001 From: "shli@kernel.org" Date: Mon, 15 Dec 2014 12:57:04 +1100 Subject: raid5: handle expansion/resync case with stripe batching expansion/resync can grab a stripe when the stripe is in batch list. Since all stripes in batch list must be in the same state, we can't allow some stripes run into expansion/resync. So we delay expansion/resync for stripe in batch list. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 24 ++++++++++++++++-------- drivers/md/raid5.h | 5 +++++ 2 files changed, 21 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54f3cb312b42..3ae097d50b51 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3440,8 +3440,10 @@ unhash: struct stripe_head, batch_list); list_del_init(&sh->batch_list); - sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) | - (1 << STRIPE_PREREAD_ACTIVE))); + set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, + head_sh->state & ~((1 << STRIPE_ACTIVE) | + (1 << STRIPE_PREREAD_ACTIVE) | + STRIPE_EXPAND_SYNC_FLAG)); sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; for (i = 0; i < sh->disks; i++) { @@ -3453,6 +3455,8 @@ unhash: spin_lock_irq(&sh->stripe_lock); sh->batch_head = NULL; spin_unlock_irq(&sh->stripe_lock); + if (sh->state & STRIPE_EXPAND_SYNC_FLAG) + set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -3460,6 +3464,8 @@ unhash: head_sh->batch_head = NULL; spin_unlock_irq(&head_sh->stripe_lock); wake_up_nr(&conf->wait_for_overlap, wakeup_nr); + if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) + set_bit(STRIPE_HANDLE, &head_sh->state); } static void handle_stripe_dirtying(struct r5conf *conf, @@ -3927,8 +3933,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) memset(s, 0, sizeof(*s)); - s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; + s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; s->failed_num[0] = -1; s->failed_num[1] = -1; @@ -4150,9 +4156,11 @@ static void check_break_stripe_batch_list(struct stripe_head *sh) struct stripe_head, batch_list); list_del_init(&sh->batch_list); - sh->state = head_sh->state & ~((1 << STRIPE_ACTIVE) | - (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)); + set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, + head_sh->state & ~((1 << STRIPE_ACTIVE) | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DEGRADED) | + STRIPE_EXPAND_SYNC_FLAG)); sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; for (i = 0; i < sh->disks; i++) @@ -4194,7 +4202,7 @@ static void handle_stripe(struct stripe_head *sh) check_break_stripe_batch_list(sh); - if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); /* Cannot process 'sync' concurrently with 'discard' */ if (!test_bit(STRIPE_DISCARD, &sh->state) && diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cf3562e99440..ee65ed844d3f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -339,6 +339,11 @@ enum { STRIPE_BATCH_ERR, }; +#define STRIPE_EXPAND_SYNC_FLAG \ + ((1 << STRIPE_EXPAND_SOURCE) |\ + (1 << STRIPE_EXPAND_READY) |\ + (1 << STRIPE_EXPANDING) |\ + (1 << STRIPE_SYNC_REQUESTED)) /* * Operation request flags */ -- cgit v1.2.3 From 584acdd49cd2472ca0f5a06adbe979db82d0b4af Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 15 Dec 2014 12:57:05 +1100 Subject: md/raid5: activate raid6 rmw feature Glue it altogehter. The raid6 rmw path should work the same as the already existing raid5 logic. So emulate the prexor handling/flags and split functions as needed. 1) Enable xor_syndrome() in the async layer. 2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome at the start of a rmw run as we did it before for the single parity. 3) Take care of rmw run in ops_run_reconstruct6(). Again process only the changed pages to get syndrome back into sync. 4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw run. The lower layers will calculate start & end pages from that and call the xor_syndrome() correspondingly. 5) Adapt the several places where we ignored Q handling up to now. Performance numbers for a single E5630 system with a mix of 10 7200k desktop/server disks. 300 seconds random write with 8 threads onto a 3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4) bsize rmw_level=1 rmw_level=0 rmw_level=1 rmw_level=0 skip_copy=1 skip_copy=1 skip_copy=0 skip_copy=0 4K 115 KB/s 141 KB/s 165 KB/s 140 KB/s 8K 225 KB/s 275 KB/s 324 KB/s 274 KB/s 16K 434 KB/s 536 KB/s 640 KB/s 534 KB/s 32K 751 KB/s 1,051 KB/s 1,234 KB/s 1,045 KB/s 64K 1,339 KB/s 1,958 KB/s 2,282 KB/s 1,962 KB/s 128K 2,673 KB/s 3,862 KB/s 4,113 KB/s 3,898 KB/s 256K 7,685 KB/s 7,539 KB/s 7,557 KB/s 7,638 KB/s 512K 19,556 KB/s 19,558 KB/s 19,652 KB/s 19,688 Kb/s Signed-off-by: Markus Stockhausen Signed-off-by: NeilBrown --- crypto/async_tx/async_pq.c | 19 +++++++-- drivers/md/raid5.c | 104 +++++++++++++++++++++++++++++++++------------ drivers/md/raid5.h | 19 ++++++++- include/linux/async_tx.h | 3 ++ 4 files changed, 115 insertions(+), 30 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index d05327caf69d..5d355e0c2633 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, { void **srcs; int i; + int start = -1, stop = disks - 3; if (submit->scribble) srcs = submit->scribble; @@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ srcs[i] = (void*)raid6_empty_zero_page; - } else + } else { srcs[i] = page_address(blocks[i]) + offset; + if (i < disks - 2) { + stop = i; + if (start == -1) + start = i; + } + } } - raid6_call.gen_syndrome(disks, len, srcs); + if (submit->flags & ASYNC_TX_PQ_XOR_DST) { + BUG_ON(!raid6_call.xor_syndrome); + if (start >= 0) + raid6_call.xor_syndrome(disks, start, stop, len, srcs); + } else + raid6_call.gen_syndrome(disks, len, srcs); async_tx_sync_epilog(submit); } @@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (device) unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); - if (unmap && + /* XORing P/Q is only implemented in software */ + if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) && (src_cnt <= dma_maxpq(device, 0) || dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && is_dma_pq_aligned(device, offset, 0, len)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3ae097d50b51..c82ce1fd8723 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) * destination buffer is recorded in srcs[count] and the Q destination * is recorded in srcs[count+1]]. */ -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +static int set_syndrome_sources(struct page **srcs, + struct stripe_head *sh, + int srctype) { int disks = sh->disks; int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); @@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + struct r5dev *dev = &sh->dev[i]; - srcs[slot] = sh->dev[i].page; + if (i == sh->qd_idx || i == sh->pd_idx || + (srctype == SYNDROME_SRC_ALL) || + (srctype == SYNDROME_SRC_WANT_DRAIN && + test_bit(R5_Wantdrain, &dev->flags)) || + (srctype == SYNDROME_SRC_WRITTEN && + dev->written)) + srcs[slot] = sh->dev[i].page; i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); @@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); @@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, return tx; } +static struct dma_async_tx_descriptor * +ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct page **blocks = to_addr_page(percpu, 0); + int count; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + + return tx; +} + static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { @@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, int count, i, j = 0; struct stripe_head *head_sh = sh; int last_stripe; + int synflags; + unsigned long txflags; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, again: blocks = to_addr_page(percpu, j); - count = set_syndrome_sources(blocks, sh); + + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + synflags = SYNDROME_SRC_WRITTEN; + txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; + } else { + synflags = SYNDROME_SRC_ALL; + txflags = ASYNC_TX_ACK; + } + + count = set_syndrome_sources(blocks, sh, synflags); last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; if (last_stripe) { atomic_inc(&head_sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, head_sh, to_addr_conv(sh, percpu, j)); } else init_async_submit(&submit, 0, tx, NULL, NULL, @@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu (unsigned long long)sh->sector, checkp); BUG_ON(sh->batch_head); - count = set_syndrome_sources(srcs, sh); + count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; @@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) async_tx_ack(tx); } - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, percpu, tx); + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { + if (level < 6) + tx = ops_run_prexor5(sh, percpu, tx); + else + tx = ops_run_prexor6(sh, percpu, tx); + } if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); @@ -2770,7 +2814,7 @@ static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { - int i, pd_idx = sh->pd_idx, disks = sh->disks; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; struct r5conf *conf = sh->raid_conf; int level = conf->level; @@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); } else { - BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + BUG_ON(level == 6 && + (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i == pd_idx) + if (i == pd_idx || i == qd_idx) continue; if (dev->towrite && @@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, int rmw = 0, rcw = 0, i; sector_t recovery_cp = conf->mddev->recovery_cp; - /* RAID6 requires 'rcw' in current implementation. - * Otherwise, check whether resync is now happening or should start. + /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or * initial creation), so parity in some stripes might be inconsistent. * In this case, we need to always do reconstruct-write, to ensure * that in case of drive failure or read-error correction, we * generate correct data from the parity. */ - if (conf->max_degraded == 2 || + if (conf->rmw_level == PARITY_DISABLE_RMW || (recovery_cp < MaxSector && sh->sector >= recovery_cp && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->max_degraded, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)recovery_cp, (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && @@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw <= rmw && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->level = mddev->new_level; - if (conf->level == 6) + if (conf->level == 6) { conf->max_degraded = 2; - else + if (raid6_call.xor_syndrome) + conf->rmw_level = PARITY_ENABLE_RMW; + else + conf->rmw_level = PARITY_DISABLE_RMW; + } else { conf->max_degraded = 1; + conf->rmw_level = PARITY_ENABLE_RMW; + } conf->algorithm = mddev->new_layout; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ee65ed844d3f..57fef9ba36fa 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -355,6 +355,23 @@ enum { STRIPE_OP_RECONSTRUCT, STRIPE_OP_CHECK, }; + +/* + * RAID parity calculation preferences + */ +enum { + PARITY_DISABLE_RMW = 0, + PARITY_ENABLE_RMW, +}; + +/* + * Pages requested from set_syndrome_sources() + */ +enum { + SYNDROME_SRC_ALL, + SYNDROME_SRC_WANT_DRAIN, + SYNDROME_SRC_WRITTEN, +}; /* * Plugging: * @@ -411,7 +428,7 @@ struct r5conf { spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; - int level, algorithm; + int level, algorithm, rmw_level; int max_degraded; int raid_disks; int max_nr_stripes; diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 179b38ffd351..388574ea38ed 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -60,12 +60,15 @@ struct dma_chan_ref { * dependency chain * @ASYNC_TX_FENCE: specify that the next operation in the dependency * chain uses this operation's result as an input + * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the + * input data. Required for rmw case. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), ASYNC_TX_ACK = (1 << 2), ASYNC_TX_FENCE = (1 << 3), + ASYNC_TX_PQ_XOR_DST = (1 << 4), }; /** -- cgit v1.2.3 From d06f191f8ecaef4d524e765fdb455f96392fbd42 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 15 Dec 2014 12:57:05 +1100 Subject: md/raid5: introduce configuration option rmw_level Depending on the available coding we allow optimized rmw logic for write operations. To support easier testing this patch allows manual control of the rmw/rcw descision through the interface /sys/block/mdX/md/rmw_level. The configuration can handle three levels of control. rmw_level=0: Disable rmw for all RAID types. Hardware assisted P/Q calculation has no implementation path yet to factor in/out chunks of a syndrome. Enforcing this level can be benefical for slow CPUs with hardware syndrome support and fast SSDs. rmw_level=1: Estimate rmw IOs and rcw IOs. Execute rmw only if we will save IOs. This equals the "old" unpatched behaviour and will be the default. rmw_level=2: Execute rmw even if calculated IOs for rmw and rcw are equal. We might have higher CPU consumption because of calculating the parity twice but it can be benefical otherwise. E.g. RAID4 with fast dedicated parity disk/SSD. The option is implemented just to be forward-looking and will ONLY work with this patch! Signed-off-by: Markus Stockhausen Signed-off-by: NeilBrown --- drivers/md/raid5.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ drivers/md/raid5.h | 1 + 2 files changed, 45 insertions(+) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c82ce1fd8723..f78b1964543b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5879,6 +5879,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, raid5_show_stripe_cache_size, raid5_store_stripe_cache_size); +static ssize_t +raid5_show_rmw_level(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->rmw_level); + else + return 0; +} + +static ssize_t +raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + + if (!conf) + return -ENODEV; + + if (len >= PAGE_SIZE) + return -EINVAL; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) + return -EINVAL; + + if (new != PARITY_DISABLE_RMW && + new != PARITY_ENABLE_RMW && + new != PARITY_PREFER_RMW) + return -EINVAL; + + conf->rmw_level = new; + return len; +} + +static struct md_sysfs_entry +raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, + raid5_show_rmw_level, + raid5_store_rmw_level); + + static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { @@ -6065,6 +6108,7 @@ static struct attribute *raid5_attrs[] = { &raid5_preread_bypass_threshold.attr, &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, + &raid5_rmw_level.attr, NULL, }; static struct attribute_group raid5_attrs_group = { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 57fef9ba36fa..6614ac5ffc0e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -362,6 +362,7 @@ enum { enum { PARITY_DISABLE_RMW = 0, PARITY_ENABLE_RMW, + PARITY_PREFER_RMW, }; /* -- cgit v1.2.3 From 5423399a84ee1d92d29d763029ed40e4905cf50f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 26 Feb 2015 12:21:04 +1100 Subject: md/raid5: change ->inactive_blocked to a bit-flag. This allows us to easily add more (atomic) flags. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 13 ++++++++----- drivers/md/raid5.h | 9 ++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 78ac7dc853c7..b7cd32e7f29e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -672,20 +672,23 @@ get_active_stripe(struct r5conf *conf, sector_t sector, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { - if (!conf->inactive_blocked) + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) sh = get_free_stripe(conf, hash); if (noblock && sh == NULL) break; if (!sh) { - conf->inactive_blocked = 1; + set_bit(R5_INACTIVE_BLOCKED, + &conf->cache_state); wait_event_lock_irq( conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) - || !conf->inactive_blocked), + || !test_bit(R5_INACTIVE_BLOCKED, + &conf->cache_state)), *(conf->hash_locks + hash)); - conf->inactive_blocked = 0; + clear_bit(R5_INACTIVE_BLOCKED, + &conf->cache_state); } else { init_stripe(sh, sector, previous); atomic_inc(&sh->count); @@ -4602,7 +4605,7 @@ static int raid5_congested(struct mddev *mddev, int bits) * how busy the stripe_cache is */ - if (conf->inactive_blocked) + if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) return 1; if (conf->quiesce) return 1; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 6614ac5ffc0e..ebe4e24bc14d 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -509,9 +509,11 @@ struct r5conf { struct llist_head released_stripes; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; - int inactive_blocked; /* release of inactive stripes blocked, - * waiting for 25% to be free - */ + unsigned long cache_state; +#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; @@ -526,6 +528,7 @@ struct r5conf { int worker_cnt_per_group; }; + /* * Our supported algorithms */ -- cgit v1.2.3 From edbe83ab4c27ea6669eb57adb5ed7eaec1118ceb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 26 Feb 2015 12:47:56 +1100 Subject: md/raid5: allow the stripe_cache to grow and shrink. The default setting of 256 stripe_heads is probably much too small for many configurations. So it is best to make it auto-configure. Shrinking the cache under memory pressure is easy. The only interesting part here is that we put a fairly high cost ('seeks') on shrinking the cache as the cost is greater than just having to read more data, it reduces parallelism. Growing the cache on demand needs to be done carefully. If we allow fast growth, that can upset memory balance as lots of dirty memory can quickly turn into lots of memory queued in the stripe_cache. It is important for the raid5 block device to appear congested to allow write-throttling to work. So we only add stripes slowly. We set a flag when an allocation fails because all stripes are in use, allocate at a convenient time when that flag is set, and don't allow it to be set again until at least one stripe_head has been released for re-use. This means that a spurt of requests will only cause one stripe_head to be allocated, but a steady stream of requests will slowly increase the cache size - until memory pressure puts it back again. It could take hours to reach a steady state. The value written to, and displayed in, stripe_cache_size is used as a minimum. The cache can grow above this and shrink back down to it. The actual size is not directly visible, though it can be deduced to some extent by watching stripe_cache_active. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------ drivers/md/raid5.h | 11 ++++++++- 2 files changed, 71 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid5.h') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b7cd32e7f29e..9716319cc477 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { - if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { sh = get_free_stripe(conf, hash); + if (!sh && llist_empty(&conf->released_stripes) && + !test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, + &conf->cache_state); + } if (noblock && sh == NULL) break; if (!sh) { @@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread) int batch_size, released; released = release_stripe_list(conf, conf->temp_inactive_list); + if (released) + clear_bit(R5_DID_ALLOC, &conf->cache_state); if ( !list_empty(&conf->bitmap_list)) { @@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); + if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { + grow_one_stripe(conf, __GFP_NOWARN); + /* Set flag even if allocation failed. This helps + * slow down allocation requests when mem is short + */ + set_bit(R5_DID_ALLOC, &conf->cache_state); + } async_tx_issue_pending_all(); blk_finish_plug(&plug); @@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) spin_lock(&mddev->lock); conf = mddev->private; if (conf) - ret = sprintf(page, "%d\n", conf->max_nr_stripes); + ret = sprintf(page, "%d\n", conf->min_nr_stripes); spin_unlock(&mddev->lock); return ret; } @@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size) if (size <= 16 || size > 32768) return -EINVAL; + conf->min_nr_stripes = size; while (size < conf->max_nr_stripes && drop_one_stripe(conf)) ; + err = md_allow_write(mddev); if (err) return err; @@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) conf = mddev->private; if (!conf) err = -ENODEV; - else if (new > conf->max_nr_stripes) + else if (new > conf->min_nr_stripes) err = -EINVAL; else conf->bypass_threshold = new; @@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + if (conf->shrinker.seeks) + unregister_shrinker(&conf->shrinker); free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf) return err; } +static unsigned long raid5_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + int ret = 0; + while (ret < sc->nr_to_scan) { + if (drop_one_stripe(conf) == 0) + return SHRINK_STOP; + ret++; + } + return ret; +} + +static unsigned long raid5_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + + if (conf->max_nr_stripes < conf->min_nr_stripes) + /* unlikely, but not impossible */ + return 0; + return conf->max_nr_stripes - conf->min_nr_stripes; +} + static struct r5conf *setup_conf(struct mddev *mddev) { struct r5conf *conf; @@ -6445,10 +6487,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->prev_algo = mddev->layout; } - memory = NR_STRIPES * (sizeof(struct stripe_head) + + conf->min_nr_stripes = NR_STRIPES; + memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); - if (grow_stripes(conf, NR_STRIPES)) { + if (grow_stripes(conf, conf->min_nr_stripes)) { printk(KERN_ERR "md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); @@ -6456,6 +6499,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) } else printk(KERN_INFO "md/raid:%s: allocated %dkB\n", mdname(mddev), memory); + /* + * Losing a stripe head costs more than the time to refill it, + * it reduces the queue depth and so can hurt throughput. + * So set it rather large, scaled by number of devices. + */ + conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; + conf->shrinker.scan_objects = raid5_cache_scan; + conf->shrinker.count_objects = raid5_cache_count; + conf->shrinker.batch = 128; + conf->shrinker.flags = 0; + register_shrinker(&conf->shrinker); sprintf(pers_name, "raid%d", mddev->new_level); conf->thread = md_register_thread(raid5d, mddev, pers_name); @@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev) */ struct r5conf *conf = mddev->private; if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes || + > conf->min_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes) { + > conf->min_nr_stripes) { printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ebe4e24bc14d..7dc0dd86074b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -433,6 +433,7 @@ struct r5conf { int max_degraded; int raid_disks; int max_nr_stripes; + int min_nr_stripes; /* reshape_progress is the leading edge of a 'reshape' * It has value MaxSector when no reshape is happening @@ -513,7 +514,15 @@ struct r5conf { #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, * waiting for 25% to be free */ - +#define R5_ALLOC_MORE 2 /* It might help to allocate another + * stripe. + */ +#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate + * more until at least one has been + * released. This avoids flooding + * the cache. + */ + struct shrinker shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; -- cgit v1.2.3