From 5cfb22a1f83e4f04c0a4df89b60053a077222e2b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 11:46:53 +1000 Subject: md/raid5: prefer replacing failed devices over want-replacement devices. If a RAID5 has both a failed device and a device marked as 'WantReplacement', then we should preferentially replace the failed device. However the current code replaces whichever is found first. So split into 2 loops, check fail failed/missing first, and only check for WantReplacement if nothing is failed or missing. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d26767246d26..95fcbbf3d6c9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5465,10 +5465,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = first; - for ( ; disk <= last ; disk++) { + first = rdev->saved_raid_disk; + + for (disk = first; disk <= last; disk++) { p = conf->disks + disk; if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5477,8 +5476,11 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); - break; + goto out; } + } + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; if (test_bit(WantReplacement, &p->rdev->flags) && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); @@ -5490,6 +5492,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } +out: print_raid5_conf(conf); return err; } -- cgit v1.2.3 From 6c0544e255dd6582a9899572e120fb55d9f672a4 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 12 Jun 2012 08:31:10 +0800 Subject: md/raid5: Do not add data_offset before call to is_badblock In chunk_aligned_read() we are adding data_offset before calling is_badblock. But is_badblock also adds data_offset, so that is bad. So move the addition of data_offset to after the call to is_badblock. This bug was introduced by commit 31c176ecdf3563140e639 md/raid5: avoid reading from known bad blocks. which first appeared in 3.0. So that patch is suitable for any -stable kernel from 3.0.y onwards. However it will need minor revision for most of those (as the comment didn't appear until recently). Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 95fcbbf3d6c9..9567a9c83a1f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3881,8 +3881,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_sector += rdev->data_offset; if (!bio_fits_rdev(align_bi) || is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, @@ -3893,6 +3891,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) return 0; } + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, -- cgit v1.2.3 From 1850753d2e6d9ca7856581ca5d3cf09521e6a5d7 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 12:11:54 +1000 Subject: md/raid5: In ops_run_io, inc nr_pending before calling md_wait_for_blocked_rdev In ops_run_io(), the call to md_wait_for_blocked_rdev will decrement nr_pending so we lose the reference we hold on the rdev. So atomic_inc it first to maintain the reference. This bug was introduced by commit 73e92e51b7969ef5477d md/raid5. Don't write to known bad block on doubtful devices. which appeared in 3.0, so patch is suitable for stable kernels since then. Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9567a9c83a1f..befadb41a11f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -606,6 +606,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) * a chance*/ md_check_recovery(conf->mddev); } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); md_wait_for_blocked_rdev(rdev, conf->mddev); } else { /* Acknowledged bad block - skip the write */ -- cgit v1.2.3 From 5f066c632fcfd2a33f2eb7077c15c630e9f5ea5b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 12:13:29 +1000 Subject: md/raid5: fix refcount problem when blocked_rdev is set. commit 43220aa0f22cd3ce5b30246d50ccd696d119edea md/raid5: fix a hang on device failure. fixed a hang, but introduced a refcounting in-balance so that if the presence of bad-blocks ever caused an rdev to be 'blocked' we would increment the refcount on the rdev and never decrement it. So added the needed rdev_dec_pending when md_wait_for_blocked_rdev is not called. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index befadb41a11f..62b6b3a83abf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3588,8 +3588,18 @@ static void handle_stripe(struct stripe_head *sh) finish: /* wait for this device to become unblocked */ - if (conf->mddev->external && unlikely(s.blocked_rdev)) - md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); + if (unlikely(s.blocked_rdev)) { + if (conf->mddev->external) + md_wait_for_blocked_rdev(s.blocked_rdev, + conf->mddev); + else + /* Internal metadata will immediately + * be written by raid5d, so we don't + * need to wait here. + */ + rdev_dec_pending(s.blocked_rdev, + conf->mddev); + } if (s.handle_bad_blocks) for (i = disks; i--; ) { -- cgit v1.2.3 From 0232605d987d8230b254aa139805bbb56a7ca30c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:56:52 +1000 Subject: md: make 'name' arg to md_register_thread non-optional. Having the 'name' arg optional and defaulting to the current personality name is no necessary and leads to errors, as when changing the level of an array we can end up using the name of the old level instead of the new one. So make it non-optional and always explicitly pass the name of the level that the array will be. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- drivers/md/multipath.c | 3 ++- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 4 +++- 5 files changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 973aa8459e98..c601c4be77c7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6751,7 +6751,7 @@ struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev thread->tsk = kthread_run(md_thread, thread, "%s_%s", mdname(thread->mddev), - name ?: mddev->pers->name); + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 9339e67fcc79..61a1833ebaf3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -474,7 +474,8 @@ static int multipath_run (struct mddev *mddev) } { - mddev->thread = md_register_thread(multipathd, mddev, NULL); + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); if (!mddev->thread) { printk(KERN_ERR "multipath: couldn't allocate thread" " for %s\n", mdname(mddev)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981ddd24..39b2a8aa3b23 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2621,7 +2621,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); + conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { printk(KERN_ERR "md/raid1:%s: couldn't allocate thread\n", diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae73e29298b2..edc1088a1320 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3427,7 +3427,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); - conf->thread = md_register_thread(raid10d, mddev, NULL); + conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) goto out; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 62b6b3a83abf..a5135e595866 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4840,6 +4840,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int raid_disk, memory, max_disks; struct md_rdev *rdev; struct disk_info *disk; + char pers_name[6]; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -4963,7 +4964,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) printk(KERN_INFO "md/raid:%s: allocated %dkB\n", mdname(mddev), memory); - conf->thread = md_register_thread(raid5d, mddev, NULL); + sprintf(pers_name, "raid%d", mddev->new_level); + conf->thread = md_register_thread(raid5d, mddev, pers_name); if (!conf->thread) { printk(KERN_ERR "md/raid:%s: couldn't allocate thread.\n", -- cgit v1.2.3 From 2e8ac30312973dd20e6807365349ecb1c7e0ea45 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 3 Jul 2012 15:57:02 +1000 Subject: md/raid456: When read error cannot be recovered, record bad block We may not be able to fix a bad block if: - the array is degraded - the over-write fails. In these cases we currently eject the device, but we should record a bad block if possible. Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5135e595866..51169ecd7787 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1743,6 +1743,7 @@ static void raid5_end_read_request(struct bio * bi, int error) } else { const char *bdn = bdevname(rdev->bdev, b); int retry = 0; + int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); @@ -1754,7 +1755,8 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (conf->mddev->degraded >= conf->max_degraded) + else if (conf->mddev->degraded >= conf->max_degraded) { + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error not correctable " @@ -1762,8 +1764,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ + set_bad = 1; printk_ratelimited( KERN_WARNING "md/raid:%s: read error NOT corrected!! " @@ -1771,7 +1774,7 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), (unsigned long long)s, bdn); - else if (atomic_read(&rdev->read_errors) + } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING "md/raid:%s: Too many read errors, failing device %s.\n", @@ -1783,7 +1786,11 @@ static void raid5_end_read_request(struct bio * bi, int error) else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, rdev); + if (!(set_bad + && test_bit(In_sync, &rdev->flags) + && rdev_set_badblocks( + rdev, sh->sector, STRIPE_SECTORS, 0))) + md_error(conf->mddev, rdev); } } rdev_dec_pending(rdev, conf->mddev); -- cgit v1.2.3 From fab363b5ff502d1b39ddcfec04271f5858d9f26e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Jul 2012 15:57:19 +1000 Subject: raid5: delayed stripe fix There isn't locking setting STRIPE_DELAYED and STRIPE_PREREAD_ACTIVE bits, but the two bits have relationship. A delayed stripe can be moved to hold list only when preread active stripe count is below IO_THRESHOLD. If a stripe has both the bits set, such stripe will be in delayed list and preread count not 0, which will make such stripe never leave delayed list. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 51169ecd7787..7245a9df35a9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { + clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } -- cgit v1.2.3 From b357f04a67c2aeee828b240863cd3f21d6cb3179 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 17:45:31 +1000 Subject: md: fix up plugging (again). The value returned by "mddev_check_plug" is only valid until the next 'schedule' as that will unplug things. This could happen at any call to mempool_alloc. So just calling mddev_check_plug at the start doesn't really make sense. So call it just before, or just after, queuing things for the thread. As the action that happens at unplug is to wake the thread, this makes lots of sense. If we cannot add a plug (which requires a small GFP_ATOMIC alloc) we wake thread immediately. RAID5 is a bit different. Requests are queued for the thread and the thread is woken by release_stripe. So we don't need to wake the thread on failure. However the thread doesn't perform certain actions when there is any active plug, so it is important to install a plug before waking the thread. So for RAID5 we install the plug *before* queuing the request and waking the thread. Without this patch it is possible for raid1 or raid10 to queue a request without then waking the thread, resulting in the array locking up. Also change raid10 to only flush_pending_write when there are not active plugs, just like raid1. This patch is suitable for 3.0 or later. I plan to submit it to -stable, but I'll like to let it spend a few weeks in mainline first to be sure it is completely safe. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 7 ++----- drivers/md/raid10.c | 12 ++++++------ drivers/md/raid5.c | 6 +----- 3 files changed, 9 insertions(+), 16 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34b4665cb0b6..8c2754f835ef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; - int plugged; int first_clone; int sectors_handled; int max_sectors; @@ -1034,7 +1033,6 @@ read_again: * the bad blocks. Each set of writes gets it's own r1bio * with a set of bios attached. */ - plugged = mddev_check_plugged(mddev); disks = conf->raid_disks * 2; retry_write: @@ -1191,6 +1189,8 @@ read_again: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. @@ -1213,9 +1213,6 @@ read_again: /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index edc1088a1320..acf5a828c7e1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1039,7 +1039,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; - int plugged; int sectors_handled; int max_sectors; int sectors; @@ -1239,7 +1238,6 @@ read_again: * of r10_bios is recored in bio->bi_phys_segments just as with * the read case. */ - plugged = mddev_check_plugged(mddev); r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); @@ -1396,6 +1394,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev, 0, 0)) + md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) continue; @@ -1423,6 +1423,8 @@ retry_write: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until @@ -1448,9 +1450,6 @@ retry_write: /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !mddev->bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) @@ -2661,7 +2660,8 @@ static void raid10d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - flush_pending_writes(conf); + if (atomic_read(&mddev->plug_cnt) == 0) + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7245a9df35a9..04348d76bb30 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3997,7 +3997,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; - int plugged; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -4016,7 +4015,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int previous; @@ -4118,6 +4116,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); + mddev_check_plugged(mddev); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -4125,10 +4124,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); break; } - } - if (!plugged) - md_wakeup_thread(mddev->thread); spin_lock_irq(&conf->device_lock); remaining = raid5_dec_bi_phys_segments(bi); -- cgit v1.2.3