diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-02-14 12:44:16 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-02-14 12:44:16 +1100 |
commit | cae7962ff06b2aa8637b49e97901090ac728064a (patch) | |
tree | 4894ba134c887d1b274e63a79909d6baf04e5519 /drivers/md | |
parent | a902eda6f6f03a3b74cb7fca7011d14a5836f71f (diff) | |
parent | 106007154abfada940c979284570448094126eb2 (diff) |
Merge remote-tracking branch 'md/for-next'
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-raid.c | 123 | ||||
-rw-r--r-- | drivers/md/md.c | 8 | ||||
-rw-r--r-- | drivers/md/md.h | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 7 | ||||
-rw-r--r-- | drivers/md/raid10.c | 96 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 408 | ||||
-rw-r--r-- | drivers/md/raid5.h | 19 |
8 files changed, 607 insertions, 67 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index a3e115b62186..9d6bf19ce6e8 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -91,15 +91,44 @@ static struct raid_type { {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} }; +static char *raid10_md_layout_to_format(int layout) +{ + /* + * Bit 16 and 17 stand for "offset" and "use_far_sets" + * Refer to MD's raid10.c for details + */ + if ((layout & 0x10000) && (layout & 0x20000)) + return "offset"; + + if ((layout & 0xFF) > 1) + return "near"; + + return "far"; +} + static unsigned raid10_md_layout_to_copies(int layout) { - return layout & 0xFF; + if ((layout & 0xFF) > 1) + return layout & 0xFF; + return (layout >> 8) & 0xFF; } static int raid10_format_to_md_layout(char *format, unsigned copies) { - /* 1 "far" copy, and 'copies' "near" copies */ - return (1 << 8) | (copies & 0xFF); + unsigned n = 1, f = 1; + + if (!strcmp("near", format)) + n = copies; + else + f = copies; + + if (!strcmp("offset", format)) + return 0x30000 | (f << 8) | n; + + if (!strcmp("far", format)) + return 0x20000 | (f << 8) | n; + + return (f << 8) | n; } static struct raid_type *get_raid_type(char *name) @@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; unsigned rebuilds_per_group, copies, d; + unsigned group_size, last_group_start; for (i = 0; i < rs->md.raid_disks; i++) if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || @@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs) * as long as the failed devices occur in different mirror * groups (i.e. different stripes). * - * Right now, we only allow for "near" copies. When other - * formats are added, we will have to check those too. - * * When checking "near" format, make sure no adjacent devices * have failed beyond what can be handled. In addition to the * simple case where the number of devices is a multiple of the @@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs) * A A B B C * C D D E E */ - for (i = 0; i < rs->md.raid_disks * copies; i++) { - if (!(i % copies)) + if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { + for (i = 0; i < rs->md.raid_disks * copies; i++) { + if (!(i % copies)) + rebuilds_per_group = 0; + d = i % rs->md.raid_disks; + if ((!rs->dev[d].rdev.sb_page || + !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + } + + /* + * When checking "far" and "offset" formats, we need to ensure + * that the device that holds its copy is not also dead or + * being rebuilt. (Note that "far" and "offset" formats only + * support two copies right now. These formats also only ever + * use the 'use_far_sets' variant.) + * + * This check is somewhat complicated by the need to account + * for arrays that are not a multiple of (far) copies. This + * results in the need to treat the last (potentially larger) + * set differently. + */ + group_size = (rs->md.raid_disks / copies); + last_group_start = (rs->md.raid_disks / group_size) - 1; + last_group_start *= group_size; + for (i = 0; i < rs->md.raid_disks; i++) { + if (!(i % copies) && !(i > last_group_start)) rebuilds_per_group = 0; - d = i % rs->md.raid_disks; - if ((!rs->dev[d].rdev.sb_page || - !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + if ((!rs->dev[i].rdev.sb_page || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && (++rebuilds_per_group >= copies)) - goto too_many; + goto too_many; } break; default: @@ -433,7 +487,7 @@ too_many: * * RAID10-only options: * [raid10_copies <# copies>] Number of copies. (Default: 2) - * [raid10_format <near>] Layout algorithm. (Default: near) + * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) @@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; return -EINVAL; } - if (strcmp("near", argv[i])) { + if (strcmp("near", argv[i]) && + strcmp("far", argv[i]) && + strcmp("offset", argv[i])) { rs->ti->error = "Invalid 'raid10_format' value given"; return -EINVAL; } @@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } + /* + * If the format is not "near", we only support + * two copies at the moment. + */ + if (strcmp("near", raid10_format) && (raid10_copies > 2)) { + rs->ti->error = "Too many copies for given RAID10 format."; + return -EINVAL; + } + /* (Len * #mirrors) / #devices */ sectors_per_dev = rs->ti->len * raid10_copies; sector_div(sectors_per_dev, rs->md.raid_disks); @@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) /* * Reshaping is not currently allowed */ - if ((le32_to_cpu(sb->level) != mddev->level) || - (le32_to_cpu(sb->layout) != mddev->layout) || - (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { - DMERR("Reshaping arrays not yet supported."); + if (le32_to_cpu(sb->level) != mddev->level) { + DMERR("Reshaping arrays not yet supported. (RAID level change)"); + return -EINVAL; + } + if (le32_to_cpu(sb->layout) != mddev->layout) { + DMERR("Reshaping arrays not yet supported. (RAID layout change)"); + DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); + DMERR(" Old layout: %s w/ %d copies", + raid10_md_layout_to_format(le32_to_cpu(sb->layout)), + raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); + DMERR(" New layout: %s w/ %d copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + return -EINVAL; + } + if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { + DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); return -EINVAL; } /* We can only change the number of devices in RAID1 right now */ if ((rs->raid_type->level != 1) && (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { - DMERR("Reshaping arrays not yet supported."); + DMERR("Reshaping arrays not yet supported. (device count change)"); return -EINVAL; } @@ -1329,7 +1407,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, raid10_md_layout_to_copies(rs->md.layout)); if (rs->print_flags & DMPF_RAID10_FORMAT) - DMEMIT(" raid10_format near"); + DMEMIT(" raid10_format %s", + raid10_md_layout_to_format(rs->md.layout)); DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { @@ -1420,6 +1499,10 @@ static struct target_type raid_target = { static int __init dm_raid_init(void) { + DMINFO("Loading target version %u.%u.%u", + raid_target.version[0], + raid_target.version[1], + raid_target.version[2]); return dm_register_target(&raid_target); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 1e634a68541e..8b557d2b6f99 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -607,9 +607,9 @@ static struct mddev * mddev_find(dev_t unit) goto retry; } -static inline int mddev_lock(struct mddev * mddev) +int md_queue_misc_work(struct work_struct *work) { - return mutex_lock_interruptible(&mddev->reconfig_mutex); + return queue_work(md_misc_wq, work); } static inline int mddev_is_locked(struct mddev *mddev) @@ -624,7 +624,7 @@ static inline int mddev_trylock(struct mddev * mddev) static struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev * mddev) +void mddev_unlock(struct mddev * mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -8631,6 +8631,8 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(mddev_unlock); +EXPORT_SYMBOL(md_queue_misc_work); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); diff --git a/drivers/md/md.h b/drivers/md/md.h index eca59c3074ef..2afa1c34d35b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -612,4 +612,12 @@ static inline int mddev_check_plugged(struct mddev *mddev) return !!blk_check_plugged(md_unplug, mddev, sizeof(struct blk_plug_cb)); } + +static inline int mddev_lock(struct mddev * mddev) +{ + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} +extern void mddev_unlock(struct mddev *mddev); +extern int md_queue_misc_work(struct work_struct *work); + #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d5bddfc4010e..6e5d5a5f9cb4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1000,6 +1000,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1301,7 +1302,8 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; + mbio->bi_rw = + WRITE | do_flush_fua | do_sync | do_discard | do_same; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -2818,6 +2820,9 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + if (mddev->queue) + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..61ed150bd0cf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -38,21 +38,36 @@ * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) + * use_far_sets (stored in bit 17 of layout ) * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize. Each device + * is divided into far_copies sections. In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive). The starting device for each section is offset + * near_copies from the starting device of the previous section. Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive. near_copies and far_copies must be at least one, and their product + * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true. In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size. The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array. This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + * A B C D A B C D E + * ... ... + * D A B C E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + * [A B] [C D] [A B] [C D E] + * |...| |...| |...| | ... | + * [B A] [D C] [B A] [E C D] */ /* @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) sector_t stripe; int dev; int slot = 0; + int last_far_set_start, last_far_set_size; + + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + last_far_set_size = geo->far_set_size; + last_far_set_size += (geo->raid_disks % geo->far_set_size); /* now calculate first sector/dev */ chunk = r10bio->sector >> geo->chunk_shift; @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) /* and calculate all the others */ for (n = 0; n < geo->near_copies; n++) { int d = dev; + int set; sector_t s = sector; - r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; slot++; for (f = 1; f < geo->far_copies; f++) { + set = d / geo->far_set_size; d += geo->near_copies; - if (d >= geo->raid_disks) - d -= geo->raid_disks; + + if ((geo->raid_disks % geo->far_set_size) && + (d > last_far_set_start)) { + d -= last_far_set_start; + d %= last_far_set_size; + d += last_far_set_start; + } else { + d %= geo->far_set_size; + d += geo->far_set_size * set; + } s += geo->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * or recovery, so reshape isn't happening */ struct geom *geo = &conf->geo; + int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; + int far_set_size = geo->far_set_size; + int last_far_set_start; + + if (geo->raid_disks % geo->far_set_size) { + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + if (dev >= last_far_set_start) { + far_set_size = geo->far_set_size; + far_set_size += (geo->raid_disks % geo->far_set_size); + far_set_start = last_far_set_start; + } + } offset = sector & geo->chunk_mask; if (geo->far_offset) { @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) chunk = sector >> geo->chunk_shift; fc = sector_div(chunk, geo->far_copies); dev -= fc * geo->near_copies; - if (dev < 0) - dev += geo->raid_disks; + if (dev < far_set_start) + dev += far_set_size; } else { while (sector >= geo->stride) { sector -= geo->stride; - if (dev < geo->near_copies) - dev += geo->raid_disks - geo->near_copies; + if (dev < (geo->near_copies + far_set_start)) + dev += far_set_size - geo->near_copies; else dev -= geo->near_copies; } @@ -1105,6 +1151,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1460,7 +1507,8 @@ retry_write: rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1502,7 +1550,8 @@ retry_write: r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -3436,7 +3485,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) disks = mddev->raid_disks + mddev->delta_disks; break; } - if (layout >> 17) + if (layout >> 18) return -1; if (chunk < (PAGE_SIZE >> 9) || !is_power_of_2(chunk)) @@ -3448,6 +3497,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) geo->near_copies = nc; geo->far_copies = fc; geo->far_offset = fo; + geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; geo->chunk_mask = chunk - 1; geo->chunk_shift = ffz(~chunk); return nc*fc; @@ -3569,6 +3619,8 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1054cf602345..157d69e83ff4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,6 +33,11 @@ struct r10conf { * far_offset, in which case it is * 1 stripe. */ + int far_set_size; /* The number of devices in a set, + * where a 'set' are devices that + * contain far/offset copies of + * each other. + */ int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; } prev, geo; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9ab506df42da..77c4a585fb82 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -198,6 +198,21 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } +static void raid5_wakeup_stripe_thread(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + struct raid5_percpu *percpu; + int i, orphaned = 1; + + percpu = per_cpu_ptr(conf->percpu, sh->cpu); + for_each_cpu(i, &percpu->handle_threads) { + md_wakeup_thread(conf->aux_threads[i]->thread); + orphaned = 0; + } + if (orphaned) + md_wakeup_thread(conf->mddev->thread); +} + static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) { BUG_ON(!list_empty(&sh->lru)); @@ -210,9 +225,19 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { + int cpu = sh->cpu; + struct raid5_percpu *percpu; + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + percpu = per_cpu_ptr(conf->percpu, cpu); + clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); + list_add_tail(&sh->lru, &percpu->handle_list); + raid5_wakeup_stripe_thread(sh); + return; } md_wakeup_thread(conf->mddev->thread); } else { @@ -357,6 +382,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) raid5_build_block(sh, i, previous); } insert_hash(conf, sh); + sh->cpu = smp_processor_id(); } static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, @@ -3759,12 +3785,19 @@ static void raid5_activate_delayed(struct r5conf *conf) while (!list_empty(&conf->delayed_list)) { struct list_head *l = conf->delayed_list.next; struct stripe_head *sh; + int cpu; sh = list_entry(l, struct stripe_head, lru); list_del_init(l); clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->hold_list); + cpu = sh->cpu; + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + raid5_wakeup_stripe_thread(sh); } } } @@ -4041,18 +4074,29 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) * head of the hold_list has changed, i.e. the head was promoted to the * handle_list. */ -static struct stripe_head *__get_priority_stripe(struct r5conf *conf) -{ - struct stripe_head *sh; - +static struct stripe_head *__get_priority_stripe(struct r5conf *conf, + cpumask_t *mask) +{ + struct stripe_head *sh = NULL, *tmp; + struct list_head *handle_list = NULL; + int cpu; + + /* Should we take action to avoid starvation of latter CPUs ? */ + for_each_cpu(cpu, mask) { + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + if (!list_empty(&percpu->handle_list)) { + handle_list = &percpu->handle_list; + break; + } + } pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", __func__, - list_empty(&conf->handle_list) ? "empty" : "busy", + !handle_list ? "empty" : "busy", list_empty(&conf->hold_list) ? "empty" : "busy", atomic_read(&conf->pending_full_writes), conf->bypass_count); - if (!list_empty(&conf->handle_list)) { - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + if (handle_list) { + sh = list_entry(handle_list->next, typeof(*sh), lru); if (list_empty(&conf->hold_list)) conf->bypass_count = 0; @@ -4070,12 +4114,23 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) ((conf->bypass_threshold && conf->bypass_count > conf->bypass_threshold) || atomic_read(&conf->pending_full_writes) == 0)) { - sh = list_entry(conf->hold_list.next, - typeof(*sh), lru); - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } else + + list_for_each_entry(tmp, &conf->hold_list, lru) { + if (cpumask_test_cpu(tmp->cpu, mask) || + !cpu_online(tmp->cpu)) { + sh = tmp; + break; + } + } + + if (sh) { + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + + if (!sh) return NULL; list_del_init(&sh->lru); @@ -4757,13 +4812,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } #define MAX_STRIPE_BATCH 8 -static int handle_active_stripes(struct r5conf *conf) +static int handle_active_stripes(struct r5conf *conf, cpumask_t *mask) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0; while (batch_size < MAX_STRIPE_BATCH && - (sh = __get_priority_stripe(conf)) != NULL) + (sh = __get_priority_stripe(conf, mask)) != NULL) batch[batch_size++] = sh; if (batch_size == 0) @@ -4781,6 +4836,35 @@ static int handle_active_stripes(struct r5conf *conf) return batch_size; } +static void raid5auxd(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; + struct blk_plug plug; + int handled; + struct raid5_auxth *auxth = thread->private; + + pr_debug("+++ raid5auxd active\n"); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + int batch_size; + + batch_size = handle_active_stripes(conf, &auxth->work_mask); + if (!batch_size) + break; + handled += batch_size; + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + blk_finish_plug(&plug); + + pr_debug("--- raid5auxd inactive\n"); +} + /* * This is our raid5 kernel thread. * @@ -4828,7 +4912,7 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf); + batch_size = handle_active_stripes(conf, &conf->work_mask); if (!batch_size) break; handled += batch_size; @@ -4957,10 +5041,270 @@ stripe_cache_active_show(struct mddev *mddev, char *page) static struct md_sysfs_entry raid5_stripecache_active = __ATTR_RO(stripe_cache_active); +static void raid5_update_threads_handle_mask(struct mddev *mddev) +{ + int cpu, i; + struct raid5_percpu *percpu; + struct r5conf *conf = mddev->private; + + for_each_online_cpu(cpu) { + percpu = per_cpu_ptr(conf->percpu, cpu); + cpumask_clear(&percpu->handle_threads); + } + cpumask_copy(&conf->work_mask, cpu_online_mask); + + for (i = 0; i < conf->aux_thread_num; i++) { + cpumask_t *work_mask = &conf->aux_threads[i]->work_mask; + for_each_cpu(cpu, work_mask) { + percpu = per_cpu_ptr(conf->percpu, cpu); + cpumask_set_cpu(i, &percpu->handle_threads); + } + cpumask_andnot(&conf->work_mask, &conf->work_mask, + work_mask); + } +} + +struct raid5_auxth_sysfs { + struct attribute attr; + ssize_t (*show)(struct mddev *, struct raid5_auxth *, char *); + ssize_t (*store)(struct mddev *, struct raid5_auxth *, + const char *, size_t); +}; + +static ssize_t raid5_show_thread_cpulist(struct mddev *mddev, + struct raid5_auxth *thread, char *page) +{ + if (!mddev->private) + return 0; + return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask); +} + +static ssize_t +raid5_store_thread_cpulist(struct mddev *mddev, struct raid5_auxth *thread, + const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + cpumask_var_t mask; + + if (!conf) + return -ENODEV; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + if (cpulist_parse(page, mask)) { + free_cpumask_var(mask); + return -EINVAL; + } + + get_online_cpus(); + spin_lock_irq(&conf->device_lock); + cpumask_copy(&thread->work_mask, mask); + raid5_update_threads_handle_mask(mddev); + spin_unlock_irq(&conf->device_lock); + put_online_cpus(); + set_cpus_allowed_ptr(thread->thread->tsk, mask); + + free_cpumask_var(mask); + return len; +} + +static struct raid5_auxth_sysfs thread_cpulist = +__ATTR(cpulist, S_IRUGO|S_IWUSR, raid5_show_thread_cpulist, + raid5_store_thread_cpulist); + +static struct attribute *auxth_attrs[] = { + &thread_cpulist.attr, + NULL, +}; + +static ssize_t +raid5_auxth_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct raid5_auxth_sysfs *entry = container_of(attr, + struct raid5_auxth_sysfs, attr); + struct raid5_auxth *thread = container_of(kobj, + struct raid5_auxth, kobj); + struct mddev *mddev = thread->thread->mddev; + ssize_t ret; + + if (!entry->show) + return -EIO; + mddev_lock(mddev); + ret = entry->show(mddev, thread, page); + mddev_unlock(mddev); + return ret; +} + +static ssize_t +raid5_auxth_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct raid5_auxth_sysfs *entry = container_of(attr, + struct raid5_auxth_sysfs, attr); + struct raid5_auxth *thread = container_of(kobj, + struct raid5_auxth, kobj); + struct mddev *mddev = thread->thread->mddev; + ssize_t ret; + + if (!entry->store) + return -EIO; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + mddev_lock(mddev); + ret = entry->store(mddev, thread, page, length); + mddev_unlock(mddev); + return ret; +} + +static void raid5_auxth_release(struct kobject *kobj) +{ + struct raid5_auxth *thread = container_of(kobj, + struct raid5_auxth, kobj); + kfree(thread); +} + +static const struct sysfs_ops raid5_auxth_sysfsops = { + .show = raid5_auxth_attr_show, + .store = raid5_auxth_attr_store, +}; +static struct kobj_type raid5_auxth_ktype = { + .release = raid5_auxth_release, + .sysfs_ops = &raid5_auxth_sysfsops, + .default_attrs = auxth_attrs, +}; + +static ssize_t +raid5_show_auxthread_number(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->aux_thread_num); + else + return 0; +} + +static void raid5_auxth_delete(struct work_struct *ws) +{ + struct raid5_auxth *thread = container_of(ws, struct raid5_auxth, + del_work); + + kobject_del(&thread->kobj); + kobject_put(&thread->kobj); +} + +static void __free_aux_thread(struct mddev *mddev, struct raid5_auxth *thread) +{ + md_unregister_thread(&thread->thread); + INIT_WORK(&thread->del_work, raid5_auxth_delete); + kobject_get(&thread->kobj); + md_queue_misc_work(&thread->del_work); +} + +static struct raid5_auxth *__create_aux_thread(struct mddev *mddev, int i) +{ + struct raid5_auxth *thread; + char name[10]; + + thread = kzalloc(sizeof(*thread), GFP_KERNEL); + if (!thread) + return NULL; + snprintf(name, 10, "aux%d", i); + thread->thread = md_register_thread(raid5auxd, mddev, name); + if (!thread->thread) { + kfree(thread); + return NULL; + } + thread->thread->private = thread; + + cpumask_copy(&thread->work_mask, cpu_online_mask); + + if (kobject_init_and_add(&thread->kobj, &raid5_auxth_ktype, + &mddev->kobj, "auxth%d", i)) { + md_unregister_thread(&thread->thread); + kfree(thread); + return NULL; + } + return thread; +} + +static ssize_t +raid5_store_auxthread_number(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + int i; + struct raid5_auxth **threads; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + if (new == conf->aux_thread_num) + return len; + + /* There is no point creating more threads than cpu number */ + if (new > num_online_cpus()) + return -EINVAL; + + if (new > conf->aux_thread_num) { + threads = kzalloc(sizeof(struct raid5_auxth *) * new, + GFP_KERNEL); + if (!threads) + return -ENOMEM; + + i = conf->aux_thread_num; + while (i < new) { + threads[i] = __create_aux_thread(mddev, i); + if (!threads[i]) + goto error; + + i++; + } + memcpy(threads, conf->aux_threads, + sizeof(struct raid5_auxth *) * conf->aux_thread_num); + get_online_cpus(); + spin_lock_irq(&conf->device_lock); + kfree(conf->aux_threads); + conf->aux_threads = threads; + conf->aux_thread_num = new; + raid5_update_threads_handle_mask(mddev); + spin_unlock_irq(&conf->device_lock); + put_online_cpus(); + } else { + int old = conf->aux_thread_num; + + get_online_cpus(); + spin_lock_irq(&conf->device_lock); + conf->aux_thread_num = new; + raid5_update_threads_handle_mask(mddev); + spin_unlock_irq(&conf->device_lock); + put_online_cpus(); + for (i = new; i < old; i++) + __free_aux_thread(mddev, conf->aux_threads[i]); + } + + return len; +error: + while (--i >= conf->aux_thread_num) + __free_aux_thread(mddev, threads[i]); + kfree(threads); + return -ENOMEM; +} + +static struct md_sysfs_entry +raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR, + raid5_show_auxthread_number, + raid5_store_auxthread_number); + static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, + &raid5_auxthread_number.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -5008,6 +5352,7 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + kfree(conf->aux_threads); shrink_stripes(conf); raid5_free_percpu(conf); kfree(conf->disks); @@ -5020,7 +5365,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, void *hcpu) { struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); - long cpu = (long)hcpu; + long cpu = (long)hcpu, anycpu; struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); switch (action) { @@ -5039,9 +5384,17 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, __func__, cpu); return notifier_from_errno(-ENOMEM); } + INIT_LIST_HEAD(&(percpu->handle_list)); + cpumask_clear(&percpu->handle_threads); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + spin_lock_irq(&conf->device_lock); + anycpu = cpumask_any(cpu_online_mask); + list_splice_tail_init(&percpu->handle_list, + &per_cpu_ptr(conf->percpu, anycpu)->handle_list); + spin_unlock_irq(&conf->device_lock); + safe_put_page(percpu->spare_page); kfree(percpu->scribble); percpu->spare_page = NULL; @@ -5050,6 +5403,10 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, default: break; } + + spin_lock_irq(&conf->device_lock); + raid5_update_threads_handle_mask(conf->mddev); + spin_unlock_irq(&conf->device_lock); return NOTIFY_OK; } #endif @@ -5070,20 +5427,24 @@ static int raid5_alloc_percpu(struct r5conf *conf) get_online_cpus(); err = 0; for_each_present_cpu(cpu) { + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + if (conf->level == 6) { spare_page = alloc_page(GFP_KERNEL); if (!spare_page) { err = -ENOMEM; break; } - per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; + percpu->spare_page = spare_page; } scribble = kmalloc(conf->scribble_len, GFP_KERNEL); if (!scribble) { err = -ENOMEM; break; } - per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; + percpu->scribble = scribble; + INIT_LIST_HEAD(&percpu->handle_list); + cpumask_clear(&percpu->handle_threads); } #ifdef CONFIG_HOTPLUG_CPU conf->cpu_notify.notifier_call = raid456_cpu_notify; @@ -5139,7 +5500,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); @@ -5150,6 +5510,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->bypass_threshold = BYPASS_THRESHOLD; conf->recovery_disabled = mddev->recovery_disabled - 1; + cpumask_copy(&conf->work_mask, cpu_online_mask); + conf->raid_disks = mddev->raid_disks; if (mddev->reshape_position == MaxSector) conf->previous_raid_disks = mddev->raid_disks; @@ -5606,6 +5968,10 @@ abort: static int stop(struct mddev *mddev) { struct r5conf *conf = mddev->private; + int i; + + for (i = 0; i < conf->aux_thread_num; i++) + __free_aux_thread(mddev, conf->aux_threads[i]); md_unregister_thread(&mddev->thread); if (mddev->queue) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 18b2c4a8a1fd..2afd8358556b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -211,6 +211,7 @@ struct stripe_head { enum check_states check_state; enum reconstruct_states reconstruct_state; spinlock_t stripe_lock; + int cpu; /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -365,6 +366,14 @@ struct disk_info { struct md_rdev *rdev, *replacement; }; +struct raid5_auxth { + struct md_thread *thread; + /* which CPUs should the auxiliary thread handle stripes from */ + cpumask_t work_mask; + struct kobject kobj; + struct work_struct del_work; +}; + struct r5conf { struct hlist_head *stripe_hashtbl; struct mddev *mddev; @@ -433,6 +442,12 @@ struct r5conf { * lists and performing address * conversions */ + struct list_head handle_list; + cpumask_t handle_threads; /* Which threads can the CPU's + * stripes be handled. It really + * is a bitmap to aux_threads[], + * but has max bits NR_CPUS + */ } __percpu *percpu; size_t scribble_len; /* size of scribble region must be * associated with conf to handle @@ -460,6 +475,10 @@ struct r5conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + int aux_thread_num; + struct raid5_auxth **aux_threads; + /* which CPUs should raid5d thread handle stripes from */ + cpumask_t work_mask; }; /* |