summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2013-02-14 12:44:16 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2013-02-14 12:44:16 +1100
commitcae7962ff06b2aa8637b49e97901090ac728064a (patch)
tree4894ba134c887d1b274e63a79909d6baf04e5519 /drivers/md
parenta902eda6f6f03a3b74cb7fca7011d14a5836f71f (diff)
parent106007154abfada940c979284570448094126eb2 (diff)
Merge remote-tracking branch 'md/for-next'
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-raid.c123
-rw-r--r--drivers/md/md.c8
-rw-r--r--drivers/md/md.h8
-rw-r--r--drivers/md/raid1.c7
-rw-r--r--drivers/md/raid10.c96
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c408
-rw-r--r--drivers/md/raid5.h19
8 files changed, 607 insertions, 67 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a3e115b62186..9d6bf19ce6e8 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
};
+static char *raid10_md_layout_to_format(int layout)
+{
+ /*
+ * Bit 16 and 17 stand for "offset" and "use_far_sets"
+ * Refer to MD's raid10.c for details
+ */
+ if ((layout & 0x10000) && (layout & 0x20000))
+ return "offset";
+
+ if ((layout & 0xFF) > 1)
+ return "near";
+
+ return "far";
+}
+
static unsigned raid10_md_layout_to_copies(int layout)
{
- return layout & 0xFF;
+ if ((layout & 0xFF) > 1)
+ return layout & 0xFF;
+ return (layout >> 8) & 0xFF;
}
static int raid10_format_to_md_layout(char *format, unsigned copies)
{
- /* 1 "far" copy, and 'copies' "near" copies */
- return (1 << 8) | (copies & 0xFF);
+ unsigned n = 1, f = 1;
+
+ if (!strcmp("near", format))
+ n = copies;
+ else
+ f = copies;
+
+ if (!strcmp("offset", format))
+ return 0x30000 | (f << 8) | n;
+
+ if (!strcmp("far", format))
+ return 0x20000 | (f << 8) | n;
+
+ return (f << 8) | n;
}
static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
unsigned rebuilds_per_group, copies, d;
+ unsigned group_size, last_group_start;
for (i = 0; i < rs->md.raid_disks; i++)
if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
* as long as the failed devices occur in different mirror
* groups (i.e. different stripes).
*
- * Right now, we only allow for "near" copies. When other
- * formats are added, we will have to check those too.
- *
* When checking "near" format, make sure no adjacent devices
* have failed beyond what can be handled. In addition to the
* simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
* A A B B C
* C D D E E
*/
- for (i = 0; i < rs->md.raid_disks * copies; i++) {
- if (!(i % copies))
+ if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
+ for (i = 0; i < rs->md.raid_disks * copies; i++) {
+ if (!(i % copies))
+ rebuilds_per_group = 0;
+ d = i % rs->md.raid_disks;
+ if ((!rs->dev[d].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ (++rebuilds_per_group >= copies))
+ goto too_many;
+ }
+ break;
+ }
+
+ /*
+ * When checking "far" and "offset" formats, we need to ensure
+ * that the device that holds its copy is not also dead or
+ * being rebuilt. (Note that "far" and "offset" formats only
+ * support two copies right now. These formats also only ever
+ * use the 'use_far_sets' variant.)
+ *
+ * This check is somewhat complicated by the need to account
+ * for arrays that are not a multiple of (far) copies. This
+ * results in the need to treat the last (potentially larger)
+ * set differently.
+ */
+ group_size = (rs->md.raid_disks / copies);
+ last_group_start = (rs->md.raid_disks / group_size) - 1;
+ last_group_start *= group_size;
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (!(i % copies) && !(i > last_group_start))
rebuilds_per_group = 0;
- d = i % rs->md.raid_disks;
- if ((!rs->dev[d].rdev.sb_page ||
- !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ if ((!rs->dev[i].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
(++rebuilds_per_group >= copies))
- goto too_many;
+ goto too_many;
}
break;
default:
@@ -433,7 +487,7 @@ too_many:
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
- * [raid10_format <near>] Layout algorithm. (Default: near)
+ * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
*/
static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
return -EINVAL;
}
- if (strcmp("near", argv[i])) {
+ if (strcmp("near", argv[i]) &&
+ strcmp("far", argv[i]) &&
+ strcmp("offset", argv[i])) {
rs->ti->error = "Invalid 'raid10_format' value given";
return -EINVAL;
}
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
+ /*
+ * If the format is not "near", we only support
+ * two copies at the moment.
+ */
+ if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
+ rs->ti->error = "Too many copies for given RAID10 format.";
+ return -EINVAL;
+ }
+
/* (Len * #mirrors) / #devices */
sectors_per_dev = rs->ti->len * raid10_copies;
sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
/*
* Reshaping is not currently allowed
*/
- if ((le32_to_cpu(sb->level) != mddev->level) ||
- (le32_to_cpu(sb->layout) != mddev->layout) ||
- (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
- DMERR("Reshaping arrays not yet supported.");
+ if (le32_to_cpu(sb->level) != mddev->level) {
+ DMERR("Reshaping arrays not yet supported. (RAID level change)");
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->layout) != mddev->layout) {
+ DMERR("Reshaping arrays not yet supported. (RAID layout change)");
+ DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+ DMERR(" Old layout: %s w/ %d copies",
+ raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+ raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+ DMERR(" New layout: %s w/ %d copies",
+ raid10_md_layout_to_format(mddev->layout),
+ raid10_md_layout_to_copies(mddev->layout));
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+ DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
return -EINVAL;
}
/* We can only change the number of devices in RAID1 right now */
if ((rs->raid_type->level != 1) &&
(le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
- DMERR("Reshaping arrays not yet supported.");
+ DMERR("Reshaping arrays not yet supported. (device count change)");
return -EINVAL;
}
@@ -1329,7 +1407,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
raid10_md_layout_to_copies(rs->md.layout));
if (rs->print_flags & DMPF_RAID10_FORMAT)
- DMEMIT(" raid10_format near");
+ DMEMIT(" raid10_format %s",
+ raid10_md_layout_to_format(rs->md.layout));
DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1420,6 +1499,10 @@ static struct target_type raid_target = {
static int __init dm_raid_init(void)
{
+ DMINFO("Loading target version %u.%u.%u",
+ raid_target.version[0],
+ raid_target.version[1],
+ raid_target.version[2]);
return dm_register_target(&raid_target);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1e634a68541e..8b557d2b6f99 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -607,9 +607,9 @@ static struct mddev * mddev_find(dev_t unit)
goto retry;
}
-static inline int mddev_lock(struct mddev * mddev)
+int md_queue_misc_work(struct work_struct *work)
{
- return mutex_lock_interruptible(&mddev->reconfig_mutex);
+ return queue_work(md_misc_wq, work);
}
static inline int mddev_is_locked(struct mddev *mddev)
@@ -624,7 +624,7 @@ static inline int mddev_trylock(struct mddev * mddev)
static struct attribute_group md_redundancy_group;
-static void mddev_unlock(struct mddev * mddev)
+void mddev_unlock(struct mddev * mddev)
{
if (mddev->to_remove) {
/* These cannot be removed under reconfig_mutex as
@@ -8631,6 +8631,8 @@ EXPORT_SYMBOL(md_register_thread);
EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_check_recovery);
+EXPORT_SYMBOL(mddev_unlock);
+EXPORT_SYMBOL(md_queue_misc_work);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
MODULE_ALIAS("md");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eca59c3074ef..2afa1c34d35b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -612,4 +612,12 @@ static inline int mddev_check_plugged(struct mddev *mddev)
return !!blk_check_plugged(md_unplug, mddev,
sizeof(struct blk_plug_cb));
}
+
+static inline int mddev_lock(struct mddev * mddev)
+{
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+extern int md_queue_misc_work(struct work_struct *work);
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010e..6e5d5a5f9cb4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1000,6 +1000,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1302,8 @@ read_again:
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+ mbio->bi_rw =
+ WRITE | do_flush_fua | do_sync | do_discard | do_same;
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2820,9 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
+ if (mddev->queue)
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..61ed150bd0cf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
* near_copies (stored in low byte of layout)
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
+ * use_far_sets (stored in bit 17 of layout )
*
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize. Each device
+ * is divided into far_copies sections. In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive). The starting device for each section is offset
+ * near_copies from the starting device of the previous section. Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive. near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
*
* If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array. This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ * A B C D A B C D E
+ * ... ...
+ * D A B C E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ * [A B] [C D] [A B] [C D E]
+ * |...| |...| |...| | ... |
+ * [B A] [D C] [B A] [E C D]
*/
/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
sector_t stripe;
int dev;
int slot = 0;
+ int last_far_set_start, last_far_set_size;
+
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ last_far_set_size = geo->far_set_size;
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
/* and calculate all the others */
for (n = 0; n < geo->near_copies; n++) {
int d = dev;
+ int set;
sector_t s = sector;
- r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
+ set = d / geo->far_set_size;
d += geo->near_copies;
- if (d >= geo->raid_disks)
- d -= geo->raid_disks;
+
+ if ((geo->raid_disks % geo->far_set_size) &&
+ (d > last_far_set_start)) {
+ d -= last_far_set_start;
+ d %= last_far_set_size;
+ d += last_far_set_start;
+ } else {
+ d %= geo->far_set_size;
+ d += geo->far_set_size * set;
+ }
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+ int far_set_size = geo->far_set_size;
+ int last_far_set_start;
+
+ if (geo->raid_disks % geo->far_set_size) {
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ if (dev >= last_far_set_start) {
+ far_set_size = geo->far_set_size;
+ far_set_size += (geo->raid_disks % geo->far_set_size);
+ far_set_start = last_far_set_start;
+ }
+ }
offset = sector & geo->chunk_mask;
if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies;
- if (dev < 0)
- dev += geo->raid_disks;
+ if (dev < far_set_start)
+ dev += far_set_size;
} else {
while (sector >= geo->stride) {
sector -= geo->stride;
- if (dev < geo->near_copies)
- dev += geo->raid_disks - geo->near_copies;
+ if (dev < (geo->near_copies + far_set_start))
+ dev += far_set_size - geo->near_copies;
else
dev -= geo->near_copies;
}
@@ -1105,6 +1151,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
@@ -1460,7 +1507,8 @@ retry_write:
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1550,8 @@ retry_write:
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3485,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
- if (layout >> 17)
+ if (layout >> 18)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
@@ -3448,6 +3497,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
@@ -3569,6 +3619,8 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf602345..157d69e83ff4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
* far_offset, in which case it is
* 1 stripe.
*/
+ int far_set_size; /* The number of devices in a set,
+ * where a 'set' are devices that
+ * contain far/offset copies of
+ * each other.
+ */
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
} prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9ab506df42da..77c4a585fb82 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -198,6 +198,21 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+ struct raid5_percpu *percpu;
+ int i, orphaned = 1;
+
+ percpu = per_cpu_ptr(conf->percpu, sh->cpu);
+ for_each_cpu(i, &percpu->handle_threads) {
+ md_wakeup_thread(conf->aux_threads[i]->thread);
+ orphaned = 0;
+ }
+ if (orphaned)
+ md_wakeup_thread(conf->mddev->thread);
+}
+
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
{
BUG_ON(!list_empty(&sh->lru));
@@ -210,9 +225,19 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
+ int cpu = sh->cpu;
+ struct raid5_percpu *percpu;
+ if (!cpu_online(cpu)) {
+ cpu = cpumask_any(cpu_online_mask);
+ sh->cpu = cpu;
+ }
+ percpu = per_cpu_ptr(conf->percpu, cpu);
+
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
- list_add_tail(&sh->lru, &conf->handle_list);
+ list_add_tail(&sh->lru, &percpu->handle_list);
+ raid5_wakeup_stripe_thread(sh);
+ return;
}
md_wakeup_thread(conf->mddev->thread);
} else {
@@ -357,6 +382,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
raid5_build_block(sh, i, previous);
}
insert_hash(conf, sh);
+ sh->cpu = smp_processor_id();
}
static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -3759,12 +3785,19 @@ static void raid5_activate_delayed(struct r5conf *conf)
while (!list_empty(&conf->delayed_list)) {
struct list_head *l = conf->delayed_list.next;
struct stripe_head *sh;
+ int cpu;
sh = list_entry(l, struct stripe_head, lru);
list_del_init(l);
clear_bit(STRIPE_DELAYED, &sh->state);
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
list_add_tail(&sh->lru, &conf->hold_list);
+ cpu = sh->cpu;
+ if (!cpu_online(cpu)) {
+ cpu = cpumask_any(cpu_online_mask);
+ sh->cpu = cpu;
+ }
+ raid5_wakeup_stripe_thread(sh);
}
}
}
@@ -4041,18 +4074,29 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
* head of the hold_list has changed, i.e. the head was promoted to the
* handle_list.
*/
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
-{
- struct stripe_head *sh;
-
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf,
+ cpumask_t *mask)
+{
+ struct stripe_head *sh = NULL, *tmp;
+ struct list_head *handle_list = NULL;
+ int cpu;
+
+ /* Should we take action to avoid starvation of latter CPUs ? */
+ for_each_cpu(cpu, mask) {
+ struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+ if (!list_empty(&percpu->handle_list)) {
+ handle_list = &percpu->handle_list;
+ break;
+ }
+ }
pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
__func__,
- list_empty(&conf->handle_list) ? "empty" : "busy",
+ !handle_list ? "empty" : "busy",
list_empty(&conf->hold_list) ? "empty" : "busy",
atomic_read(&conf->pending_full_writes), conf->bypass_count);
- if (!list_empty(&conf->handle_list)) {
- sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+ if (handle_list) {
+ sh = list_entry(handle_list->next, typeof(*sh), lru);
if (list_empty(&conf->hold_list))
conf->bypass_count = 0;
@@ -4070,12 +4114,23 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
((conf->bypass_threshold &&
conf->bypass_count > conf->bypass_threshold) ||
atomic_read(&conf->pending_full_writes) == 0)) {
- sh = list_entry(conf->hold_list.next,
- typeof(*sh), lru);
- conf->bypass_count -= conf->bypass_threshold;
- if (conf->bypass_count < 0)
- conf->bypass_count = 0;
- } else
+
+ list_for_each_entry(tmp, &conf->hold_list, lru) {
+ if (cpumask_test_cpu(tmp->cpu, mask) ||
+ !cpu_online(tmp->cpu)) {
+ sh = tmp;
+ break;
+ }
+ }
+
+ if (sh) {
+ conf->bypass_count -= conf->bypass_threshold;
+ if (conf->bypass_count < 0)
+ conf->bypass_count = 0;
+ }
+ }
+
+ if (!sh)
return NULL;
list_del_init(&sh->lru);
@@ -4757,13 +4812,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
}
#define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, cpumask_t *mask)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0;
while (batch_size < MAX_STRIPE_BATCH &&
- (sh = __get_priority_stripe(conf)) != NULL)
+ (sh = __get_priority_stripe(conf, mask)) != NULL)
batch[batch_size++] = sh;
if (batch_size == 0)
@@ -4781,6 +4836,35 @@ static int handle_active_stripes(struct r5conf *conf)
return batch_size;
}
+static void raid5auxd(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct r5conf *conf = mddev->private;
+ struct blk_plug plug;
+ int handled;
+ struct raid5_auxth *auxth = thread->private;
+
+ pr_debug("+++ raid5auxd active\n");
+
+ blk_start_plug(&plug);
+ handled = 0;
+ spin_lock_irq(&conf->device_lock);
+ while (1) {
+ int batch_size;
+
+ batch_size = handle_active_stripes(conf, &auxth->work_mask);
+ if (!batch_size)
+ break;
+ handled += batch_size;
+ }
+ pr_debug("%d stripes handled\n", handled);
+
+ spin_unlock_irq(&conf->device_lock);
+ blk_finish_plug(&plug);
+
+ pr_debug("--- raid5auxd inactive\n");
+}
+
/*
* This is our raid5 kernel thread.
*
@@ -4828,7 +4912,7 @@ static void raid5d(struct md_thread *thread)
handled++;
}
- batch_size = handle_active_stripes(conf);
+ batch_size = handle_active_stripes(conf, &conf->work_mask);
if (!batch_size)
break;
handled += batch_size;
@@ -4957,10 +5041,270 @@ stripe_cache_active_show(struct mddev *mddev, char *page)
static struct md_sysfs_entry
raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
+static void raid5_update_threads_handle_mask(struct mddev *mddev)
+{
+ int cpu, i;
+ struct raid5_percpu *percpu;
+ struct r5conf *conf = mddev->private;
+
+ for_each_online_cpu(cpu) {
+ percpu = per_cpu_ptr(conf->percpu, cpu);
+ cpumask_clear(&percpu->handle_threads);
+ }
+ cpumask_copy(&conf->work_mask, cpu_online_mask);
+
+ for (i = 0; i < conf->aux_thread_num; i++) {
+ cpumask_t *work_mask = &conf->aux_threads[i]->work_mask;
+ for_each_cpu(cpu, work_mask) {
+ percpu = per_cpu_ptr(conf->percpu, cpu);
+ cpumask_set_cpu(i, &percpu->handle_threads);
+ }
+ cpumask_andnot(&conf->work_mask, &conf->work_mask,
+ work_mask);
+ }
+}
+
+struct raid5_auxth_sysfs {
+ struct attribute attr;
+ ssize_t (*show)(struct mddev *, struct raid5_auxth *, char *);
+ ssize_t (*store)(struct mddev *, struct raid5_auxth *,
+ const char *, size_t);
+};
+
+static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
+ struct raid5_auxth *thread, char *page)
+{
+ if (!mddev->private)
+ return 0;
+ return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
+}
+
+static ssize_t
+raid5_store_thread_cpulist(struct mddev *mddev, struct raid5_auxth *thread,
+ const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ cpumask_var_t mask;
+
+ if (!conf)
+ return -ENODEV;
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (cpulist_parse(page, mask)) {
+ free_cpumask_var(mask);
+ return -EINVAL;
+ }
+
+ get_online_cpus();
+ spin_lock_irq(&conf->device_lock);
+ cpumask_copy(&thread->work_mask, mask);
+ raid5_update_threads_handle_mask(mddev);
+ spin_unlock_irq(&conf->device_lock);
+ put_online_cpus();
+ set_cpus_allowed_ptr(thread->thread->tsk, mask);
+
+ free_cpumask_var(mask);
+ return len;
+}
+
+static struct raid5_auxth_sysfs thread_cpulist =
+__ATTR(cpulist, S_IRUGO|S_IWUSR, raid5_show_thread_cpulist,
+ raid5_store_thread_cpulist);
+
+static struct attribute *auxth_attrs[] = {
+ &thread_cpulist.attr,
+ NULL,
+};
+
+static ssize_t
+raid5_auxth_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+ struct raid5_auxth_sysfs *entry = container_of(attr,
+ struct raid5_auxth_sysfs, attr);
+ struct raid5_auxth *thread = container_of(kobj,
+ struct raid5_auxth, kobj);
+ struct mddev *mddev = thread->thread->mddev;
+ ssize_t ret;
+
+ if (!entry->show)
+ return -EIO;
+ mddev_lock(mddev);
+ ret = entry->show(mddev, thread, page);
+ mddev_unlock(mddev);
+ return ret;
+}
+
+static ssize_t
+raid5_auxth_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct raid5_auxth_sysfs *entry = container_of(attr,
+ struct raid5_auxth_sysfs, attr);
+ struct raid5_auxth *thread = container_of(kobj,
+ struct raid5_auxth, kobj);
+ struct mddev *mddev = thread->thread->mddev;
+ ssize_t ret;
+
+ if (!entry->store)
+ return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ mddev_lock(mddev);
+ ret = entry->store(mddev, thread, page, length);
+ mddev_unlock(mddev);
+ return ret;
+}
+
+static void raid5_auxth_release(struct kobject *kobj)
+{
+ struct raid5_auxth *thread = container_of(kobj,
+ struct raid5_auxth, kobj);
+ kfree(thread);
+}
+
+static const struct sysfs_ops raid5_auxth_sysfsops = {
+ .show = raid5_auxth_attr_show,
+ .store = raid5_auxth_attr_store,
+};
+static struct kobj_type raid5_auxth_ktype = {
+ .release = raid5_auxth_release,
+ .sysfs_ops = &raid5_auxth_sysfsops,
+ .default_attrs = auxth_attrs,
+};
+
+static ssize_t
+raid5_show_auxthread_number(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", conf->aux_thread_num);
+ else
+ return 0;
+}
+
+static void raid5_auxth_delete(struct work_struct *ws)
+{
+ struct raid5_auxth *thread = container_of(ws, struct raid5_auxth,
+ del_work);
+
+ kobject_del(&thread->kobj);
+ kobject_put(&thread->kobj);
+}
+
+static void __free_aux_thread(struct mddev *mddev, struct raid5_auxth *thread)
+{
+ md_unregister_thread(&thread->thread);
+ INIT_WORK(&thread->del_work, raid5_auxth_delete);
+ kobject_get(&thread->kobj);
+ md_queue_misc_work(&thread->del_work);
+}
+
+static struct raid5_auxth *__create_aux_thread(struct mddev *mddev, int i)
+{
+ struct raid5_auxth *thread;
+ char name[10];
+
+ thread = kzalloc(sizeof(*thread), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+ snprintf(name, 10, "aux%d", i);
+ thread->thread = md_register_thread(raid5auxd, mddev, name);
+ if (!thread->thread) {
+ kfree(thread);
+ return NULL;
+ }
+ thread->thread->private = thread;
+
+ cpumask_copy(&thread->work_mask, cpu_online_mask);
+
+ if (kobject_init_and_add(&thread->kobj, &raid5_auxth_ktype,
+ &mddev->kobj, "auxth%d", i)) {
+ md_unregister_thread(&thread->thread);
+ kfree(thread);
+ return NULL;
+ }
+ return thread;
+}
+
+static ssize_t
+raid5_store_auxthread_number(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ unsigned long new;
+ int i;
+ struct raid5_auxth **threads;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (!conf)
+ return -ENODEV;
+
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ if (new == conf->aux_thread_num)
+ return len;
+
+ /* There is no point creating more threads than cpu number */
+ if (new > num_online_cpus())
+ return -EINVAL;
+
+ if (new > conf->aux_thread_num) {
+ threads = kzalloc(sizeof(struct raid5_auxth *) * new,
+ GFP_KERNEL);
+ if (!threads)
+ return -ENOMEM;
+
+ i = conf->aux_thread_num;
+ while (i < new) {
+ threads[i] = __create_aux_thread(mddev, i);
+ if (!threads[i])
+ goto error;
+
+ i++;
+ }
+ memcpy(threads, conf->aux_threads,
+ sizeof(struct raid5_auxth *) * conf->aux_thread_num);
+ get_online_cpus();
+ spin_lock_irq(&conf->device_lock);
+ kfree(conf->aux_threads);
+ conf->aux_threads = threads;
+ conf->aux_thread_num = new;
+ raid5_update_threads_handle_mask(mddev);
+ spin_unlock_irq(&conf->device_lock);
+ put_online_cpus();
+ } else {
+ int old = conf->aux_thread_num;
+
+ get_online_cpus();
+ spin_lock_irq(&conf->device_lock);
+ conf->aux_thread_num = new;
+ raid5_update_threads_handle_mask(mddev);
+ spin_unlock_irq(&conf->device_lock);
+ put_online_cpus();
+ for (i = new; i < old; i++)
+ __free_aux_thread(mddev, conf->aux_threads[i]);
+ }
+
+ return len;
+error:
+ while (--i >= conf->aux_thread_num)
+ __free_aux_thread(mddev, threads[i]);
+ kfree(threads);
+ return -ENOMEM;
+}
+
+static struct md_sysfs_entry
+raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
+ raid5_show_auxthread_number,
+ raid5_store_auxthread_number);
+
static struct attribute *raid5_attrs[] = {
&raid5_stripecache_size.attr,
&raid5_stripecache_active.attr,
&raid5_preread_bypass_threshold.attr,
+ &raid5_auxthread_number.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
@@ -5008,6 +5352,7 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf)
{
+ kfree(conf->aux_threads);
shrink_stripes(conf);
raid5_free_percpu(conf);
kfree(conf->disks);
@@ -5020,7 +5365,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
- long cpu = (long)hcpu;
+ long cpu = (long)hcpu, anycpu;
struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
switch (action) {
@@ -5039,9 +5384,17 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
__func__, cpu);
return notifier_from_errno(-ENOMEM);
}
+ INIT_LIST_HEAD(&(percpu->handle_list));
+ cpumask_clear(&percpu->handle_threads);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+ spin_lock_irq(&conf->device_lock);
+ anycpu = cpumask_any(cpu_online_mask);
+ list_splice_tail_init(&percpu->handle_list,
+ &per_cpu_ptr(conf->percpu, anycpu)->handle_list);
+ spin_unlock_irq(&conf->device_lock);
+
safe_put_page(percpu->spare_page);
kfree(percpu->scribble);
percpu->spare_page = NULL;
@@ -5050,6 +5403,10 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
default:
break;
}
+
+ spin_lock_irq(&conf->device_lock);
+ raid5_update_threads_handle_mask(conf->mddev);
+ spin_unlock_irq(&conf->device_lock);
return NOTIFY_OK;
}
#endif
@@ -5070,20 +5427,24 @@ static int raid5_alloc_percpu(struct r5conf *conf)
get_online_cpus();
err = 0;
for_each_present_cpu(cpu) {
+ struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+
if (conf->level == 6) {
spare_page = alloc_page(GFP_KERNEL);
if (!spare_page) {
err = -ENOMEM;
break;
}
- per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+ percpu->spare_page = spare_page;
}
scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
if (!scribble) {
err = -ENOMEM;
break;
}
- per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+ percpu->scribble = scribble;
+ INIT_LIST_HEAD(&percpu->handle_list);
+ cpumask_clear(&percpu->handle_threads);
}
#ifdef CONFIG_HOTPLUG_CPU
conf->cpu_notify.notifier_call = raid456_cpu_notify;
@@ -5139,7 +5500,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
- INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
@@ -5150,6 +5510,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->bypass_threshold = BYPASS_THRESHOLD;
conf->recovery_disabled = mddev->recovery_disabled - 1;
+ cpumask_copy(&conf->work_mask, cpu_online_mask);
+
conf->raid_disks = mddev->raid_disks;
if (mddev->reshape_position == MaxSector)
conf->previous_raid_disks = mddev->raid_disks;
@@ -5606,6 +5968,10 @@ abort:
static int stop(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
+ int i;
+
+ for (i = 0; i < conf->aux_thread_num; i++)
+ __free_aux_thread(mddev, conf->aux_threads[i]);
md_unregister_thread(&mddev->thread);
if (mddev->queue)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1fd..2afd8358556b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -211,6 +211,7 @@ struct stripe_head {
enum check_states check_state;
enum reconstruct_states reconstruct_state;
spinlock_t stripe_lock;
+ int cpu;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -365,6 +366,14 @@ struct disk_info {
struct md_rdev *rdev, *replacement;
};
+struct raid5_auxth {
+ struct md_thread *thread;
+ /* which CPUs should the auxiliary thread handle stripes from */
+ cpumask_t work_mask;
+ struct kobject kobj;
+ struct work_struct del_work;
+};
+
struct r5conf {
struct hlist_head *stripe_hashtbl;
struct mddev *mddev;
@@ -433,6 +442,12 @@ struct r5conf {
* lists and performing address
* conversions
*/
+ struct list_head handle_list;
+ cpumask_t handle_threads; /* Which threads can the CPU's
+ * stripes be handled. It really
+ * is a bitmap to aux_threads[],
+ * but has max bits NR_CPUS
+ */
} __percpu *percpu;
size_t scribble_len; /* size of scribble region must be
* associated with conf to handle
@@ -460,6 +475,10 @@ struct r5conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
+ int aux_thread_num;
+ struct raid5_auxth **aux_threads;
+ /* which CPUs should raid5d thread handle stripes from */
+ cpumask_t work_mask;
};
/*