diff options
Diffstat (limited to 'drivers/md')
54 files changed, 3135 insertions, 3900 deletions
diff --git a/drivers/md/.gitignore b/drivers/md/.gitignore deleted file mode 100644 index a7afec6b19c6..000000000000 --- a/drivers/md/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -mktables -raid6altivec*.c -raid6int*.c -raid6tables.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index acb3a4e404ff..bf1a95e31559 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -100,8 +100,8 @@ config MD_RAID1 If unsure, say Y. config MD_RAID10 - tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" - depends on BLK_DEV_MD && EXPERIMENTAL + tristate "RAID-10 (mirrored striping) mode" + depends on BLK_DEV_MD ---help--- RAID-10 provides a combination of striping (RAID-0) and mirroring (RAID-1) with easier configuration and more flexible @@ -121,7 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD - select MD_RAID6_PQ + select RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR select ASYNC_PQ @@ -165,22 +165,6 @@ config MULTICORE_RAID456 If unsure, say N. -config MD_RAID6_PQ - tristate - -config ASYNC_RAID6_TEST - tristate "Self test for hardware accelerated raid6 recovery" - depends on MD_RAID6_PQ - select ASYNC_RAID6_RECOV - ---help--- - This is a one-shot self test that permutes through the - recovery of all the possible two disk failure scenarios for a - N-disk array. Recovery is performed with the asynchronous - raid6 recovery routines, and will optionally use an offload - engine if one is available. - - If unsure, say N. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index e355e7f6a536..5e3aac41919d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -12,13 +12,6 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o md-mod-y += md.o bitmap.o raid456-y += raid5.o -raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ - raid6int1.o raid6int2.o raid6int4.o \ - raid6int8.o raid6int16.o raid6int32.o \ - raid6altivec1.o raid6altivec2.o raid6altivec4.o \ - raid6altivec8.o \ - raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y += mktables # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise @@ -29,7 +22,6 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o -obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o @@ -45,75 +37,6 @@ obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o -quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ - < $< > $@ || ( rm -f $@ && exit 1 ) - -ifeq ($(CONFIG_ALTIVEC),y) -altivec_flags := -maltivec -mabi=altivec -endif - ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif - -targets += raid6int1.c -$(obj)/raid6int1.c: UNROLL := 1 -$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += raid6int2.c -$(obj)/raid6int2.c: UNROLL := 2 -$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += raid6int4.c -$(obj)/raid6int4.c: UNROLL := 4 -$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += raid6int8.c -$(obj)/raid6int8.c: UNROLL := 8 -$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += raid6int16.c -$(obj)/raid6int16.c: UNROLL := 16 -$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -targets += raid6int32.c -$(obj)/raid6int32.c: UNROLL := 32 -$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec1.o += $(altivec_flags) -targets += raid6altivec1.c -$(obj)/raid6altivec1.c: UNROLL := 1 -$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec2.o += $(altivec_flags) -targets += raid6altivec2.c -$(obj)/raid6altivec2.c: UNROLL := 2 -$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec4.o += $(altivec_flags) -targets += raid6altivec4.c -$(obj)/raid6altivec4.c: UNROLL := 4 -$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec8.o += $(altivec_flags) -targets += raid6altivec8.c -$(obj)/raid6altivec8.c: UNROLL := 8 -$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE - $(call if_changed,unroll) - -quiet_cmd_mktable = TABLE $@ - cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) - -targets += raid6tables.c -$(obj)/raid6tables.c: $(obj)/mktables FORCE - $(call if_changed,mktable) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 26ac8aad0b19..e4fb58db5454 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -13,7 +13,6 @@ * Still to do: * * flush after percent set rather than just time based. (maybe both). - * wait if count gets too high, wake when it drops to half. */ #include <linux/blkdev.h> @@ -30,6 +29,7 @@ #include "md.h" #include "bitmap.h" +#include <linux/dm-dirty-log.h> /* debug macros */ #define DEBUG 0 @@ -51,9 +51,6 @@ #define INJECT_FATAL_FAULT_3 0 /* undef */ #endif -//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */ -#define DPRINTK(x...) do { } while(0) - #ifndef PRINTK # if DEBUG > 0 # define PRINTK(x...) printk(KERN_DEBUG x) @@ -62,12 +59,11 @@ # endif #endif -static inline char * bmname(struct bitmap *bitmap) +static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } - /* * just a placeholder - calls kmalloc for bitmap pages */ @@ -78,7 +74,7 @@ static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) #ifdef INJECT_FAULTS_1 page = NULL; #else - page = kmalloc(PAGE_SIZE, GFP_NOIO); + page = kzalloc(PAGE_SIZE, GFP_NOIO); #endif if (!page) printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); @@ -107,7 +103,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it */ -static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) +static int bitmap_checkpage(struct bitmap *bitmap, + unsigned long page, int create) __releases(bitmap->lock) __acquires(bitmap->lock) { @@ -121,7 +118,6 @@ __acquires(bitmap->lock) return -EINVAL; } - if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -131,43 +127,34 @@ __acquires(bitmap->lock) if (!create) return -ENOENT; - spin_unlock_irq(&bitmap->lock); - /* this page has not been allocated yet */ - if ((mappage = bitmap_alloc_page(bitmap)) == NULL) { + spin_unlock_irq(&bitmap->lock); + mappage = bitmap_alloc_page(bitmap); + spin_lock_irq(&bitmap->lock); + + if (mappage == NULL) { PRINTK("%s: bitmap map page allocation failed, hijacking\n", bmname(bitmap)); /* failed - set the hijacked flag so that we can use the * pointer as a counter */ - spin_lock_irq(&bitmap->lock); if (!bitmap->bp[page].map) bitmap->bp[page].hijacked = 1; - goto out; - } - - /* got a page */ - - spin_lock_irq(&bitmap->lock); - - /* recheck the page */ - - if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { + } else if (bitmap->bp[page].map || + bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ bitmap_free_page(bitmap, mappage); return 0; - } + } else { - /* no page was in place and we have one, so install it */ + /* no page was in place and we have one, so install it */ - memset(mappage, 0, PAGE_SIZE); - bitmap->bp[page].map = mappage; - bitmap->missing_pages--; -out: + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; + } return 0; } - /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ @@ -183,26 +170,15 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ bitmap->bp[page].hijacked = 0; bitmap->bp[page].map = NULL; - return; + } else { + /* normal case, free the page */ + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + bitmap_free_page(bitmap, ptr); } - - /* normal case, free the page */ - -#if 0 -/* actually ... let's not. We will probably need the page again exactly when - * memory is tight and we are flusing to disk - */ - return; -#else - ptr = bitmap->bp[page].map; - bitmap->bp[page].map = NULL; - bitmap->missing_pages++; - bitmap_free_page(bitmap, ptr); - return; -#endif } - /* * bitmap file handling - read and write the bitmap file and its superblock */ @@ -220,11 +196,14 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, mdk_rdev_t *rdev; sector_t target; + int did_alloc = 0; - if (!page) + if (!page) { page = alloc_page(GFP_KERNEL); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) + return ERR_PTR(-ENOMEM); + did_alloc = 1; + } list_for_each_entry(rdev, &mddev->disks, same_set) { if (! test_bit(In_sync, &rdev->flags) @@ -242,6 +221,8 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, return page; } } + if (did_alloc) + put_page(page); return ERR_PTR(-EIO); } @@ -286,49 +267,51 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - int size = PAGE_SIZE; - loff_t offset = mddev->bitmap_info.offset; - if (page->index == bitmap->file_pages-1) - size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); - /* Just make sure we aren't corrupting data or - * metadata - */ - if (mddev->external) { - /* Bitmap could be anywhere. */ - if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) > - rdev->data_offset && - rdev->sb_start + offset < - rdev->data_offset + mddev->dev_sectors + - (PAGE_SIZE/512)) - goto bad_alignment; - } else if (offset < 0) { - /* DATA BITMAP METADATA */ - if (offset - + (long)(page->index * (PAGE_SIZE/512)) - + size/512 > 0) - /* bitmap runs in to metadata */ - goto bad_alignment; - if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + offset) - /* data runs in to bitmap */ - goto bad_alignment; - } else if (rdev->sb_start < rdev->data_offset) { - /* METADATA BITMAP DATA */ - if (rdev->sb_start - + offset - + page->index*(PAGE_SIZE/512) + size/512 - > rdev->data_offset) - /* bitmap runs in to data */ - goto bad_alignment; - } else { - /* DATA METADATA BITMAP - no problems */ - } - md_super_write(mddev, rdev, - rdev->sb_start + offset - + page->index * (PAGE_SIZE/512), - size, - page); + int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; + if (page->index == bitmap->file_pages-1) + size = roundup(bitmap->last_page_size, + bdev_logical_block_size(rdev->bdev)); + /* Just make sure we aren't corrupting data or + * metadata + */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index + * (PAGE_SIZE/512)) + > rdev->data_offset + && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512))) + goto bad_alignment; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE/512)) + + size/512 > 0) + /* bitmap runs in to metadata */ + goto bad_alignment; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + + offset + + page->index*(PAGE_SIZE/512) + size/512 + > rdev->data_offset) + /* bitmap runs in to data */ + goto bad_alignment; + } else { + /* DATA METADATA BITMAP - no problems */ + } + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE/512), + size, + page); } if (wait) @@ -364,10 +347,9 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) bh = bh->b_this_page; } - if (wait) { + if (wait) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); - } } if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); @@ -424,7 +406,7 @@ static struct page *read_page(struct file *file, unsigned long index, struct buffer_head *bh; sector_t block; - PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, + PRINTK("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); page = alloc_page(GFP_KERNEL); @@ -478,7 +460,7 @@ static struct page *read_page(struct file *file, unsigned long index, } out: if (IS_ERR(page)) - printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT, PTR_ERR(page)); @@ -505,7 +487,7 @@ void bitmap_update_sb(struct bitmap *bitmap) return; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) { /* rocking back to read-only */ @@ -526,7 +508,7 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->sb_page) return; - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -575,7 +557,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) return err; } - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -661,14 +643,17 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, return 0; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page, KM_USER0); old = le32_to_cpu(sb->state) & bits; switch (op) { - case MASK_SET: sb->state |= cpu_to_le32(bits); - break; - case MASK_UNSET: sb->state &= cpu_to_le32(~bits); - break; - default: BUG(); + case MASK_SET: + sb->state |= cpu_to_le32(bits); + break; + case MASK_UNSET: + sb->state &= cpu_to_le32(~bits); + break; + default: + BUG(); } kunmap_atomic(sb, KM_USER0); return old; @@ -710,12 +695,14 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { - if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; + if (bitmap->filemap == NULL) + return NULL; + if (file_page_index(bitmap, chunk) >= bitmap->file_pages) + return NULL; return bitmap->filemap[file_page_index(bitmap, chunk) - file_page_index(bitmap, 0)]; } - static void bitmap_file_unmap(struct bitmap *bitmap) { struct page **map, *sb_page; @@ -766,7 +753,6 @@ static void bitmap_file_put(struct bitmap *bitmap) } } - /* * bitmap_file_kick - if an error occurs while manipulating the bitmap file * then it is no longer reliable, so we stop using it and we mark the file @@ -785,7 +771,6 @@ static void bitmap_file_kick(struct bitmap *bitmap) ptr = d_path(&bitmap->file->f_path, path, PAGE_SIZE); - printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", bmname(bitmap), IS_ERR(ptr) ? "" : ptr); @@ -803,27 +788,36 @@ static void bitmap_file_kick(struct bitmap *bitmap) } enum bitmap_page_attr { - BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced - BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared - BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ + BITMAP_PAGE_CLEAN = 1, /* there are bits that might need to be cleared */ + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ }; static inline void set_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + __set_bit(attr, &bitmap->logattrs); } static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + __clear_bit(attr, &bitmap->logattrs); } static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + if (page) + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + else + return test_bit(attr, &bitmap->logattrs); } /* @@ -836,30 +830,32 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; - struct page *page; + struct page *page = NULL; void *kaddr; unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); if (!bitmap->filemap) { - return; - } - - page = filemap_get_page(bitmap, chunk); - if (!page) return; - bit = file_page_offset(bitmap, chunk); + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; + if (log) + log->type->mark_region(log, chunk); + } else { - /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - set_bit(bit, kaddr); - else - ext2_set_bit(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); - PRINTK("set file bit %lu page %lu\n", bit, page->index); + page = filemap_get_page(bitmap, chunk); + if (!page) + return; + bit = file_page_offset(bitmap, chunk); + /* set the bit */ + kaddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + set_bit(bit, kaddr); + else + ext2_set_bit(bit, kaddr); + kunmap_atomic(kaddr, KM_USER0); + PRINTK("set file bit %lu page %lu\n", bit, page->index); + } /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - } /* this gets called when the md device is ready to unplug its underlying @@ -874,6 +870,16 @@ void bitmap_unplug(struct bitmap *bitmap) if (!bitmap) return; + if (!bitmap->filemap) { + /* Must be using a dirty_log */ + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; + dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); + need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); + if (dirty || need_write) + if (log->type->flush(log)) + bitmap->flags |= BITMAP_WRITE_ERROR; + goto out; + } /* look at each page to see if there are any set bits that need to be * flushed out to disk */ @@ -892,7 +898,7 @@ void bitmap_unplug(struct bitmap *bitmap) wait = 1; spin_unlock_irqrestore(&bitmap->lock, flags); - if (dirty | need_write) + if (dirty || need_write) write_page(bitmap, page, 0); } if (wait) { /* if any writes were performed, we need to wait on them */ @@ -902,9 +908,11 @@ void bitmap_unplug(struct bitmap *bitmap) else md_super_wait(bitmap->mddev); } +out: if (bitmap->flags & BITMAP_WRITE_ERROR) bitmap_file_kick(bitmap); } +EXPORT_SYMBOL(bitmap_unplug); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize @@ -943,12 +951,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) printk(KERN_INFO "%s: bitmap file is out of date, doing full " "recovery\n", bmname(bitmap)); - bytes = (chunks + 7) / 8; + bytes = DIV_ROUND_UP(bitmap->chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); - - num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); if (file && i_size_read(file->f_mapping->host) < bytes) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", @@ -966,7 +973,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ bitmap->filemap_attr = kzalloc( - roundup( DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), GFP_KERNEL); if (!bitmap->filemap_attr) goto err; @@ -993,10 +1000,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) page = bitmap->sb_page; offset = sizeof(bitmap_super_t); if (!file) - read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index, count); + page = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index, count); } else if (file) { page = read_page(file, index, bitmap, count); offset = 0; @@ -1021,7 +1029,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (outofdate) { /* * if bitmap is out of date, dirty the - * whole page and write it out + * whole page and write it out */ paddr = kmap_atomic(page, KM_USER0); memset(paddr + offset, 0xff, @@ -1052,7 +1060,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) } } - /* everything went OK */ + /* everything went OK */ ret = 0; bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); @@ -1080,21 +1088,16 @@ void bitmap_write_all(struct bitmap *bitmap) */ int i; - for (i=0; i < bitmap->file_pages; i++) + for (i = 0; i < bitmap->file_pages; i++) set_page_attr(bitmap, bitmap->filemap[i], BITMAP_PAGE_NEEDWRITE); } - static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) { sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; -/* - if (page == 0) printk("count page 0, offset %llu: %d gives %d\n", - (unsigned long long)offset, inc, bitmap->bp[page].count); -*/ bitmap_checkfree(bitmap, page); } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, @@ -1114,6 +1117,7 @@ void bitmap_daemon_work(mddev_t *mddev) struct page *page = NULL, *lastpage = NULL; int blocks; void *paddr; + struct dm_dirty_log *log = mddev->bitmap_info.log; /* Use a mutex to guard daemon_work against * bitmap_destroy. @@ -1138,11 +1142,12 @@ void bitmap_daemon_work(mddev_t *mddev) spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - if (!bitmap->filemap) - /* error or shutdown */ - break; - - page = filemap_get_page(bitmap, j); + if (!bitmap->filemap) { + if (!log) + /* error or shutdown */ + break; + } else + page = filemap_get_page(bitmap, j); if (page != lastpage) { /* skip this page unless it's marked as needing cleaning */ @@ -1197,14 +1202,11 @@ void bitmap_daemon_work(mddev_t *mddev) (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), &blocks, 0); if (bmc) { -/* - if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); -*/ if (*bmc) bitmap->allclean = 0; if (*bmc == 2) { - *bmc=1; /* maybe clear the bit next time */ + *bmc = 1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } else if (*bmc == 1 && !bitmap->need_sync) { /* we can clear the bit */ @@ -1214,14 +1216,17 @@ void bitmap_daemon_work(mddev_t *mddev) -1); /* clear the bit */ - paddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(bitmap, j), - paddr); - else - ext2_clear_bit(file_page_offset(bitmap, j), - paddr); - kunmap_atomic(paddr, KM_USER0); + if (page) { + paddr = kmap_atomic(page, KM_USER0); + if (bitmap->flags & BITMAP_HOSTENDIAN) + clear_bit(file_page_offset(bitmap, j), + paddr); + else + ext2_clear_bit(file_page_offset(bitmap, j), + paddr); + kunmap_atomic(paddr, KM_USER0); + } else + log->type->clear_region(log, j); } } else j |= PAGE_COUNTER_MASK; @@ -1229,12 +1234,16 @@ void bitmap_daemon_work(mddev_t *mddev) spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ - if (lastpage != NULL) { + if (lastpage != NULL || log != NULL) { spin_lock_irqsave(&bitmap->lock, flags); if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); + if (lastpage) + write_page(bitmap, lastpage, 0); + else + if (log->type->flush(log)) + bitmap->flags |= BITMAP_WRITE_ERROR; } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1243,7 +1252,7 @@ void bitmap_daemon_work(mddev_t *mddev) done: if (bitmap->allclean == 0) - bitmap->mddev->thread->timeout = + bitmap->mddev->thread->timeout = bitmap->mddev->bitmap_info.daemon_sleep; mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1262,39 +1271,48 @@ __acquires(bitmap->lock) unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; sector_t csize; + int err; + + err = bitmap_checkpage(bitmap, page, create); - if (bitmap_checkpage(bitmap, page, create) < 0) { + if (bitmap->bp[page].hijacked || + bitmap->bp[page].map == NULL) + csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + + PAGE_COUNTER_SHIFT - 1); + else csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize- 1)); + *blocks = csize - (offset & (csize - 1)); + + if (err < 0) return NULL; - } + /* now locked ... */ if (bitmap->bp[page].hijacked) { /* hijacked pointer */ /* should we use the first or second counter field * of the hijacked pointer? */ int hi = (pageoff > PAGE_COUNTER_MASK); - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + - PAGE_COUNTER_SHIFT - 1); - *blocks = csize - (offset & (csize- 1)); return &((bitmap_counter_t *) &bitmap->bp[page].map)[hi]; - } else { /* page is allocated */ - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize- 1)); + } else /* page is allocated */ return (bitmap_counter_t *) &(bitmap->bp[page].map[pageoff]); - } } int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) { - if (!bitmap) return 0; + if (!bitmap) + return 0; if (behind) { + int bw; atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + bw, bitmap->max_write_behind); } while (sectors) { @@ -1317,17 +1335,16 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&bitmap->lock); - blk_unplug(bitmap->mddev->queue); + md_unplug(bitmap->mddev); schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; } - switch(*bmc) { + switch (*bmc) { case 0: bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(bitmap,offset, 1); - blk_plug_device_unlocked(bitmap->mddev->queue); + bitmap_count_page(bitmap, offset, 1); /* fall through */ case 1: *bmc = 2; @@ -1340,18 +1357,22 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect offset += blocks; if (sectors > blocks) sectors -= blocks; - else sectors = 0; + else + sectors = 0; } bitmap->allclean = 0; return 0; } +EXPORT_SYMBOL(bitmap_startwrite); void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind) { - if (!bitmap) return; + if (!bitmap) + return; if (behind) { - atomic_dec(&bitmap->behind_writes); + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); } @@ -1375,7 +1396,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; - sysfs_notify_dirent(bitmap->sysfs_can_clear); + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); } if (!success && ! (*bmc & NEEDED_MASK)) @@ -1385,18 +1406,22 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto wake_up(&bitmap->overflow_wait); (*bmc)--; - if (*bmc <= 2) { + if (*bmc <= 2) set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), + filemap_get_page( + bitmap, + offset >> CHUNK_BLOCK_SHIFT(bitmap)), BITMAP_PAGE_CLEAN); - } + spin_unlock_irqrestore(&bitmap->lock, flags); offset += blocks; if (sectors > blocks) sectors -= blocks; - else sectors = 0; + else + sectors = 0; } } +EXPORT_SYMBOL(bitmap_endwrite); static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded) @@ -1449,14 +1474,14 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, } return rv; } +EXPORT_SYMBOL(bitmap_start_sync); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) { bitmap_counter_t *bmc; unsigned long flags; -/* - if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted); -*/ if (bitmap == NULL) { + + if (bitmap == NULL) { *blocks = 1024; return; } @@ -1465,26 +1490,23 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int ab if (bmc == NULL) goto unlock; /* locked */ -/* - if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks); -*/ if (RESYNC(*bmc)) { *bmc &= ~RESYNC_MASK; if (!NEEDED(*bmc) && aborted) *bmc |= NEEDED_MASK; else { - if (*bmc <= 2) { + if (*bmc <= 2) set_page_attr(bitmap, filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), BITMAP_PAGE_CLEAN); - } } } unlock: spin_unlock_irqrestore(&bitmap->lock, flags); bitmap->allclean = 0; } +EXPORT_SYMBOL(bitmap_end_sync); void bitmap_close_sync(struct bitmap *bitmap) { @@ -1501,6 +1523,7 @@ void bitmap_close_sync(struct bitmap *bitmap) sector += blocks; } } +EXPORT_SYMBOL(bitmap_close_sync); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) { @@ -1530,6 +1553,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->last_end_sync = jiffies; sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); } +EXPORT_SYMBOL(bitmap_cond_end_sync); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { @@ -1546,9 +1570,9 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n spin_unlock_irq(&bitmap->lock); return; } - if (! *bmc) { + if (!*bmc) { struct page *page; - *bmc = 1 | (needed?NEEDED_MASK:0); + *bmc = 1 | (needed ? NEEDED_MASK : 0); bitmap_count_page(bitmap, offset, 1); page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); @@ -1657,15 +1681,17 @@ int bitmap_create(mddev_t *mddev) unsigned long pages; struct file *file = mddev->bitmap_info.file; int err; - sector_t start; - struct sysfs_dirent *bm; + struct sysfs_dirent *bm = NULL; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ + if (!file + && !mddev->bitmap_info.offset + && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ return 0; BUG_ON(file && mddev->bitmap_info.offset); + BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) @@ -1675,12 +1701,14 @@ int bitmap_create(mddev_t *mddev) atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; - bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); + if (mddev->kobj.sd) + bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); if (bm) { - bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); sysfs_put(bm); } else bitmap->sysfs_can_clear = NULL; @@ -1692,7 +1720,7 @@ int bitmap_create(mddev_t *mddev) * and bypass the page cache, we must sync the file * first. */ - vfs_fsync(file, file->f_dentry, 1); + vfs_fsync(file, 1); } /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ if (!mddev->bitmap_info.external) @@ -1712,9 +1740,9 @@ int bitmap_create(mddev_t *mddev) bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); /* now that chunksize and chunkshift are set, we can use these macros */ - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> + chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> CHUNK_BLOCK_SHIFT(bitmap); - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; + pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; BUG_ON(!pages); @@ -1734,27 +1762,11 @@ int bitmap_create(mddev_t *mddev) if (!bitmap->bp) goto error; - /* now that we have some pages available, initialize the in-memory - * bitmap from the on-disk bitmap */ - start = 0; - if (mddev->degraded == 0 - || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a re-add of a missing device */ - start = mddev->recovery_cp; - err = bitmap_init_from_disk(bitmap, start); - - if (err) - goto error; - printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", pages, bmname(bitmap)); mddev->bitmap = bitmap; - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; - md_wakeup_thread(mddev->thread); - - bitmap_update_sb(bitmap); return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; @@ -1763,15 +1775,69 @@ int bitmap_create(mddev_t *mddev) return err; } +int bitmap_load(mddev_t *mddev) +{ + int err = 0; + sector_t sector = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + goto out; + + /* Clear out old bitmap info first: Either there is none, or we + * are resuming after someone else has possibly changed things, + * so we should forget old cached info. + * All chunks should be clean, but some might need_sync. + */ + while (sector < mddev->resync_max_sectors) { + int blocks; + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + bitmap_close_sync(bitmap); + + if (mddev->bitmap_info.log) { + unsigned long i; + struct dm_dirty_log *log = mddev->bitmap_info.log; + for (i = 0; i < bitmap->chunks; i++) + if (!log->type->in_sync(log, i, 1)) + bitmap_set_memory_bits(bitmap, + (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + 1); + } else { + sector_t start = 0; + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + + err = bitmap_init_from_disk(bitmap, start); + } + if (err) + goto out; + + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + md_wakeup_thread(mddev->thread); + + bitmap_update_sb(bitmap); + + if (bitmap->flags & BITMAP_WRITE_ERROR) + err = -EIO; +out: + return err; +} +EXPORT_SYMBOL_GPL(bitmap_load); + static ssize_t location_show(mddev_t *mddev, char *page) { ssize_t len; - if (mddev->bitmap_info.file) { + if (mddev->bitmap_info.file) len = sprintf(page, "file"); - } else if (mddev->bitmap_info.offset) { + else if (mddev->bitmap_info.offset) len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); - } else + else len = sprintf(page, "none"); len += sprintf(page+len, "\n"); return len; @@ -1860,7 +1926,7 @@ timeout_show(mddev_t *mddev, char *page) ssize_t len; unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; - + len = sprintf(page, "%lu", secs); if (jifs) len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); @@ -2006,6 +2072,27 @@ static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry bitmap_can_clear = __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); +static ssize_t +behind_writes_used_show(mddev_t *mddev, char *page) +{ + if (mddev->bitmap == NULL) + return sprintf(page, "0\n"); + return sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); +} + +static ssize_t +behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + static struct attribute *md_bitmap_attrs[] = { &bitmap_location.attr, &bitmap_timeout.attr, @@ -2013,6 +2100,7 @@ static struct attribute *md_bitmap_attrs[] = { &bitmap_chunksize.attr, &bitmap_metadata.attr, &bitmap_can_clear.attr, + &max_backlog_used.attr, NULL }; struct attribute_group md_bitmap_group = { @@ -2020,12 +2108,3 @@ struct attribute_group md_bitmap_group = { .attrs = md_bitmap_attrs, }; - -/* the bitmap API -- for raid personalities */ -EXPORT_SYMBOL(bitmap_startwrite); -EXPORT_SYMBOL(bitmap_endwrite); -EXPORT_SYMBOL(bitmap_start_sync); -EXPORT_SYMBOL(bitmap_end_sync); -EXPORT_SYMBOL(bitmap_unplug); -EXPORT_SYMBOL(bitmap_close_sync); -EXPORT_SYMBOL(bitmap_cond_end_sync); diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index cb821d76d1b4..e872a7bad6b8 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -222,11 +222,16 @@ struct bitmap { unsigned long file_pages; /* number of pages in the file */ int last_page_size; /* bytes in the last page */ + unsigned long logattrs; /* used when filemap_attr doesn't exist + * because we are working with a dirty_log + */ + unsigned long flags; int allclean; atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ /* * the bitmap daemon - periodically wakes up and sweeps the bitmap @@ -239,14 +244,17 @@ struct bitmap { atomic_t pending_writes; /* pending writes to the bitmap file */ wait_queue_head_t write_wait; wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; struct sysfs_dirent *sysfs_can_clear; + }; /* the bitmap API */ /* these are used only by md/bitmap */ int bitmap_create(mddev_t *mddev); +int bitmap_load(mddev_t *mddev); void bitmap_flush(mddev_t *mddev); void bitmap_destroy(mddev_t *mddev); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a93637223c8d..d5b0e4c0e702 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -107,11 +107,10 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; - /* - * crypto related data - */ + char *cipher; + char *cipher_mode; + struct crypt_iv_operations *iv_gen_ops; - char *iv_mode; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; @@ -135,8 +134,6 @@ struct crypt_config { unsigned int dmreq_start; struct ablkcipher_request *req; - char cipher[CRYPTO_MAX_ALG_NAME]; - char chainmode[CRYPTO_MAX_ALG_NAME]; struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; @@ -999,82 +996,135 @@ static int crypt_wipe_key(struct crypt_config *cc) return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } -/* - * Construct an encryption mapping: - * <cipher> <key> <iv_offset> <dev_path> <start> - */ -static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +static void crypt_dtr(struct dm_target *ti) { - struct crypt_config *cc; - struct crypto_ablkcipher *tfm; - char *tmp; - char *cipher; - char *chainmode; - char *ivmode; - char *ivopts; - unsigned int key_size; - unsigned long long tmpll; + struct crypt_config *cc = ti->private; - if (argc != 5) { - ti->error = "Not enough arguments"; + ti->private = NULL; + + if (!cc) + return; + + if (cc->io_queue) + destroy_workqueue(cc->io_queue); + if (cc->crypt_queue) + destroy_workqueue(cc->crypt_queue); + + if (cc->bs) + bioset_free(cc->bs); + + if (cc->page_pool) + mempool_destroy(cc->page_pool); + if (cc->req_pool) + mempool_destroy(cc->req_pool); + if (cc->io_pool) + mempool_destroy(cc->io_pool); + + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) + cc->iv_gen_ops->dtr(cc); + + if (cc->tfm && !IS_ERR(cc->tfm)) + crypto_free_ablkcipher(cc->tfm); + + if (cc->dev) + dm_put_device(ti, cc->dev); + + kzfree(cc->cipher); + kzfree(cc->cipher_mode); + + /* Must zero key material before freeing */ + kzfree(cc); +} + +static int crypt_ctr_cipher(struct dm_target *ti, + char *cipher_in, char *key) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *cipher_api = NULL; + int ret = -EINVAL; + + /* Convert to crypto api definition? */ + if (strchr(cipher_in, '(')) { + ti->error = "Bad cipher specification"; return -EINVAL; } - tmp = argv[0]; + /* + * Legacy dm-crypt cipher specification + * cipher-mode-iv:ivopts + */ + tmp = cipher_in; cipher = strsep(&tmp, "-"); + + cc->cipher = kstrdup(cipher, GFP_KERNEL); + if (!cc->cipher) + goto bad_mem; + + if (tmp) { + cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); + if (!cc->cipher_mode) + goto bad_mem; + } + chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); if (tmp) - DMWARN("Unexpected additional cipher options"); - - key_size = strlen(argv[1]) >> 1; - - cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); - if (cc == NULL) { - ti->error = - "Cannot allocate transparent encryption context"; - return -ENOMEM; - } + DMWARN("Ignoring unexpected additional cipher options"); - /* Compatibility mode for old dm-crypt cipher strings */ - if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { + /* Compatibility mode for old dm-crypt mappings */ + if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { + kfree(cc->cipher_mode); + cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } if (strcmp(chainmode, "ecb") && !ivmode) { - ti->error = "This chaining mode requires an IV mechanism"; - goto bad_cipher; + ti->error = "IV mechanism required"; + return -EINVAL; } - if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", - chainmode, cipher) >= CRYPTO_MAX_ALG_NAME) { - ti->error = "Chain mode + cipher name is too long"; - goto bad_cipher; + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + if (ret < 0) { + kfree(cipher_api); + goto bad_mem; } - tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); - if (IS_ERR(tfm)) { + /* Allocate cipher */ + cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); + if (IS_ERR(cc->tfm)) { + ret = PTR_ERR(cc->tfm); ti->error = "Error allocating crypto tfm"; - goto bad_cipher; + goto bad; } - strcpy(cc->cipher, cipher); - strcpy(cc->chainmode, chainmode); - cc->tfm = tfm; - - if (crypt_set_key(cc, argv[1]) < 0) { + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { ti->error = "Error decoding and setting key"; - goto bad_ivmode; + goto bad; } - /* - * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". - * See comments at iv code - */ + /* Initialize IV */ + cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + if (cc->iv_size) + /* at least a 64 bit sector number should fit in our buffer */ + cc->iv_size = max(cc->iv_size, + (unsigned int)(sizeof(u64) / sizeof(u8))); + else if (ivmode) { + DMWARN("Selected cipher does not support IVs"); + ivmode = NULL; + } + /* Choose ivmode, see comments at iv code. */ if (ivmode == NULL) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) @@ -1088,160 +1138,138 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; else { + ret = -EINVAL; ti->error = "Invalid IV mode"; - goto bad_ivmode; + goto bad; } - if (cc->iv_gen_ops && cc->iv_gen_ops->ctr && - cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) - goto bad_ivmode; - - if (cc->iv_gen_ops && cc->iv_gen_ops->init && - cc->iv_gen_ops->init(cc) < 0) { - ti->error = "Error initialising IV"; - goto bad_slab_pool; + /* Allocate IV */ + if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { + ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); + if (ret < 0) { + ti->error = "Error creating IV"; + goto bad; + } } - cc->iv_size = crypto_ablkcipher_ivsize(tfm); - if (cc->iv_size) - /* at least a 64 bit sector number should fit in our buffer */ - cc->iv_size = max(cc->iv_size, - (unsigned int)(sizeof(u64) / sizeof(u8))); - else { - if (cc->iv_gen_ops) { - DMWARN("Selected cipher does not support IVs"); - if (cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - cc->iv_gen_ops = NULL; + /* Initialize IV (set keys for ESSIV etc) */ + if (cc->iv_gen_ops && cc->iv_gen_ops->init) { + ret = cc->iv_gen_ops->init(cc); + if (ret < 0) { + ti->error = "Error initialising IV"; + goto bad; } } + ret = 0; +bad: + kfree(cipher_api); + return ret; + +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +/* + * Construct an encryption mapping: + * <cipher> <key> <iv_offset> <dev_path> <start> + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc; + unsigned int key_size; + unsigned long long tmpll; + int ret; + + if (argc != 5) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + key_size = strlen(argv[1]) >> 1; + + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + if (!cc) { + ti->error = "Cannot allocate encryption context"; + return -ENOMEM; + } + + ti->private = cc; + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; + + ret = -ENOMEM; cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); if (!cc->io_pool) { ti->error = "Cannot allocate crypt io mempool"; - goto bad_slab_pool; + goto bad; } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + sizeof(struct dm_crypt_request) + cc->iv_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; - goto bad_req_pool; + goto bad; } cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; - goto bad_page_pool; + goto bad; } cc->bs = bioset_create(MIN_IOS, 0); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; - goto bad_bs; + goto bad; } + ret = -EINVAL; if (sscanf(argv[2], "%llu", &tmpll) != 1) { ti->error = "Invalid iv_offset sector"; - goto bad_device; + goto bad; } cc->iv_offset = tmpll; + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + if (sscanf(argv[4], "%llu", &tmpll) != 1) { ti->error = "Invalid device sector"; - goto bad_device; + goto bad; } cc->start = tmpll; - if (dm_get_device(ti, argv[3], cc->start, ti->len, - dm_table_get_mode(ti->table), &cc->dev)) { - ti->error = "Device lookup failed"; - goto bad_device; - } - - if (ivmode && cc->iv_gen_ops) { - if (ivopts) - *(ivopts - 1) = ':'; - cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); - if (!cc->iv_mode) { - ti->error = "Error kmallocing iv_mode string"; - goto bad_ivmode_string; - } - strcpy(cc->iv_mode, ivmode); - } else - cc->iv_mode = NULL; - + ret = -ENOMEM; cc->io_queue = create_singlethread_workqueue("kcryptd_io"); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; - goto bad_io_queue; + goto bad; } cc->crypt_queue = create_singlethread_workqueue("kcryptd"); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; - goto bad_crypt_queue; + goto bad; } ti->num_flush_requests = 1; - ti->private = cc; return 0; -bad_crypt_queue: - destroy_workqueue(cc->io_queue); -bad_io_queue: - kfree(cc->iv_mode); -bad_ivmode_string: - dm_put_device(ti, cc->dev); -bad_device: - bioset_free(cc->bs); -bad_bs: - mempool_destroy(cc->page_pool); -bad_page_pool: - mempool_destroy(cc->req_pool); -bad_req_pool: - mempool_destroy(cc->io_pool); -bad_slab_pool: - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); -bad_ivmode: - crypto_free_ablkcipher(tfm); -bad_cipher: - /* Must zero key material before freeing */ - kzfree(cc); - return -EINVAL; -} - -static void crypt_dtr(struct dm_target *ti) -{ - struct crypt_config *cc = (struct crypt_config *) ti->private; - - destroy_workqueue(cc->io_queue); - destroy_workqueue(cc->crypt_queue); - - if (cc->req) - mempool_free(cc->req, cc->req_pool); - - bioset_free(cc->bs); - mempool_destroy(cc->page_pool); - mempool_destroy(cc->req_pool); - mempool_destroy(cc->io_pool); - - kfree(cc->iv_mode); - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - crypto_free_ablkcipher(cc->tfm); - dm_put_device(ti, cc->dev); - - /* Must zero key material before freeing */ - kzfree(cc); +bad: + crypt_dtr(ti); + return ret; } static int crypt_map(struct dm_target *ti, struct bio *bio, @@ -1250,13 +1278,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct dm_crypt_io *io; struct crypt_config *cc; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { cc = ti->private; bio->bi_bdev = cc->dev->bdev; return DM_MAPIO_REMAPPED; } - io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); + io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); if (bio_data_dir(io->base_bio) == READ) kcryptd_queue_io(io); @@ -1269,7 +1297,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, static int crypt_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - struct crypt_config *cc = (struct crypt_config *) ti->private; + struct crypt_config *cc = ti->private; unsigned int sz = 0; switch (type) { @@ -1278,11 +1306,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->iv_mode) - DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode, - cc->iv_mode); + if (cc->cipher_mode) + DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); else - DMEMIT("%s-%s ", cc->cipher, cc->chainmode); + DMEMIT("%s ", cc->cipher); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -1379,7 +1406,7 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return max_size; bvm->bi_bdev = cc->dev->bdev; - bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin; + bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index ebe7381f47c8..baa11912cc94 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -156,8 +156,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - if (dm_get_device(ti, argv[0], dc->start_read, ti->len, - dm_table_get_mode(ti->table), &dc->dev_read)) { + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &dc->dev_read)) { ti->error = "Device lookup failed"; goto bad; } @@ -177,8 +177,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_dev_read; } - if (dm_get_device(ti, argv[3], dc->start_write, ti->len, - dm_table_get_mode(ti->table), &dc->dev_write)) { + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), + &dc->dev_write)) { ti->error = "Write device lookup failed"; goto bad_dev_read; } @@ -198,6 +198,7 @@ out: atomic_set(&dc->may_delay, 1); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = dc; return 0; @@ -281,14 +282,13 @@ static int delay_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = dc->dev_write->bdev; if (bio_sectors(bio)) bio->bi_sector = dc->start_write + - (bio->bi_sector - ti->begin); + dm_target_offset(ti, bio->bi_sector); return delay_bio(dc, dc->write_delay, bio); } bio->bi_bdev = dc->dev_read->bdev; - bio->bi_sector = dc->start_read + - (bio->bi_sector - ti->begin); + bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector); return delay_bio(dc, dc->read_delay, bio); } diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 2b7907b6dd09..0bdb201c2c2a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -173,7 +173,9 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, /* Validate the chunk size against the device block size */ if (chunk_size % - (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || + chunk_size % + (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index e8dfa06af3ba..0b2536247cf5 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -126,8 +126,9 @@ struct dm_exception_store { }; /* - * Obtain the cow device used by a given snapshot. + * Obtain the origin or cow device used by a given snapshot. */ +struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); /* diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 10f457ca6af2..136d4f71a116 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -31,7 +31,6 @@ struct dm_io_client { */ struct io { unsigned long error_bits; - unsigned long eopnotsupp_bits; atomic_t count; struct task_struct *sleeper; struct dm_io_client *client; @@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, *---------------------------------------------------------------*/ static void dec_count(struct io *io, unsigned int region, int error) { - if (error) { + if (error) set_bit(region, &io->error_bits); - if (error == -EOPNOTSUPP) - set_bit(region, &io->eopnotsupp_bits); - } if (atomic_dec_and_test(&io->count)) { if (io->sleeper) @@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where->count; /* - * where->count may be zero if rw holds a write barrier and we - * need to send a zero-sized barrier. + * where->count may be zero if rw holds a flush and we need to + * send a zero-sized flush. */ do { /* @@ -356,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions, BUG_ON(num_regions > DM_IO_MAX_REGIONS); if (sync) - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; /* * For multiple regions we need to be careful to rewind @@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) + if (where[i].count || (rw & REQ_FLUSH)) do_region(rw, i, where + i, dp, io); } @@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, return -EIO; } -retry: io->error_bits = 0; - io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = current; io->client = client; @@ -412,11 +406,6 @@ retry: } set_current_state(TASK_RUNNING); - if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { - rw &= ~(1 << BIO_RW_BARRIER); - goto retry; - } - if (error_bits) *error_bits = io->error_bits; @@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; - io->eopnotsupp_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; io->client = client; @@ -479,8 +467,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in - * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to + * the queue with blk_unplug() some time later or set REQ_SYNC in +io_req->bi_rw. If you fail to do one of these, the IO will be submitted to * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1d669322b27c..4b54618b4159 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -249,54 +249,66 @@ static void __hash_remove(struct hash_cell *hc) static void dm_hash_remove_all(int keep_open_devices) { - int i, dev_skipped, dev_removed; + int i, dev_skipped; struct hash_cell *hc; - struct list_head *tmp, *n; + struct mapped_device *md; + +retry: + dev_skipped = 0; down_write(&_hash_lock); -retry: - dev_skipped = dev_removed = 0; for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_safe (tmp, n, _name_buckets + i) { - hc = list_entry(tmp, struct hash_cell, name_list); + list_for_each_entry(hc, _name_buckets + i, name_list) { + md = hc->md; + dm_get(md); - if (keep_open_devices && - dm_lock_for_deletion(hc->md)) { + if (keep_open_devices && dm_lock_for_deletion(md)) { + dm_put(md); dev_skipped++; continue; } + __hash_remove(hc); - dev_removed = 1; - } - } - /* - * Some mapped devices may be using other mapped devices, so if any - * still exist, repeat until we make no further progress. - */ - if (dev_skipped) { - if (dev_removed) - goto retry; + up_write(&_hash_lock); - DMWARN("remove_all left %d open device(s)", dev_skipped); + dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; + } } up_write(&_hash_lock); + + if (dev_skipped) + DMWARN("remove_all left %d open device(s)", dev_skipped); } -static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) +static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, + const char *new) { char *new_name, *old_name; struct hash_cell *hc; struct dm_table *table; + struct mapped_device *md; /* * duplicate new. */ new_name = kstrdup(new, GFP_KERNEL); if (!new_name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -305,24 +317,24 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) */ hc = __get_name_cell(new); if (hc) { - DMWARN("asked to rename to an already existing name %s -> %s", - old, new); + DMWARN("asked to rename to an already-existing name %s -> %s", + param->name, new); dm_put(hc->md); up_write(&_hash_lock); kfree(new_name); - return -EBUSY; + return ERR_PTR(-EBUSY); } /* * Is there such a device as 'old' ? */ - hc = __get_name_cell(old); + hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non existent device %s -> %s", - old, new); + DMWARN("asked to rename a non-existent device %s -> %s", + param->name, new); up_write(&_hash_lock); kfree(new_name); - return -ENXIO; + return ERR_PTR(-ENXIO); } /* @@ -344,12 +356,14 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) dm_table_put(table); } - dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); + if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; - dm_put(hc->md); + md = hc->md; up_write(&_hash_lock); kfree(old_name); - return 0; + + return md; } /*----------------------------------------------------------------- @@ -571,7 +585,7 @@ static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, * Fills in a dm_ioctl structure, ready for sending back to * userland. */ -static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; @@ -615,8 +629,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) dm_table_put(table); } } - - return 0; } static int dev_create(struct dm_ioctl *param, size_t param_size) @@ -638,15 +650,17 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); if (r) { dm_put(md); + dm_destroy(md); return r; } param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); + dm_put(md); - return r; + return 0; } /* @@ -736,10 +750,11 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) __hash_remove(hc); up_write(&_hash_lock); - dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; dm_put(md); - param->data_size = 0; + dm_destroy(md); return 0; } @@ -760,6 +775,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; char *new_name = (char *) param + param->data_start; + struct mapped_device *md; if (new_name < param->data || invalid_str(new_name, (void *) param + param_size) || @@ -772,8 +788,14 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) if (r) return r; - param->data_size = 0; - return dm_hash_rename(param->event_nr, param->name, new_name); + md = dm_hash_rename(param, new_name); + if (IS_ERR(md)) + return PTR_ERR(md); + + __dev_status(md, param); + dm_put(md); + + return 0; } static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) @@ -814,8 +836,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) geometry.start = indata[3]; r = dm_set_geometry(md, &geometry); - if (!r) - r = __dev_status(md, param); param->data_size = 0; @@ -839,13 +859,17 @@ static int do_suspend(struct dm_ioctl *param) if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended_md(md)) + if (!dm_suspended_md(md)) { r = dm_suspend(md, suspend_flags); + if (r) + goto out; + } - if (!r) - r = __dev_status(md, param); + __dev_status(md, param); +out: dm_put(md); + return r; } @@ -897,16 +921,17 @@ static int do_resume(struct dm_ioctl *param) set_disk_ro(dm_disk(md), 1); } - if (dm_suspended_md(md)) + if (dm_suspended_md(md)) { r = dm_resume(md); + if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + } if (old_map) dm_table_destroy(old_map); - if (!r) { - dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); - r = __dev_status(md, param); - } + if (!r) + __dev_status(md, param); dm_put(md); return r; @@ -930,16 +955,16 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size) */ static int dev_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); - return r; + + return 0; } /* @@ -1014,7 +1039,7 @@ static void retrieve_status(struct dm_table *table, */ static int dev_wait(struct dm_ioctl *param, size_t param_size) { - int r; + int r = 0; struct mapped_device *md; struct dm_table *table; @@ -1035,9 +1060,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) * changed to trigger the event, so we may as well tell * him and save an ioctl. */ - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1045,8 +1068,9 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } - out: +out: dm_put(md); + return r; } @@ -1107,28 +1131,9 @@ static int populate_table(struct dm_table *table, next = spec->next; } - r = dm_table_set_type(table); - if (r) { - DMWARN("unable to set table type"); - return r; - } - return dm_table_complete(table); } -static int table_prealloc_integrity(struct dm_table *t, - struct mapped_device *md) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd; - - list_for_each_entry(dd, devices, list) - if (bdev_get_integrity(dd->dm_dev.bdev)) - return blk_integrity_register(dm_disk(md), NULL); - - return 0; -} - static int table_load(struct dm_ioctl *param, size_t param_size) { int r; @@ -1150,21 +1155,30 @@ static int table_load(struct dm_ioctl *param, size_t param_size) goto out; } - r = table_prealloc_integrity(t, md); - if (r) { - DMERR("%s: could not register integrity profile.", - dm_device_name(md)); + /* Protect md->type and md->queue against concurrent table loads. */ + dm_lock_md_type(md); + if (dm_get_md_type(md) == DM_TYPE_NONE) + /* Initial table load: acquire type of table. */ + dm_set_md_type(md, dm_table_get_type(t)); + else if (dm_get_md_type(md) != dm_table_get_type(t)) { + DMWARN("can't change device type after initial table load."); dm_table_destroy(t); + dm_unlock_md_type(md); + r = -EINVAL; goto out; } - r = dm_table_alloc_md_mempools(t); + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md); if (r) { - DMWARN("unable to allocate mempools for this table"); + DMWARN("unable to set up device queue for new table."); dm_table_destroy(t); + dm_unlock_md_type(md); goto out; } + dm_unlock_md_type(md); + /* stage inactive table */ down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { @@ -1181,7 +1195,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) up_write(&_hash_lock); param->flags |= DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); out: dm_put(md); @@ -1191,7 +1205,6 @@ out: static int table_clear(struct dm_ioctl *param, size_t param_size) { - int r; struct hash_cell *hc; struct mapped_device *md; @@ -1211,11 +1224,12 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(hc->md, param); + __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); dm_put(md); - return r; + + return 0; } /* @@ -1260,7 +1274,6 @@ static void retrieve_deps(struct dm_table *table, static int table_deps(struct dm_ioctl *param, size_t param_size) { - int r = 0; struct mapped_device *md; struct dm_table *table; @@ -1268,9 +1281,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1278,9 +1289,9 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } - out: dm_put(md); - return r; + + return 0; } /* @@ -1289,7 +1300,6 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) */ static int table_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; struct dm_table *table; @@ -1297,9 +1307,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); table = dm_get_live_or_inactive_table(md, param); if (table) { @@ -1307,9 +1315,9 @@ static int table_status(struct dm_ioctl *param, size_t param_size) dm_table_put(table); } -out: dm_put(md); - return r; + + return 0; } /* @@ -1328,10 +1336,6 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; - if (tmsg < (struct dm_target_msg *) param->data || invalid_str(tmsg->message, (void *) param + param_size)) { DMWARN("Invalid target message parameters."); @@ -1476,6 +1480,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) { /* Always clear this flag */ param->flags &= ~DM_BUFFER_FULL_FLAG; + param->flags &= ~DM_UEVENT_GENERATED_FLAG; /* Ignores parameters */ if (cmd == DM_REMOVE_ALL_CMD || @@ -1587,18 +1592,23 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) #endif static const struct file_operations _ctl_fops = { + .open = nonseekable_open, .unlocked_ioctl = dm_ctl_ioctl, .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, + .llseek = noop_llseek, }; static struct miscdevice _dm_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = MAPPER_CTRL_MINOR, .name = DM_NAME, - .nodename = "mapper/control", + .nodename = DM_DIR "/" DM_CONTROL_NODE, .fops = &_ctl_fops }; +MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); +MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); + /* * Create misc character device and link to DM_DIR/control. */ diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index addf83475040..d8587bac5682 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -345,7 +345,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG), + .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 82f7d6e6b1ea..3921e3bb43c1 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -47,13 +47,13 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) } lc->start = tmp; - if (dm_get_device(ti, argv[0], lc->start, ti->len, - dm_table_get_mode(ti->table), &lc->dev)) { + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { ti->error = "dm-linear: Device lookup failed"; goto bad; } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = lc; return 0; @@ -74,7 +74,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { struct linear_c *lc = ti->private; - return lc->start + (bi_sector - ti->begin); + return lc->start + dm_target_offset(ti, bi_sector); } static void linear_map_bio(struct dm_target *ti, struct bio *bio) diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 7ac2c1450d10..1ed0094f064b 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -5,6 +5,7 @@ */ #include <linux/bio.h> +#include <linux/slab.h> #include <linux/dm-dirty-log.h> #include <linux/device-mapper.h> #include <linux/dm-log-userspace.h> diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 54abf9e303b7..075cbcf8a9f5 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/module.h> +#include <linux/slab.h> #include <net/sock.h> #include <linux/workqueue.h> #include <linux/connector.h> @@ -172,11 +173,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, { int r = 0; size_t dummy = 0; - int overhead_size = - sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); + int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); struct dm_ulog_request *tfr = prealloced_ulog_tfr; struct receiving_pkg pkg; + /* + * Given the space needed to hold the 'struct cn_msg' and + * 'struct dm_ulog_request' - do we have enough payload + * space remaining? + */ if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { DMINFO("Size of tfr exceeds preallocated size"); return -EINVAL; @@ -191,7 +196,7 @@ resend: */ mutex_lock(&dm_ulog_lock); - memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 7035582786fb..33420e68d153 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc) .count = 0, }; - lc->io_req.bi_rw = WRITE_BARRIER; + lc->io_req.bi_rw = WRITE_FLUSH; return dm_io(&lc->io_req, 1, &null_location, NULL); } @@ -543,8 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, return -EINVAL; } - r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, - FMODE_READ | FMODE_WRITE, &dev); + r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev); if (r) return r; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e81345a1d08f..487ecda90ad4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -69,6 +69,7 @@ struct multipath { struct list_head priority_groups; unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; @@ -95,8 +96,6 @@ struct multipath { mempool_t *mpio_pool; struct mutex work_mutex; - - unsigned suspended; /* Don't create new I/O internally when set. */ }; /* @@ -202,6 +201,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); + init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { @@ -235,6 +235,21 @@ static void free_multipath(struct multipath *m) * Path selection *-----------------------------------------------*/ +static void __pg_init_all_paths(struct multipath *m) +{ + struct pgpath *pgpath; + + m->pg_init_count++; + m->pg_init_required = 0; + list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { + /* Skip failed paths */ + if (!pgpath->is_active) + continue; + if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + m->pg_init_in_progress++; + } +} + static void __switch_pg(struct multipath *m, struct pgpath *pgpath) { m->current_pg = pgpath->pg; @@ -439,7 +454,7 @@ static void process_queued_ios(struct work_struct *work) { struct multipath *m = container_of(work, struct multipath, process_queued_ios); - struct pgpath *pgpath = NULL, *tmp; + struct pgpath *pgpath = NULL; unsigned must_queue = 1; unsigned long flags; @@ -457,14 +472,9 @@ static void process_queued_ios(struct work_struct *work) (!pgpath && !m->queue_if_no_path)) must_queue = 0; - if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { - m->pg_init_count++; - m->pg_init_required = 0; - list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { - if (queue_work(kmpath_handlerd, &tmp->activate_path)) - m->pg_init_in_progress++; - } - } + if (m->pg_init_required && !m->pg_init_in_progress && pgpath) + __pg_init_all_paths(m); + out: spin_unlock_irqrestore(&m->lock, flags); if (!must_queue) @@ -597,8 +607,8 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, if (!p) return ERR_PTR(-ENOMEM); - r = dm_get_device(ti, shift(as), ti->begin, ti->len, - dm_table_get_mode(ti->table), &p->path.dev); + r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), + &p->path.dev); if (r) { ti->error = "error getting device"; goto bad; @@ -696,6 +706,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, if (as->argc < nr_params) { ti->error = "not enough path parameters"; + r = -EINVAL; goto bad; } @@ -882,6 +893,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; return 0; @@ -890,9 +902,34 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, return r; } -static void flush_multipath_work(void) +static void multipath_wait_for_pg_init_completion(struct multipath *m) +{ + DECLARE_WAITQUEUE(wait, current); + unsigned long flags; + + add_wait_queue(&m->pg_init_wait, &wait); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&m->lock, flags); + if (!m->pg_init_in_progress) { + spin_unlock_irqrestore(&m->lock, flags); + break; + } + spin_unlock_irqrestore(&m->lock, flags); + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + remove_wait_queue(&m->pg_init_wait, &wait); +} + +static void flush_multipath_work(struct multipath *m) { flush_workqueue(kmpath_handlerd); + multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); flush_scheduled_work(); } @@ -901,7 +938,7 @@ static void multipath_dtr(struct dm_target *ti) { struct multipath *m = ti->private; - flush_multipath_work(); + flush_multipath_work(m); free_multipath(m); } @@ -1128,8 +1165,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) static void pg_init_done(void *data, int errors) { - struct dm_path *path = data; - struct pgpath *pgpath = path_to_pgpath(path); + struct pgpath *pgpath = data; struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; @@ -1143,8 +1179,8 @@ static void pg_init_done(void *data, int errors) errors = 0; break; } - DMERR("Cannot failover device because scsi_dh_%s was not " - "loaded.", m->hw_handler_name); + DMERR("Could not failover the device: Handler scsi_dh_%s " + "Error %d.", m->hw_handler_name, errors); /* * Fail path for now, so we do not ping pong */ @@ -1181,14 +1217,24 @@ static void pg_init_done(void *data, int errors) m->current_pgpath = NULL; m->current_pg = NULL; } - } else if (!m->pg_init_required) { - m->queue_io = 0; + } else if (!m->pg_init_required) pg->bypassed = 0; - } - m->pg_init_in_progress--; - if (!m->pg_init_in_progress) - queue_work(kmultipathd, &m->process_queued_ios); + if (--m->pg_init_in_progress) + /* Activations of other paths are still on going */ + goto out; + + if (!m->pg_init_required) + m->queue_io = 0; + + queue_work(kmultipathd, &m->process_queued_ios); + + /* + * Wake up any thread waiting to suspend. + */ + wake_up(&m->pg_init_wait); + +out: spin_unlock_irqrestore(&m->lock, flags); } @@ -1198,7 +1244,7 @@ static void activate_path(struct work_struct *work) container_of(work, struct pgpath, activate_path); scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), - pg_init_done, &pgpath->path); + pg_init_done, pgpath); } /* @@ -1227,6 +1273,15 @@ static int do_end_io(struct multipath *m, struct request *clone, if (error == -EOPNOTSUPP) return error; + if (clone->cmd_flags & REQ_DISCARD) + /* + * Pass all discard request failures up. + * FIXME: only fail_path if the discard failed due to a + * transport problem. This requires precise understanding + * of the underlying failure (e.g. the SCSI sense). + */ + return error; + if (mpio->pgpath) fail_path(mpio->pgpath); @@ -1276,8 +1331,7 @@ static void multipath_postsuspend(struct dm_target *ti) struct multipath *m = ti->private; mutex_lock(&m->work_mutex); - m->suspended = 1; - flush_multipath_work(); + flush_multipath_work(m); mutex_unlock(&m->work_mutex); } @@ -1289,10 +1343,6 @@ static void multipath_resume(struct dm_target *ti) struct multipath *m = (struct multipath *) ti->private; unsigned long flags; - mutex_lock(&m->work_mutex); - m->suspended = 0; - mutex_unlock(&m->work_mutex); - spin_lock_irqsave(&m->lock, flags); m->queue_if_no_path = m->saved_queue_if_no_path; spin_unlock_irqrestore(&m->lock, flags); @@ -1428,11 +1478,6 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) mutex_lock(&m->work_mutex); - if (m->suspended) { - r = -EBUSY; - goto out; - } - if (dm_suspended(ti)) { r = -EBUSY; goto out; @@ -1471,8 +1516,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) goto out; } - r = dm_get_device(ti, argv[1], ti->begin, ti->len, - dm_table_get_mode(ti->table), &dev); + r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); if (r) { DMWARN("message: error getting device %s", argv[1]); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ad779bd13aec..19a59b041c27 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_region io[ms->nr_mirrors]; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE_BARRIER, + .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.bvec = NULL, .client = ms->io_client, @@ -445,7 +445,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio) { if (unlikely(!bio->bi_size)) return 0; - return m->offset + (bio->bi_sector - m->ms->ti->begin); + return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector); } static void map_bio(struct mirror *m, struct bio *bio) @@ -465,9 +465,17 @@ static void map_region(struct dm_io_region *io, struct mirror *m, static void hold_bio(struct mirror_set *ms, struct bio *bio) { /* - * If device is suspended, complete the bio. + * Lock is required to avoid race condition during suspend + * process. */ + spin_lock_irq(&ms->lock); + if (atomic_read(&ms->suspend)) { + spin_unlock_irq(&ms->lock); + + /* + * If device is suspended, complete the bio. + */ if (dm_noflush_suspending(ms->ti)) bio_endio(bio, DM_ENDIO_REQUEUE); else @@ -478,7 +486,6 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio) /* * Hold bio until the suspend is complete. */ - spin_lock_irq(&ms->lock); bio_list_add(&ms->holds, bio); spin_unlock_irq(&ms->lock); } @@ -622,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct dm_io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), + .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), .mem.type = DM_IO_BVEC, .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, .notify.fn = write_callback, @@ -663,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { bio_list_add(&sync, bio); continue; } @@ -724,7 +731,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /* * Dispatch io. */ - if (unlikely(ms->log_failure)) { + if (unlikely(ms->log_failure) && errors_handled(ms)) { spin_lock_irq(&ms->lock); bio_list_merge(&ms->failures, &sync); spin_unlock_irq(&ms->lock); @@ -737,9 +744,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) dm_rh_delay(ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - if (unlikely(ms->leg_failure) && errors_handled(ms)) - hold_bio(ms, bio); - else { + if (unlikely(ms->leg_failure) && errors_handled(ms)) { + spin_lock_irq(&ms->lock); + bio_list_add(&ms->failures, bio); + spin_unlock_irq(&ms->lock); + wakeup_mirrord(ms); + } else { map_bio(get_default_mirror(ms), bio); generic_make_request(bio); } @@ -917,8 +927,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, return -EINVAL; } - if (dm_get_device(ti, argv[0], offset, ti->len, - dm_table_get_mode(ti->table), + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ms->mirror[mirror].dev)) { ti->error = "Device lookup failure"; return -ENXIO; @@ -1194,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (likely(!bio_empty_barrier(bio))) + if (!(bio->bi_rw & REQ_FLUSH)) dm_rh_dec(ms->rh, map_context->ll); return error; } @@ -1202,7 +1211,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, if (error == -EOPNOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) goto out; if (unlikely(error)) { @@ -1259,6 +1268,20 @@ static void mirror_presuspend(struct dm_target *ti) atomic_set(&ms->suspend, 1); /* + * Process bios in the hold list to start recovery waiting + * for bios in the hold list. After the process, no bio has + * a chance to be added in the hold list because ms->suspend + * is set. + */ + spin_lock_irq(&ms->lock); + holds = ms->holds; + bio_list_init(&ms->holds); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&holds))) + hold_bio(ms, bio); + + /* * We must finish up all the work that we've * generated (i.e. recovery work). */ @@ -1278,22 +1301,6 @@ static void mirror_presuspend(struct dm_target *ti) * we know that all of our I/O has been pushed. */ flush_workqueue(ms->kmirrord_wq); - - /* - * Now set ms->suspend is set and the workqueue flushed, no more - * entries can be added to ms->hold list, so process it. - * - * Bios can still arrive concurrently with or after this - * presuspend function, but they cannot join the hold list - * because ms->suspend is set. - */ - spin_lock_irq(&ms->lock); - holds = ms->holds; - bio_list_init(&ms->holds); - spin_unlock_irq(&ms->lock); - - while ((bio = bio_list_pop(&holds))) - hold_bio(ms, bio); } static void mirror_postsuspend(struct dm_target *ti) diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 5f19ceb6fe91..dad011aed0c9 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -11,6 +11,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include "dm.h" @@ -80,9 +81,9 @@ struct dm_region_hash { struct list_head failed_recovered_regions; /* - * If there was a barrier failure no regions can be marked clean. + * If there was a flush failure no regions can be marked clean. */ - int barrier_failure; + int flush_failure; void *context; sector_t target_begin; @@ -216,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create( INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); INIT_LIST_HEAD(&rh->failed_recovered_regions); - rh->barrier_failure = 0; + rh->flush_failure = 0; rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, sizeof(struct dm_region)); @@ -398,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; - if (bio_empty_barrier(bio)) { - rh->barrier_failure = 1; + if (bio->bi_rw & REQ_FLUSH) { + rh->flush_failure = 1; return; } @@ -523,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio_empty_barrier(bio)) + if (bio->bi_rw & REQ_FLUSH) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } @@ -554,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) */ /* do nothing for DM_RH_NOSYNC */ - if (unlikely(rh->barrier_failure)) { + if (unlikely(rh->flush_failure)) { /* - * If a write barrier failed some time ago, we + * If a write flush failed some time ago, we * don't know whether or not this write made it * to the disk, so we must resync the device. */ @@ -660,10 +661,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success) spin_lock_irq(&rh->region_lock); if (success) list_add(®->list, ®->rh->recovered_regions); - else { - reg->state = DM_RH_NOSYNC; + else list_add(®->list, ®->rh->failed_recovered_regions); - } + spin_unlock_irq(&rh->region_lock); rh->wakeup_workers(rh->context); diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index cfa668f46c40..9c6c2e47ad62 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -11,6 +11,8 @@ #include "dm.h" #include "dm-path-selector.h" +#include <linux/slab.h> + #define DM_MSG_PREFIX "multipath service-time" #define ST_MIN_IO 1 #define ST_MAX_RELATIVE_THROUGHPUT 100 diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 7d08879689ac..2129cdb115dc 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -254,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, * Issue the synchronous I/O from a different thread * to avoid generic_make_request recursion. */ - INIT_WORK(&req.work, do_metadata); + INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); flush_workqueue(ps->metadata_wq); @@ -266,7 +266,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ static chunk_t area_location(struct pstore *ps, chunk_t area) { - return 1 + ((ps->exceptions_per_area + 1) * area); + return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); } /* @@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, WRITE_BARRIER)) + if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) ps->valid = 0; /* @@ -780,8 +780,8 @@ static int persistent_commit_merge(struct dm_exception_store *store, * ps->current_area does not get reduced by prepare_merge() until * after commit_merge() has removed the nr_merged previous exceptions. */ - ps->next_free = (area_location(ps, ps->current_area) - 1) + - (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; + ps->next_free = area_location(ps, ps->current_area) + + ps->current_committed + 1; return 0; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ee8eb283650d..53cf79d8bcbc 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -83,10 +83,10 @@ struct dm_snapshot { /* Whether or not owning mapped_device is suspended */ int suspended; - mempool_t *pending_pool; - atomic_t pending_exceptions_count; + mempool_t *pending_pool; + struct dm_exception_table pending; struct dm_exception_table complete; @@ -96,6 +96,11 @@ struct dm_snapshot { */ spinlock_t pe_lock; + /* Chunks with outstanding reads */ + spinlock_t tracked_chunk_lock; + mempool_t *tracked_chunk_pool; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + /* The on disk metadata handler */ struct dm_exception_store *store; @@ -105,10 +110,12 @@ struct dm_snapshot { struct bio_list queued_bios; struct work_struct queued_bios_work; - /* Chunks with outstanding reads */ - mempool_t *tracked_chunk_pool; - spinlock_t tracked_chunk_lock; - struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + /* Wait for events based on state_bits */ + unsigned long state_bits; + + /* Range of chunks currently being merged. */ + chunk_t first_merging_chunk; + int num_merging_chunks; /* * The merge operation failed if this flag is set. @@ -125,13 +132,6 @@ struct dm_snapshot { */ int merge_failed; - /* Wait for events based on state_bits */ - unsigned long state_bits; - - /* Range of chunks currently being merged. */ - chunk_t first_merging_chunk; - int num_merging_chunks; - /* * Incoming bios that overlap with chunks being merged must wait * for them to be committed. @@ -148,6 +148,12 @@ struct dm_snapshot { #define RUNNING_MERGE 0 #define SHUTDOWN_MERGE 1 +struct dm_dev *dm_snap_origin(struct dm_snapshot *s) +{ + return s->origin; +} +EXPORT_SYMBOL(dm_snap_origin); + struct dm_dev *dm_snap_cow(struct dm_snapshot *s) { return s->cow; @@ -700,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) return 0; } -#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) - /* * Return a minimum chunk size of all snapshots that have the specified origin. * Return zero if the origin has no snapshots. @@ -1065,10 +1069,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) origin_mode = FMODE_WRITE; } - origin_path = argv[0]; - argv++; - argc--; - s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { ti->error = "Cannot allocate snapshot context private " @@ -1077,12 +1077,21 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + origin_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, origin_path, origin_mode, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad_origin; + } + cow_path = argv[0]; argv++; argc--; - r = dm_get_device(ti, cow_path, 0, 0, - FMODE_READ | FMODE_WRITE, &s->cow); + r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); if (r) { ti->error = "Cannot get COW device"; goto bad_cow; @@ -1098,12 +1107,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv += args_used; argc -= args_used; - r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin); - if (r) { - ti->error = "Cannot get origin device"; - goto bad_origin; - } - s->ti = ti; s->valid = 1; s->active = 0; @@ -1213,15 +1216,15 @@ bad_kcopyd: dm_exception_table_exit(&s->complete, exception_cache); bad_hash_tables: - dm_put_device(ti, s->origin); - -bad_origin: dm_exception_store_destroy(s->store); bad_store: dm_put_device(ti, s->cow); bad_cow: + dm_put_device(ti, s->origin); + +bad_origin: kfree(s); bad: @@ -1315,12 +1318,12 @@ static void snapshot_dtr(struct dm_target *ti) mempool_destroy(s->pending_pool); - dm_put_device(ti, s->origin); - dm_exception_store_destroy(s->store); dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + kfree(s); } @@ -1582,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; - if (unlikely(bio_empty_barrier(bio))) { + if (bio->bi_rw & REQ_FLUSH) { bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1686,8 +1689,8 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, int r = DM_MAPIO_REMAPPED; chunk_t chunk; - if (unlikely(bio_empty_barrier(bio))) { - if (!map_context->flush_request) + if (bio->bi_rw & REQ_FLUSH) { + if (!map_context->target_request_nr) bio->bi_bdev = s->origin->bdev; else bio->bi_bdev = s->cow->bdev; @@ -1900,8 +1903,14 @@ static int snapshot_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dm_snapshot *snap = ti->private; + int r; + + r = fn(ti, snap->origin, 0, ti->len, data); + + if (!r) + r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); - return fn(ti, snap->origin, 0, ti->len, data); + return r; } @@ -2100,8 +2109,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - r = dm_get_device(ti, argv[0], 0, ti->len, - dm_table_get_mode(ti->table), &dev); + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); if (r) { ti->error = "Cannot get target device"; return r; @@ -2125,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio, struct dm_dev *dev = ti->private; bio->bi_bdev = dev->bdev; - if (unlikely(bio_empty_barrier(bio))) + if (bio->bi_rw & REQ_FLUSH) return DM_MAPIO_REMAPPED; /* Only tell snapshots if this is a write */ @@ -2161,6 +2169,21 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, return 0; } +static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_dev *dev = ti->private; + struct request_queue *q = bdev_get_queue(dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = dev->bdev; + bvm->bi_sector = bvm->bi_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -2178,6 +2201,7 @@ static struct target_type origin_target = { .map = origin_map, .resume = origin_resume, .status = origin_status, + .merge = origin_merge, .iterate_devices = origin_iterate_devices, }; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e0efc1adcaff..f0371b4c4fbf 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -25,6 +25,8 @@ struct stripe { struct stripe_c { uint32_t stripes; + int stripes_shift; + sector_t stripes_mask; /* The size of this target / num. stripes */ sector_t stripe_width; @@ -80,8 +82,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, if (sscanf(argv[1], "%llu", &start) != 1) return -EINVAL; - if (dm_get_device(ti, argv[0], start, sc->stripe_width, - dm_table_get_mode(ti->table), + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &sc->stripe[stripe].dev)) return -ENXIO; @@ -110,7 +111,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) } stripes = simple_strtoul(argv[0], &end, 10); - if (*end) { + if (!stripes || *end) { ti->error = "Invalid stripe count"; return -EINVAL; } @@ -163,16 +164,22 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; - sc->stripes = stripes; sc->stripe_width = width; + + if (stripes & (stripes - 1)) + sc->stripes_shift = -1; + else { + sc->stripes_shift = ffs(stripes) - 1; + sc->stripes_mask = ((sector_t) stripes) - 1; + } + ti->split_io = chunk_size; ti->num_flush_requests = stripes; + ti->num_discard_requests = stripes; + sc->chunk_shift = ffs(chunk_size) - 1; sc->chunk_mask = ((sector_t) chunk_size) - 1; - for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) - chunk_size >>= 1; - sc->chunk_shift--; /* * Get the stripe destinations. @@ -208,26 +215,79 @@ static void stripe_dtr(struct dm_target *ti) kfree(sc); } +static void stripe_map_sector(struct stripe_c *sc, sector_t sector, + uint32_t *stripe, sector_t *result) +{ + sector_t offset = dm_target_offset(sc->ti, sector); + sector_t chunk = offset >> sc->chunk_shift; + + if (sc->stripes_shift < 0) + *stripe = sector_div(chunk, sc->stripes); + else { + *stripe = chunk & sc->stripes_mask; + chunk >>= sc->stripes_shift; + } + + *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); +} + +static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, + uint32_t target_stripe, sector_t *result) +{ + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, result); + if (stripe == target_stripe) + return; + *result &= ~sc->chunk_mask; /* round down */ + if (target_stripe < stripe) + *result += sc->chunk_mask + 1; /* next chunk */ +} + +static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, + uint32_t target_stripe) +{ + sector_t begin, end; + + stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); + stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), + target_stripe, &end); + if (begin < end) { + bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio->bi_sector = begin + sc->stripe[target_stripe].physical_start; + bio->bi_size = to_bytes(end - begin); + return DM_MAPIO_REMAPPED; + } else { + /* The range doesn't map to the target stripe */ + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } +} + static int stripe_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct stripe_c *sc = (struct stripe_c *) ti->private; - sector_t offset, chunk; + struct stripe_c *sc = ti->private; uint32_t stripe; + unsigned target_request_nr; - if (unlikely(bio_empty_barrier(bio))) { - BUG_ON(map_context->flush_request >= sc->stripes); - bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; + if (bio->bi_rw & REQ_FLUSH) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; return DM_MAPIO_REMAPPED; } + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + target_request_nr = map_context->target_request_nr; + BUG_ON(target_request_nr >= sc->stripes); + return stripe_map_discard(sc, bio, target_request_nr); + } - offset = bio->bi_sector - ti->begin; - chunk = offset >> sc->chunk_shift; - stripe = sector_div(chunk, sc->stripes); + stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); + bio->bi_sector += sc->stripe[stripe].physical_start; bio->bi_bdev = sc->stripe[stripe].dev->bdev; - bio->bi_sector = sc->stripe[stripe].physical_start + - (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return DM_MAPIO_REMAPPED; } @@ -285,7 +345,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (!error) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) return error; if (error == -EOPNOTSUPP) diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index f53392df7b97..84d2b91e4efb 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -75,25 +75,17 @@ static struct attribute *dm_attrs[] = { NULL, }; -static struct sysfs_ops dm_sysfs_ops = { +static const struct sysfs_ops dm_sysfs_ops = { .show = dm_attr_show, }; /* - * The sysfs structure is embedded in md struct, nothing to do here - */ -static void dm_sysfs_release(struct kobject *kobj) -{ -} - -/* * dm kobject is embedded in mapped_device structure * no need to define release function here */ static struct kobj_type dm_ktype = { .sysfs_ops = &dm_sysfs_ops, .default_attrs = dm_attrs, - .release = dm_sysfs_release }; /* diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4b22feb01a0c..90267f8d64ee 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -54,6 +54,8 @@ struct dm_table { sector_t *highs; struct dm_target *targets; + unsigned discards_supported:1; + /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ @@ -203,6 +205,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); atomic_set(&t->holders, 0); + t->discards_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -245,7 +248,7 @@ void dm_table_destroy(struct dm_table *t) msleep(1); smp_mb(); - /* free the indexes (see dm_table_complete) */ + /* free the indexes */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -429,8 +432,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, * it's already present. */ static int __table_get_device(struct dm_table *t, struct dm_target *ti, - const char *path, sector_t start, sector_t len, - fmode_t mode, struct dm_dev **result) + const char *path, fmode_t mode, struct dm_dev **result) { int r; dev_t uninitialized_var(dev); @@ -484,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, return 0; } -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -527,11 +524,10 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, } EXPORT_SYMBOL_GPL(dm_set_device_limits); -int dm_get_device(struct dm_target *ti, const char *path, sector_t start, - sector_t len, fmode_t mode, struct dm_dev **result) +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) { - return __table_get_device(ti->table, ti, path, - start, len, mode, result); + return __table_get_device(ti->table, ti, path, mode, result); } @@ -772,6 +768,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + if (!tgt->num_discard_requests) + t->discards_supported = 0; + return 0; bad: @@ -780,7 +779,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } -int dm_table_set_type(struct dm_table *t) +static int dm_table_set_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0; @@ -902,7 +901,7 @@ static int setup_indexes(struct dm_table *t) /* * Builds the btree to index the map. */ -int dm_table_complete(struct dm_table *t) +static int dm_table_build_index(struct dm_table *t) { int r = 0; unsigned int leaf_nodes; @@ -921,6 +920,55 @@ int dm_table_complete(struct dm_table *t) return r; } +/* + * Register the mapped device for blk_integrity support if + * the underlying devices support it. + */ +static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd; + + list_for_each_entry(dd, devices, list) + if (bdev_get_integrity(dd->dm_dev.bdev)) + return blk_integrity_register(dm_disk(md), NULL); + + return 0; +} + +/* + * Prepares the table for use by building the indices, + * setting the type, and allocating mempools. + */ +int dm_table_complete(struct dm_table *t) +{ + int r; + + r = dm_table_set_type(t); + if (r) { + DMERR("unable to set table type"); + return r; + } + + r = dm_table_build_index(t); + if (r) { + DMERR("unable to build btrees"); + return r; + } + + r = dm_table_prealloc_integrity(t, t->md); + if (r) { + DMERR("could not register integrity profile."); + return r; + } + + r = dm_table_alloc_md_mempools(t); + if (r) + DMERR("unable to allocate mempools"); + + return r; +} + static DEFINE_MUTEX(_event_lock); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context) @@ -1088,6 +1136,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); + if (!dm_table_supports_discards(t)) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + dm_table_set_integrity(t); /* @@ -1231,11 +1284,42 @@ void dm_table_unplug_all(struct dm_table *t) struct mapped_device *dm_table_get_md(struct dm_table *t) { - dm_get(t->md); - return t->md; } +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + if (!t->discards_supported) + return 0; + + /* + * Ensure that at least one underlying device supports discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 04feccf2a997..8da366cf381c 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -10,7 +10,6 @@ #include <linux/init.h> #include <linux/kmod.h> #include <linux/bio.h> -#include <linux/slab.h> #define DM_MSG_PREFIX "target" @@ -114,6 +113,11 @@ void dm_unregister_target(struct target_type *tt) */ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { + /* + * Return error for discards instead of -EOPNOTSUPP + */ + tt->num_discard_requests = 1; + return 0; } diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index c7c555a8c7b2..6b1e3b61b25e 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c @@ -187,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { DMERR("%s: Invalid event_type %d", __func__, event_type); - goto out; + return; } event = dm_build_path_uevent(md, ti, @@ -195,12 +195,9 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, _dm_uevent_type_names[event_type].name, path, nr_valid_paths); if (IS_ERR(event)) - goto out; + return; dm_uevent_add(md, &event->elist); - -out: - dm_put(md); } EXPORT_SYMBOL_GPL(dm_path_uevent); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index bbc97030c0c2..cc2b3cb81946 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -22,6 +22,11 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } + /* + * Silently drop discards, avoiding -EOPNOTSUPP. + */ + ti->num_discard_requests = 1; + return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3167480b532c..7cb1352f7e7a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/hdreg.h> +#include <linux/delay.h> #include <trace/events/block.h> @@ -31,6 +32,7 @@ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 +static DEFINE_MUTEX(dm_mutex); static const char *_name = DM_NAME; static unsigned int major = 0; @@ -108,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_QUEUE_IO_TO_THREAD 6 /* * Work processed by per-device workqueue. @@ -123,6 +124,10 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + unsigned type; + /* Protect queue and type against concurrent access. */ + struct mutex type_lock; + struct gendisk *disk; char name[16]; @@ -138,24 +143,9 @@ struct mapped_device { spinlock_t deferred_lock; /* - * An error from the barrier request currently being processed. - */ - int barrier_error; - - /* - * Protect barrier_error from concurrent endio processing - * in request-based dm. - */ - spinlock_t barrier_error_lock; - - /* - * Processing queue (flush/barriers) + * Processing queue (flush) */ struct workqueue_struct *wq; - struct work_struct barrier_work; - - /* A pointer to the currently processing pre/post flush request */ - struct request *flush_request; /* * The current mapping. @@ -194,8 +184,8 @@ struct mapped_device { /* sysfs handle */ struct kobject kobj; - /* zero-length barrier that will be cloned and submitted to targets */ - struct bio barrier_bio; + /* zero-length flush that will be cloned and submitted to targets */ + struct bio flush_bio; }; /* @@ -338,6 +328,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; + mutex_lock(&dm_mutex); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -355,6 +346,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); + mutex_unlock(&dm_mutex); return md ? 0 : -ENXIO; } @@ -362,8 +354,12 @@ out: static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; + + mutex_lock(&dm_mutex); atomic_dec(&md->open_count); dm_put(md); + mutex_unlock(&dm_mutex); + return 0; } @@ -500,7 +496,7 @@ static void end_io_acct(struct dm_io *io) /* * After this is decremented the bio must not be touched if it is - * a barrier. + * a flush. */ dm_disk(md)->part0.in_flight[rw] = pending = atomic_dec_return(&md->pending[rw]); @@ -516,16 +512,12 @@ static void end_io_acct(struct dm_io *io) */ static void queue_io(struct mapped_device *md, struct bio *bio) { - down_write(&md->io_lock); + unsigned long flags; - spin_lock_irq(&md->deferred_lock); + spin_lock_irqsave(&md->deferred_lock, flags); bio_list_add(&md->deferred, bio); - spin_unlock_irq(&md->deferred_lock); - - if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) - queue_work(md->wq, &md->work); - - up_write(&md->io_lock); + spin_unlock_irqrestore(&md->deferred_lock, flags); + queue_work(md->wq, &md->work); } /* @@ -613,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error) * Target requested pushing back the I/O. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md)) { - if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) - bio_list_add_head(&md->deferred, - io->bio); - } else + if (__noflush_suspending(md)) + bio_list_add_head(&md->deferred, io->bio); + else /* noflush suspend was interrupted. */ io->error = -EIO; spin_unlock_irqrestore(&md->deferred_lock, flags); @@ -625,27 +615,24 @@ static void dec_pending(struct dm_io *io, int error) io_error = io->error; bio = io->bio; + end_io_acct(io); + free_io(md, io); + + if (io_error == DM_ENDIO_REQUEUE) + return; - if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { /* - * There can be just one barrier request so we use - * a per-device variable for error reporting. - * Note that you can't touch the bio after end_io_acct + * Preflush done for flush with data, reissue + * without REQ_FLUSH. */ - if (!md->barrier_error && io_error != -EOPNOTSUPP) - md->barrier_error = io_error; - end_io_acct(io); + bio->bi_rw &= ~REQ_FLUSH; + queue_io(md, bio); } else { - end_io_acct(io); - - if (io_error != DM_ENDIO_REQUEUE) { - trace_block_bio_complete(md->queue, bio); - - bio_endio(bio, io_error); - } + /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio); + bio_endio(bio, io_error); } - - free_io(md, io); } } @@ -737,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error) blk_update_request(tio->orig, 0, nr_bytes); } -static void store_barrier_error(struct mapped_device *md, int error) -{ - unsigned long flags; - - spin_lock_irqsave(&md->barrier_error_lock, flags); - /* - * Basically, the first error is taken, but: - * -EOPNOTSUPP supersedes any I/O error. - * Requeue request supersedes any I/O error but -EOPNOTSUPP. - */ - if (!md->barrier_error || error == -EOPNOTSUPP || - (md->barrier_error != -EOPNOTSUPP && - error == DM_ENDIO_REQUEUE)) - md->barrier_error = error; - spin_unlock_irqrestore(&md->barrier_error_lock, flags); -} - /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. @@ -791,13 +761,11 @@ static void free_rq_clone(struct request *clone) static void dm_end_request(struct request *clone, int error) { int rw = rq_data_dir(clone); - int run_queue = 1; - bool is_barrier = blk_barrier_rq(clone); struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; - if (blk_pc_request(rq) && !is_barrier) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { rq->errors = clone->errors; rq->resid_len = clone->resid_len; @@ -811,15 +779,8 @@ static void dm_end_request(struct request *clone, int error) } free_rq_clone(clone); - - if (unlikely(is_barrier)) { - if (unlikely(error)) - store_barrier_error(md, error); - run_queue = 0; - } else - blk_end_request_all(rq, error); - - rq_completed(md, rw, run_queue); + blk_end_request_all(rq, error); + rq_completed(md, rw, true); } static void dm_unprep_request(struct request *rq) @@ -844,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone) struct request_queue *q = rq->q; unsigned long flags; - if (unlikely(blk_barrier_rq(clone))) { - /* - * Barrier clones share an original request. - * Leave it to dm_end_request(), which handles this special - * case. - */ - dm_end_request(clone, DM_ENDIO_REQUEUE); - return; - } - dm_unprep_request(rq); spin_lock_irqsave(q->queue_lock, flags); @@ -943,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; - if (unlikely(blk_barrier_rq(clone))) { - /* - * Barrier clones share an original request. So can't use - * softirq_done with the original. - * Pass the clone to dm_done() directly in this special case. - * It is safe (even if clone->q->queue_lock is held here) - * because there is no I/O dispatching during the completion - * of barrier clone. - */ - dm_done(clone, error, true); - return; - } - tio->error = error; rq->completion_data = clone; blk_complete_request(rq); @@ -972,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; - if (unlikely(blk_barrier_rq(clone))) { - /* - * Barrier clones share an original request. - * Leave it to dm_end_request(), which handles this special - * case. - */ - BUG_ON(error > 0); - dm_end_request(clone, error); - return; - } - rq->cmd_flags |= REQ_FAILED; dm_complete_request(clone, error); } @@ -1012,17 +939,27 @@ static void end_clone_request(struct request *clone, int error) dm_complete_request(clone, error); } -static sector_t max_io_len(struct mapped_device *md, - sector_t sector, struct dm_target *ti) +/* + * Return maximum size of I/O possible at the supplied sector up to the current + * target boundary. + */ +static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) { - sector_t offset = sector - ti->begin; - sector_t len = ti->len - offset; + sector_t target_offset = dm_target_offset(ti, sector); + + return ti->len - target_offset; +} + +static sector_t max_io_len(sector_t sector, struct dm_target *ti) +{ + sector_t len = max_io_len_target_boundary(sector, ti); /* * Does the target need to split even further ? */ if (ti->split_io) { sector_t boundary; + sector_t offset = dm_target_offset(ti, sector); boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) - offset; if (len > boundary) @@ -1091,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio) } /* - * Creates a little bio that is just does part of a bvec. + * Creates a little bio that just does part of a bvec. */ static struct bio *split_bvec(struct bio *bio, sector_t sector, unsigned short idx, unsigned int offset, @@ -1106,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, clone->bi_sector = sector; clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); + clone->bi_rw = bio->bi_rw; clone->bi_vcnt = 1; clone->bi_size = to_bytes(len); clone->bi_io_vec->bv_offset = offset; @@ -1133,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); __bio_clone(clone, bio); - clone->bi_rw &= ~(1 << BIO_RW_BARRIER); clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; @@ -1164,32 +1100,91 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, return tio; } -static void __flush_target(struct clone_info *ci, struct dm_target *ti, - unsigned flush_nr) +static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, + unsigned request_nr, sector_t len) { struct dm_target_io *tio = alloc_tio(ci, ti); struct bio *clone; - tio->info.flush_request = flush_nr; + tio->info.target_request_nr = request_nr; - clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); + /* + * Discard requests require the bio's inline iovecs be initialized. + * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush + * and discard, so no need for concern about wasted bvec allocations. + */ + clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); __bio_clone(clone, ci->bio); clone->bi_destructor = dm_bio_destructor; + if (len) { + clone->bi_sector = ci->sector; + clone->bi_size = to_bytes(len); + } __map_bio(ti, clone, tio); } -static int __clone_and_map_empty_barrier(struct clone_info *ci) +static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, + unsigned num_requests, sector_t len) +{ + unsigned request_nr; + + for (request_nr = 0; request_nr < num_requests; request_nr++) + __issue_target_request(ci, ti, request_nr, len); +} + +static int __clone_and_map_empty_flush(struct clone_info *ci) { - unsigned target_nr = 0, flush_nr; + unsigned target_nr = 0; struct dm_target *ti; + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) - for (flush_nr = 0; flush_nr < ti->num_flush_requests; - flush_nr++) - __flush_target(ci, ti, flush_nr); + __issue_target_requests(ci, ti, ti->num_flush_requests, 0); + return 0; +} + +/* + * Perform all io with a single clone. + */ +static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) +{ + struct bio *clone, *bio = ci->bio; + struct dm_target_io *tio; + + tio = alloc_tio(ci, ti); + clone = clone_bio(bio, ci->sector, ci->idx, + bio->bi_vcnt - ci->idx, ci->sector_count, + ci->md->bs); + __map_bio(ti, clone, tio); ci->sector_count = 0; +} + +static int __clone_and_map_discard(struct clone_info *ci) +{ + struct dm_target *ti; + sector_t len; + + do { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; + + /* + * Even though the device advertised discard support, + * reconfiguration might have changed that since the + * check was performed. + */ + if (!ti->num_discard_requests) + return -EOPNOTSUPP; + + len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + + __issue_target_requests(ci, ti, ti->num_discard_requests, len); + + ci->sector += len; + } while (ci->sector_count -= len); return 0; } @@ -1201,30 +1196,21 @@ static int __clone_and_map(struct clone_info *ci) sector_t len = 0, max; struct dm_target_io *tio; - if (unlikely(bio_empty_barrier(bio))) - return __clone_and_map_empty_barrier(ci); + if (unlikely(bio->bi_rw & REQ_DISCARD)) + return __clone_and_map_discard(ci); ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->md, ci->sector, ti); - - /* - * Allocate a target io object. - */ - tio = alloc_tio(ci, ti); + max = max_io_len(ci->sector, ti); if (ci->sector_count <= max) { /* * Optimise for the simple case where we can do all of * the remaining io with a single clone. */ - clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count, - ci->md->bs); - __map_bio(ti, clone, tio); - ci->sector_count = 0; + __clone_and_map_simple(ci, ti); } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { /* @@ -1245,6 +1231,7 @@ static int __clone_and_map(struct clone_info *ci) len += bv_len; } + tio = alloc_tio(ci, ti); clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, ci->md->bs); __map_bio(ti, clone, tio); @@ -1267,13 +1254,12 @@ static int __clone_and_map(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->md, ci->sector, ti); - - tio = alloc_tio(ci, ti); + max = max_io_len(ci->sector, ti); } len = min(remaining, max); + tio = alloc_tio(ci, ti); clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + offset, len, ci->md->bs); @@ -1301,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.map = dm_get_live_table(md); if (unlikely(!ci.map)) { - if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) - bio_io_error(bio); - else - if (!md->barrier_error) - md->barrier_error = -EIO; + bio_io_error(bio); return; } ci.md = md; - ci.bio = bio; ci.io = alloc_io(md); ci.io->error = 0; atomic_set(&ci.io->io_count, 1); @@ -1318,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.io->md = md; spin_lock_init(&ci.io->endio_lock); ci.sector = bio->bi_sector; - ci.sector_count = bio_sectors(bio); - if (unlikely(bio_empty_barrier(bio))) - ci.sector_count = 1; ci.idx = bio->bi_idx; start_io_acct(ci.io); - while (ci.sector_count && !error) - error = __clone_and_map(&ci); + if (bio->bi_rw & REQ_FLUSH) { + ci.bio = &ci.md->flush_bio; + ci.sector_count = 0; + error = __clone_and_map_empty_flush(&ci); + /* dec_pending submits any data associated with flush */ + } else { + ci.bio = bio; + ci.sector_count = bio_sectors(bio); + while (ci.sector_count && !error) + error = __clone_and_map(&ci); + } /* drop the extra reference count */ dec_pending(ci.io, error); @@ -1355,7 +1342,7 @@ static int dm_merge_bvec(struct request_queue *q, /* * Find maximum amount of I/O that won't need splitting */ - max_sectors = min(max_io_len(md, bvm->bi_sector, ti), + max_sectors = min(max_io_len(bvm->bi_sector, ti), (sector_t) BIO_MAX_SECTORS); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; if (max_size < 0) @@ -1409,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio) part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); part_stat_unlock(); - /* - * If we're suspended or the thread is processing barriers - * we have to queue this io for later. - */ - if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || - unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + /* if we're suspended, we have to queue this io for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { up_read(&md->io_lock); - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && - bio_rw(bio) == READA) { + if (bio_rw(bio) != READA) + queue_io(md, bio); + else bio_io_error(bio); - return 0; - } - - queue_io(md, bio); - return 0; } @@ -1455,25 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio) return _dm_request(q, bio); } -/* - * Mark this request as flush request, so that dm_request_fn() can - * recognize. - */ -static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) -{ - rq->cmd_type = REQ_TYPE_LINUX_BLOCK; - rq->cmd[0] = REQ_LB_OP_FLUSH; -} - -static bool dm_rq_is_flush_request(struct request *rq) -{ - if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && - rq->cmd[0] == REQ_LB_OP_FLUSH) - return true; - else - return false; -} - void dm_dispatch_request(struct request *rq) { int r; @@ -1521,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; - if (dm_rq_is_flush_request(rq)) { - blk_rq_init(NULL, clone); - clone->cmd_type = REQ_TYPE_FS; - clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); - } else { - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, - dm_rq_bio_constructor, tio); - if (r) - return r; - - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; - clone->buffer = rq->buffer; - } + r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + if (r) + return r; + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -1577,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) struct mapped_device *md = q->queuedata; struct request *clone; - if (unlikely(dm_rq_is_flush_request(rq))) - return BLKPREP_OK; - if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; @@ -1595,10 +1545,15 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_OK; } -static void map_request(struct dm_target *ti, struct request *clone, - struct mapped_device *md) +/* + * Returns: + * 0 : the request has been processed (not requeued) + * !0 : the request has been requeued + */ +static int map_request(struct dm_target *ti, struct request *clone, + struct mapped_device *md) { - int r; + int r, requeued = 0; struct dm_rq_target_io *tio = clone->end_io_data; /* @@ -1625,6 +1580,7 @@ static void map_request(struct dm_target *ti, struct request *clone, case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ dm_requeue_unmapped_request(clone); + requeued = 1; break; default: if (r > 0) { @@ -1636,6 +1592,8 @@ static void map_request(struct dm_target *ti, struct request *clone, dm_kill_unmapped_request(clone, r); break; } + + return requeued; } /* @@ -1648,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q) struct dm_table *map = dm_get_live_table(md); struct dm_target *ti; struct request *rq, *clone; + sector_t pos; /* * For suspend, check blk_queue_stopped() and increment @@ -1660,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q) if (!rq) goto plug_and_out; - if (unlikely(dm_rq_is_flush_request(rq))) { - BUG_ON(md->flush_request); - md->flush_request = rq; - blk_start_request(rq); - queue_work(md->wq, &md->barrier_work); - goto out; - } + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + BUG_ON(!dm_target_is_valid(ti)); - ti = dm_table_find_target(map, blk_rq_pos(rq)); if (ti->type->busy && ti->type->busy(ti)) goto plug_and_out; @@ -1677,12 +1635,17 @@ static void dm_request_fn(struct request_queue *q) atomic_inc(&md->pending[rq_data_dir(clone)]); spin_unlock(q->queue_lock); - map_request(ti, clone, md); + if (map_request(ti, clone, md)) + goto requeued; + spin_lock_irq(q->queue_lock); } goto out; +requeued: + spin_lock_irq(q->queue_lock); + plug_and_out: if (!elv_queue_empty(q)) /* Some requests still remain, retry later */ @@ -1834,7 +1797,29 @@ out: static const struct block_device_operations dm_blk_dops; static void dm_wq_work(struct work_struct *work); -static void dm_rq_barrier_work(struct work_struct *work); + +static void dm_init_md_queue(struct mapped_device *md) +{ + /* + * Request-based dm devices cannot be stacked on top of bio-based dm + * devices. The type of this dm device has not been decided yet. + * The type is decided at the first table loading time. + * To prevent problematic device stacking, clear the queue flag + * for request stacking support until then. + * + * This queue is new, so no concurrency on the queue_flags. + */ + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + + md->queue->queuedata = md; + md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info.congested_data = md; + blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); + md->queue->unplug_fn = dm_unplug_all; + blk_queue_merge_bvec(md->queue, dm_merge_bvec); + blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); +} /* * Allocate and initialise a blank device with a given minor. @@ -1861,10 +1846,11 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_minor; + md->type = DM_TYPE_NONE; init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); + mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); - spin_lock_init(&md->barrier_error_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -1873,34 +1859,11 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->uevent_list); spin_lock_init(&md->uevent_lock); - md->queue = blk_init_queue(dm_request_fn, NULL); + md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) goto bad_queue; - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet, - * although we initialized the queue using blk_init_queue(). - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - md->saved_make_request_fn = md->queue->make_request_fn; - md->queue->queuedata = md; - md->queue->backing_dev_info.congested_fn = dm_any_congested; - md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - md->queue->unplug_fn = dm_unplug_all; - blk_queue_merge_bvec(md->queue, dm_merge_bvec); - blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); - blk_queue_lld_busy(md->queue, dm_lld_busy); - blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, - dm_rq_prepare_flush); + dm_init_md_queue(md); md->disk = alloc_disk(1); if (!md->disk) @@ -1910,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor) atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); - INIT_WORK(&md->barrier_work, dm_rq_barrier_work); init_waitqueue_head(&md->eventq); md->disk->major = _major; @@ -1930,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad_bdev; + bio_init(&md->flush_bio); + md->flush_bio.bi_bdev = md->bdev; + md->flush_bio.bi_rw = WRITE_FLUSH; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -2115,6 +2081,71 @@ int dm_create(int minor, struct mapped_device **result) return 0; } +/* + * Functions to manage md->type. + * All are required to hold md->type_lock. + */ +void dm_lock_md_type(struct mapped_device *md) +{ + mutex_lock(&md->type_lock); +} + +void dm_unlock_md_type(struct mapped_device *md) +{ + mutex_unlock(&md->type_lock); +} + +void dm_set_md_type(struct mapped_device *md, unsigned type) +{ + md->type = type; +} + +unsigned dm_get_md_type(struct mapped_device *md) +{ + return md->type; +} + +/* + * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + */ +static int dm_init_request_based_queue(struct mapped_device *md) +{ + struct request_queue *q = NULL; + + if (md->queue->elevator) + return 1; + + /* Fully initialize the queue */ + q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); + if (!q) + return 0; + + md->queue = q; + md->saved_make_request_fn = md->queue->make_request_fn; + dm_init_md_queue(md); + blk_queue_softirq_done(md->queue, dm_softirq_done); + blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_lld_busy(md->queue, dm_lld_busy); + + elv_register_queue(md->queue); + + return 1; +} + +/* + * Setup the DM device's queue based on md's type + */ +int dm_setup_md_queue(struct mapped_device *md) +{ + if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && + !dm_init_request_based_queue(md)) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return -EINVAL; + } + + return 0; +} + static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; @@ -2128,6 +2159,7 @@ static struct mapped_device *dm_find_md(dev_t dev) md = idr_find(&_minor_idr, minor); if (md && (md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || + dm_deleting_md(md) || test_bit(DMF_FREEING, &md->flags))) { md = NULL; goto out; @@ -2162,6 +2194,7 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr) void dm_get(struct mapped_device *md) { atomic_inc(&md->holders); + BUG_ON(test_bit(DMF_FREEING, &md->flags)); } const char *dm_device_name(struct mapped_device *md) @@ -2170,27 +2203,55 @@ const char *dm_device_name(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_device_name); -void dm_put(struct mapped_device *md) +static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; - BUG_ON(test_bit(DMF_FREEING, &md->flags)); + might_sleep(); - if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { - map = dm_get_live_table(md); - idr_replace(&_minor_idr, MINOR_ALLOCED, - MINOR(disk_devt(dm_disk(md)))); - set_bit(DMF_FREEING, &md->flags); - spin_unlock(&_minor_lock); - if (!dm_suspended_md(md)) { - dm_table_presuspend_targets(map); - dm_table_postsuspend_targets(map); - } - dm_sysfs_exit(md); - dm_table_put(map); - dm_table_destroy(__unbind(md)); - free_dev(md); + spin_lock(&_minor_lock); + map = dm_get_live_table(md); + idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); + + if (!dm_suspended_md(md)) { + dm_table_presuspend_targets(map); + dm_table_postsuspend_targets(map); } + + /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + + dm_sysfs_exit(md); + dm_table_put(map); + dm_table_destroy(__unbind(md)); + free_dev(md); +} + +void dm_destroy(struct mapped_device *md) +{ + __dm_destroy(md, true); +} + +void dm_destroy_immediate(struct mapped_device *md) +{ + __dm_destroy(md, false); +} + +void dm_put(struct mapped_device *md) +{ + atomic_dec(&md->holders); } EXPORT_SYMBOL_GPL(dm_put); @@ -2225,38 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) return r; } -static void dm_flush(struct mapped_device *md) -{ - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - - bio_init(&md->barrier_bio); - md->barrier_bio.bi_bdev = md->bdev; - md->barrier_bio.bi_rw = WRITE_BARRIER; - __split_and_process_bio(md, &md->barrier_bio); - - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); -} - -static void process_barrier(struct mapped_device *md, struct bio *bio) -{ - md->barrier_error = 0; - - dm_flush(md); - - if (!bio_empty_barrier(bio)) { - __split_and_process_bio(md, bio); - dm_flush(md); - } - - if (md->barrier_error != DM_ENDIO_REQUEUE) - bio_endio(bio, md->barrier_error); - else { - spin_lock_irq(&md->deferred_lock); - bio_list_add_head(&md->deferred, bio); - spin_unlock_irq(&md->deferred_lock); - } -} - /* * Process the deferred bios */ @@ -2266,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work) work); struct bio *c; - down_write(&md->io_lock); + down_read(&md->io_lock); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); c = bio_list_pop(&md->deferred); spin_unlock_irq(&md->deferred_lock); - if (!c) { - clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); + if (!c) break; - } - up_write(&md->io_lock); + up_read(&md->io_lock); if (dm_request_based(md)) generic_make_request(c); - else { - if (bio_rw_flagged(c, BIO_RW_BARRIER)) - process_barrier(md, c); - else - __split_and_process_bio(md, c); - } + else + __split_and_process_bio(md, c); - down_write(&md->io_lock); + down_read(&md->io_lock); } - up_write(&md->io_lock); + up_read(&md->io_lock); } static void dm_queue_flush(struct mapped_device *md) @@ -2302,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md) queue_work(md->wq, &md->work); } -static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - - tio->info.flush_request = flush_nr; -} - -/* Issue barrier requests to targets and wait for their completion. */ -static int dm_rq_barrier(struct mapped_device *md) -{ - int i, j; - struct dm_table *map = dm_get_live_table(md); - unsigned num_targets = dm_table_get_num_targets(map); - struct dm_target *ti; - struct request *clone; - - md->barrier_error = 0; - - for (i = 0; i < num_targets; i++) { - ti = dm_table_get_target(map, i); - for (j = 0; j < ti->num_flush_requests; j++) { - clone = clone_rq(md->flush_request, md, GFP_NOIO); - dm_rq_set_flush_nr(clone, j); - atomic_inc(&md->pending[rq_data_dir(clone)]); - map_request(ti, clone, md); - } - } - - dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); - dm_table_put(map); - - return md->barrier_error; -} - -static void dm_rq_barrier_work(struct work_struct *work) -{ - int error; - struct mapped_device *md = container_of(work, struct mapped_device, - barrier_work); - struct request_queue *q = md->queue; - struct request *rq; - unsigned long flags; - - /* - * Hold the md reference here and leave it at the last part so that - * the md can't be deleted by device opener when the barrier request - * completes. - */ - dm_get(md); - - error = dm_rq_barrier(md); - - rq = md->flush_request; - md->flush_request = NULL; - - if (error == DM_ENDIO_REQUEUE) { - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, rq); - spin_unlock_irqrestore(q->queue_lock, flags); - } else - blk_end_request_all(rq, error); - - blk_run_queue(q); - - dm_put(md); -} - /* * Swap in a new table, returning the old one for the caller to destroy. */ @@ -2390,13 +2346,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; } - /* cannot change the device type, once a table is bound */ - if (md->map && - (dm_table_get_type(md->map) != dm_table_get_type(table))) { - DMWARN("can't change the device type after a table is bound"); - goto out; - } - map = __bind(md, table, &limits); out: @@ -2498,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * * To get all processes out of __split_and_process_bio in dm_request, * we take the write lock. To prevent any process from reentering - * __split_and_process_bio from dm_request, we set - * DMF_QUEUE_IO_TO_THREAD. - * - * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND - * and call flush_workqueue(md->wq). flush_workqueue will wait until - * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any - * further calls to __split_and_process_bio from dm_wq_work. + * __split_and_process_bio from dm_request and quiesce the thread + * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call + * flush_workqueue(md->wq). */ down_write(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); up_write(&md->io_lock); /* - * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which - * can be kicked until md->queue is stopped. So stop md->queue before - * flushing md->wq. + * Stop md->queue before flushing md->wq in case request-based + * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) stop_queue(md->queue); @@ -2605,18 +2548,19 @@ out: /*----------------------------------------------------------------- * Event notification. *---------------------------------------------------------------*/ -void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie) { char udev_cookie[DM_COOKIE_LENGTH]; char *envp[] = { udev_cookie, NULL }; if (!cookie) - kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); else { snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", DM_COOKIE_ENV_VAR_NAME, cookie); - kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); + return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); } } @@ -2686,23 +2630,13 @@ int dm_suspended_md(struct mapped_device *md) int dm_suspended(struct dm_target *ti) { - struct mapped_device *md = dm_table_get_md(ti->table); - int r = dm_suspended_md(md); - - dm_put(md); - - return r; + return dm_suspended_md(dm_table_get_md(ti->table)); } EXPORT_SYMBOL_GPL(dm_suspended); int dm_noflush_suspending(struct dm_target *ti) { - struct mapped_device *md = dm_table_get_md(ti->table); - int r = __noflush_suspending(md); - - dm_put(md); - - return r; + return __noflush_suspending(dm_table_get_md(ti->table)); } EXPORT_SYMBOL_GPL(dm_noflush_suspending); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 8dadaa5bc396..0c2dd5f4af76 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -59,13 +59,20 @@ void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); int dm_table_any_busy_target(struct dm_table *t); -int dm_table_set_type(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_discards(struct dm_table *t); int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); + +int dm_setup_md_queue(struct mapped_device *md); + /* * To check the return value from dm_table_find_target(). */ @@ -122,11 +129,16 @@ void dm_linear_exit(void); int dm_stripe_init(void); void dm_stripe_exit(void); +/* + * mapped_device operations + */ +void dm_destroy(struct mapped_device *md); +void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); -void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, - unsigned cookie); +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie); int dm_io_init(void); void dm_io_exit(void); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 713acd02ab39..1a8987884614 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -64,6 +64,7 @@ #define MaxFault 50 #include <linux/blkdev.h> #include <linux/raid/md_u.h> +#include <linux/slab.h> #include "md.h" #include <linux/seq_file.h> @@ -168,10 +169,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode) conf->nfaults = n+1; } -static int make_request(struct request_queue *q, struct bio *bio) +static int make_request(mddev_t *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; - conf_t *conf = (conf_t*)mddev->private; + conf_t *conf = mddev->private; int failit = 0; if (bio_data_dir(bio) == WRITE) { @@ -224,7 +224,7 @@ static int make_request(struct request_queue *q, struct bio *bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = (conf_t*)mddev->private; + conf_t *conf = mddev->private; int n; if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) @@ -327,7 +327,7 @@ static int run(mddev_t *mddev) static int stop(mddev_t *mddev) { - conf_t *conf = (conf_t *)mddev->private; + conf_t *conf = mddev->private; kfree(conf); mddev->private = NULL; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 00435bd20699..8a2f767f26d8 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -19,6 +19,7 @@ #include <linux/blkdev.h> #include <linux/raid/md_u.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include "md.h" #include "linear.h" @@ -158,7 +159,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { - printk("linear: disk numbering problem. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); goto out; } @@ -172,19 +174,22 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit max_segments to 1 lying within + * a single page. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } conf->array_sectors += rdev->sectors; cnt++; } if (cnt != raid_disks) { - printk("linear: not enough drives present. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); goto out; } @@ -279,29 +284,21 @@ static int linear_stop (mddev_t *mddev) rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); + mddev->private = NULL; return 0; } -static int linear_make_request (struct request_queue *q, struct bio *bio) +static int linear_make_request (mddev_t *mddev, struct bio *bio) { - const int rw = bio_data_dir(bio); - mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; sector_t start_sector; - int cpu; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - rcu_read_lock(); tmp_dev = which_dev(mddev, bio->bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; @@ -311,12 +308,14 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) || (bio->bi_sector < start_sector))) { char b[BDEVNAME_SIZE]; - printk("linear_make_request: Sector %llu out of bounds on " - "dev %s: %llu sectors, offset %llu\n", - (unsigned long long)bio->bi_sector, - bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); + printk(KERN_ERR + "md/linear:%s: make_request: Sector %llu out of bounds on " + "dev %s: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_sector, + bdevname(tmp_dev->rdev->bdev, b), + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); rcu_read_unlock(); bio_io_error(bio); return 0; @@ -333,9 +332,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) bp = bio_split(bio, end_sector - bio->bi_sector); - if (linear_make_request(q, &bp->bio1)) + if (linear_make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (linear_make_request(q, &bp->bio2)) + if (linear_make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index dd3dfe42d5a9..225815197a3d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -36,6 +36,7 @@ #include <linux/blkdev.h> #include <linux/sysctl.h> #include <linux/seq_file.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/poll.h> #include <linux/ctype.h> @@ -49,12 +50,14 @@ #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> +#include <linux/slab.h> #include "md.h" #include "bitmap.h" #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) +static DEFINE_MUTEX(md_mutex); #ifndef MODULE static void autostart_arrays(int part); @@ -214,19 +217,22 @@ static DEFINE_SPINLOCK(all_mddevs_lock); */ static int md_make_request(struct request_queue *q, struct bio *bio) { + const int rw = bio_data_dir(bio); mddev_t *mddev = q->queuedata; int rv; + int cpu; + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return 0; } rcu_read_lock(); - if (mddev->suspended || mddev->barrier) { + if (mddev->suspended) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended && !mddev->barrier) + if (!mddev->suspended) break; rcu_read_unlock(); schedule(); @@ -236,72 +242,70 @@ static int md_make_request(struct request_queue *q, struct bio *bio) } atomic_inc(&mddev->active_io); rcu_read_unlock(); - rv = mddev->pers->make_request(q, bio); + + rv = mddev->pers->make_request(mddev, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) wake_up(&mddev->sb_wait); return rv; } -static void mddev_suspend(mddev_t *mddev) +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */ +void mddev_suspend(mddev_t *mddev) { BUG_ON(mddev->suspended); mddev->suspended = 1; synchronize_rcu(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - /* we now know that no code is executing in the personality module, - * except possibly the tail end of a ->bi_end_io function, but that - * is certain to complete before the module has a chance to get - * unloaded - */ } +EXPORT_SYMBOL_GPL(mddev_suspend); -static void mddev_resume(mddev_t *mddev) +void mddev_resume(mddev_t *mddev) { mddev->suspended = 0; wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); } +EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(mddev_t *mddev, int bits) { - if (mddev->barrier) - return 1; return mddev->suspended; } EXPORT_SYMBOL(mddev_congested); /* - * Generic barrier handling for md + * Generic flush handling for md */ -#define POST_REQUEST_BARRIER ((void*)1) - -static void md_end_barrier(struct bio *bio, int err) +static void md_end_flush(struct bio *bio, int err) { mdk_rdev_t *rdev = bio->bi_private; mddev_t *mddev = rdev->mddev; - if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) - set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { - if (mddev->barrier == POST_REQUEST_BARRIER) { - /* This was a post-request barrier */ - mddev->barrier = NULL; - wake_up(&mddev->sb_wait); - } else - /* The pre-request barrier has finished */ - schedule_work(&mddev->barrier_work); + /* The pre-request flush has finished */ + schedule_work(&mddev->flush_work); } bio_put(bio); } -static void submit_barriers(mddev_t *mddev) +static void submit_flushes(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -318,60 +322,101 @@ static void submit_barriers(mddev_t *mddev) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc(GFP_KERNEL, 0); - bi->bi_end_io = md_end_barrier; + bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_BARRIER, bi); + submit_bio(WRITE_FLUSH, bi); rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); } -static void md_submit_barrier(struct work_struct *ws) +static void md_submit_flush_data(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, barrier_work); - struct bio *bio = mddev->barrier; + mddev_t *mddev = container_of(ws, mddev_t, flush_work); + struct bio *bio = mddev->flush_bio; atomic_set(&mddev->flush_pending, 1); - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - bio_endio(bio, -EOPNOTSUPP); - else if (bio->bi_size == 0) + if (bio->bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); else { - bio->bi_rw &= ~(1<<BIO_RW_BARRIER); - if (mddev->pers->make_request(mddev->queue, bio)) + bio->bi_rw &= ~REQ_FLUSH; + if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); - mddev->barrier = POST_REQUEST_BARRIER; - submit_barriers(mddev); } if (atomic_dec_and_test(&mddev->flush_pending)) { - mddev->barrier = NULL; + mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); } } -void md_barrier_request(mddev_t *mddev, struct bio *bio) +void md_flush_request(mddev_t *mddev, struct bio *bio) { spin_lock_irq(&mddev->write_lock); wait_event_lock_irq(mddev->sb_wait, - !mddev->barrier, + !mddev->flush_bio, mddev->write_lock, /*nothing*/); - mddev->barrier = bio; + mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); atomic_set(&mddev->flush_pending, 1); - INIT_WORK(&mddev->barrier_work, md_submit_barrier); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); - submit_barriers(mddev); + submit_flushes(mddev); if (atomic_dec_and_test(&mddev->flush_pending)) - schedule_work(&mddev->barrier_work); + schedule_work(&mddev->flush_work); +} +EXPORT_SYMBOL(md_flush_request); + +/* Support for plugging. + * This mirrors the plugging support in request_queue, but does not + * require having a whole queue + */ +static void plugger_work(struct work_struct *work) +{ + struct plug_handle *plug = + container_of(work, struct plug_handle, unplug_work); + plug->unplug_fn(plug); } -EXPORT_SYMBOL(md_barrier_request); +static void plugger_timeout(unsigned long data) +{ + struct plug_handle *plug = (void *)data; + kblockd_schedule_work(NULL, &plug->unplug_work); +} +void plugger_init(struct plug_handle *plug, + void (*unplug_fn)(struct plug_handle *)) +{ + plug->unplug_flag = 0; + plug->unplug_fn = unplug_fn; + init_timer(&plug->unplug_timer); + plug->unplug_timer.function = plugger_timeout; + plug->unplug_timer.data = (unsigned long)plug; + INIT_WORK(&plug->unplug_work, plugger_work); +} +EXPORT_SYMBOL_GPL(plugger_init); + +void plugger_set_plug(struct plug_handle *plug) +{ + if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) + mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); +} +EXPORT_SYMBOL_GPL(plugger_set_plug); + +int plugger_remove_plug(struct plug_handle *plug) +{ + if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { + del_timer(&plug->unplug_timer); + return 1; + } else + return 0; +} +EXPORT_SYMBOL_GPL(plugger_remove_plug); + static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -405,6 +450,28 @@ static void mddev_put(mddev_t *mddev) spin_unlock(&all_mddevs_lock); } +void mddev_init(mddev_t *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + init_timer(&mddev->safemode_timer); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + atomic_set(&mddev->active_io, 0); + spin_lock_init(&mddev->write_lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; +} +EXPORT_SYMBOL_GPL(mddev_init); + static mddev_t * mddev_find(dev_t unit) { mddev_t *mddev, *new = NULL; @@ -471,23 +538,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - mutex_init(&new->open_mutex); - mutex_init(&new->reconfig_mutex); - mutex_init(&new->bitmap_info.mutex); - INIT_LIST_HEAD(&new->disks); - INIT_LIST_HEAD(&new->all_mddevs); - init_timer(&new->safemode_timer); - atomic_set(&new->active, 1); - atomic_set(&new->openers, 0); - atomic_set(&new->active_io, 0); - spin_lock_init(&new->write_lock); - atomic_set(&new->flush_pending, 0); - init_waitqueue_head(&new->sb_wait); - init_waitqueue_head(&new->recovery_wait); - new->reshape_position = MaxSector; - new->resync_min = 0; - new->resync_max = MaxSector; - new->level = LEVEL_NONE; + mddev_init(new); goto retry; } @@ -507,9 +558,42 @@ static inline int mddev_trylock(mddev_t * mddev) return mutex_trylock(&mddev->reconfig_mutex); } -static inline void mddev_unlock(mddev_t * mddev) +static struct attribute_group md_redundancy_group; + +static void mddev_unlock(mddev_t * mddev) { - mutex_unlock(&mddev->reconfig_mutex); + if (mddev->to_remove) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. + */ + struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; + mddev->sysfs_active = 1; + mutex_unlock(&mddev->reconfig_mutex); + + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } + } + mddev->sysfs_active = 0; + } else + mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); } @@ -598,31 +682,6 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -static void super_written_barrier(struct bio *bio, int error) -{ - struct bio *bio2 = bio->bi_private; - mdk_rdev_t *rdev = bio2->bi_private; - mddev_t *mddev = rdev->mddev; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - error == -EOPNOTSUPP) { - unsigned long flags; - /* barriers don't appear to be supported :-( */ - set_bit(BarriersNotsupp, &rdev->flags); - mddev->barriers_work = 0; - spin_lock_irqsave(&mddev->write_lock, flags); - bio2->bi_next = mddev->biolist; - mddev->biolist = bio2; - spin_unlock_irqrestore(&mddev->write_lock, flags); - wake_up(&mddev->sb_wait); - bio_put(bio); - } else { - bio_put(bio2); - bio->bi_private = rdev; - super_written(bio, error); - } -} - void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page) { @@ -631,51 +690,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error - * - * As we might need to resubmit the request if BIO_RW_BARRIER - * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); - if (!test_bit(BarriersNotsupp, &rdev->flags)) { - struct bio *rbio; - rw |= (1<<BIO_RW_BARRIER); - rbio = bio_clone(bio, GFP_NOIO); - rbio->bi_private = bio; - rbio->bi_end_io = super_written_barrier; - submit_bio(rw, rbio); - } else - submit_bio(rw, bio); + submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, + bio); } void md_super_wait(mddev_t *mddev) { - /* wait for all superblock writes that were scheduled to complete. - * if any had to be retried (due to BARRIER problems), retry them - */ + /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; - while (mddev->biolist) { - struct bio *bio; - spin_lock_irq(&mddev->write_lock); - bio = mddev->biolist; - mddev->biolist = bio->bi_next ; - bio->bi_next = NULL; - spin_unlock_irq(&mddev->write_lock); - submit_bio(bio->bi_rw, bio); - } schedule(); } finish_wait(&mddev->sb_wait, &wq); @@ -693,7 +729,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct completion event; int ret; - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; bio->bi_bdev = bdev; bio->bi_sector = sector; @@ -972,7 +1008,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; @@ -1028,10 +1063,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->bitmap_info.default_offset; } else if (mddev->pers == NULL) { - /* Insist on good event counter while assembling */ + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (sb->disks[rdev->desc_nr].state & ( + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. @@ -1384,7 +1422,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; @@ -1427,10 +1464,14 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } } else if (mddev->pers == NULL) { - /* Insist of good event counter while assembling */ + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. @@ -1538,7 +1579,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; - } + } else + max_dev = le32_to_cpu(sb->max_dev); + for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1761,11 +1804,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { - kobject_del(&rdev->kobj); - goto fail; - } - rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); + if (sysfs_create_link(&rdev->kobj, ko, "block")) + /* failure here is OK */; + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); @@ -2033,20 +2074,10 @@ static void sync_sbs(mddev_t * mddev, int nospares) * with the rest of the array) */ mdk_rdev_t *rdev; - - /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - mddev->curr_resync_completed > rdev->recovery_offset) - rdev->recovery_offset = mddev->curr_resync_completed; - - } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && - (rdev->sb_events&1)==0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; @@ -2064,13 +2095,27 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; - mddev->utime = get_seconds(); - if (mddev->external) - return; repeat: + /* First make sure individual recovery_offsets are correct */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_DEVS, &mddev->flags); + wake_up(&mddev->sb_wait); + return; + } + spin_lock_irq(&mddev->write_lock); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + mddev->utime = get_seconds(); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) @@ -2099,22 +2144,14 @@ repeat: * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) - && (mddev->events & 1) - && mddev->events != 1) + && mddev->can_decrease_events + && mddev->events != 1) { mddev->events--; - else { + mddev->can_decrease_events = 0; + } else { /* otherwise we have to go forward and ... */ mddev->events ++; - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, an 'even' event must also go - * to spares. */ - if ((mddev->events&1)==0) - nospares = 0; - } else { - /* otherwise an 'odd' event must go to spares */ - if ((mddev->events&1)) - nospares = 0; - } + mddev->can_decrease_events = nospares; } if (!mddev->events) { @@ -2126,19 +2163,6 @@ repeat: MD_BUG(); mddev->events --; } - - /* - * do not write anything to disk if using - * nonpersistent superblocks - */ - if (!mddev->persistent) { - if (!mddev->external) - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - - spin_unlock_irq(&mddev->write_lock); - wake_up(&mddev->sb_wait); - return; - } sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); @@ -2292,8 +2316,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(In_sync, &rdev->flags); err = 0; } - if (!err && rdev->sysfs_state) - sysfs_notify_dirent(rdev->sysfs_state); + if (!err) + sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = @@ -2358,6 +2382,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) return err; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&rdev->mddev->kobj, nm); + rdev->raid_disk = -1; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { @@ -2387,14 +2412,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) rdev->raid_disk = -1; return err; } else - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(rdev->mddev)); - + /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks) @@ -2404,7 +2425,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } @@ -2642,7 +2663,7 @@ static void rdev_free(struct kobject *ko) mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); kfree(rdev); } -static struct sysfs_ops rdev_sysfs_ops = { +static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; @@ -2652,6 +2673,24 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; +void md_rdev_init(mdk_rdev_t *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); +} +EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -2675,6 +2714,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } + md_rdev_init(rdev); if ((err = alloc_disk_sb(rdev))) goto abort_free; @@ -2684,18 +2724,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - rdev->raid_disk = -1; - rdev->flags = 0; - rdev->data_offset = 0; - rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; - atomic_set(&rdev->nr_pending, 0); - atomic_set(&rdev->read_errors, 0); - atomic_set(&rdev->corrected_errors, 0); - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING @@ -2724,9 +2752,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi } } - INIT_LIST_HEAD(&rdev->same_set); - init_waitqueue_head(&rdev->blocked_wait); - return rdev; abort_free: @@ -2773,8 +2798,9 @@ static void analyze_sbs(mddev_t * mddev) i = 0; rdev_for_each(rdev, tmp, mddev) { - if (rdev->desc_nr >= mddev->max_disks || - i > mddev->max_disks) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { printk(KERN_WARNING "md: %s: %s: only %d devices permitted\n", mdname(mddev), bdevname(rdev->bdev, b), @@ -2890,9 +2916,10 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { - char level[16]; + char clevel[16]; ssize_t rv = len; struct mdk_personality *pers; + long level; void *priv; mdk_rdev_t *rdev; @@ -2915,7 +2942,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) * - new personality will access other array. */ - if (mddev->sync_thread || mddev->reshape_position != MaxSector) + if (mddev->sync_thread || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) return -EBUSY; if (!mddev->pers->quiesce) { @@ -2925,19 +2954,22 @@ level_store(mddev_t *mddev, const char *buf, size_t len) } /* Now find the new personality */ - if (len == 0 || len >= sizeof(level)) + if (len == 0 || len >= sizeof(clevel)) return -EINVAL; - strncpy(level, buf, len); - if (level[len-1] == '\n') + strncpy(clevel, buf, len); + if (clevel[len-1] == '\n') len--; - level[len] = 0; + clevel[len] = 0; + if (strict_strtol(clevel, 10, &level)) + level = LEVEL_NONE; - request_module("md-%s", level); + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); spin_lock(&pers_lock); - pers = find_pers(LEVEL_NONE, level); + pers = find_pers(level, clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %s not loaded\n", level); + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); return -EINVAL; } spin_unlock(&pers_lock); @@ -2950,10 +2982,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", - mdname(mddev), level); + mdname(mddev), clevel); return -EINVAL; } + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->new_raid_disk = rdev->raid_disk; + /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ @@ -2966,20 +3001,73 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", - mdname(mddev), level); + mdname(mddev), clevel); return PTR_ERR(priv); } /* Looks like we have a winner */ mddev_suspend(mddev); mddev->pers->stop(mddev); - module_put(mddev->pers->owner); - /* Invalidate devices that are now superfluous */ - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= mddev->raid_disks) { - rdev->raid_disk = -1; + + if (mddev->pers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + } + if (mddev->pers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + if (mddev->pers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + + list_for_each_entry(rdev, &mddev->disks, same_set) { + char nm[20]; + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk > mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); + else { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + printk("md: cannot register %s for %s after level change\n", + nm, mdname(mddev)); } + } + + module_put(mddev->pers->owner); mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); @@ -2987,11 +3075,20 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; + if (mddev->pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } pers->run(mddev); mddev_resume(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + sysfs_notify(&mddev->kobj, NULL, "level"); + md_new_event(mddev); return rv; } @@ -3211,7 +3308,7 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; - else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) st = write_pending; else if (mddev->safemode) st = active_idle; @@ -3230,6 +3327,7 @@ array_state_show(mddev_t *mddev, char *page) } static int do_md_stop(mddev_t * mddev, int ro, int is_open); +static int md_set_readonly(mddev_t * mddev, int is_open); static int do_md_run(mddev_t * mddev); static int restart_array(mddev_t *mddev); @@ -3260,7 +3358,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; /* not supported yet */ case readonly: if (mddev->pers) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, 0); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -3270,7 +3368,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (mddev->ro == 0) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, 0); else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { @@ -3291,9 +3389,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, - &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } err = 0; } else @@ -3305,8 +3401,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - if (mddev->external) - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -3323,7 +3418,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; else { - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } } @@ -3621,7 +3716,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } @@ -4059,7 +4154,7 @@ static void md_free(struct kobject *ko) kfree(mddev); } -static struct sysfs_ops md_sysfs_ops = { +static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; @@ -4075,13 +4170,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { mddev_t *mddev = container_of(ws, mddev_t, del_work); - if (mddev->private == &md_redundancy_group) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - mddev->private = NULL; - } sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); @@ -4174,13 +4262,14 @@ static int md_alloc(dev_t dev, char *name) disk->disk_name); error = 0; } - if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); abort: mutex_unlock(&disks_mutex); - if (!error) { + if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); - mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); } mddev_put(mddev); return error; @@ -4218,18 +4307,17 @@ static void md_safemode_timeout(unsigned long data) if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; if (mddev->external) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; -static int do_md_run(mddev_t * mddev) +int md_run(mddev_t *mddev) { int err; mdk_rdev_t *rdev; - struct gendisk *disk; struct mdk_personality *pers; if (list_empty(&mddev->disks)) @@ -4238,6 +4326,9 @@ static int do_md_run(mddev_t * mddev) if (mddev->pers) return -EBUSY; + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; /* * Analyze all RAID superblock(s) @@ -4284,14 +4375,9 @@ static int do_md_run(mddev_t * mddev) return -EINVAL; } } - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } - md_probe(mddev->unit, NULL, NULL); - disk = mddev->gendisk; - if (!disk) - return -ENOMEM; - spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -4354,7 +4440,6 @@ static int do_md_run(mddev_t * mddev) /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; - mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly && mddev->ro == 0) @@ -4388,11 +4473,12 @@ static int do_md_run(mddev_t * mddev) return err; } if (mddev->pers->sync_request) { - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; @@ -4410,8 +4496,7 @@ static int do_md_run(mddev_t * mddev) char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s\n", - nm, mdname(mddev)); + /* failure here is OK */; } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -4419,21 +4504,35 @@ static int do_md_run(mddev_t * mddev) if (mddev->flags) md_update_sb(mddev, 0); - set_capacity(disk, mddev->array_sectors); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - revalidate_disk(mddev->gendisk); - mddev->changed = 1; md_new_event(mddev); - sysfs_notify_dirent(mddev->sysfs_state); - if (mddev->sysfs_action) - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); return 0; } +EXPORT_SYMBOL_GPL(md_run); + +static int do_md_run(mddev_t *mddev) +{ + int err; + + err = md_run(mddev); + if (err) + goto out; + err = bitmap_load(mddev); + if (err) { + bitmap_destroy(mddev); + goto out; + } + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: + return err; +} static int restart_array(mddev_t *mddev) { @@ -4455,7 +4554,7 @@ static int restart_array(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } @@ -4485,89 +4584,157 @@ void restore_bitmap_write_access(struct file *file) spin_unlock(&inode->i_lock); } +static void md_clean(mddev_t *mddev) +{ + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->can_decrease_events = 0; + mddev->delta_disks = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + mddev->resync_mismatches = 0; + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->degraded = 0; + mddev->safemode = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; + mddev->plug = NULL; +} + +void md_stop_writes(mddev_t *mddev) +{ + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + bitmap_flush(mddev); + md_super_wait(mddev); + + if (!mddev->in_sync || mddev->flags) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev, 1); + } +} +EXPORT_SYMBOL_GPL(md_stop_writes); + +void md_stop(mddev_t *mddev) +{ + mddev->pers->stop(mddev); + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(mddev->pers->owner); + mddev->pers = NULL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} +EXPORT_SYMBOL_GPL(md_stop); + +static int md_set_readonly(mddev_t *mddev, int is_open) +{ + int err = 0; + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > is_open) { + printk("md: %s still in use.\n",mdname(mddev)); + err = -EBUSY; + goto out; + } + if (mddev->pers) { + md_stop_writes(mddev); + + err = -ENXIO; + if (mddev->ro==1) + goto out; + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_state); + err = 0; + } +out: + mutex_unlock(&mddev->open_mutex); + return err; +} + /* mode: * 0 - completely stop and dis-assemble array - * 1 - switch to readonly * 2 - stop but do not disassemble array */ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { - int err = 0; struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open) { + if (atomic_read(&mddev->openers) > is_open || + mddev->sysfs_active) { printk("md: %s still in use.\n",mdname(mddev)); - err = -EBUSY; - } else if (mddev->pers) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - } + if (mddev->pers) { + if (mddev->ro) + set_disk_ro(disk, 0); - del_timer_sync(&mddev->safemode_timer); + md_stop_writes(mddev); + md_stop(mddev); + mddev->queue->merge_bvec_fn = NULL; + mddev->queue->unplug_fn = NULL; + mddev->queue->backing_dev_info.congested_fn = NULL; - switch(mode) { - case 1: /* readonly */ - err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - break; - case 0: /* disassemble */ - case 2: /* stop */ - bitmap_flush(mddev); - md_super_wait(mddev); - if (mddev->ro) - set_disk_ro(disk, 0); + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent_safe(mddev->sysfs_state); - mddev->pers->stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; - module_put(mddev->pers->owner); - if (mddev->pers->sync_request) - mddev->private = &md_redundancy_group; - mddev->pers = NULL; - /* tell userspace to handle 'inactive' */ - sysfs_notify_dirent(mddev->sysfs_state); - - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } - set_capacity(disk, 0); - mddev->changed = 1; + set_capacity(disk, 0); + mutex_unlock(&mddev->open_mutex); + revalidate_disk(disk); - if (mddev->ro) - mddev->ro = 0; - } - if (!mddev->in_sync || mddev->flags) { - /* mark array as shutdown cleanly */ - mddev->in_sync = 1; - md_update_sb(mddev, 1); - } - if (mode == 1) - set_disk_ro(disk, 1); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - err = 0; - } -out: - mutex_unlock(&mddev->open_mutex); - if (err) - return err; + if (mddev->ro) + mddev->ro = 0; + } else + mutex_unlock(&mddev->open_mutex); /* * Free resources if final stop */ if (mode == 0) { - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); @@ -4578,62 +4745,17 @@ out: } mddev->bitmap_info.offset = 0; - /* make sure all md_delayed_delete calls have finished */ - flush_scheduled_work(); - export_array(mddev); - mddev->array_sectors = 0; - mddev->external_size = 0; - mddev->dev_sectors = 0; - mddev->raid_disks = 0; - mddev->recovery_cp = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->reshape_position = MaxSector; - mddev->external = 0; - mddev->persistent = 0; - mddev->level = LEVEL_NONE; - mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; - mddev->metadata_type[0] = 0; - mddev->chunk_sectors = 0; - mddev->ctime = mddev->utime = 0; - mddev->layout = 0; - mddev->max_disks = 0; - mddev->events = 0; - mddev->delta_disks = 0; - mddev->new_level = LEVEL_NONE; - mddev->new_layout = 0; - mddev->new_chunk_sectors = 0; - mddev->curr_resync = 0; - mddev->resync_mismatches = 0; - mddev->suspend_lo = mddev->suspend_hi = 0; - mddev->sync_speed_min = mddev->sync_speed_max = 0; - mddev->recovery = 0; - mddev->in_sync = 0; - mddev->changed = 0; - mddev->degraded = 0; - mddev->barriers_work = 0; - mddev->safemode = 0; - mddev->bitmap_info.offset = 0; - mddev->bitmap_info.default_offset = 0; - mddev->bitmap_info.chunksize = 0; - mddev->bitmap_info.daemon_sleep = 0; - mddev->bitmap_info.max_write_behind = 0; + md_clean(mddev); kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; - - } else if (mddev->pers) - printk(KERN_INFO "md: %s switched to read-only mode.\n", - mdname(mddev)); - err = 0; + } blk_integrity_unregister(disk); md_new_event(mddev); - sysfs_notify_dirent(mddev->sysfs_state); - return err; + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; } #ifndef MODULE @@ -4994,7 +5116,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); else - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); md_update_sb(mddev, 1); if (mddev->degraded) @@ -5187,8 +5309,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) err = 0; if (mddev->pers) { mddev->pers->quiesce(mddev, 1); - if (fd >= 0) + if (fd >= 0) { err = bitmap_create(mddev); + if (!err) + err = bitmap_load(mddev); + } if (fd < 0 || err) { bitmap_destroy(mddev); fd = -1; /* make sure to put the file */ @@ -5346,7 +5471,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) if (mddev->pers->check_reshape == NULL) return -EINVAL; if (raid_disks <= 0 || - raid_disks >= mddev->max_disks) + (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; @@ -5437,6 +5562,8 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -5483,7 +5610,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) geo->heads = 2; geo->sectors = 4; - geo->cylinders = get_capacity(mddev->gendisk) / 8; + geo->cylinders = mddev->array_sectors / 8; return 0; } @@ -5493,6 +5620,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, int err = 0; void __user *argp = (void __user *)arg; mddev_t *mddev = NULL; + int ro; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -5625,9 +5753,37 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto done_unlock; case STOP_ARRAY_RO: - err = do_md_stop(mddev, 1, 1); + err = md_set_readonly(mddev, 1); goto done_unlock; + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; + goto done_unlock; + } + err = -EINVAL; + + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) + goto done_unlock; + + /* are we are already prepared for writes? */ + if (mddev->ro != 1) + goto done_unlock; + + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } + } + goto done_unlock; } /* @@ -5640,7 +5796,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } else { @@ -5729,6 +5885,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) mddev_t *mddev = mddev_find(bdev->bd_dev); int err; + mutex_lock(&md_mutex); if (mddev->gendisk != bdev->bd_disk) { /* we are racing with mddev_put which is discarding this * bd_disk. @@ -5737,6 +5894,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) /* Wait until bdev->bd_disk is definitely gone */ flush_scheduled_work(); /* Then retry the open from the top */ + mutex_unlock(&md_mutex); return -ERESTARTSYS; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -5748,8 +5906,9 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - check_disk_change(bdev); + check_disk_size_change(mddev->gendisk, bdev); out: + mutex_unlock(&md_mutex); return err; } @@ -5758,26 +5917,13 @@ static int md_release(struct gendisk *disk, fmode_t mode) mddev_t *mddev = disk->private_data; BUG_ON(!mddev); + mutex_lock(&md_mutex); atomic_dec(&mddev->openers); mddev_put(mddev); + mutex_unlock(&md_mutex); return 0; } - -static int md_media_changed(struct gendisk *disk) -{ - mddev_t *mddev = disk->private_data; - - return mddev->changed; -} - -static int md_revalidate(struct gendisk *disk) -{ - mddev_t *mddev = disk->private_data; - - mddev->changed = 0; - return 0; -} static const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -5788,8 +5934,6 @@ static const struct block_device_operations md_fops = .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, }; static int md_thread(void * arg) @@ -5903,10 +6047,12 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(StateChanged, &rdev->flags); + sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + if (mddev->event_work.func) + schedule_work(&mddev->event_work); md_new_event_inintr(mddev); } @@ -6358,15 +6504,15 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock_irq(&mddev->write_lock); } if (did_change) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } @@ -6402,22 +6548,31 @@ int md_allow_write(mddev_t *mddev) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock_irq(&mddev->write_lock); - if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; else return 0; } EXPORT_SYMBOL_GPL(md_allow_write); +void md_unplug(mddev_t *mddev) +{ + if (mddev->queue) + blk_unplug(mddev->queue); + if (mddev->plug) + mddev->plug->unplug_fn(mddev->plug); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) void md_do_sync(mddev_t *mddev) @@ -6596,7 +6751,7 @@ void md_do_sync(mddev_t *mddev) >= mddev->resync_max - mddev->curr_resync_completed )) { /* time to update curr_resync_completed */ - blk_unplug(mddev->queue); + md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = @@ -6673,7 +6828,7 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - blk_unplug(mddev->queue); + md_unplug(mddev); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -6692,7 +6847,7 @@ void md_do_sync(mddev_t *mddev) * this also signals 'finished resyncing' to md_stop */ out: - blk_unplug(mddev->queue); + md_unplug(mddev); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); @@ -6717,6 +6872,7 @@ void md_do_sync(mddev_t *mddev) rcu_read_lock(); list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) @@ -6793,10 +6949,7 @@ static int remove_and_add_spares(mddev_t *mddev) sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + /* failure here is OK */; spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -6852,7 +7005,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags && !mddev->external) || + (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -6882,24 +7035,18 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; did_change = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); if (did_change) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } if (mddev->flags) md_update_sb(mddev, 0); - list_for_each_entry(rdev, &mddev->disks, same_set) - if (test_and_clear_bit(StateChanged, &rdev->flags)) - sysfs_notify_dirent(rdev->sysfs_state); - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ @@ -6933,7 +7080,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); goto unlock; } @@ -6995,7 +7142,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; } else md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); } unlock: @@ -7004,7 +7151,7 @@ void md_check_recovery(mddev_t *mddev) if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) if (mddev->sysfs_action) - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); } mddev_unlock(mddev); } @@ -7012,7 +7159,7 @@ void md_check_recovery(mddev_t *mddev) void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, !test_bit(Blocked, &rdev->flags), msecs_to_jiffies(5000)); @@ -7036,7 +7183,7 @@ static int md_notify_reboot(struct notifier_block *this, * appears to still be in use. Hence * the '100'. */ - do_md_stop(mddev, 1, 100); + md_set_readonly(mddev, 100); mddev_unlock(mddev); } /* diff --git a/drivers/md/md.h b/drivers/md/md.h index 8e4c75c00d46..112a2c32db0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -29,6 +29,26 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; +/* generic plugging support - like that provided with request_queue, + * but does not require a request_queue + */ +struct plug_handle { + void (*unplug_fn)(struct plug_handle *); + struct timer_list unplug_timer; + struct work_struct unplug_work; + unsigned long unplug_flag; +}; +#define PLUGGED_FLAG 1 +void plugger_init(struct plug_handle *plug, + void (*unplug_fn)(struct plug_handle *)); +void plugger_set_plug(struct plug_handle *plug); +int plugger_remove_plug(struct plug_handle *plug); +static inline void plugger_flush(struct plug_handle *plug) +{ + del_timer_sync(&plug->unplug_timer); + cancel_work_sync(&plug->unplug_work); +} + /* * MD's 'extended' device */ @@ -67,20 +87,19 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ -#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ #define AllReserved 6 /* If whole device is reserved for * one array */ #define AutoDetected 7 /* added by auto-detect */ #define Blocked 8 /* An error occured on an externally * managed array, don't allow writes * until it is cleared */ -#define StateChanged 9 /* Faulty or Blocked has changed during - * interrupt, so it needs to be - * notified by the thread */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ + int new_raid_disk; /* role that the device will have in + * the array after a level-change completes. + */ int saved_raid_disk; /* role that device used to have in the * array and could again if we did a partial * resync from the bitmap @@ -120,11 +139,15 @@ struct mddev_s unsigned long flags; #define MD_CHANGE_DEVS 0 /* Some device status has changed */ #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ -#define MD_CHANGE_PENDING 2 /* superblock update in progress */ +#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ int suspended; atomic_t active_io; int ro; + int sysfs_active; /* set when sysfs deletes + * are happening, so run/ + * takeover/stop are not safe + */ struct gendisk *gendisk; @@ -153,6 +176,12 @@ struct mddev_s int external_size; /* size managed * externally */ __u64 events; + /* If the last 'event' was simply a clean->dirty transition, and + * we didn't write it to the spares, then it is safe and simple + * to just decrement the event count on a dirty->clean transition. + * So we record that possibility here. + */ + int can_decrease_events; char uuid[16]; @@ -240,17 +269,9 @@ struct mddev_s atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ - int changed; /* true if we might need to reread partition info */ int degraded; /* whether md should consider * adding a spare */ - int barriers_work; /* initialised to true, cleared as soon - * as a barrier request to slave - * fails. Only supported - */ - struct bio *biolist; /* bios that need to be retried - * because BIO_RW_BARRIER is not supported - */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; @@ -279,9 +300,6 @@ struct mddev_s atomic_t writes_pending; struct request_queue *queue; /* for plugging ... */ - atomic_t write_behind; /* outstanding async IO */ - unsigned int max_write_behind; /* 0 = sync */ - struct bitmap *bitmap; /* the bitmap for the device */ struct { struct file *file; /* the bitmap file */ @@ -295,9 +313,14 @@ struct mddev_s * hot-adding a bitmap. It should * eventually be settable by sysfs. */ + /* When md is serving under dm, it might use a + * dirty_log to store the bits. + */ + struct dm_dirty_log *log; + struct mutex mutex; unsigned long chunksize; - unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long max_write_behind; /* write-behind mode */ int external; } bitmap_info; @@ -305,16 +328,17 @@ struct mddev_s atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; - /* Generic barrier handling. - * If there is a pending barrier request, all other - * writes are blocked while the devices are flushed. - * The last to finish a flush schedules a worker to - * submit the barrier request (without the barrier flag), - * then submit more flush requests. + struct attribute_group *to_remove; + struct plug_handle *plug; /* if used by personality */ + + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_FLUSH flag). */ - struct bio *barrier; + struct bio *flush_bio; atomic_t flush_pending; - struct work_struct barrier_work; + struct work_struct flush_work; + struct work_struct event_work; /* used by dm to report failure event */ }; @@ -336,7 +360,7 @@ struct mdk_personality int level; struct list_head list; struct module *owner; - int (*make_request)(struct request_queue *q, struct bio *bio); + int (*make_request)(mddev_t *mddev, struct bio *bio); int (*run)(mddev_t *mddev); int (*stop)(mddev_t *mddev); void (*status)(struct seq_file *seq, mddev_t *mddev); @@ -379,6 +403,18 @@ struct md_sysfs_entry { }; extern struct attribute_group md_bitmap_group; +static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) +{ + if (sd) + return sysfs_get_dirent(sd, NULL, name); + return sd; +} +static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) +{ + if (sd) + sysfs_notify_dirent(sd); +} + static inline char * mdname (mddev_t * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; @@ -455,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); extern int mddev_congested(mddev_t *mddev, int bits); -extern void md_barrier_request(mddev_t *mddev, struct bio *bio); +extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); @@ -471,5 +507,14 @@ extern int md_integrity_register(mddev_t *mddev); extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void restore_bitmap_write_access(struct file *file); +extern void md_unplug(mddev_t *mddev); + +extern void mddev_init(mddev_t *mddev); +extern int md_run(mddev_t *mddev); +extern void md_stop(mddev_t *mddev); +extern void md_stop_writes(mddev_t *mddev); +extern void md_rdev_init(mdk_rdev_t *rdev); +extern void mddev_suspend(mddev_t *mddev); +extern void mddev_resume(mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c deleted file mode 100644 index 3b1500843bba..000000000000 --- a/drivers/md/mktables.c +++ /dev/null @@ -1,132 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * mktables.c - * - * Make RAID-6 tables. This is a host user space program to be run at - * compile time. - */ - -#include <stdio.h> -#include <string.h> -#include <inttypes.h> -#include <stdlib.h> -#include <time.h> - -static uint8_t gfmul(uint8_t a, uint8_t b) -{ - uint8_t v = 0; - - while (b) { - if (b & 1) - v ^= a; - a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); - b >>= 1; - } - - return v; -} - -static uint8_t gfpow(uint8_t a, int b) -{ - uint8_t v = 1; - - b %= 255; - if (b < 0) - b += 255; - - while (b) { - if (b & 1) - v = gfmul(v, a); - a = gfmul(a, a); - b >>= 1; - } - - return v; -} - -int main(int argc, char *argv[]) -{ - int i, j, k; - uint8_t v; - uint8_t exptbl[256], invtbl[256]; - - printf("#include <linux/raid/pq.h>\n"); - - /* Compute multiplication table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfmul[256][256] =\n" - "{\n"); - for (i = 0; i < 256; i++) { - printf("\t{\n"); - for (j = 0; j < 256; j += 8) { - printf("\t\t"); - for (k = 0; k < 8; k++) - printf("0x%02x,%c", gfmul(i, j + k), - (k == 7) ? '\n' : ' '); - } - printf("\t},\n"); - } - printf("};\n"); - printf("#ifdef __KERNEL__\n"); - printf("EXPORT_SYMBOL(raid6_gfmul);\n"); - printf("#endif\n"); - - /* Compute power-of-2 table (exponent) */ - v = 1; - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexp[256] =\n" "{\n"); - for (i = 0; i < 256; i += 8) { - printf("\t"); - for (j = 0; j < 8; j++) { - exptbl[i + j] = v; - printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); - v = gfmul(v, 2); - if (v == 1) - v = 0; /* For entry 255, not a real entry */ - } - } - printf("};\n"); - printf("#ifdef __KERNEL__\n"); - printf("EXPORT_SYMBOL(raid6_gfexp);\n"); - printf("#endif\n"); - - /* Compute inverse table x^-1 == x^254 */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfinv[256] =\n" "{\n"); - for (i = 0; i < 256; i += 8) { - printf("\t"); - for (j = 0; j < 8; j++) { - invtbl[i + j] = v = gfpow(i + j, 254); - printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); - } - } - printf("};\n"); - printf("#ifdef __KERNEL__\n"); - printf("EXPORT_SYMBOL(raid6_gfinv);\n"); - printf("#endif\n"); - - /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexi[256] =\n" "{\n"); - for (i = 0; i < 256; i += 8) { - printf("\t"); - for (j = 0; j < 8; j++) - printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], - (j == 7) ? '\n' : ' '); - } - printf("};\n"); - printf("#ifdef __KERNEL__\n"); - printf("EXPORT_SYMBOL(raid6_gfexi);\n"); - printf("#endif\n"); - - return 0; -} diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 32a662fc55c9..6d7ddf32ef2e 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/raid/md_u.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include "md.h" #include "multipath.h" @@ -84,13 +85,13 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) static void multipath_end_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); + struct multipath_bh *mp_bh = bio->bi_private; multipath_conf_t *conf = mp_bh->mddev->private; mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; if (uptodate) multipath_end_bh_io(mp_bh, 0); - else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) { + else if (!(bio->bi_rw & REQ_RAHEAD)) { /* * oops, IO error: */ @@ -135,17 +136,14 @@ static void multipath_unplug(struct request_queue *q) } -static int multipath_make_request (struct request_queue *q, struct bio * bio) +static int multipath_make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; multipath_conf_t *conf = mddev->private; struct multipath_bh * mp_bh; struct multipath_info *multipath; - const int rw = bio_data_dir(bio); - int cpu; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } @@ -154,12 +152,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) mp_bh->master_bio = bio; mp_bh->mddev = mddev; - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - mp_bh->path = multipath_map(conf); if (mp_bh->path < 0) { bio_endio(bio, -EIO); @@ -171,7 +163,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) mp_bh->bio = *bio; mp_bh->bio.bi_sector += multipath->rdev->data_offset; mp_bh->bio.bi_bdev = multipath->rdev->bdev; - mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); + mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; generic_make_request(&mp_bh->bio); @@ -301,14 +293,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit ->max_segments to one, lying + * within a single page. * (Note: it is very unlikely that a device with * merge_bvec_fn will be involved in multipath.) */ - if (q->merge_bvec_fn && - queue_max_sectors(q) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (q->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } conf->working_disks++; mddev->degraded--; @@ -404,7 +398,7 @@ static void multipathd (mddev_t *mddev) *bio = *(mp_bh->master_bio); bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; - bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); + bio->bi_rw |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; generic_make_request(bio); @@ -476,9 +470,11 @@ static int multipath_run (mddev_t *mddev) /* as we don't honour merge_bvec_fn, we must never risk * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } if (!test_bit(Faulty, &rdev->flags)) conf->working_disks++; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 77605cdceaf1..a39f4c355e55 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -20,17 +20,20 @@ #include <linux/blkdev.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include "md.h" #include "raid0.h" +#include "raid5.h" static void raid0_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; raid0_conf_t *conf = mddev->private; mdk_rdev_t **devlist = conf->devlist; + int raid_disks = conf->strip_zone[0].nb_dev; int i; - for (i=0; i<mddev->raid_disks; i++) { + for (i=0; i < raid_disks; i++) { struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); blk_unplug(r_queue); @@ -42,12 +45,13 @@ static int raid0_congested(void *data, int bits) mddev_t *mddev = data; raid0_conf_t *conf = mddev->private; mdk_rdev_t **devlist = conf->devlist; + int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; if (mddev_congested(mddev, bits)) return 1; - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); @@ -65,16 +69,17 @@ static void dump_zones(mddev_t *mddev) sector_t zone_start = 0; char b[BDEVNAME_SIZE]; raid0_conf_t *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; printk(KERN_INFO "******* %s configuration *********\n", mdname(mddev)); h = 0; for (j = 0; j < conf->nr_strip_zones; j++) { printk(KERN_INFO "zone%d=[", j); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - printk("%s/", - bdevname(conf->devlist[j*mddev->raid_disks + printk(KERN_CONT "%s/", + bdevname(conf->devlist[j*raid_disks + k]->bdev, b)); - printk("]\n"); + printk(KERN_CONT "]\n"); zone_size = conf->strip_zone[j].zone_end - zone_start; printk(KERN_INFO " zone offset=%llukb " @@ -87,7 +92,7 @@ static void dump_zones(mddev_t *mddev) printk(KERN_INFO "**********************************\n\n"); } -static int create_strip_zones(mddev_t *mddev) +static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) { int i, c, err; sector_t curr_zone_end, sectors; @@ -100,8 +105,9 @@ static int create_strip_zones(mddev_t *mddev) if (!conf) return -ENOMEM; list_for_each_entry(rdev1, &mddev->disks, same_set) { - printk(KERN_INFO "raid0: looking at %s\n", - bdevname(rdev1->bdev,b)); + printk(KERN_INFO "md/raid0:%s: looking at %s\n", + mdname(mddev), + bdevname(rdev1->bdev, b)); c = 0; /* round size to chunk_size */ @@ -110,14 +116,16 @@ static int create_strip_zones(mddev_t *mddev) rdev1->sectors = sectors * mddev->chunk_sectors; list_for_each_entry(rdev2, &mddev->disks, same_set) { - printk(KERN_INFO "raid0: comparing %s(%llu)", + printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)", + mdname(mddev), bdevname(rdev1->bdev,b), (unsigned long long)rdev1->sectors); - printk(KERN_INFO " with %s(%llu)\n", + printk(KERN_CONT " with %s(%llu)\n", bdevname(rdev2->bdev,b), (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { - printk(KERN_INFO "raid0: END\n"); + printk(KERN_INFO "md/raid0:%s: END\n", + mdname(mddev)); break; } if (rdev2->sectors == rdev1->sectors) { @@ -125,20 +133,24 @@ static int create_strip_zones(mddev_t *mddev) * Not unique, don't count it as a new * group */ - printk(KERN_INFO "raid0: EQUAL\n"); + printk(KERN_INFO "md/raid0:%s: EQUAL\n", + mdname(mddev)); c = 1; break; } - printk(KERN_INFO "raid0: NOT EQUAL\n"); + printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n", + mdname(mddev)); } if (!c) { - printk(KERN_INFO "raid0: ==> UNIQUE\n"); + printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n", + mdname(mddev)); conf->nr_strip_zones++; - printk(KERN_INFO "raid0: %d zones\n", - conf->nr_strip_zones); + printk(KERN_INFO "md/raid0:%s: %d zones\n", + mdname(mddev), conf->nr_strip_zones); } } - printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); + printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n", + mdname(mddev), conf->nr_strip_zones); err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); @@ -161,14 +173,20 @@ static int create_strip_zones(mddev_t *mddev) list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; + if (mddev->level == 10) { + /* taking over a raid10-n2 array */ + j /= 2; + rdev1->new_raid_disk = j; + } + if (j < 0 || j >= mddev->raid_disks) { - printk(KERN_ERR "raid0: bad disk number %d - " - "aborting!\n", j); + printk(KERN_ERR "md/raid0:%s: bad disk number %d - " + "aborting!\n", mdname(mddev), j); goto abort; } if (dev[j]) { - printk(KERN_ERR "raid0: multiple devices for %d - " - "aborting!\n", j); + printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " + "aborting!\n", mdname(mddev), j); goto abort; } dev[j] = rdev1; @@ -176,21 +194,22 @@ static int create_strip_zones(mddev_t *mddev) disk_stack_limits(mddev->gendisk, rdev1->bdev, rdev1->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit ->max_segments to 1, lying within + * a single page. */ - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - + if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; } if (cnt != mddev->raid_disks) { - printk(KERN_ERR "raid0: too few disks (%d of %d) - " - "aborting!\n", cnt, mddev->raid_disks); + printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " + "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); goto abort; } zone->nb_dev = cnt; @@ -206,39 +225,44 @@ static int create_strip_zones(mddev_t *mddev) zone = conf->strip_zone + i; dev = conf->devlist + i * mddev->raid_disks; - printk(KERN_INFO "raid0: zone %d\n", i); + printk(KERN_INFO "md/raid0:%s: zone %d\n", + mdname(mddev), i); zone->dev_start = smallest->sectors; smallest = NULL; c = 0; for (j=0; j<cnt; j++) { rdev = conf->devlist[j]; - printk(KERN_INFO "raid0: checking %s ...", - bdevname(rdev->bdev, b)); + printk(KERN_INFO "md/raid0:%s: checking %s ...", + mdname(mddev), + bdevname(rdev->bdev, b)); if (rdev->sectors <= zone->dev_start) { - printk(KERN_INFO " nope.\n"); + printk(KERN_CONT " nope.\n"); continue; } - printk(KERN_INFO " contained as device %d\n", c); + printk(KERN_CONT " contained as device %d\n", c); dev[c] = rdev; c++; if (!smallest || rdev->sectors < smallest->sectors) { smallest = rdev; - printk(KERN_INFO " (%llu) is smallest!.\n", - (unsigned long long)rdev->sectors); + printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n", + mdname(mddev), + (unsigned long long)rdev->sectors); } } zone->nb_dev = c; sectors = (smallest->sectors - zone->dev_start) * c; - printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", - zone->nb_dev, (unsigned long long)sectors); + printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", + mdname(mddev), + zone->nb_dev, (unsigned long long)sectors); curr_zone_end += sectors; zone->zone_end = curr_zone_end; - printk(KERN_INFO "raid0: current zone start: %llu\n", - (unsigned long long)smallest->sectors); + printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n", + mdname(mddev), + (unsigned long long)smallest->sectors); } mddev->queue->unplug_fn = raid0_unplug; mddev->queue->backing_dev_info.congested_fn = raid0_congested; @@ -249,7 +273,7 @@ static int create_strip_zones(mddev_t *mddev) * chunk size is a multiple of that sector size */ if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "%s chunk_size of %d not valid\n", + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", mdname(mddev), mddev->chunk_sectors << 9); goto abort; @@ -259,14 +283,15 @@ static int create_strip_zones(mddev_t *mddev) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); - printk(KERN_INFO "raid0: done.\n"); - mddev->private = conf; + printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev)); + *private_conf = conf; + return 0; abort: kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; + *private_conf = NULL; return err; } @@ -317,26 +342,34 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) static int raid0_run(mddev_t *mddev) { + raid0_conf_t *conf; int ret; if (mddev->chunk_sectors == 0) { - printk(KERN_ERR "md/raid0: chunk size must be set.\n"); + printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", + mdname(mddev)); return -EINVAL; } if (md_check_no_bitmap(mddev)) return -EINVAL; - blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); mddev->queue->queue_lock = &mddev->queue->__queue_lock; - ret = create_strip_zones(mddev); - if (ret < 0) - return ret; + /* if private is not null, we are here after takeover */ + if (mddev->private == NULL) { + ret = create_strip_zones(mddev, &conf); + if (ret < 0) + return ret; + mddev->private = conf; + } + conf = mddev->private; /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); - printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", - (unsigned long long)mddev->array_sectors); + printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", + mdname(mddev), + (unsigned long long)mddev->array_sectors); /* calculate the max read-ahead size. * For read-ahead of large files to be effective, we need to * readahead at least twice a whole stripe. i.e. number of devices @@ -400,6 +433,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, unsigned int sect_in_chunk; sector_t chunk; raid0_conf_t *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; unsigned int chunk_sects = mddev->chunk_sectors; if (is_power_of_2(chunk_sects)) { @@ -422,7 +456,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone, * + the position in the chunk */ *sector_offset = (chunk * chunk_sects) + sect_in_chunk; - return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks + return conf->devlist[(zone - conf->strip_zone)*raid_disks + sector_div(sector, zone->nb_dev)]; } @@ -442,27 +476,18 @@ static inline int is_io_in_chunk_boundary(mddev_t *mddev, } } -static int raid0_make_request(struct request_queue *q, struct bio *bio) +static int raid0_make_request(mddev_t *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; unsigned int chunk_sects; sector_t sector_offset; struct strip_zone *zone; mdk_rdev_t *tmp_dev; - const int rw = bio_data_dir(bio); - int cpu; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - chunk_sects = mddev->chunk_sectors; if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { sector_t sector = bio->bi_sector; @@ -480,9 +505,9 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) else bp = bio_split(bio, chunk_sects - sector_div(sector, chunk_sects)); - if (raid0_make_request(q, &bp->bio1)) + if (raid0_make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (raid0_make_request(q, &bp->bio2)) + if (raid0_make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); @@ -502,9 +527,10 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) return 1; bad_map: - printk("raid0_make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", chunk_sects / 2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + printk("md/raid0:%s: make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", + mdname(mddev), chunk_sects / 2, + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); bio_io_error(bio); return 0; @@ -517,6 +543,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) int j, k, h; char b[BDEVNAME_SIZE]; raid0_conf_t *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; sector_t zone_size; sector_t zone_start = 0; @@ -527,7 +554,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, "=["); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) seq_printf(seq, "%s/", bdevname( - conf->devlist[j*mddev->raid_disks + k] + conf->devlist[j*raid_disks + k] ->bdev, b)); zone_size = conf->strip_zone[j].zone_end - zone_start; @@ -542,6 +569,109 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) return; } +static void *raid0_takeover_raid45(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + raid0_conf_t *priv_conf; + + if (mddev->degraded != 1) { + printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", + mdname(mddev), + mddev->degraded); + return ERR_PTR(-EINVAL); + } + + list_for_each_entry(rdev, &mddev->disks, same_set) { + /* check slot number for a disk */ + if (rdev->raid_disk == mddev->raid_disks-1) { + printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks--; + mddev->delta_disks = -1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover_raid10(mddev_t *mddev) +{ + raid0_conf_t *priv_conf; + + /* Check layout: + * - far_copies must be 1 + * - near_copies must be 2 + * - disks number must be even + * - all mirrors must be already degraded + */ + if (mddev->layout != ((1 << 8) + 2)) { + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", + mdname(mddev), + mddev->layout); + return ERR_PTR(-EINVAL); + } + if (mddev->raid_disks & 1) { + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + if (mddev->degraded != (mddev->raid_disks>>1)) { + printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = - mddev->raid_disks / 2; + mddev->raid_disks += mddev->delta_disks; + mddev->degraded = 0; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + +static void *raid0_takeover(mddev_t *mddev) +{ + /* raid0 can take over: + * raid4 - if all data disks are active. + * raid5 - providing it is Raid4 layout and one disk is faulty + * raid10 - assuming we have all necessary active disks + */ + if (mddev->level == 4) + return raid0_takeover_raid45(mddev); + + if (mddev->level == 5) { + if (mddev->layout == ALGORITHM_PARITY_N) + return raid0_takeover_raid45(mddev); + + printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", + mdname(mddev), ALGORITHM_PARITY_N); + } + + if (mddev->level == 10) + return raid0_takeover_raid10(mddev); + + return ERR_PTR(-EINVAL); +} + +static void raid0_quiesce(mddev_t *mddev, int state) +{ +} + static struct mdk_personality raid0_personality= { .name = "raid0", @@ -552,6 +682,8 @@ static struct mdk_personality raid0_personality= .stop = raid0_stop, .status = raid0_status, .size = raid0_size, + .takeover = raid0_takeover, + .quiesce = raid0_quiesce, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 859bd3ffe435..378a25894c57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,6 +31,7 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/slab.h> #include <linux/delay.h> #include <linux/blkdev.h> #include <linux/seq_file.h> @@ -262,7 +263,7 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio) static void raid1_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int mirror; conf_t *conf = r1_bio->mddev->private; @@ -296,7 +297,8 @@ static void raid1_end_read_request(struct bio *bio, int error) */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", + printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } @@ -307,7 +309,7 @@ static void raid1_end_read_request(struct bio *bio, int error) static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); conf_t *conf = r1_bio->mddev->private; struct bio *to_put = NULL; @@ -317,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) if (r1_bio->bios[mirror] == bio) break; - if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { - set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_BarrierRetry, &r1_bio->state); - r1_bio->mddev->barriers_work = 0; - /* Don't rdev_dec_pending in this branch - keep it for the retry */ - } else { + /* + * 'one mirror IO has finished' event handler: + */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + if (!uptodate) { + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* - * this branch is our 'one mirror IO has finished' event handler: + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - if (!uptodate) { - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } else - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); - - if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) - atomic_dec(&r1_bio->behind_remaining); - - /* In behind mode, we ACK the master bio once the I/O has safely - * reached all non-writemostly disks. Setting the Returned bit - * ensures that this gets done only once -- we don't ever want to - * return -EIO here, instead we'll wait */ - - if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && - test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* Maybe we can return now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - struct bio *mbio = r1_bio->master_bio; - PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); - bio_endio(mbio, 0); - } + set_bit(R1BIO_Uptodate, &r1_bio->state); + + update_head_pos(mirror, r1_bio); + + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, 0); } } - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + /* - * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + safe_put_page(bio->bi_io_vec[i].bv_page); } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } if (to_put) @@ -417,7 +410,7 @@ static void raid1_end_write_request(struct bio *bio, int error) */ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { - const unsigned long this_sector = r1_bio->sector; + const sector_t this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; int wonly_disk = -1; const int sectors = r1_bio->sectors; @@ -433,7 +426,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) retry: if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { - /* Choose the first operation device, for consistancy */ + /* Choose the first operational device, for consistancy */ new_disk = 0; for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); @@ -773,9 +766,8 @@ do_sync_io: return NULL; } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; @@ -786,18 +778,14 @@ static int make_request(struct request_queue *q, struct bio * bio) struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); - int cpu; - bool do_barriers; + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. - * We test barriers_work *after* md_write_start as md_write_start - * may cause the first superblock write, and that will check out - * if barriers work. */ md_write_start(mddev, bio); /* wait on superblock update early */ @@ -821,24 +809,11 @@ static int make_request(struct request_queue *q, struct bio * bio) } finish_wait(&conf->wait_barrier, &w); } - if (unlikely(!mddev->barriers_work && - bio_rw_flagged(bio, BIO_RW_BARRIER))) { - if (rw == WRITE) - md_write_end(mddev); - bio_endio(bio, -EOPNOTSUPP); - return 0; - } wait_barrier(conf); bitmap = mddev->bitmap; - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - /* * make_request() can abort the operation when READA is being * used and no empty request is available. @@ -865,6 +840,15 @@ static int make_request(struct request_queue *q, struct bio * bio) } mirror = conf->mirrors + rdisk; + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* Reading from a write-mostly device must + * take care not to over-take any writes + * that are 'behind' + */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } r1_bio->read_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); @@ -874,7 +858,7 @@ static int make_request(struct request_queue *q, struct bio * bio) read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; generic_make_request(read_bio); @@ -911,9 +895,10 @@ static int make_request(struct request_queue *q, struct bio * bio) if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); r1_bio->bios[i] = NULL; - } else + } else { r1_bio->bios[i] = bio; - targets++; + targets++; + } } else r1_bio->bios[i] = NULL; } @@ -941,20 +926,20 @@ static int make_request(struct request_queue *q, struct bio * bio) set_bit(R1BIO_Degraded, &r1_bio->state); } - /* do behind I/O ? */ + /* do behind I/O ? + * Not if there are too many, or cannot allocate memory, + * or a reader on WriteMostly is waiting for behind writes + * to flush */ if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait) && (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER); - if (do_barriers) - set_bit(R1BIO_Barrier, &r1_bio->state); - bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; @@ -967,8 +952,7 @@ static int make_request(struct request_queue *q, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) | - (do_sync << BIO_RW_SYNCIO); + mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -1069,21 +1053,22 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" - "raid1: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) { int i; - printk("RAID1 conf printout:\n"); + printk(KERN_DEBUG "RAID1 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); @@ -1091,7 +1076,7 @@ static void print_conf(conf_t *conf) char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), bdevname(rdev->bdev,b)); @@ -1112,6 +1097,8 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; + int count = 0; + unsigned long flags; /* * Find all failed disks within the RAID1 configuration @@ -1123,15 +1110,16 @@ static int raid1_spare_active(mddev_t *mddev) if (rdev && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent(rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); - return 0; + return count; } @@ -1152,13 +1140,17 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + /* as we don't honour merge_bvec_fn, we must + * never risk violating it, so limit + * ->max_segments to one lying with a single + * page, as a one page request is never in + * violation. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } p->head_position = 0; rdev->raid_disk = mirror; @@ -1218,7 +1210,7 @@ abort: static void end_sync_read(struct bio *bio, int error) { - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; int i; for (i=r1_bio->mddev->raid_disks; i--; ) @@ -1241,7 +1233,7 @@ static void end_sync_read(struct bio *bio, int error) static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + r1bio_t *r1_bio = bio->bi_private; mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev->private; int i; @@ -1448,9 +1440,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) char b[BDEVNAME_SIZE]; /* Cannot read from anywhere, array is toast */ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" " for block %llu\n", - bdevname(bio->bi_bdev,b), + mdname(mddev), + bdevname(bio->bi_bdev, b), (unsigned long long)r1_bio->sector); md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); @@ -1572,7 +1565,7 @@ static void fix_read_error(conf_t *conf, int read_disk, else { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO - "raid1:%s: read error corrected " + "md/raid1:%s: read error corrected " "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + @@ -1618,42 +1611,6 @@ static void raid1d(mddev_t *mddev) if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; - } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { - /* some requests in the r1bio were BIO_RW_BARRIER - * requests which failed with -EOPNOTSUPP. Hohumm.. - * Better resubmit without the barrier. - * We know which devices to resubmit for, because - * all others have had their bios[] entry cleared. - * We already have a nr_pending reference on these rdevs. - */ - int i; - const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); - clear_bit(R1BIO_BarrierRetry, &r1_bio->state); - clear_bit(R1BIO_Barrier, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) - atomic_inc(&r1_bio->remaining); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) { - struct bio_vec *bvec; - int j; - - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - /* copy pages from the failed bio, as - * this might be a write-behind device */ - __bio_for_each_segment(bvec, bio, j, 0) - bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; - bio_put(r1_bio->bios[i]); - bio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | - (do_sync << BIO_RW_SYNCIO); - bio->bi_private = r1_bio; - r1_bio->bios[i] = bio; - generic_make_request(bio); - } } else { int disk; @@ -1677,13 +1634,14 @@ static void raid1d(mddev_t *mddev) bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" " read error for block %llu\n", + mdname(mddev), bdevname(bio->bi_bdev,b), (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); + const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; @@ -1692,14 +1650,15 @@ static void raid1d(mddev_t *mddev) r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r1_bio->sector); + printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" + " other mirror: %s\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev,b)); bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); @@ -1750,13 +1709,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int still_degraded = 0; if (!conf->r1buf_pool) - { -/* - printk("sync start - bitmap %p\n", mddev->bitmap); -*/ if (init_resync(conf)) return 0; - } max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { @@ -1827,7 +1781,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* take from bio_init */ bio->bi_next = NULL; + bio->bi_flags &= ~(BIO_POOL_MASK-1); bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; @@ -1900,7 +1856,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; BUG_ON(sync_blocks < (PAGE_SIZE>>9)); - if (len > (sync_blocks<<9)) + if ((len >> 9) > sync_blocks) len = sync_blocks<<9; } @@ -2037,7 +1993,7 @@ static conf_t *setup_conf(mddev_t *mddev) err = -EIO; if (conf->last_used < 0) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", + printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", mdname(mddev)); goto abort; } @@ -2045,7 +2001,7 @@ static conf_t *setup_conf(mddev_t *mddev) conf->thread = md_register_thread(raid1d, mddev, NULL); if (!conf->thread) { printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", + "md/raid1:%s: couldn't allocate thread\n", mdname(mddev)); goto abort; } @@ -2071,12 +2027,12 @@ static int run(mddev_t *mddev) mdk_rdev_t *rdev; if (mddev->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", mdname(mddev), mddev->level); return -EIO; } if (mddev->reshape_position != MaxSector) { - printk("raid1: %s: reshape_position set but not supported\n", + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", mdname(mddev)); return -EIO; } @@ -2098,12 +2054,14 @@ static int run(mddev_t *mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit ->max_segments to 1 lying within + * a single page, as a one page request is never in violation. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } } mddev->degraded = 0; @@ -2117,11 +2075,11 @@ static int run(mddev_t *mddev) mddev->recovery_cp = MaxSector; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid1: %s is not clean" + printk(KERN_NOTICE "md/raid1:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); printk(KERN_INFO - "raid1: raid set %s active with %d out of %d mirrors\n", + "md/raid1:%s: active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); @@ -2145,15 +2103,14 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; - int behind_wait = 0; /* wait for behind writes to complete */ - while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - behind_wait++; - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); /* wait a second */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); } raise_barrier(conf); @@ -2184,7 +2141,6 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { @@ -2279,9 +2235,9 @@ static int raid1_reshape(mddev_t *mddev) if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) printk(KERN_WARNING - "md/raid1: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + "md/raid1:%s: cannot register " + "%s\n", + mdname(mddev), nm); } if (rdev) newmirrors[d2++].rdev = rdev; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5f2d443ae28a..adf8cfd73313 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -117,8 +117,6 @@ struct r1bio_s { #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 #define R1BIO_BehindIO 3 -#define R1BIO_Barrier 4 -#define R1BIO_BarrierRetry 5 /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d119b7b75e71..f0d082f749be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,11 +18,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/slab.h> #include <linux/delay.h> #include <linux/blkdev.h> #include <linux/seq_file.h> #include "md.h" #include "raid10.h" +#include "raid0.h" #include "bitmap.h" /* @@ -254,7 +256,7 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; int slot, dev; conf_t *conf = r10_bio->mddev->private; @@ -284,7 +286,8 @@ static void raid10_end_read_request(struct bio *bio, int error) */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) - printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", + printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } @@ -295,7 +298,7 @@ static void raid10_end_read_request(struct bio *bio, int error) static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; int slot, dev; conf_t *conf = r10_bio->mddev->private; @@ -493,7 +496,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, */ static int read_balance(conf_t *conf, r10bio_t *r10_bio) { - const unsigned long this_sector = r10_bio->sector; + const sector_t this_sector = r10_bio->sector; int disk, slot, nslot; const int sectors = r10_bio->sectors; sector_t new_distance, current_distance; @@ -600,7 +603,7 @@ static void unplug_slaves(mddev_t *mddev) int i; rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { + for (i=0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -634,7 +637,7 @@ static int raid10_congested(void *data, int bits) if (mddev_congested(mddev, bits)) return 1; rcu_read_lock(); - for (i = 0; i < mddev->raid_disks && ret == 0; i++) { + for (i = 0; i < conf->raid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -787,24 +790,23 @@ static void unfreeze_array(conf_t *conf) spin_unlock_irq(&conf->resync_lock); } -static int make_request(struct request_queue *q, struct bio * bio) +static int make_request(mddev_t *mddev, struct bio * bio) { - mddev_t *mddev = q->queuedata; conf_t *conf = mddev->private; mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; - int cpu; int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); - const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_fua = (bio->bi_rw & REQ_FUA); struct bio_list bl; unsigned long flags; mdk_rdev_t *blocked_rdev; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - md_barrier_request(mddev, bio); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); return 0; } @@ -824,16 +826,34 @@ static int make_request(struct request_queue *q, struct bio * bio) */ bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); - if (make_request(q, &bp->bio1)) + + /* Each of these 'make_request' calls will call 'wait_barrier'. + * If the first succeeds but the second blocks due to the resync + * thread raising the barrier, we will deadlock because the + * IO to the underlying device will be queued in generic_make_request + * and will never complete, so will never reduce nr_pending. + * So increment nr_waiting here so no new raise_barriers will + * succeed, and so the second wait_barrier cannot block. + */ + spin_lock_irq(&conf->resync_lock); + conf->nr_waiting++; + spin_unlock_irq(&conf->resync_lock); + + if (make_request(mddev, &bp->bio1)) generic_make_request(&bp->bio1); - if (make_request(q, &bp->bio2)) + if (make_request(mddev, &bp->bio2)) generic_make_request(&bp->bio2); + spin_lock_irq(&conf->resync_lock); + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + spin_unlock_irq(&conf->resync_lock); + bio_pair_release(bp); return 0; bad_map: - printk("raid10_make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", chunk_sects/2, + printk("md/raid10:%s: make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, (unsigned long long)bio->bi_sector, bio->bi_size >> 10); bio_io_error(bio); @@ -849,12 +869,6 @@ static int make_request(struct request_queue *q, struct bio * bio) */ wait_barrier(conf); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); - part_stat_unlock(); - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; @@ -884,7 +898,7 @@ static int make_request(struct request_queue *q, struct bio * bio) mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; generic_make_request(read_bio); @@ -952,7 +966,7 @@ static int make_request(struct request_queue *q, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO); + mbio->bi_rw = WRITE | do_sync | do_fua; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1038,9 +1052,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" - "raid10: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -1048,19 +1063,19 @@ static void print_conf(conf_t *conf) int i; mirror_info_t *tmp; - printk("RAID10 conf printout:\n"); + printk(KERN_DEBUG "RAID10 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; if (tmp->rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); @@ -1102,6 +1117,8 @@ static int raid10_spare_active(mddev_t *mddev) int i; conf_t *conf = mddev->private; mirror_info_t *tmp; + int count = 0; + unsigned long flags; /* * Find all non-in_sync disks within the RAID10 configuration @@ -1112,15 +1129,16 @@ static int raid10_spare_active(mddev_t *mddev) if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent(tmp->rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); - return 0; + return count; } @@ -1131,7 +1149,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int mirror; mirror_info_t *p; int first = 0; - int last = mddev->raid_disks - 1; + int last = conf->raid_disks - 1; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -1155,13 +1173,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + /* as we don't honour merge_bvec_fn, we must + * never risk violating it, so limit + * ->max_segments to one lying with a single + * page, as a one page request is never in + * violation. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } p->head_position = 0; rdev->raid_disk = mirror; @@ -1219,7 +1241,7 @@ abort: static void end_sync_read(struct bio *bio, int error) { - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; conf_t *conf = r10_bio->mddev->private; int i,d; @@ -1256,7 +1278,7 @@ static void end_sync_read(struct bio *bio, int error) static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + r10bio_t *r10_bio = bio->bi_private; mddev_t *mddev = r10_bio->mddev; conf_t *conf = mddev->private; int i,d; @@ -1482,14 +1504,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int sectors = r10_bio->sectors; mdk_rdev_t*rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int d = r10_bio->devs[r10_bio->read_slot].devnum; rcu_read_lock(); - { - int d = r10_bio->devs[r10_bio->read_slot].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev) { /* If rdev is not NULL */ char b[BDEVNAME_SIZE]; int cur_read_error_count = 0; - rdev = rcu_dereference(conf->mirrors[d].rdev); bdevname(rdev->bdev, b); if (test_bit(Faulty, &rdev->flags)) { @@ -1505,13 +1527,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) if (cur_read_error_count > max_read_errors) { rcu_read_unlock(); printk(KERN_NOTICE - "raid10: %s: Raid device exceeded " + "md/raid10:%s: %s: Raid device exceeded " "read_error threshold " "[cur %d:max %d]\n", + mdname(mddev), b, cur_read_error_count, max_read_errors); printk(KERN_NOTICE - "raid10: %s: Failing raid " - "device\n", b); + "md/raid10:%s: %s: Failing raid " + "device\n", mdname(mddev), b); md_error(mddev, conf->mirrors[d].rdev); return; } @@ -1529,7 +1552,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); do { - int d = r10_bio->devs[sl].devnum; + d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { @@ -1563,7 +1586,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); while (sl != r10_bio->read_slot) { char b[BDEVNAME_SIZE]; - int d; + if (sl==0) sl = conf->copies; sl--; @@ -1581,15 +1604,16 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE - "raid10:%s: read correction " + "md/raid10:%s: read correction " "write failed" " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect+ rdev->data_offset), bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "raid10:%s: failing " + printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", + mdname(mddev), bdevname(rdev->bdev, b)); md_error(mddev, rdev); } @@ -1599,7 +1623,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) } sl = start; while (sl != r10_bio->read_slot) { - int d; + if (sl==0) sl = conf->copies; sl--; @@ -1617,20 +1641,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) READ) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE - "raid10:%s: unable to read back " + "md/raid10:%s: unable to read back " "corrected sectors" " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect+ rdev->data_offset), bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "raid10:%s: failing drive\n", + printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", + mdname(mddev), bdevname(rdev->bdev, b)); md_error(mddev, rdev); } else { printk(KERN_INFO - "raid10:%s: read error corrected" + "md/raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect+ @@ -1705,19 +1730,21 @@ static void raid10d(mddev_t *mddev) mddev->ro ? IO_BLOCKED : NULL; mirror = read_balance(conf, r10_bio); if (mirror == -1) { - printk(KERN_ALERT "raid10: %s: unrecoverable I/O" + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" " read error for block %llu\n", + mdname(mddev), bdevname(bio->bi_bdev,b), (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); bio_put(bio); } else { - const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO); + const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) - printk(KERN_ERR "raid10: %s: redirecting sector %llu to" + printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" " another mirror\n", + mdname(mddev), bdevname(rdev->bdev,b), (unsigned long long)r10_bio->sector); bio = bio_clone(r10_bio->master_bio, GFP_NOIO); @@ -1725,7 +1752,7 @@ static void raid10d(mddev_t *mddev) bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); + bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; unplug = 1; @@ -1975,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio = rb2; if (!test_and_set_bit(MD_RECOVERY_INTR, &mddev->recovery)) - printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", mdname(mddev)); break; } @@ -2135,9 +2163,9 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) conf_t *conf = mddev->private; if (!raid_disks) - raid_disks = mddev->raid_disks; + raid_disks = conf->raid_disks; if (!sectors) - sectors = mddev->dev_sectors; + sectors = conf->dev_sectors; size = sectors >> conf->chunk_shift; sector_div(size, conf->far_copies); @@ -2147,62 +2175,61 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) return size << conf->chunk_shift; } -static int run(mddev_t *mddev) + +static conf_t *setup_conf(mddev_t *mddev) { - conf_t *conf; - int i, disk_idx, chunk_size; - mirror_info_t *disk; - mdk_rdev_t *rdev; + conf_t *conf = NULL; int nc, fc, fo; sector_t stride, size; + int err = -EINVAL; - if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || - !is_power_of_2(mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid10: chunk size must be " - "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); - return -EINVAL; + if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->new_chunk_sectors)) { + printk(KERN_ERR "md/raid10:%s: chunk size must be " + "at least PAGE_SIZE(%ld) and be a power of 2.\n", + mdname(mddev), PAGE_SIZE); + goto out; } - nc = mddev->layout & 255; - fc = (mddev->layout >> 8) & 255; - fo = mddev->layout & (1<<16); + nc = mddev->new_layout & 255; + fc = (mddev->new_layout >> 8) & 255; + fo = mddev->new_layout & (1<<16); + if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->layout >> 17)) { - printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", - mdname(mddev), mddev->layout); + (mddev->new_layout >> 17)) { + printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", + mdname(mddev), mddev->new_layout); goto out; } - /* - * copy the already verified devices into our private RAID10 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ + + err = -ENOMEM; conf = kzalloc(sizeof(conf_t), GFP_KERNEL); - mddev->private = conf; - if (!conf) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); + if (!conf) goto out; - } + conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, - GFP_KERNEL); - if (!conf->mirrors) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } + GFP_KERNEL); + if (!conf->mirrors) + goto out; conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) - goto out_free_conf; + goto out; + conf->raid_disks = mddev->raid_disks; conf->near_copies = nc; conf->far_copies = fc; conf->copies = nc*fc; conf->far_offset = fo; - conf->chunk_mask = mddev->chunk_sectors - 1; - conf->chunk_shift = ffz(~mddev->chunk_sectors); + conf->chunk_mask = mddev->new_chunk_sectors - 1; + conf->chunk_shift = ffz(~mddev->new_chunk_sectors); + + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, + r10bio_pool_free, conf); + if (!conf->r10bio_pool) + goto out; + size = mddev->dev_sectors >> conf->chunk_shift; sector_div(size, fc); size = size * conf->raid_disks; @@ -2216,7 +2243,8 @@ static int run(mddev_t *mddev) */ stride += conf->raid_disks - 1; sector_div(stride, conf->raid_disks); - mddev->dev_sectors = stride << conf->chunk_shift; + + conf->dev_sectors = stride << conf->chunk_shift; if (fo) stride = 1; @@ -2224,18 +2252,62 @@ static int run(mddev_t *mddev) sector_div(stride, fc); conf->stride = stride << conf->chunk_shift; - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); - if (!conf->r10bio_pool) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } - conf->mddev = mddev; spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + spin_lock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + + conf->thread = md_register_thread(raid10d, mddev, NULL); + if (!conf->thread) + goto out; + + conf->mddev = mddev; + return conf; + + out: + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", + mdname(mddev)); + if (conf) { + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf); + } + return ERR_PTR(err); +} + +static int run(mddev_t *mddev) +{ + conf_t *conf; + int i, disk_idx, chunk_size; + mirror_info_t *disk; + mdk_rdev_t *rdev; + sector_t size; + + /* + * copy the already verified devices into our private RAID10 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] + */ + + if (mddev->private == NULL) { + conf = setup_conf(mddev); + if (IS_ERR(conf)) + return PTR_ERR(conf); + mddev->private = conf; + } + conf = mddev->private; + if (!conf) + goto out; + mddev->queue->queue_lock = &conf->device_lock; + mddev->thread = conf->thread; + conf->thread = NULL; + chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); if (conf->raid_disks % conf->near_copies) @@ -2246,7 +2318,7 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks + if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; disk = conf->mirrors + disk_idx; @@ -2255,23 +2327,20 @@ static int run(mddev_t *mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit max_segments to 1 lying + * within a single page. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } disk->head_position = 0; } - INIT_LIST_HEAD(&conf->retry_list); - - spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_barrier); - /* need to check that every block has at least one working mirror */ if (!enough(conf)) { - printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", + printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", mdname(mddev)); goto out_free_conf; } @@ -2290,28 +2359,21 @@ static int run(mddev_t *mddev) } } - - mddev->thread = md_register_thread(raid10d, mddev, NULL); - if (!mddev->thread) { - printk(KERN_ERR - "raid10: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; - } - if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid10: %s is not clean" + printk(KERN_NOTICE "md/raid10:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); printk(KERN_INFO - "raid10: raid set %s active with %d out of %d devices\n", - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks); + "md/raid10:%s: active with %d out of %d devices\n", + mdname(mddev), conf->raid_disks - mddev->degraded, + conf->raid_disks); /* * Ok, everything is just fine now */ - md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); - mddev->resync_max_sectors = raid10_size(mddev, 0, 0); + mddev->dev_sectors = conf->dev_sectors; + size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + mddev->resync_max_sectors = size; mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; @@ -2329,7 +2391,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - if (conf->near_copies < mddev->raid_disks) + if (conf->near_copies < conf->raid_disks) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); md_integrity_register(mddev); return 0; @@ -2341,6 +2403,7 @@ out_free_conf: kfree(conf->mirrors); kfree(conf); mddev->private = NULL; + md_unregister_thread(mddev->thread); out: return -EIO; } @@ -2377,6 +2440,57 @@ static void raid10_quiesce(mddev_t *mddev, int state) } } +static void *raid10_takeover_raid0(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + conf_t *conf; + + if (mddev->degraded > 0) { + printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 10; + /* new layout: far_copies = 1, near_copies = 2 */ + mddev->new_layout = (1<<8) + 2; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = mddev->raid_disks; + mddev->raid_disks *= 2; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + conf = setup_conf(mddev); + if (!IS_ERR(conf)) + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) + rdev->new_raid_disk = rdev->raid_disk * 2; + + return conf; +} + +static void *raid10_takeover(mddev_t *mddev) +{ + struct raid0_private_data *raid0_priv; + + /* raid10 can take over: + * raid0 - providing it has only two drives + */ + if (mddev->level == 0) { + /* for raid0 takeover only one zone is supported */ + raid0_priv = mddev->private; + if (raid0_priv->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" + " with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + return raid10_takeover_raid0(mddev); + } + return ERR_PTR(-EINVAL); +} + static struct mdk_personality raid10_personality = { .name = "raid10", @@ -2393,6 +2507,7 @@ static struct mdk_personality raid10_personality = .sync_request = sync_request, .quiesce = raid10_quiesce, .size = raid10_size, + .takeover = raid10_takeover, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 59cd1efb8d30..2316ac2e8e21 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,6 +33,8 @@ struct r10_private_data_s { * 1 stripe. */ + sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ + int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; @@ -57,6 +59,11 @@ struct r10_private_data_s { mempool_t *r10bio_pool; mempool_t *r10buf_pool; struct page *tmppage; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct mdk_thread_s *thread; }; typedef struct r10_private_data_s conf_t; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e84204eb12df..31140d1259dc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -50,8 +50,10 @@ #include <linux/async.h> #include <linux/seq_file.h> #include <linux/cpu.h> +#include <linux/slab.h> #include "md.h" #include "raid5.h" +#include "raid0.h" #include "bitmap.h" /* @@ -199,11 +201,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - blk_plug_device(conf->mddev->queue); + plugger_set_plug(&conf->plug); } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) { list_add_tail(&sh->lru, &conf->bitmap_list); - blk_plug_device(conf->mddev->queue); + plugger_set_plug(&conf->plug); } else { clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); @@ -275,12 +277,13 @@ out: return sh; } -static void shrink_buffers(struct stripe_head *sh, int num) +static void shrink_buffers(struct stripe_head *sh) { struct page *p; int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num ; i++) { + for (i = 0; i < num ; i++) { p = sh->dev[i].page; if (!p) continue; @@ -289,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) } } -static int grow_buffers(struct stripe_head *sh, int num) +static int grow_buffers(struct stripe_head *sh) { int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num; i++) { + for (i = 0; i < num; i++) { struct page *page; if (!(page = alloc_page(GFP_KERNEL))) { @@ -362,8 +366,74 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, return NULL; } +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int has_failed(raid5_conf_t *conf) +{ + int degraded; + int i; + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + return 0; +} + static void unplug_slaves(mddev_t *mddev); -static void raid5_unplug_device(struct request_queue *q); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, @@ -393,7 +463,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - raid5_unplug_device(conf->mddev->queue) + md_raid5_unplug_device(conf) ); conf->inactive_blocked = 0; } else @@ -436,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) int rw; struct bio *bi; mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) + rw = WRITE_FUA; + else + rw = WRITE; + } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else continue; @@ -961,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + if (wbi->bi_rw & REQ_FUA) + set_bit(R5_WantFUA, &dev->flags); tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -978,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; + bool fua = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = disks; i--; ) + fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written || i == pd_idx || i == qd_idx) + if (dev->written || i == pd_idx || i == qd_idx) { set_bit(R5_UPTODATE, &dev->flags); + if (fua) + set_bit(R5_WantFUA, &dev->flags); + } } if (sh->reconstruct_state == reconstruct_state_drain_run) @@ -1238,19 +1320,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - int disks = max(conf->raid_disks, conf->previous_raid_disks); sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); + memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif - if (grow_buffers(sh, disks)) { - shrink_buffers(sh, disks); + if (grow_buffers(sh)) { + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); return 0; } @@ -1267,10 +1348,14 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); - sprintf(conf->cache_name[0], - "raid%d-%s", conf->level, mdname(conf->mddev)); - sprintf(conf->cache_name[1], - "raid%d-%s-alt", conf->level, mdname(conf->mddev)); + if (conf->mddev->gendisk) + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + else + sprintf(conf->cache_name[0], + "raid%d-%p", conf->level, conf->mddev); + sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); + conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -1466,7 +1551,7 @@ static int drop_one_stripe(raid5_conf_t *conf) if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh, conf->pool_size); + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); return 1; @@ -1508,7 +1593,7 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk_rl(KERN_INFO "raid5:%s: read error corrected" + printk_rl(KERN_INFO "md/raid:%s: read error corrected" " (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, (unsigned long long)(sh->sector @@ -1526,9 +1611,9 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); - if (conf->mddev->degraded) + if (conf->mddev->degraded >= conf->max_degraded) printk_rl(KERN_WARNING - "raid5:%s: read error not correctable " + "md/raid:%s: read error not correctable " "(sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)(sh->sector @@ -1537,7 +1622,7 @@ static void raid5_end_read_request(struct bio * bi, int error) else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ printk_rl(KERN_WARNING - "raid5:%s: read error NOT corrected!! " + "md/raid:%s: read error NOT corrected!! " "(sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)(sh->sector @@ -1546,7 +1631,7 @@ static void raid5_end_read_request(struct bio * bi, int error) else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING - "raid5:%s: Too many read errors, failing device %s.\n", + "md/raid:%s: Too many read errors, failing device %s.\n", mdname(conf->mddev), bdn); else retry = 1; @@ -1618,8 +1703,8 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - pr_debug("raid5: error called\n"); + raid5_conf_t *conf = mddev->private; + pr_debug("raid456: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1635,9 +1720,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); printk(KERN_ALERT - "raid5: Disk failure on %s, disabling device.\n" - "raid5: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + "md/raid:%s: Disk failure on %s, disabling device.\n" + KERN_ALERT + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } } @@ -1649,8 +1738,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, int previous, int *dd_idx, struct stripe_head *sh) { - long stripe; - unsigned long chunk_number; + sector_t stripe, stripe2; + sector_t chunk_number; unsigned int chunk_offset; int pd_idx, qd_idx; int ddf_layout = 0; @@ -1670,18 +1759,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, */ chunk_offset = sector_div(r_sector, sectors_per_chunk); chunk_number = r_sector; - BUG_ON(r_sector != chunk_number); /* * Compute the stripe number */ - stripe = chunk_number / data_disks; - - /* - * Compute the data disk and parity disk indexes inside the stripe - */ - *dd_idx = chunk_number % data_disks; - + stripe = chunk_number; + *dd_idx = sector_div(stripe, data_disks); + stripe2 = stripe; /* * Select the parity disk based on the user selected algorithm. */ @@ -1693,21 +1777,21 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, case 5: switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - pd_idx = data_disks - stripe % raid_disks; + pd_idx = data_disks - sector_div(stripe2, raid_disks); if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - pd_idx = stripe % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - pd_idx = data_disks - stripe % raid_disks; + pd_idx = data_disks - sector_div(stripe2, raid_disks); *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - pd_idx = stripe % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_PARITY_0: @@ -1718,8 +1802,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, pd_idx = data_disks; break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1727,7 +1809,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - pd_idx = raid_disks - 1 - (stripe % raid_disks); + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ @@ -1736,7 +1818,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - pd_idx = stripe % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ @@ -1745,12 +1827,12 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - pd_idx = raid_disks - 1 - (stripe % raid_disks); + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); qd_idx = (pd_idx + 1) % raid_disks; *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - pd_idx = stripe % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); qd_idx = (pd_idx + 1) % raid_disks; *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; @@ -1769,7 +1851,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, /* Exactly the same as RIGHT_ASYMMETRIC, but or * of blocks for computing Q is different. */ - pd_idx = stripe % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ @@ -1784,7 +1866,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, * D D D P Q rather than * Q D D D P */ - pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); + stripe2 += 1; + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); qd_idx = pd_idx + 1; if (pd_idx == raid_disks-1) { (*dd_idx)++; /* Q D D D P */ @@ -1796,7 +1879,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, case ALGORITHM_ROTATING_N_CONTINUE: /* Same as left_symmetric but Q is before P */ - pd_idx = raid_disks - 1 - (stripe % raid_disks); + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); qd_idx = (pd_idx + raid_disks - 1) % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; ddf_layout = 1; @@ -1804,27 +1887,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, case ALGORITHM_LEFT_ASYMMETRIC_6: /* RAID5 left_asymmetric, with Q on last device */ - pd_idx = data_disks - stripe % (raid_disks-1); + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); if (*dd_idx >= pd_idx) (*dd_idx)++; qd_idx = raid_disks - 1; break; case ALGORITHM_RIGHT_ASYMMETRIC_6: - pd_idx = stripe % (raid_disks-1); + pd_idx = sector_div(stripe2, raid_disks-1); if (*dd_idx >= pd_idx) (*dd_idx)++; qd_idx = raid_disks - 1; break; case ALGORITHM_LEFT_SYMMETRIC_6: - pd_idx = data_disks - stripe % (raid_disks-1); + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); qd_idx = raid_disks - 1; break; case ALGORITHM_RIGHT_SYMMETRIC_6: - pd_idx = stripe % (raid_disks-1); + pd_idx = sector_div(stripe2, raid_disks-1); *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); qd_idx = raid_disks - 1; break; @@ -1835,10 +1918,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, qd_idx = raid_disks - 1; break; - default: - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1869,14 +1949,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) : conf->algorithm; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dd_idx = i; + sector_t chunk_number; + int dummy1, dd_idx = i; sector_t r_sector; struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); stripe = new_sector; - BUG_ON(new_sector != stripe); if (i == sh->pd_idx) return 0; @@ -1901,8 +1981,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) case ALGORITHM_PARITY_N: break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - algorithm); BUG(); } break; @@ -1961,21 +2039,20 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) i -= 1; break; default: - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", - algorithm); BUG(); } break; } chunk_number = stripe * data_disks + i; - r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; + r_sector = chunk_number * sectors_per_chunk + chunk_offset; check = raid5_compute_sector(conf, r_sector, previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { - printk(KERN_ERR "compute_blocknr: map not correct\n"); + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); return 0; } return r_sector; @@ -2969,7 +3046,6 @@ static void handle_stripe5(struct stripe_head *sh) mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx toread %p read %p write %p " "written %p\n", i, dev->flags, dev->toread, dev->read, @@ -3006,17 +3082,27 @@ static void handle_stripe5(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* could be in-sync depending on recovery/reshape status */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { s.failed++; s.failed_num = i; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -3207,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh) if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request - * is waiting on a barrier, it won't continue until the writes + * is waiting on a flush, it won't continue until the writes * have actually been submitted. */ atomic_dec(&conf->preread_active_stripes); @@ -3250,7 +3336,6 @@ static void handle_stripe6(struct stripe_head *sh) for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -3288,18 +3373,28 @@ static void handle_stripe6(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* in sync if before recovery_offset */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { if (s.failed < 2) r6s.failed_num[s.failed] = i; s.failed++; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -3500,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh) if (dec_preread_active) { /* We delay this until after ops_run_io so that if make_request - * is waiting on a barrier, it won't continue until the writes + * is waiting on a flush, it won't continue until the writes * have actually been submitted. */ atomic_dec(&conf->preread_active_stripes); @@ -3534,7 +3629,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) list_add_tail(&sh->lru, &conf->hold_list); } } else - blk_plug_device(conf->mddev->queue); + plugger_set_plug(&conf->plug); } static void activate_bit_delay(raid5_conf_t *conf) @@ -3575,36 +3670,44 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid5_unplug_device(struct request_queue *q) +void md_raid5_unplug_device(raid5_conf_t *conf) { - mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev->private; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) { + if (plugger_remove_plug(&conf->plug)) { conf->seq_flush++; raid5_activate_delayed(conf); } - md_wakeup_thread(mddev->thread); + md_wakeup_thread(conf->mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); - unplug_slaves(mddev); + unplug_slaves(conf->mddev); } +EXPORT_SYMBOL_GPL(md_raid5_unplug_device); -static int raid5_congested(void *data, int bits) +static void raid5_unplug(struct plug_handle *plug) +{ + raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); + md_raid5_unplug_device(conf); +} + +static void raid5_unplug_queue(struct request_queue *q) +{ + mddev_t *mddev = q->queuedata; + md_raid5_unplug_device(mddev->private); +} + +int md_raid5_congested(mddev_t *mddev, int bits) { - mddev_t *mddev = data; raid5_conf_t *conf = mddev->private; /* No difference between reads and writes. Just check * how busy the stripe_cache is */ - if (mddev_congested(mddev, bits)) - return 1; if (conf->inactive_blocked) return 1; if (conf->quiesce) @@ -3614,6 +3717,15 @@ static int raid5_congested(void *data, int bits) return 0; } +EXPORT_SYMBOL_GPL(md_raid5_congested); + +static int raid5_congested(void *data, int bits) +{ + mddev_t *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid5_congested(mddev, bits); +} /* We want read requests to align with chunks where possible, * but write requests don't need to. @@ -3712,10 +3824,10 @@ static void raid5_align_endio(struct bio *bi, int error) bio_put(bi); - mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; - conf = mddev->private; rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; + mddev = rdev->mddev; + conf = mddev->private; rdev_dec_pending(rdev, conf->mddev); @@ -3739,7 +3851,7 @@ static int bio_fits_rdev(struct bio *bi) if ((bi->bi_size>>9) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > queue_max_phys_segments(q)) + if (bi->bi_phys_segments > queue_max_segments(q)) return 0; if (q->merge_bvec_fn) @@ -3752,9 +3864,8 @@ static int bio_fits_rdev(struct bio *bi) } -static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) +static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) { - mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; int dd_idx; struct bio* align_bi; @@ -3869,39 +3980,26 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) return sh; } -static int make_request(struct request_queue *q, struct bio * bi) +static int make_request(mddev_t *mddev, struct bio * bi) { - mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int cpu, remaining; + int remaining; - if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { - /* Drain all pending writes. We only really need - * to ensure they have been submitted, but this is - * easier. - */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - md_barrier_request(mddev, bi); + if (unlikely(bi->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bi); return 0; } md_write_start(mddev, bi); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bi)); - part_stat_unlock(); - if (rw == READ && mddev->reshape_position == MaxSector && - chunk_aligned_read(q,bi)) + chunk_aligned_read(mddev,bi)) return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); @@ -3949,7 +4047,7 @@ static int make_request(struct request_queue *q, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); - pr_debug("raid5: make_request, sector %llu logical %llu\n", + pr_debug("raid456: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -4003,7 +4101,7 @@ static int make_request(struct request_queue *q, struct bio * bi) * add failed due to overlap. Flush everything * and wait a while */ - raid5_unplug_device(mddev->queue); + md_raid5_unplug_device(conf); release_stripe(sh); schedule(); goto retry; @@ -4011,7 +4109,7 @@ static int make_request(struct request_queue *q, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - if (mddev->barrier && + if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe(sh); @@ -4034,13 +4132,6 @@ static int make_request(struct request_queue *q, struct bio * bi) bio_endio(bi, 0); } - if (mddev->barrier) { - /* We need to wait for the stripes to all be handled. - * So: wait for preread_active_stripes to drop to 0. - */ - wait_event(mddev->thread->wqueue, - atomic_read(&conf->preread_active_stripes) == 0); - } return 0; } @@ -4057,7 +4148,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * As the reads complete, handle_stripe will copy the data * into the destination stripe and release that stripe. */ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; @@ -4266,7 +4357,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped /* FIXME go_faster isn't used */ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; struct stripe_head *sh; sector_t max_sector = mddev->dev_sectors; int sync_blocks; @@ -4494,23 +4585,15 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page) return 0; } -static ssize_t -raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) +int +raid5_set_cache_size(mddev_t *mddev, int size) { raid5_conf_t *conf = mddev->private; - unsigned long new; int err; - if (len >= PAGE_SIZE) + if (size <= 16 || size > 32768) return -EINVAL; - if (!conf) - return -ENODEV; - - if (strict_strtoul(page, 10, &new)) - return -EINVAL; - if (new <= 16 || new > 32768) - return -EINVAL; - while (new < conf->max_nr_stripes) { + while (size < conf->max_nr_stripes) { if (drop_one_stripe(conf)) conf->max_nr_stripes--; else @@ -4519,11 +4602,32 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) err = md_allow_write(mddev); if (err) return err; - while (new > conf->max_nr_stripes) { + while (size > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; else break; } + return 0; +} +EXPORT_SYMBOL(raid5_set_cache_size); + +static ssize_t +raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) +{ + raid5_conf_t *conf = mddev->private; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (strict_strtoul(page, 10, &new)) + return -EINVAL; + err = raid5_set_cache_size(mddev, new); + if (err) + return err; return len; } @@ -4659,7 +4763,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, kfree(percpu->scribble); pr_err("%s: failed memory allocation for cpu%ld\n", __func__, cpu); - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); } break; case CPU_DEAD: @@ -4680,7 +4784,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) { unsigned long cpu; struct page *spare_page; - struct raid5_percpu *allcpus; + struct raid5_percpu __percpu *allcpus; void *scribble; int err; @@ -4728,7 +4832,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (mddev->new_level != 5 && mddev->new_level != 4 && mddev->new_level != 6) { - printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", mdname(mddev), mddev->new_level); return ERR_PTR(-EIO); } @@ -4736,12 +4840,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) && !algorithm_valid_raid5(mddev->new_layout)) || (mddev->new_level == 6 && !algorithm_valid_raid6(mddev->new_layout))) { - printk(KERN_ERR "raid5: %s: layout %d not supported\n", + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", mdname(mddev), mddev->new_layout); return ERR_PTR(-EIO); } if (mddev->new_level == 6 && mddev->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", mdname(mddev), mddev->raid_disks); return ERR_PTR(-EINVAL); } @@ -4749,8 +4853,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (!mddev->new_chunk_sectors || (mddev->new_chunk_sectors << 9) % PAGE_SIZE || !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - mddev->new_chunk_sectors << 9, mdname(mddev)); + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); return ERR_PTR(-EINVAL); } @@ -4792,7 +4896,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (raid5_alloc_percpu(conf) != 0) goto abort; - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); list_for_each_entry(rdev, &mddev->disks, same_set) { raid_disk = rdev->raid_disk; @@ -4805,9 +4909,9 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "raid5: device %s operational as raid" - " disk %d\n", bdevname(rdev->bdev,b), - raid_disk); + printk(KERN_INFO "md/raid:%s: device %s operational as raid" + " disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); } else /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; @@ -4831,16 +4935,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; if (grow_stripes(conf, conf->max_nr_stripes)) { printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); + "md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); goto abort; } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", + mdname(mddev), memory); conf->thread = md_register_thread(raid5d, mddev, NULL); if (!conf->thread) { printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", + "md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); goto abort; } @@ -4885,13 +4990,13 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded static int run(mddev_t *mddev) { raid5_conf_t *conf; - int working_disks = 0, chunk_size; + int working_disks = 0; int dirty_parity_disks = 0; mdk_rdev_t *rdev; sector_t reshape_offset = 0; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "raid5: %s is not clean" + printk(KERN_NOTICE "md/raid:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); if (mddev->reshape_position != MaxSector) { @@ -4905,7 +5010,7 @@ static int run(mddev_t *mddev) int max_degraded = (mddev->level == 6 ? 2 : 1); if (mddev->new_level != mddev->level) { - printk(KERN_ERR "raid5: %s: unsupported reshape " + printk(KERN_ERR "md/raid:%s: unsupported reshape " "required - aborting.\n", mdname(mddev)); return -EINVAL; @@ -4918,8 +5023,8 @@ static int run(mddev_t *mddev) here_new = mddev->reshape_position; if (sector_div(here_new, mddev->new_chunk_sectors * (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); + printk(KERN_ERR "md/raid:%s: reshape_position not " + "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } reshape_offset = here_new * mddev->new_chunk_sectors; @@ -4940,8 +5045,9 @@ static int run(mddev_t *mddev) if ((here_new * mddev->new_chunk_sectors != here_old * mddev->chunk_sectors) || mddev->ro == 0) { - printk(KERN_ERR "raid5: in-place reshape must be started" - " in read-only mode - aborting\n"); + printk(KERN_ERR "md/raid:%s: in-place reshape must be started" + " in read-only mode - aborting\n", + mdname(mddev)); return -EINVAL; } } else if (mddev->delta_disks < 0 @@ -4950,11 +5056,13 @@ static int run(mddev_t *mddev) : (here_new * mddev->new_chunk_sectors >= here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); + printk(KERN_ERR "md/raid:%s: reshape_position too early for " + "auto-recovery - aborting.\n", + mdname(mddev)); return -EINVAL; } - printk(KERN_INFO "raid5: reshape will continue\n"); + printk(KERN_INFO "md/raid:%s: reshape will continue\n", + mdname(mddev)); /* OK, we should be able to continue; */ } else { BUG_ON(mddev->level != mddev->new_level); @@ -4981,8 +5089,10 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk < 0) continue; - if (test_bit(In_sync, &rdev->flags)) + if (test_bit(In_sync, &rdev->flags)) { working_disks++; + continue; + } /* This disc is not fully in-sync. However if it * just stored parity (beyond the recovery_offset), * when we don't need to be concerned about the @@ -4996,18 +5106,6 @@ static int run(mddev_t *mddev) mddev->minor_version > 90) rdev->recovery_offset = reshape_offset; - printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", - rdev->raid_disk, working_disks, conf->prev_algo, - conf->previous_raid_disks, conf->max_degraded, - conf->algorithm, conf->raid_disks, - only_parity(rdev->raid_disk, - conf->prev_algo, - conf->previous_raid_disks, - conf->max_degraded), - only_parity(rdev->raid_disk, - conf->algorithm, - conf->raid_disks, - conf->max_degraded)); if (rdev->recovery_offset < reshape_offset) { /* We need to check old and new layout */ if (!only_parity(rdev->raid_disk, @@ -5027,8 +5125,8 @@ static int run(mddev_t *mddev) mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - working_disks); - if (mddev->degraded > conf->max_degraded) { - printk(KERN_ERR "raid5: not enough operational devices for %s" + if (has_failed(conf)) { + printk(KERN_ERR "md/raid:%s: not enough operational devices" " (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; @@ -5042,32 +5140,32 @@ static int run(mddev_t *mddev) mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING - "raid5: starting dirty degraded array: %s" - "- data corruption possible.\n", + "md/raid:%s: starting dirty degraded array" + " - data corruption possible.\n", mdname(mddev)); else { printk(KERN_ERR - "raid5: cannot start dirty degraded array for %s\n", + "md/raid:%s: cannot start dirty degraded array.\n", mdname(mddev)); goto abort; } } if (mddev->degraded == 0) - printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" + " devices, algorithm %d\n", mdname(mddev), conf->level, mddev->raid_disks-mddev->degraded, mddev->raid_disks, mddev->new_layout); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d" - " out of %d devices, algorithm %d\n", conf->level, - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, mddev->new_layout); + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" + " out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks - mddev->degraded, + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); if (conf->reshape_progress != MaxSector) { - printk("...ok start reshape thread\n"); conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -5078,40 +5176,47 @@ static int run(mddev_t *mddev) "reshape"); } - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - } /* Ok, everything is just fine now */ - if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + if (mddev->to_remove == &raid5_attrs_group) + mddev->to_remove = NULL; + else if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - mddev->queue->queue_lock = &conf->device_lock; + plugger_init(&conf->plug, raid5_unplug); + mddev->plug = &conf->plug; + if (mddev->queue) { + int chunk_size; + /* read-ahead size must cover two whole stripes, which + * is 2 * (datadisks) * chunksize where 'n' is the + * number of raid devices + */ + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - mddev->queue->unplug_fn = raid5_unplug_device; - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_queue; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks - conf->max_degraded)); + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); - list_for_each_entry(rdev, &mddev->disks, same_set) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + list_for_each_entry(rdev, &mddev->disks, same_set) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } return 0; abort: @@ -5122,23 +5227,22 @@ abort: free_conf(conf); } mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); return -EIO; } - - static int stop(mddev_t *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; md_unregister_thread(mddev->thread); mddev->thread = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); + if (mddev->queue) + mddev->queue->backing_dev_info.congested_fn = NULL; + plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/ free_conf(conf); mddev->private = NULL; + mddev->to_remove = &raid5_attrs_group; return 0; } @@ -5179,7 +5283,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf) static void status(struct seq_file *seq, mddev_t *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + raid5_conf_t *conf = mddev->private; int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, @@ -5201,21 +5305,22 @@ static void print_raid5_conf (raid5_conf_t *conf) int i; struct disk_info *tmp; - printk("RAID5 conf printout:\n"); + printk(KERN_DEBUG "RAID conf printout:\n"); if (!conf) { printk("(conf==NULL)\n"); return; } - printk(" --- rd:%d wd:%d\n", conf->raid_disks, - conf->raid_disks - conf->mddev->degraded); + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->disks + i; if (tmp->rdev) - printk(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + i, !test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev, b)); } } @@ -5224,20 +5329,24 @@ static int raid5_spare_active(mddev_t *mddev) int i; raid5_conf_t *conf = mddev->private; struct disk_info *tmp; + int count = 0; + unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent(tmp->rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); - return 0; + return count; } static int raid5_remove_disk(mddev_t *mddev, int number) @@ -5263,7 +5372,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded && + !has_failed(conf) && number < conf->raid_disks) { err = -EBUSY; goto abort; @@ -5291,7 +5400,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5338,7 +5447,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_size(mddev, sectors, mddev->raid_disks)) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; @@ -5364,7 +5472,8 @@ static int check_stripe_cache(mddev_t *mddev) > conf->max_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) / STRIPE_SIZE)*4); return 0; @@ -5383,7 +5492,7 @@ static int check_reshape(mddev_t *mddev) if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) return -EINVAL; if (mddev->delta_disks < 0) { /* We might be able to shrink, but the devices must @@ -5435,7 +5544,7 @@ static int raid5_start_reshape(mddev_t *mddev) */ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) < mddev->array_sectors) { - printk(KERN_ERR "md: %s: array size must be reduced " + printk(KERN_ERR "md/raid:%s: array size must be reduced " "before number of disks\n", mdname(mddev)); return -EINVAL; } @@ -5458,31 +5567,36 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + if (mddev->delta_disks >= 0) + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; - if (rdev->raid_disk >= conf->previous_raid_disks) + if (rdev->raid_disk >= conf->previous_raid_disks) { set_bit(In_sync, &rdev->flags); - else + added_devices++; + } else rdev->recovery_offset = 0; - added_devices++; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "raid5: failed to create " - " link %s for %s\n", - nm, mdname(mddev)); + /* Failure here is OK */; } else break; } + /* When a reshape changes the number of devices, ->degraded + * is measured against the larger of the pre and post number of + * devices.*/ if (mddev->delta_disks > 0) { spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) + mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) - added_devices; spin_unlock_irqrestore(&conf->device_lock, flags); } @@ -5527,7 +5641,7 @@ static void end_reshape(raid5_conf_t *conf) /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ - { + if (conf->mddev->queue) { int data_disks = conf->raid_disks - conf->max_degraded; int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); @@ -5549,7 +5663,6 @@ static void raid5_finish_reshape(mddev_t *mddev) if (mddev->delta_disks > 0) { md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; revalidate_disk(mddev->gendisk); } else { int d; @@ -5614,6 +5727,29 @@ static void raid5_quiesce(mddev_t *mddev, int state) } +static void *raid45_takeover_raid0(mddev_t *mddev, int level) +{ + struct raid0_private_data *raid0_priv = mddev->private; + + /* for raid0 takeover only one zone is supported */ + if (raid0_priv->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + mddev->new_level = level; + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks += 1; + mddev->delta_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + return setup_conf(mddev); +} + + static void *raid5_takeover_raid1(mddev_t *mddev) { int chunksect; @@ -5738,12 +5874,13 @@ static int raid6_check_reshape(mddev_t *mddev) static void *raid5_takeover(mddev_t *mddev) { /* raid5 can take over: - * raid0 - if all devices are the same - make it a raid4 layout + * raid0 - if there is only one strip zone - make it a raid4 layout * raid1 - if there are two drives. We need to know the chunk size * raid4 - trivial - just use a raid4 layout. * raid6 - Providing it is a *_6 layout */ - + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 5); if (mddev->level == 1) return raid5_takeover_raid1(mddev); if (mddev->level == 4) { @@ -5757,6 +5894,22 @@ static void *raid5_takeover(mddev_t *mddev) return ERR_PTR(-EINVAL); } +static void *raid4_takeover(mddev_t *mddev) +{ + /* raid4 can take over: + * raid0 - if there is only one strip zone + * raid5 - if layout is right + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 4); + if (mddev->level == 5 && + mddev->layout == ALGORITHM_PARITY_N) { + mddev->new_layout = 0; + mddev->new_level = 4; + return setup_conf(mddev); + } + return ERR_PTR(-EINVAL); +} static struct mdk_personality raid5_personality; @@ -5872,6 +6025,7 @@ static struct mdk_personality raid4_personality = .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid4_takeover, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index dd708359b451..2ace0582b409 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -275,6 +275,7 @@ struct r6_state { * filling */ #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +#define R5_WantFUA 14 /* Write should be FUA */ /* * Write method */ @@ -388,7 +389,7 @@ struct raid5_private_data { * two caches. */ int active_name; - char cache_name[2][20]; + char cache_name[2][32]; struct kmem_cache *slab_cache; /* for allocating stripes */ int seq_flush, seq_write; @@ -398,6 +399,9 @@ struct raid5_private_data { * (fresh device added). * Cleared when a sync completes. */ + + struct plug_handle plug; + /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ @@ -405,7 +409,7 @@ struct raid5_private_data { * lists and performing address * conversions */ - } *percpu; + } __percpu *percpu; size_t scribble_len; /* size of scribble region must be * associated with conf to handle * cpu hotplug while reshaping @@ -497,4 +501,8 @@ static inline int algorithm_is_DDF(int layout) { return layout >= 8 && layout <= 10; } + +extern int md_raid5_congested(mddev_t *mddev, int bits); +extern void md_raid5_unplug_device(raid5_conf_t *conf); +extern int raid5_set_cache_size(mddev_t *mddev, int size); #endif diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c deleted file mode 100644 index bffc61bff5ab..000000000000 --- a/drivers/md/raid6algos.c +++ /dev/null @@ -1,153 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6algos.c - * - * Algorithm list and algorithm selection for RAID-6 - */ - -#include <linux/raid/pq.h> -#ifndef __KERNEL__ -#include <sys/mman.h> -#include <stdio.h> -#else -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); -#endif -#endif - -struct raid6_calls raid6_call; -EXPORT_SYMBOL_GPL(raid6_call); - -const struct raid6_calls * const raid6_algos[] = { - &raid6_intx1, - &raid6_intx2, - &raid6_intx4, - &raid6_intx8, -#if defined(__ia64__) - &raid6_intx16, - &raid6_intx32, -#endif -#if defined(__i386__) && !defined(__arch_um__) - &raid6_mmxx1, - &raid6_mmxx2, - &raid6_sse1x1, - &raid6_sse1x2, - &raid6_sse2x1, - &raid6_sse2x2, -#endif -#if defined(__x86_64__) && !defined(__arch_um__) - &raid6_sse2x1, - &raid6_sse2x2, - &raid6_sse2x4, -#endif -#ifdef CONFIG_ALTIVEC - &raid6_altivec1, - &raid6_altivec2, - &raid6_altivec4, - &raid6_altivec8, -#endif - NULL -}; - -#ifdef __KERNEL__ -#define RAID6_TIME_JIFFIES_LG2 4 -#else -/* Need more time to be stable in userspace */ -#define RAID6_TIME_JIFFIES_LG2 9 -#define time_before(x, y) ((x) < (y)) -#endif - -/* Try to pick the best algorithm */ -/* This code uses the gfmul table as convenient data set to abuse */ - -int __init raid6_select_algo(void) -{ - const struct raid6_calls * const * algo; - const struct raid6_calls * best; - char *syndromes; - void *dptrs[(65536/PAGE_SIZE)+2]; - int i, disks; - unsigned long perf, bestperf; - int bestprefer; - unsigned long j0, j1; - - disks = (65536/PAGE_SIZE)+2; - for ( i = 0 ; i < disks-2 ; i++ ) { - dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; - } - - /* Normal code - use a 2-page allocation to avoid D$ conflict */ - syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); - - if ( !syndromes ) { - printk("raid6: Yikes! No memory available.\n"); - return -ENOMEM; - } - - dptrs[disks-2] = syndromes; - dptrs[disks-1] = syndromes + PAGE_SIZE; - - bestperf = 0; bestprefer = 0; best = NULL; - - for ( algo = raid6_algos ; *algo ; algo++ ) { - if ( !(*algo)->valid || (*algo)->valid() ) { - perf = 0; - - preempt_disable(); - j0 = jiffies; - while ( (j1 = jiffies) == j0 ) - cpu_relax(); - while (time_before(jiffies, - j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { - (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); - perf++; - } - preempt_enable(); - - if ( (*algo)->prefer > bestprefer || - ((*algo)->prefer == bestprefer && - perf > bestperf) ) { - best = *algo; - bestprefer = best->prefer; - bestperf = perf; - } - printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, - (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); - } - } - - if (best) { - printk("raid6: using algorithm %s (%ld MB/s)\n", - best->name, - (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); - raid6_call = *best; - } else - printk("raid6: Yikes! No algorithm found!\n"); - - free_pages((unsigned long)syndromes, 1); - - return best ? 0 : -EINVAL; -} - -static void raid6_exit(void) -{ - do { } while (0); -} - -subsys_initcall(raid6_select_algo); -module_exit(raid6_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc deleted file mode 100644 index 2654d5c854be..000000000000 --- a/drivers/md/raid6altivec.uc +++ /dev/null @@ -1,130 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6altivec$#.c - * - * $#-way unrolled portable integer math RAID-6 instruction set - * - * This file is postprocessed using unroll.awk - * - * <benh> hpa: in process, - * you can just "steal" the vec unit with enable_kernel_altivec() (but - * bracked this with preempt_disable/enable or in a lock) - */ - -#include <linux/raid/pq.h> - -#ifdef CONFIG_ALTIVEC - -#include <altivec.h> -#ifdef __KERNEL__ -# include <asm/system.h> -# include <asm/cputable.h> -#endif - -/* - * This is the C data type to use. We use a vector of - * signed char so vec_cmpgt() will generate the right - * instruction. - */ - -typedef vector signed char unative_t; - -#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) -#define NSIZE sizeof(unative_t) - -/* - * The SHLBYTE() operation shifts each byte left by 1, *not* - * rolling over into the next byte - */ -static inline __attribute_const__ unative_t SHLBYTE(unative_t v) -{ - return vec_add(v,v); -} - -/* - * The MASK() operation returns 0xFF in any byte for which the high - * bit is 1, 0x00 for any byte for which the high bit is 0. - */ -static inline __attribute_const__ unative_t MASK(unative_t v) -{ - unative_t zv = NBYTES(0); - - /* vec_cmpgt returns a vector bool char; thus the need for the cast */ - return (unative_t)vec_cmpgt(zv, v); -} - - -/* This is noinline to make damned sure that gcc doesn't move any of the - Altivec code around the enable/disable code */ -static void noinline -raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - unative_t wd$$, wq$$, wp$$, w1$$, w2$$; - unative_t x1d = NBYTES(0x1d); - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { - wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; - for ( z = z0-1 ; z >= 0 ; z-- ) { - wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; - wp$$ = vec_xor(wp$$, wd$$); - w2$$ = MASK(wq$$); - w1$$ = SHLBYTE(wq$$); - w2$$ = vec_and(w2$$, x1d); - w1$$ = vec_xor(w1$$, w2$$); - wq$$ = vec_xor(w1$$, wd$$); - } - *(unative_t *)&p[d+NSIZE*$$] = wp$$; - *(unative_t *)&q[d+NSIZE*$$] = wq$$; - } -} - -static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - preempt_disable(); - enable_kernel_altivec(); - - raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); - - preempt_enable(); -} - -int raid6_have_altivec(void); -#if $# == 1 -int raid6_have_altivec(void) -{ - /* This assumes either all CPUs have Altivec or none does */ -# ifdef __KERNEL__ - return cpu_has_feature(CPU_FTR_ALTIVEC); -# else - return 1; -# endif -} -#endif - -const struct raid6_calls raid6_altivec$# = { - raid6_altivec$#_gen_syndrome, - raid6_have_altivec, - "altivecx$#", - 0 -}; - -#endif /* CONFIG_ALTIVEC */ diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc deleted file mode 100644 index d1e276a14fab..000000000000 --- a/drivers/md/raid6int.uc +++ /dev/null @@ -1,117 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6int$#.c - * - * $#-way unrolled portable integer math RAID-6 instruction set - * - * This file is postprocessed using unroll.awk - */ - -#include <linux/raid/pq.h> - -/* - * This is the C data type to use - */ - -/* Change this from BITS_PER_LONG if there is something better... */ -#if BITS_PER_LONG == 64 -# define NBYTES(x) ((x) * 0x0101010101010101UL) -# define NSIZE 8 -# define NSHIFT 3 -# define NSTRING "64" -typedef u64 unative_t; -#else -# define NBYTES(x) ((x) * 0x01010101U) -# define NSIZE 4 -# define NSHIFT 2 -# define NSTRING "32" -typedef u32 unative_t; -#endif - - - -/* - * IA-64 wants insane amounts of unrolling. On other architectures that - * is just a waste of space. - */ -#if ($# <= 8) || defined(__ia64__) - - -/* - * These sub-operations are separate inlines since they can sometimes be - * specially optimized using architecture-specific hacks. - */ - -/* - * The SHLBYTE() operation shifts each byte left by 1, *not* - * rolling over into the next byte - */ -static inline __attribute_const__ unative_t SHLBYTE(unative_t v) -{ - unative_t vv; - - vv = (v << 1) & NBYTES(0xfe); - return vv; -} - -/* - * The MASK() operation returns 0xFF in any byte for which the high - * bit is 1, 0x00 for any byte for which the high bit is 0. - */ -static inline __attribute_const__ unative_t MASK(unative_t v) -{ - unative_t vv; - - vv = v & NBYTES(0x80); - vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ - return vv; -} - - -static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - unative_t wd$$, wq$$, wp$$, w1$$, w2$$; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { - wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; - for ( z = z0-1 ; z >= 0 ; z-- ) { - wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; - wp$$ ^= wd$$; - w2$$ = MASK(wq$$); - w1$$ = SHLBYTE(wq$$); - w2$$ &= NBYTES(0x1d); - w1$$ ^= w2$$; - wq$$ = w1$$ ^ wd$$; - } - *(unative_t *)&p[d+NSIZE*$$] = wp$$; - *(unative_t *)&q[d+NSIZE*$$] = wq$$; - } -} - -const struct raid6_calls raid6_intx$# = { - raid6_int$#_gen_syndrome, - NULL, /* always valid */ - "int" NSTRING "x$#", - 0 -}; - -#endif diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c deleted file mode 100644 index e7f6c13132bf..000000000000 --- a/drivers/md/raid6mmx.c +++ /dev/null @@ -1,142 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6mmx.c - * - * MMX implementation of RAID-6 syndrome functions - */ - -#if defined(__i386__) && !defined(__arch_um__) - -#include <linux/raid/pq.h> -#include "raid6x86.h" - -/* Shared with raid6sse1.c */ -const struct raid6_mmx_constants { - u64 x1d; -} raid6_mmx_constants = { - 0x1d1d1d1d1d1d1d1dULL, -}; - -static int raid6_have_mmx(void) -{ - /* Not really "boot_cpu" but "all_cpus" */ - return boot_cpu_has(X86_FEATURE_MMX); -} - -/* - * Plain MMX implementation - */ -static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 8 ) { - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm6,%mm2"); - asm volatile("pxor %mm6,%mm4"); - } - asm volatile("movq %%mm2,%0" : "=m" (p[d])); - asm volatile("pxor %mm2,%mm2"); - asm volatile("movq %%mm4,%0" : "=m" (q[d])); - asm volatile("pxor %mm4,%mm4"); - } - - kernel_fpu_end(); -} - -const struct raid6_calls raid6_mmxx1 = { - raid6_mmx1_gen_syndrome, - raid6_have_mmx, - "mmxx1", - 0 -}; - -/* - * Unrolled-by-2 MMX implementation - */ -static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - asm volatile("pxor %mm7,%mm7"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 16 ) { - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - asm volatile("movq %mm3,%mm6"); /* Q[1] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("pcmpgtb %mm6,%mm7"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("paddb %mm6,%mm6"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pand %mm0,%mm7"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); - asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); - asm volatile("pxor %mm5,%mm2"); - asm volatile("pxor %mm7,%mm3"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm7,%mm7"); - } - asm volatile("movq %%mm2,%0" : "=m" (p[d])); - asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); - asm volatile("movq %%mm4,%0" : "=m" (q[d])); - asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); - } - - kernel_fpu_end(); -} - -const struct raid6_calls raid6_mmxx2 = { - raid6_mmx2_gen_syndrome, - raid6_have_mmx, - "mmxx2", - 0 -}; - -#endif diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c deleted file mode 100644 index 2609f00e0d61..000000000000 --- a/drivers/md/raid6recov.c +++ /dev/null @@ -1,132 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6recov.c - * - * RAID-6 data recovery in dual failure mode. In single failure mode, - * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct - * the syndrome.) - */ - -#include <linux/raid/pq.h> - -/* Recover two failed data blocks. */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, - void **ptrs) -{ - u8 *p, *q, *dp, *dq; - u8 px, qx, db; - const u8 *pbmul; /* P multiplier table for B data */ - const u8 *qmul; /* Q multiplier table (for both) */ - - p = (u8 *)ptrs[disks-2]; - q = (u8 *)ptrs[disks-1]; - - /* Compute syndrome with zero for the missing data pages - Use the dead data pages as temporary storage for - delta p and delta q */ - dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; - ptrs[disks-2] = dp; - dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; - ptrs[disks-1] = dq; - - raid6_call.gen_syndrome(disks, bytes, ptrs); - - /* Restore pointer table */ - ptrs[faila] = dp; - ptrs[failb] = dq; - ptrs[disks-2] = p; - ptrs[disks-1] = q; - - /* Now, pick the proper data tables */ - pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; - qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; - - /* Now do it... */ - while ( bytes-- ) { - px = *p ^ *dp; - qx = qmul[*q ^ *dq]; - *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ - *dp++ = db ^ px; /* Reconstructed A */ - p++; q++; - } -} -EXPORT_SYMBOL_GPL(raid6_2data_recov); - -/* Recover failure of one data block plus the P block */ -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) -{ - u8 *p, *q, *dq; - const u8 *qmul; /* Q multiplier table */ - - p = (u8 *)ptrs[disks-2]; - q = (u8 *)ptrs[disks-1]; - - /* Compute syndrome with zero for the missing data page - Use the dead data page as temporary storage for delta q */ - dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; - ptrs[disks-1] = dq; - - raid6_call.gen_syndrome(disks, bytes, ptrs); - - /* Restore pointer table */ - ptrs[faila] = dq; - ptrs[disks-1] = q; - - /* Now, pick the proper data tables */ - qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; - - /* Now do it... */ - while ( bytes-- ) { - *p++ ^= *dq = qmul[*q ^ *dq]; - q++; dq++; - } -} -EXPORT_SYMBOL_GPL(raid6_datap_recov); - -#ifndef __KERNEL__ -/* Testing only */ - -/* Recover two failed blocks. */ -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) -{ - if ( faila > failb ) { - int tmp = faila; - faila = failb; - failb = tmp; - } - - if ( failb == disks-1 ) { - if ( faila == disks-2 ) { - /* P+Q failure. Just rebuild the syndrome. */ - raid6_call.gen_syndrome(disks, bytes, ptrs); - } else { - /* data+Q failure. Reconstruct data from P, - then rebuild syndrome. */ - /* NOT IMPLEMENTED - equivalent to RAID-5 */ - } - } else { - if ( failb == disks-2 ) { - /* data+P failure. */ - raid6_datap_recov(disks, bytes, faila, ptrs); - } else { - /* data+data failure. */ - raid6_2data_recov(disks, bytes, faila, failb, ptrs); - } - } -} - -#endif diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c deleted file mode 100644 index b274dd5eab8f..000000000000 --- a/drivers/md/raid6sse1.c +++ /dev/null @@ -1,162 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6sse1.c - * - * SSE-1/MMXEXT implementation of RAID-6 syndrome functions - * - * This is really an MMX implementation, but it requires SSE-1 or - * AMD MMXEXT for prefetch support and a few other features. The - * support for nontemporal memory accesses is enough to make this - * worthwhile as a separate implementation. - */ - -#if defined(__i386__) && !defined(__arch_um__) - -#include <linux/raid/pq.h> -#include "raid6x86.h" - -/* Defined in raid6mmx.c */ -extern const struct raid6_mmx_constants { - u64 x1d; -} raid6_mmx_constants; - -static int raid6_have_sse1_or_mmxext(void) -{ - /* Not really boot_cpu but "all_cpus" */ - return boot_cpu_has(X86_FEATURE_MMX) && - (boot_cpu_has(X86_FEATURE_XMM) || - boot_cpu_has(X86_FEATURE_MMXEXT)); -} - -/* - * Plain SSE1 implementation - */ -static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 8 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); - for ( z = z0-2 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm6,%mm2"); - asm volatile("pxor %mm6,%mm4"); - asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); - } - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm6,%mm2"); - asm volatile("pxor %mm6,%mm4"); - - asm volatile("movntq %%mm2,%0" : "=m" (p[d])); - asm volatile("movntq %%mm4,%0" : "=m" (q[d])); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse1x1 = { - raid6_sse11_gen_syndrome, - raid6_have_sse1_or_mmxext, - "sse1x1", - 1 /* Has cache hints */ -}; - -/* - * Unrolled-by-2 SSE1 implementation - */ -static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - asm volatile("pxor %mm7,%mm7"); /* Zero temp */ - - /* We uniformly assume a single prefetch covers at least 16 bytes */ - for ( d = 0 ; d < bytes ; d += 16 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - asm volatile("movq %mm3,%mm6"); /* Q[1] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("pcmpgtb %mm6,%mm7"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("paddb %mm6,%mm6"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pand %mm0,%mm7"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); - asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); - asm volatile("pxor %mm5,%mm2"); - asm volatile("pxor %mm7,%mm3"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm7,%mm7"); - } - asm volatile("movntq %%mm2,%0" : "=m" (p[d])); - asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); - asm volatile("movntq %%mm4,%0" : "=m" (q[d])); - asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); - } - - asm volatile("sfence" : :: "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse1x2 = { - raid6_sse12_gen_syndrome, - raid6_have_sse1_or_mmxext, - "sse1x2", - 1 /* Has cache hints */ -}; - -#endif diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c deleted file mode 100644 index 6ed6c6c0389f..000000000000 --- a/drivers/md/raid6sse2.c +++ /dev/null @@ -1,262 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6sse2.c - * - * SSE-2 implementation of RAID-6 syndrome functions - * - */ - -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - -#include <linux/raid/pq.h> -#include "raid6x86.h" - -static const struct raid6_sse_constants { - u64 x1d[2]; -} raid6_sse_constants __attribute__((aligned(16))) = { - { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, -}; - -static int raid6_have_sse2(void) -{ - /* Not really boot_cpu but "all_cpus" */ - return boot_cpu_has(X86_FEATURE_MMX) && - boot_cpu_has(X86_FEATURE_FXSR) && - boot_cpu_has(X86_FEATURE_XMM) && - boot_cpu_has(X86_FEATURE_XMM2); -} - -/* - * Plain SSE2 implementation - */ -static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); - asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 16 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); - asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ - asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); - for ( z = z0-2 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm6,%xmm2"); - asm volatile("pxor %xmm6,%xmm4"); - asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); - } - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm6,%xmm2"); - asm volatile("pxor %xmm6,%xmm4"); - - asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); - asm volatile("pxor %xmm2,%xmm2"); - asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); - asm volatile("pxor %xmm4,%xmm4"); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse2x1 = { - raid6_sse21_gen_syndrome, - raid6_have_sse2, - "sse2x1", - 1 /* Has cache hints */ -}; - -/* - * Unrolled-by-2 SSE2 implementation - */ -static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); - asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ - asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ - - /* We uniformly assume a single prefetch covers at least 32 bytes */ - for ( d = 0 ; d < bytes ; d += 32 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ - asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ - asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("pcmpgtb %xmm6,%xmm7"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("paddb %xmm6,%xmm6"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pand %xmm0,%xmm7"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); - asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); - asm volatile("pxor %xmm5,%xmm2"); - asm volatile("pxor %xmm7,%xmm3"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm7,%xmm7"); - } - asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); - asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); - asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); - asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse2x2 = { - raid6_sse22_gen_syndrome, - raid6_have_sse2, - "sse2x2", - 1 /* Has cache hints */ -}; - -#endif - -#if defined(__x86_64__) && !defined(__arch_um__) - -/* - * Unrolled-by-4 SSE2 implementation - */ -static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); - asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ - asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ - asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ - asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ - asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ - asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ - asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ - asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ - asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ - asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ - asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ - asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 64 ) { - for ( z = z0 ; z >= 0 ; z-- ) { - /* The second prefetch seems to improve performance... */ - asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); - asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("pcmpgtb %xmm6,%xmm7"); - asm volatile("pcmpgtb %xmm12,%xmm13"); - asm volatile("pcmpgtb %xmm14,%xmm15"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("paddb %xmm6,%xmm6"); - asm volatile("paddb %xmm12,%xmm12"); - asm volatile("paddb %xmm14,%xmm14"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pand %xmm0,%xmm7"); - asm volatile("pand %xmm0,%xmm13"); - asm volatile("pand %xmm0,%xmm15"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("pxor %xmm13,%xmm12"); - asm volatile("pxor %xmm15,%xmm14"); - asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); - asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); - asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); - asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); - asm volatile("pxor %xmm5,%xmm2"); - asm volatile("pxor %xmm7,%xmm3"); - asm volatile("pxor %xmm13,%xmm10"); - asm volatile("pxor %xmm15,%xmm11"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("pxor %xmm13,%xmm12"); - asm volatile("pxor %xmm15,%xmm14"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm7,%xmm7"); - asm volatile("pxor %xmm13,%xmm13"); - asm volatile("pxor %xmm15,%xmm15"); - } - asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); - asm volatile("pxor %xmm2,%xmm2"); - asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); - asm volatile("pxor %xmm3,%xmm3"); - asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); - asm volatile("pxor %xmm10,%xmm10"); - asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); - asm volatile("pxor %xmm11,%xmm11"); - asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); - asm volatile("pxor %xmm4,%xmm4"); - asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); - asm volatile("pxor %xmm6,%xmm6"); - asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); - asm volatile("pxor %xmm12,%xmm12"); - asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); - asm volatile("pxor %xmm14,%xmm14"); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse2x4 = { - raid6_sse24_gen_syndrome, - raid6_have_sse2, - "sse2x4", - 1 /* Has cache hints */ -}; - -#endif diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile deleted file mode 100644 index 2874cbef529d..000000000000 --- a/drivers/md/raid6test/Makefile +++ /dev/null @@ -1,75 +0,0 @@ -# -# This is a simple Makefile to test some of the RAID-6 code -# from userspace. -# - -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) -LD = ld -AWK = awk -AR = ar -RANLIB = ranlib - -.c.o: - $(CC) $(CFLAGS) -c -o $@ $< - -%.c: ../%.c - cp -f $< $@ - -%.uc: ../%.uc - cp -f $< $@ - -all: raid6.a raid6test - -raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \ - raid6int32.o \ - raid6mmx.o raid6sse1.o raid6sse2.o \ - raid6altivec1.o raid6altivec2.o raid6altivec4.o raid6altivec8.o \ - raid6recov.o raid6algos.o \ - raid6tables.o - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ - -raid6test: test.c raid6.a - $(CC) $(CFLAGS) -o raid6test $^ - -raid6altivec1.c: raid6altivec.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=1 < raid6altivec.uc > $@ - -raid6altivec2.c: raid6altivec.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=2 < raid6altivec.uc > $@ - -raid6altivec4.c: raid6altivec.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=4 < raid6altivec.uc > $@ - -raid6altivec8.c: raid6altivec.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=8 < raid6altivec.uc > $@ - -raid6int1.c: raid6int.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=1 < raid6int.uc > $@ - -raid6int2.c: raid6int.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=2 < raid6int.uc > $@ - -raid6int4.c: raid6int.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=4 < raid6int.uc > $@ - -raid6int8.c: raid6int.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=8 < raid6int.uc > $@ - -raid6int16.c: raid6int.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=16 < raid6int.uc > $@ - -raid6int32.c: raid6int.uc ../unroll.awk - $(AWK) ../unroll.awk -vN=32 < raid6int.uc > $@ - -raid6tables.c: mktables - ./mktables > raid6tables.c - -clean: - rm -f *.o *.a mktables mktables.c raid6int.uc raid6*.c raid6test - -spotless: clean - rm -f *~ diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c deleted file mode 100644 index 7a930318b17d..000000000000 --- a/drivers/md/raid6test/test.c +++ /dev/null @@ -1,124 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6test.c - * - * Test RAID-6 recovery with various algorithms - */ - -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <linux/raid/pq.h> - -#define NDISKS 16 /* Including P and Q */ - -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -struct raid6_calls raid6_call; - -char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; - -static void makedata(void) -{ - int i, j; - - for (i = 0; i < NDISKS; i++) { - for (j = 0; j < PAGE_SIZE; j++) - data[i][j] = rand(); - - dataptrs[i] = data[i]; - } -} - -static char disk_type(int d) -{ - switch (d) { - case NDISKS-2: - return 'P'; - case NDISKS-1: - return 'Q'; - default: - return 'D'; - } -} - -static int test_disks(int i, int j) -{ - int erra, errb; - - memset(recovi, 0xf0, PAGE_SIZE); - memset(recovj, 0xba, PAGE_SIZE); - - dataptrs[i] = recovi; - dataptrs[j] = recovj; - - raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); - - erra = memcmp(data[i], recovi, PAGE_SIZE); - errb = memcmp(data[j], recovj, PAGE_SIZE); - - if (i < NDISKS-2 && j == NDISKS-1) { - /* We don't implement the DQ failure scenario, since it's - equivalent to a RAID-5 failure (XOR, then recompute Q) */ - erra = errb = 0; - } else { - printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", - raid6_call.name, - i, disk_type(i), - j, disk_type(j), - (!erra && !errb) ? "OK" : - !erra ? "ERRB" : - !errb ? "ERRA" : "ERRAB"); - } - - dataptrs[i] = data[i]; - dataptrs[j] = data[j]; - - return erra || errb; -} - -int main(int argc, char *argv[]) -{ - const struct raid6_calls *const *algo; - int i, j; - int err = 0; - - makedata(); - - for (algo = raid6_algos; *algo; algo++) { - if (!(*algo)->valid || (*algo)->valid()) { - raid6_call = **algo; - - /* Nuke syndromes */ - memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); - - /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, - (void **)&dataptrs); - - for (i = 0; i < NDISKS-1; i++) - for (j = i+1; j < NDISKS; j++) - err += test_disks(i, j); - } - printf("\n"); - } - - printf("\n"); - /* Pick the best algorithm test */ - raid6_select_algo(); - - if (err) - printf("\n*** ERRORS FOUND ***\n"); - - return err; -} diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h deleted file mode 100644 index 4c22c1568558..000000000000 --- a/drivers/md/raid6x86.h +++ /dev/null @@ -1,61 +0,0 @@ -/* ----------------------------------------------------------------------- * - * - * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6x86.h - * - * Definitions common to x86 and x86-64 RAID-6 code only - */ - -#ifndef LINUX_RAID_RAID6X86_H -#define LINUX_RAID_RAID6X86_H - -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - -#ifdef __KERNEL__ /* Real code */ - -#include <asm/i387.h> - -#else /* Dummy code for user space testing */ - -static inline void kernel_fpu_begin(void) -{ -} - -static inline void kernel_fpu_end(void) -{ -} - -#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions - * (fast save and restore) */ -#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ -#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ -#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ - -/* Should work well enough on modern CPUs for testing */ -static inline int boot_cpu_has(int flag) -{ - u32 eax = (flag >> 5) ? 0x80000001 : 1; - u32 edx; - - asm volatile("cpuid" - : "+a" (eax), "=d" (edx) - : : "ecx", "ebx"); - - return (edx >> (flag & 31)) & 1; -} - -#endif /* ndef __KERNEL__ */ - -#endif -#endif diff --git a/drivers/md/unroll.awk b/drivers/md/unroll.awk deleted file mode 100644 index c6aa03631df8..000000000000 --- a/drivers/md/unroll.awk +++ /dev/null @@ -1,20 +0,0 @@ - -# This filter requires one command line option of form -vN=n -# where n must be a decimal number. -# -# Repeat each input line containing $$ n times, replacing $$ with 0...n-1. -# Replace each $# with n, and each $* with a single $. - -BEGIN { - n = N + 0 -} -{ - if (/\$\$/) { rep = n } else { rep = 1 } - for (i = 0; i < rep; ++i) { - tmp = $0 - gsub(/\$\$/, i, tmp) - gsub(/\$\#/, n, tmp) - gsub(/\$\*/, "$", tmp) - print tmp - } -} |