diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-03-19 14:41:01 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-03-19 14:41:01 +1100 |
commit | 0559139627a4e8a81d85f8260f6f40fc92307ced (patch) | |
tree | 6ef8ae0a95916c272ab48d911ea2ed640dca432f | |
parent | 8f5e3ef1bc28929926f5e786e14d8667c9c6955d (diff) | |
parent | c487d7f239b9818f245715762f248e1458db92c9 (diff) |
Merge branch 'akpm/master'
326 files changed, 5487 insertions, 3249 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 8b8c28b9864c..addb1f110e9a 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -70,6 +70,7 @@ Brief summary of control files. memory.move_charge_at_immigrate # set/show controls of moving charges memory.oom_control # set/show oom controls. memory.numa_stat # show the number of memory usage per numa node + memory.dangling_memcgs # show debugging information about dangling groups memory.kmem.limit_in_bytes # set/show hard limit for kernel memory memory.kmem.usage_in_bytes # show current kernel memory allocation @@ -577,6 +578,21 @@ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... And we have total = file + anon + unevictable. +5.7 dangling_memcgs + +This file will only be ever present in the root cgroup, if the option +CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory +consumed by it may not be immediately freed. This is because when some +extensions are used, such as swap or kernel memory, objects can outlive the +group and hold a reference to it. + +If this is the case, the dangling_memcgs file will show information about what +are the memcgs still alive, and which references are still preventing it to be +freed. There is nothing wrong with that, but it is very useful when debugging, +to know where this memory is being held. This is a developer-oriented debugging +facility only, and no guarantees of interface stability will be given. The file +is read-only, and has the sole purpose of displaying information. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 078701fdbd4d..21ad1815be8c 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -138,18 +138,39 @@ Setting this to zero disables periodic writeback altogether. drop_caches -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. +Writing to this will cause the kernel to drop clean caches, as well as +reclaimable slab objects like dentries and inodes. Once dropped, their +memory becomes free. To free pagecache: echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: +To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: +To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +This is a non-destructive operation and will not free any dirty objects. +To increase the number of objects freed by this operation, the user may run +`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the +number of dirty objects on the system and create more candidates to be +dropped. + +This file is not a means to control the growth of the various kernel caches +(inodes, dentries, pagecache, etc...) These objects are automatically +reclaimed by the kernel when memory is needed elsewhere on the system. + +Use of this file can cause performance problems. Since it discards cached +objects, it may cost a significant amount of I/O and CPU to recreate the +dropped objects, especially if they were under heavy use. Because of this, +use outside of a testing or debugging environment is not recommended. + +You may see informational messages in your kernel log when this file is +used: + + cat (1234): dropped kernel caches: 3 + +These are informational only. They do not mean that anything is wrong +with your system. ============================================================== diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 6cf160086021..38e6278aa049 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -38,6 +38,7 @@ config ARM select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)) select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore index f79a08efe000..47279aa96a6a 100644 --- a/arch/arm/boot/compressed/.gitignore +++ b/arch/arm/boot/compressed/.gitignore @@ -6,6 +6,7 @@ piggy.gzip piggy.lzo piggy.lzma piggy.xzkern +piggy.lz4 vmlinux vmlinux.lds diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index afed28e37ea5..37234ce5a261 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -24,6 +24,9 @@ endif AFLAGS_head.o += -DTEXT_OFFSET=$(TEXT_OFFSET) HEAD = head.o OBJS += misc.o decompress.o +ifeq ($(CONFIG_KERNEL_LZ4),y) +CFLAGS_decompress.o := -Os +endif FONTC = $(srctree)/drivers/video/console/font_acorn_8x8.c # string library code (-Os is enforced to keep it much smaller) @@ -88,6 +91,7 @@ suffix_$(CONFIG_KERNEL_GZIP) = gzip suffix_$(CONFIG_KERNEL_LZO) = lzo suffix_$(CONFIG_KERNEL_LZMA) = lzma suffix_$(CONFIG_KERNEL_XZ) = xzkern +suffix_$(CONFIG_KERNEL_LZ4) = lz4 # Borrowed libfdt files for the ATAG compatibility mode @@ -112,7 +116,7 @@ targets := vmlinux vmlinux.lds \ font.o font.c head.o misc.o $(OBJS) # Make sure files are removed during clean -extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \ +extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \ lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs) ifeq ($(CONFIG_FUNCTION_TRACER),y) diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index 24b0475cb8bf..bd245d34952d 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -51,6 +51,10 @@ extern char * strstr(const char * s1, const char *s2); #include "../../../../lib/decompress_unxz.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) { return decompress(input, len, NULL, NULL, output, NULL, error); diff --git a/arch/arm/boot/compressed/piggy.lz4.S b/arch/arm/boot/compressed/piggy.lz4.S new file mode 100644 index 000000000000..3d9a575618a3 --- /dev/null +++ b/arch/arm/boot/compressed/piggy.lz4.S @@ -0,0 +1,6 @@ + .section .piggydata,#alloc + .globl input_data +input_data: + .incbin "arch/arm/boot/compressed/piggy.lz4" + .globl input_data_end +input_data_end: diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index ad722f1208a5..ad9a9f3f0322 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -99,6 +99,9 @@ void show_mem(unsigned int filter) printk("Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_bank (i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 10062ceadd1c..0c6356255fe3 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7c7be7855638..8ed6cb1a900f 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 80dab509dfb0..67c59ebec899 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -47,6 +47,8 @@ void show_mem(unsigned int filter) printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); printk(KERN_INFO "Node memory in pages:\n"); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; for_each_online_pgdat(pgdat) { unsigned long present; unsigned long flags; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c2e955ee79a8..a57436e5d405 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -623,6 +623,8 @@ void show_mem(unsigned int filter) printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; printk(KERN_INFO "Node memory in pages:\n"); for_each_online_pgdat(pgdat) { unsigned long present; diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 7e5fe2790d8a..f1baadd56e82 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 0339181bf3ac..433e75a2ee9a 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -1,5 +1,6 @@ config PARISC def_bool y + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select HAVE_IDE select HAVE_OPROFILE select HAVE_FUNCTION_TRACER if 64BIT diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug index 7305ac8f7f5b..bc989e522a04 100644 --- a/arch/parisc/Kconfig.debug +++ b/arch/parisc/Kconfig.debug @@ -12,18 +12,4 @@ config DEBUG_RODATA portion of the kernel code won't be covered by a TLB anymore. If in doubt, say "N". -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict copy size checks" - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - ---help--- - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, or if you run an older (pre 4.4) gcc, say N. - endmenu diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3ac462de53a4..cf2da13c41e6 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -697,6 +697,8 @@ void show_mem(unsigned int filter) printk(KERN_INFO "Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; #ifndef CONFIG_DISCONTIGMEM i = max_mapnr; while (i-- > 0) { diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index cd915d6b093d..88693cef4f3d 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -99,8 +99,7 @@ extern unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long flags, unsigned int psize, - int topdown, - int use_cache); + int topdown); extern unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1a6de0a7d8eb..5dc52d803ed8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -742,7 +742,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); + return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } #endif diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c index 67a42ed0d2fc..cb8bdbe4972f 100644 --- a/arch/powerpc/mm/mmap_64.c +++ b/arch/powerpc/mm/mmap_64.c @@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index cf9dada734b6..3e99c149271a 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -237,134 +237,112 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz #endif } +/* + * Compute which slice addr is part of; + * set *boundary_addr to the start or end boundary of that slice + * (depending on 'end' parameter); + * return boolean indicating if the slice is marked as available in the + * 'available' slice_mark. + */ +static bool slice_scan_available(unsigned long addr, + struct slice_mask available, + int end, + unsigned long *boundary_addr) +{ + unsigned long slice; + if (addr < SLICE_LOW_TOP) { + slice = GET_LOW_SLICE_INDEX(addr); + *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; + return !!(available.low_slices & (1u << slice)); + } else { + slice = GET_HIGH_SLICE_INDEX(addr); + *boundary_addr = (slice + end) ? + ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; + return !!(available.high_slices & (1u << slice)); + } +} + static unsigned long slice_find_area_bottomup(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize, int use_cache) + int psize) { - struct vm_area_struct *vma; - unsigned long start_addr, addr; - struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - - if (use_cache) { - if (len <= mm->cached_hole_size) { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } else - start_addr = addr = mm->free_area_cache; - } else - start_addr = addr = TASK_UNMAPPED_BASE; - -full_search: - for (;;) { - addr = _ALIGN_UP(addr, 1ul << pshift); - if ((TASK_SIZE - len) < addr) - break; - vma = find_vma(mm, addr); - BUG_ON(vma && (addr >= vma->vm_end)); - - mask = slice_range_to_mask(addr, len); - if (!slice_check_fit(mask, available)) { - if (addr < SLICE_LOW_TOP) - addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT); - else - addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT); + unsigned long addr, found, next_end; + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + + addr = TASK_UNMAPPED_BASE; + while (addr < TASK_SIZE) { + info.low_limit = addr; + if (!slice_scan_available(addr, available, 1, &addr)) continue; + + next_slice: + /* + * At this point [info.low_limit; addr) covers + * available slices only and ends at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the next available slice. + */ + if (addr >= TASK_SIZE) + addr = TASK_SIZE; + else if (slice_scan_available(addr, available, 1, &next_end)) { + addr = next_end; + goto next_slice; } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - if (use_cache) - mm->free_area_cache = addr + len; - return addr; - } - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - } + info.high_limit = addr; - /* Make sure we didn't miss any holes */ - if (use_cache && start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; } + return -ENOMEM; } static unsigned long slice_find_area_topdown(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize, int use_cache) + int psize) { - struct vm_area_struct *vma; - unsigned long addr; - struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long addr, found, prev; + struct vm_unmapped_area_info info; - /* check if free_area_cache is useful for us */ - if (use_cache) { - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - - /* either no address requested or can't fit in requested - * address hole - */ - addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (addr > len) { - addr = _ALIGN_DOWN(addr - len, 1ul << pshift); - mask = slice_range_to_mask(addr, len); - if (slice_check_fit(mask, available) && - slice_area_is_free(mm, addr, len)) - /* remember the address as a hint for - * next time - */ - return (mm->free_area_cache = addr); - } - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; addr = mm->mmap_base; - while (addr > len) { - /* Go down by chunk size */ - addr = _ALIGN_DOWN(addr - len, 1ul << pshift); - - /* Check for hit with different page size */ - mask = slice_range_to_mask(addr, len); - if (!slice_check_fit(mask, available)) { - if (addr < SLICE_LOW_TOP) - addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT); - else if (addr < (1ul << SLICE_HIGH_SHIFT)) - addr = SLICE_LOW_TOP; - else - addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT); + while (addr > PAGE_SIZE) { + info.high_limit = addr; + if (!slice_scan_available(addr - 1, available, 0, &addr)) continue; - } + prev_slice: /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: + * At this point [addr; info.high_limit) covers + * available slices only and starts at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the previous available slice. */ - vma = find_vma(mm, addr); - if (!vma || (addr + len) <= vma->vm_start) { - /* remember the address as a hint for next time */ - if (use_cache) - mm->free_area_cache = addr; - return addr; + if (addr < PAGE_SIZE) + addr = PAGE_SIZE; + else if (slice_scan_available(addr - 1, available, 0, &prev)) { + addr = prev; + goto prev_slice; } + info.low_limit = addr; - /* remember the largest hole we saw so far */ - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start; + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; } /* @@ -373,28 +351,18 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * can happen with large stack limits and large mmap() * allocations. */ - addr = slice_find_area_bottomup(mm, len, available, psize, 0); - - /* - * Restore the topdown base: - */ - if (use_cache) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - } - - return addr; + return slice_find_area_bottomup(mm, len, available, psize); } static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, struct slice_mask mask, int psize, - int topdown, int use_cache) + int topdown) { if (topdown) - return slice_find_area_topdown(mm, len, mask, psize, use_cache); + return slice_find_area_topdown(mm, len, mask, psize); else - return slice_find_area_bottomup(mm, len, mask, psize, use_cache); + return slice_find_area_bottomup(mm, len, mask, psize); } #define or_mask(dst, src) do { \ @@ -415,7 +383,7 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long flags, unsigned int psize, - int topdown, int use_cache) + int topdown) { struct slice_mask mask = {0, 0}; struct slice_mask good_mask; @@ -430,8 +398,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, BUG_ON(mm->task_size == 0); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); - slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n", - addr, len, flags, topdown, use_cache); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", + addr, len, flags, topdown); if (len > mm->task_size) return -ENOMEM; @@ -503,8 +471,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Now let's see if we can find something in the existing * slices for that size */ - newaddr = slice_find_area(mm, len, good_mask, psize, topdown, - use_cache); + newaddr = slice_find_area(mm, len, good_mask, psize, topdown); if (newaddr != -ENOMEM) { /* Found within the good mask, we don't have to setup, * we thus return directly @@ -536,8 +503,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * anywhere in the good area. */ if (addr) { - addr = slice_find_area(mm, len, good_mask, psize, topdown, - use_cache); + addr = slice_find_area(mm, len, good_mask, psize, topdown); if (addr != -ENOMEM) { slice_dbg(" found area at 0x%lx\n", addr); return addr; @@ -547,15 +513,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Now let's see if we can find something in the existing slices * for that size plus free slices */ - addr = slice_find_area(mm, len, potential_mask, psize, topdown, - use_cache); + addr = slice_find_area(mm, len, potential_mask, psize, topdown); #ifdef CONFIG_PPC_64K_PAGES if (addr == -ENOMEM && psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ or_mask(potential_mask, compat_mask); addr = slice_find_area(mm, len, potential_mask, psize, - topdown, use_cache); + topdown); } #endif @@ -586,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags) { return slice_get_unmapped_area(addr, len, flags, - current->mm->context.user_psize, - 0, 1); + current->mm->context.user_psize, 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -597,8 +561,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags) { return slice_get_unmapped_area(addr0, len, flags, - current->mm->context.user_psize, - 1, 1); + current->mm->context.user_psize, 1); } unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 68c57d38745a..0026a37e21fd 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -352,7 +352,7 @@ static unsigned long spufs_get_unmapped_area(struct file *file, /* Else, try to obtain a 64K pages slice */ return slice_get_unmapped_area(addr, len, flags, - MMU_PAGE_64K, 1, 0); + MMU_PAGE_64K, 1); } #endif /* CONFIG_SPU_FS_64K_LS */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 830ec55d246f..6a94c397ad08 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -91,6 +91,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index fc32a2df4974..c56878e1245f 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -17,20 +17,6 @@ config STRICT_DEVMEM If you are unsure, say Y. -config DEBUG_STRICT_USER_COPY_CHECKS - def_bool n - prompt "Strict user copy size checks" - ---help--- - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time warnings. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, or if you run an older (pre 4.4) gcc, say N. - config S390_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 5f7d7ba2874c..7a539f4f5e30 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/aio.h> #include <asm/ebcdic.h> #include "hypfs.h" diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 6ab0d0b5cec8..20b0e97a7df2 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -3,7 +3,6 @@ # lib-y += delay.o string.o uaccess_std.o uaccess_pt.o -obj-y += usercopy.o obj-$(CONFIG_32BIT) += div64.o qrnnd.o ucmpdi2.o mem32.o obj-$(CONFIG_64BIT) += mem64.o lib-$(CONFIG_64BIT) += uaccess_mvcos.o diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 06bafec00278..40023290ee5b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } @@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = s390_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = s390_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 2daaaa6eda23..51561b8b15ba 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 8410065f2862..dbe119b63b48 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -45,4 +45,3 @@ obj-y += iomap.o obj-$(CONFIG_SPARC32) += atomic32.o ucmpdi2.o obj-y += ksyms.o obj-$(CONFIG_SPARC64) += PeeCeeI.o -obj-y += usercopy.o diff --git a/arch/sparc/lib/usercopy.c b/arch/sparc/lib/usercopy.c deleted file mode 100644 index 5c4284ce1c03..000000000000 --- a/arch/sparc/lib/usercopy.c +++ /dev/null @@ -1,9 +0,0 @@ -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/bug.h> - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 72c13cad3307..812a8005f72f 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -18,6 +18,7 @@ config TILE select HAVE_DEBUG_BUGVERBOSE select VIRT_TO_BUS select SYS_HYPERVISOR + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA @@ -116,13 +117,6 @@ config STRICT_DEVMEM config SMP def_bool y -# Allow checking for compile-time determined overflow errors in -# copy_from_user(). There are still unprovable places in the -# generic code as of 2.6.34, so this option is not really compatible -# with -Werror, which is more useful in general. -config DEBUG_COPY_FROM_USER - def_bool n - config HVC_TILE depends on TTY select HVC_DRIVER diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h index 9ab078a4605d..8a082bc6bca5 100644 --- a/arch/tile/include/asm/uaccess.h +++ b/arch/tile/include/asm/uaccess.h @@ -395,7 +395,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -#ifdef CONFIG_DEBUG_COPY_FROM_USER +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS +/* + * There are still unprovable places in the generic code as of 2.6.34, so this + * option is not really compatible with -Werror, which is more useful in + * general. + */ extern void copy_from_user_overflow(void) __compiletime_warning("copy_from_user() size is not provably correct"); diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c index f8d398c9ee7f..030abe3ee4f1 100644 --- a/arch/tile/lib/uaccess.c +++ b/arch/tile/lib/uaccess.c @@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size) is_arch_mappable_range(addr, size)); } EXPORT_SYMBOL(__range_ok); - -#ifdef CONFIG_DEBUG_COPY_FROM_USER -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); -#endif diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c index f96f4cec602a..d67d91ebf63e 100644 --- a/arch/tile/mm/mmap.c +++ b/arch/tile/mm/mmap.c @@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(mm); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index de186bde8975..644482882bae 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -66,6 +66,9 @@ void show_mem(unsigned int filter) printk(KERN_DEFAULT "Mem-info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_bank(i, mi) { struct membank *bank = &mi->bank[i]; unsigned int pfn1, pfn2; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 24f12c961339..75267ecba812 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -20,6 +20,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select ARCH_SUPPORTS_NUMA_BALANCING @@ -64,6 +65,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZ4 select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index b322f124ee3c..dea0da520e13 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -292,20 +292,6 @@ config OPTIMIZE_INLINING If unsure, say N. -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict copy size checks" - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - ---help--- - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, or if you run an older (pre 4.4) gcc, say N. - config DEBUG_NMI_SELFTEST bool "NMI Selftest" depends on DEBUG_KERNEL && X86_LOCAL_APIC diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8a84501acb1b..c275db562aa9 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -64,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lz4) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_LZ4) := lz4 quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7cb56c6ca351..0319c88290a5 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -145,6 +145,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzo.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + static void scroll(void) { int i; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 03abf9b70011..0f9a4728a467 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -162,7 +162,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; - current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(current->comm)); dump.u_ar0 = offsetof(struct user32, regs); dump.signal = signr; @@ -309,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3a..b8e9224f0b45 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -17,6 +17,8 @@ extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void e820_add_region(u64 start, u64 size, int type); +extern void e820_add_limit_region(u64 start, u64 size, int type); +extern void e820_adjust_region(u64 *start, u64 *size); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5f81bcefbe14..895f62e36ebb 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -44,10 +44,10 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ cpufeature = $(src)/../../include/asm/cpufeature.h targets += capflags.c -$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl deleted file mode 100644 index 091972ef49de..000000000000 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl -w -# -# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h -# - -($in, $out) = @ARGV; - -open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; -open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; - -print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n"; -print OUT "#include <asm/cpufeature.h>\n"; -print OUT "#endif\n"; -print OUT "\n"; -print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; - -%features = (); -$err = 0; - -while (defined($line = <IN>)) { - if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { - $macro = $1; - $feature = "\L$2"; - $tail = $3; - if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = "\L$1"; - } - - next if ($feature eq ''); - - if ($features{$feature}++) { - print STDERR "$in: duplicate feature name: $feature\n"; - $err++; - } - printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature; - } -} -print OUT "};\n"; - -close(IN); -close(OUT); - -if ($err) { - unlink($out); - exit(1); -} - -exit(0); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh new file mode 100644 index 000000000000..2bf616505499 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# Generate the x86_cap_flags[] array from include/asm/cpufeature.h +# + +IN=$1 +OUT=$2 + +TABS="$(printf '\t\t\t\t\t')" +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURE_H" + echo "#include <asm/cpufeature.h>" + echo "#endif" + echo "" + echo "const char * const x86_cap_flags[NCAPINTS*32] = {" + + # Iterate through any input lines starting with #define X86_FEATURE_ + sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN | + while read i + do + # Name is everything up to the first whitespace + NAME="$(echo "$i" | sed 's/ .*//')" + + # If the /* comment */ starts with a quote string, grab that. + VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" + [ -z "$VALUE" ] && VALUE="\"$NAME\"" + [ "$VALUE" == '""' ] && continue + + # Name is uppercase, VALUE is all lowercase + VALUE="$(echo "$VALUE" | tr A-Z a-z)" + + TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" \ + "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE" + done + echo "};" +) > $OUT + +trap - EXIT diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda5..0d5bb689649a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +static u64 mem_limit = ~0ULL; /* * This function checks if any part of the range <start,end> is mapped @@ -108,7 +109,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) * Add a memory region to the kernel e820 map. */ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) + int type, bool limited) { int x = e820x->nr_map; @@ -119,6 +120,22 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, return; } + if (limited) { + if (start >= mem_limit) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)start, + (unsigned long long)(start + size - 1)); + return; + } + + if (mem_limit - start < size) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)mem_limit, + (unsigned long long)(start + size - 1)); + size = mem_limit - start; + } + } + e820x->map[x].addr = start; e820x->map[x].size = size; e820x->map[x].type = type; @@ -127,7 +144,37 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(&e820, start, size, type, false); +} + +/* + * do_add_efi_memmap() calls this function(). + * + * Note: BOOT_SERVICES_{CODE,DATA} regions on some efi machines are marked + * as E820_RAM, and they are needed to be mapped. Please use e820_add_region() + * to add BOOT_SERVICES_{CODE,DATA} regions. + */ +void __init e820_add_limit_region(u64 start, u64 size, int type) +{ + /* + * efi_init() is called after finish_e820_parsing(), so we should + * check whether [start, start + size) contains address above + * mem_limit if the type is E820_RAM. + */ + __e820_add_region(&e820, start, size, type, type == E820_RAM); +} + +void __init e820_adjust_region(u64 *start, u64 *size) +{ + if (*start >= mem_limit) { + *size = 0; + return; + } + + if (mem_limit - *start < *size) + *size = mem_limit - *start; + + return; } static void __init e820_print_type(u32 type) @@ -455,8 +502,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, /* new range is totally covered? */ if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); + __e820_add_region(e820x, start, size, new_type, false); + __e820_add_region(e820x, end, ei_end - end, ei->type, + false); ei->size = start - ei->addr; real_updated_size += size; continue; @@ -469,7 +517,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, continue; __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + new_type, false); real_updated_size += final_end - final_start; @@ -809,7 +857,7 @@ static int userdef __initdata; /* "mem=nopentium" disables the 4MB page tables. */ static int __init parse_memopt(char *p) { - u64 mem_size; + char *oldp; if (!p) return -EINVAL; @@ -825,11 +873,11 @@ static int __init parse_memopt(char *p) } userdef = 1; - mem_size = memparse(p, &p); + oldp = p; + mem_limit = memparse(p, &p); /* don't remove all of memory when handling "mem={invalid}" param */ - if (mem_size == 0) + if (mem_limit == 0 || p == oldp) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; } @@ -895,6 +943,12 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { + if (mem_limit != ~0ULL) { + userdef = 1; + e820_remove_range(mem_limit, ULLONG_MAX - mem_limit, + E820_RAM, 1); + } + if (userdef) { u32 nr = e820.nr_map; diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index f0312d746402..3eb18acd0e40 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -689,9 +689,3 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return n; } EXPORT_SYMBOL(_copy_from_user); - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df6835f9f..62c29a5bfe26 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index b0086567271c..934610802e3f 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -130,13 +130,12 @@ static int pageattr_test(void) } failed += print_split(&sa); - srandom32(100); for (i = 0; i < NTEST; i++) { - unsigned long pfn = random32() % max_pfn_mapped; + unsigned long pfn = prandom_u32() % max_pfn_mapped; addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = random32() % 100; + len[i] = prandom_u32() % 100; len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 5f2ecaf3f9d8..4f65e1d05119 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -316,10 +316,17 @@ static void __init do_add_efi_memmap(void) int e820_type; switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: + /* EFI_BOOT_SERVICES_{CODE,DATA} needs to be mapped */ + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + e820_add_region(start, size, e820_type); + continue; + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: case EFI_CONVENTIONAL_MEMORY: if (md->attribute & EFI_MEMORY_WB) e820_type = E820_RAM; @@ -344,7 +351,7 @@ static void __init do_add_efi_memmap(void) e820_type = E820_RESERVED; break; } - e820_add_region(start, size, e820_type); + e820_add_limit_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -451,6 +458,8 @@ void __init efi_free_boot_services(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + e820_adjust_region(&start, &size); + /* Could not reserve boot area */ if (!size) continue; diff --git a/block/blk-core.c b/block/blk-core.c index 074b758efc42..186603b36c44 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -151,7 +151,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) + unsigned int nbytes, int error, + struct batch_complete *batch) { if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); @@ -175,7 +176,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio_batch(bio, error, batch); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -2250,7 +2251,8 @@ EXPORT_SYMBOL(blk_fetch_request); * %false - this request doesn't have any more data * %true - this request has more data **/ -bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) +bool blk_update_request(struct request *req, int error, unsigned int nr_bytes, + struct batch_complete *batch) { int total_bytes, bio_nbytes, next_idx = 0; struct bio *bio; @@ -2306,7 +2308,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (nr_bytes >= bio->bi_size) { req->bio = bio->bi_next; nbytes = bio->bi_size; - req_bio_endio(req, bio, nbytes, error); + req_bio_endio(req, bio, nbytes, error, batch); next_idx = 0; bio_nbytes = 0; } else { @@ -2368,7 +2370,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * if the request wasn't completed, update state */ if (bio_nbytes) { - req_bio_endio(req, bio, bio_nbytes, error); + req_bio_endio(req, bio, bio_nbytes, error, batch); bio->bi_idx += next_idx; bio_iovec(bio)->bv_offset += nr_bytes; bio_iovec(bio)->bv_len -= nr_bytes; @@ -2405,14 +2407,15 @@ EXPORT_SYMBOL_GPL(blk_update_request); static bool blk_update_bidi_request(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes) + unsigned int bidi_bytes, + struct batch_complete *batch) { - if (blk_update_request(rq, error, nr_bytes)) + if (blk_update_request(rq, error, nr_bytes, batch)) return true; /* Bidi request must be completed as a whole */ if (unlikely(blk_bidi_rq(rq)) && - blk_update_request(rq->next_rq, error, bidi_bytes)) + blk_update_request(rq->next_rq, error, bidi_bytes, batch)) return true; if (blk_queue_add_random(rq->q)) @@ -2495,7 +2498,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, struct request_queue *q = rq->q; unsigned long flags; - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, NULL)) return true; spin_lock_irqsave(q->queue_lock, flags); @@ -2521,9 +2524,10 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %true - still buffers pending for this request **/ bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) + unsigned int nr_bytes, unsigned int bidi_bytes, + struct batch_complete *batch) { - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, batch)) return true; blk_finish_request(rq, error); @@ -2624,7 +2628,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_err); **/ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - return __blk_end_bidi_request(rq, error, nr_bytes, 0); + return __blk_end_bidi_request(rq, error, nr_bytes, 0, NULL); } EXPORT_SYMBOL(__blk_end_request); @@ -2636,7 +2640,7 @@ EXPORT_SYMBOL(__blk_end_request); * Description: * Completely finish @rq. Must be called with queue lock held. */ -void __blk_end_request_all(struct request *rq, int error) +void blk_end_request_all_batch(struct request *rq, int error, struct batch_complete *batch) { bool pending; unsigned int bidi_bytes = 0; @@ -2644,10 +2648,10 @@ void __blk_end_request_all(struct request *rq, int error) if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); - pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); + pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes, batch); BUG_ON(pending); } -EXPORT_SYMBOL(__blk_end_request_all); +EXPORT_SYMBOL(blk_end_request_all_batch); /** * __blk_end_request_cur - Helper function to finish the current request chunk. diff --git a/block/blk-flush.c b/block/blk-flush.c index db8f1b507857..28785079ee23 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -316,7 +316,7 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - __blk_end_bidi_request(rq, 0, 0, 0); + __blk_end_bidi_request(rq, 0, 0, 0, NULL); return; } diff --git a/block/blk.h b/block/blk.h index e837b8f619b7..dc8fee6d41d6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -31,7 +31,8 @@ void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes); + unsigned int nr_bytes, unsigned int bidi_bytes, + struct batch_complete *batch); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); diff --git a/block/genhd.c b/block/genhd.c index 3c001fba80c7..a1ed52af5290 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9a87daa6f4fb..a5ffcc988f0b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -27,6 +27,7 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/times.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include <scsi/scsi.h> diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c index aa2b0270ed16..4a92bac744dc 100644 --- a/crypto/async_tx/raid6test.c +++ b/crypto/async_tx/raid6test.c @@ -46,15 +46,10 @@ static void callback(void *param) static void makedata(int disks) { - int i, j; + int i; for (i = 0; i < disks; i++) { - for (j = 0; j < PAGE_SIZE/sizeof(u32); j += sizeof(u32)) { - u32 *p = page_address(data[i]) + j; - - *p = random32(); - } - + prandom_bytes(page_address(data[i]), PAGE_SIZE); dataptrs[i] = data[i]; } } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a9eccfc6079b..83c5ae0ed56b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -757,7 +757,8 @@ static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct acc rcu_read_unlock(); timeo = connect_int * HZ; - timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ + /* 28.5% random jitter */ + timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); if (err <= 0) @@ -953,7 +954,7 @@ retry: conn_warn(tconn, "Error receiving initial packet\n"); sock_release(s); randomize: - if (random32() & 1) + if (prandom_u32() & 1) goto retry; } } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 11cc9522cdd4..b84dda58c33c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -146,6 +146,9 @@ static void mtip_command_cleanup(struct driver_data *dd) struct mtip_cmd *command; struct mtip_port *port = dd->port; static int in_progress; + struct batch_complete batch; + + batch_complete_init(&batch); if (in_progress) return; @@ -161,11 +164,9 @@ static void mtip_command_cleanup(struct driver_data *dd) command = &port->commands[commandindex]; if (atomic_read(&command->active) - && (command->async_callback)) { - command->async_callback(command->async_data, - -ENODEV); - command->async_callback = NULL; - command->async_data = NULL; + && (command->bio)) { + bio_endio_batch(command->bio, -ENODEV, &batch); + command->bio = NULL; } dma_unmap_sg(&port->dd->pdev->dev, @@ -173,9 +174,10 @@ static void mtip_command_cleanup(struct driver_data *dd) command->scatter_ents, command->direction); } + up(&port->cmd_slot); } - up(&port->cmd_slot); + batch_complete(&batch); set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); in_progress = 0; @@ -564,6 +566,9 @@ static void mtip_timeout_function(unsigned long int data) unsigned int bit, group; unsigned int num_command_slots; unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; + struct batch_complete batch; + + batch_complete_init(&batch); if (unlikely(!port)) return; @@ -606,11 +611,9 @@ static void mtip_timeout_function(unsigned long int data) writel(1 << bit, port->completed[group]); /* Call the async completion callback. */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, - -EIO); - command->async_callback = NULL; - command->comp_func = NULL; + if (likely(command->bio)) + bio_endio_batch(command->bio, -EIO, &batch); + command->bio = NULL; /* Unmap the DMA scatter list entries */ dma_unmap_sg(&port->dd->pdev->dev, @@ -629,6 +632,8 @@ static void mtip_timeout_function(unsigned long int data) } } + batch_complete(&batch); + if (cmdto_cnt) { print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { @@ -679,7 +684,8 @@ static void mtip_timeout_function(unsigned long int data) static void mtip_async_complete(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { struct mtip_cmd *command; struct driver_data *dd = data; @@ -696,11 +702,10 @@ static void mtip_async_complete(struct mtip_port *port, } /* Upper layer callback */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, cb_status); + if (likely(command->bio)) + bio_endio_batch(command->bio, cb_status, batch); - command->async_callback = NULL; - command->comp_func = NULL; + command->bio = NULL; /* Unmap the DMA scatter list entries */ dma_unmap_sg(&dd->pdev->dev, @@ -733,24 +738,22 @@ static void mtip_async_complete(struct mtip_port *port, static void mtip_completion(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { - struct mtip_cmd *command = &port->commands[tag]; struct completion *waiting = data; if (unlikely(status == PORT_IRQ_TF_ERR)) dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); - command->async_callback = NULL; - command->comp_func = NULL; - complete(waiting); } static void mtip_null_completion(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { return; } @@ -779,6 +782,7 @@ static void mtip_handle_tfe(struct driver_data *dd) unsigned char *buf; char *fail_reason = NULL; int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0; + struct batch_complete batch; dev_warn(&dd->pdev->dev, "Taskfile error\n"); @@ -796,13 +800,14 @@ static void mtip_handle_tfe(struct driver_data *dd) atomic_inc(&cmd->active); /* active > 1 indicates error */ if (cmd->comp_data && cmd->comp_func) { cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, PORT_IRQ_TF_ERR); + cmd->comp_data, PORT_IRQ_TF_ERR, NULL); } goto handle_tfe_exit; } /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + batch_complete_init(&batch); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -829,7 +834,7 @@ static void mtip_handle_tfe(struct driver_data *dd) cmd->comp_func(port, tag, cmd->comp_data, - 0); + 0, &batch); } else { dev_err(&port->dd->pdev->dev, "Missing completion func for tag %d", @@ -842,6 +847,7 @@ static void mtip_handle_tfe(struct driver_data *dd) } } } + batch_complete(&batch); print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt); @@ -883,6 +889,7 @@ static void mtip_handle_tfe(struct driver_data *dd) /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + batch_complete_init(&batch); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -916,7 +923,7 @@ static void mtip_handle_tfe(struct driver_data *dd) if (cmd->comp_func) { cmd->comp_func(port, tag, cmd->comp_data, - -ENODATA); + -ENODATA, &batch); } continue; } @@ -946,13 +953,15 @@ static void mtip_handle_tfe(struct driver_data *dd) port, tag, cmd->comp_data, - PORT_IRQ_TF_ERR); + PORT_IRQ_TF_ERR, &batch); else dev_warn(&port->dd->pdev->dev, "Bad completion for tag %d\n", tag); } } + + batch_complete(&batch); print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); handle_tfe_exit: @@ -973,6 +982,9 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, struct driver_data *dd = port->dd; int tag, bit; struct mtip_cmd *command; + struct batch_complete batch; + + batch_complete_init(&batch); if (!completed) { WARN_ON_ONCE(!completed); @@ -997,7 +1009,8 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, port, tag, command->comp_data, - 0); + 0, + &batch); } else { dev_warn(&dd->pdev->dev, "Null completion " @@ -1007,13 +1020,16 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, if (mtip_check_surprise_removal( dd->pdev)) { mtip_command_cleanup(dd); - return; + goto out; } } } completed >>= 1; } +out: + batch_complete(&batch); + /* If last, re-enable interrupts */ if (atomic_dec_return(&dd->irq_workers_active) == 0) writel(0xffffffff, dd->mmio + HOST_IRQ_STAT); @@ -1034,7 +1050,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd->comp_data, - 0); + 0, NULL); return; } } @@ -2554,8 +2570,8 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * None */ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, - int nsect, int nents, int tag, void *callback, - void *data, int dir) + int nsect, int nents, int tag, + struct bio *bio, int dir) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; @@ -2610,12 +2626,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->comp_func = mtip_async_complete; command->direction = dma_dir; - /* - * Set the completion function and data for the command passed - * from the upper layer. - */ - command->async_data = data; - command->async_callback = callback; + command->bio = bio; /* * To prevent this command from being issued @@ -3795,7 +3806,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) bio_sectors(bio), nents, tag, - bio_endio, bio, bio_data_dir(bio)); } else diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3bffff5f670c..af8c6f79a8d8 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -325,11 +325,9 @@ struct mtip_cmd { void (*comp_func)(struct mtip_port *port, int tag, void *data, - int status); - /* Additional callback function that may be called by comp_func() */ - void (*async_callback)(void *data, int status); - - void *async_data; /* Addl. data passed to async_callback() */ + int status, + struct batch_complete *batch); + struct bio *bio; int scatter_ents; /* Number of scatter list entries used */ diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 758f2ac878cf..deb722d63d68 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -775,7 +775,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) if (intr & ERROR_INTR) { n = fs->scount - 1 - resid / 512; if (n > 0) { - blk_update_request(req, 0, n << 9); + blk_update_request(req, 0, n << 9, NULL); fs->req_sector += n; } if (fs->retries < 5) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 922bcb97e23a..5ece6db9971f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -210,7 +210,8 @@ static void virtblk_bio_send_flush_work(struct work_struct *work) virtblk_bio_send_flush(vbr); } -static inline void virtblk_request_done(struct virtblk_req *vbr) +static inline void virtblk_request_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; struct request *req = vbr->req; @@ -224,11 +225,12 @@ static inline void virtblk_request_done(struct virtblk_req *vbr) req->errors = (error != 0); } - __blk_end_request_all(req, error); + blk_end_request_all_batch(req, error, batch); mempool_free(vbr, vblk->pool); } -static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) +static inline void virtblk_bio_flush_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; @@ -237,12 +239,13 @@ static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) INIT_WORK(&vbr->work, virtblk_bio_send_data_work); queue_work(virtblk_wq, &vbr->work); } else { - bio_endio(vbr->bio, virtblk_result(vbr)); + bio_endio_batch(vbr->bio, virtblk_result(vbr), batch); mempool_free(vbr, vblk->pool); } } -static inline void virtblk_bio_data_done(struct virtblk_req *vbr) +static inline void virtblk_bio_data_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; @@ -252,17 +255,18 @@ static inline void virtblk_bio_data_done(struct virtblk_req *vbr) INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); queue_work(virtblk_wq, &vbr->work); } else { - bio_endio(vbr->bio, virtblk_result(vbr)); + bio_endio_batch(vbr->bio, virtblk_result(vbr), batch); mempool_free(vbr, vblk->pool); } } -static inline void virtblk_bio_done(struct virtblk_req *vbr) +static inline void virtblk_bio_done(struct virtblk_req *vbr, + struct batch_complete *batch) { if (unlikely(vbr->flags & VBLK_IS_FLUSH)) - virtblk_bio_flush_done(vbr); + virtblk_bio_flush_done(vbr, batch); else - virtblk_bio_data_done(vbr); + virtblk_bio_data_done(vbr, batch); } static void virtblk_done(struct virtqueue *vq) @@ -272,16 +276,19 @@ static void virtblk_done(struct virtqueue *vq) struct virtblk_req *vbr; unsigned long flags; unsigned int len; + struct batch_complete batch; + + batch_complete_init(&batch); spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { if (vbr->bio) { - virtblk_bio_done(vbr); + virtblk_bio_done(vbr, &batch); bio_done = true; } else { - virtblk_request_done(vbr); + virtblk_request_done(vbr, &batch); req_done = true; } } @@ -291,6 +298,8 @@ static void virtblk_done(struct virtqueue *vq) blk_start_queue(vblk->disk->queue); spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); + batch_complete(&batch); + if (bio_done) wake_up(&vblk->queue_wait); } diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 2c644afbcdd4..1ccbe9482faa 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -28,6 +28,7 @@ #include <linux/pfn.h> #include <linux/export.h> #include <linux/io.h> +#include <linux/aio.h> #include <asm/uaccess.h> @@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf, return count; } +static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return 0; +} + +static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return iov_length(iov, nr_segs); +} + static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) { @@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf, return written ? written : -EFAULT; } +static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + size_t written = 0; + unsigned long i; + ssize_t ret; + + for (i = 0; i < nr_segs; i++) { + ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len, + &pos); + if (ret < 0) + break; + written += ret; + } + + return written ? written : -EFAULT; +} + static int mmap_zero(struct file *file, struct vm_area_struct *vma) { #ifndef CONFIG_MMU @@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp) #define full_lseek null_lseek #define write_zero write_null #define read_full read_zero +#define aio_write_zero aio_write_null #define open_mem open_port #define open_kmem open_mem #define open_oldmem open_mem @@ -766,6 +798,8 @@ static const struct file_operations null_fops = { .llseek = null_lseek, .read = read_null, .write = write_null, + .aio_read = aio_read_null, + .aio_write = aio_write_null, .splice_write = splice_write_null, }; @@ -782,6 +816,8 @@ static const struct file_operations zero_fops = { .llseek = zero_lseek, .read = read_zero, .write = write_zero, + .aio_read = aio_read_zero, + .aio_write = aio_write_zero, .mmap = mmap_zero, }; diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 4cd392dbf115..3439f59123ed 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -410,22 +410,45 @@ static void __init dmi_dump_ids(void) printk(KERN_CONT "\n"); } -static int __init dmi_present(const char __iomem *p) +static int __init dmi_present(const char *buf) { - u8 buf[15]; + int smbios_ver; - memcpy_fromio(buf, p, 15); - if (dmi_checksum(buf, 15)) { + if (memcmp(buf, "_SM_", 4) == 0 && + buf[5] < 32 && dmi_checksum(buf, buf[5])) { + smbios_ver = (buf[6] << 8) + buf[7]; + + /* Some BIOS report weird SMBIOS version, fix that up */ + switch (smbios_ver) { + case 0x021F: + case 0x0221: + pr_debug("SMBIOS version fixup(2.%d->2.%d)\n", + smbios_ver & 0xFF, 3); + smbios_ver = 0x0203; + break; + case 0x0233: + pr_debug("SMBIOS version fixup(2.%d->2.%d)\n", 51, 6); + smbios_ver = 0x0206; + break; + } + } else { + smbios_ver = 0; + } + + buf += 16; + + if (memcmp(buf, "_DMI_", 5) == 0 && dmi_checksum(buf, 15)) { dmi_num = (buf[13] << 8) | buf[12]; dmi_len = (buf[7] << 8) | buf[6]; dmi_base = (buf[11] << 24) | (buf[10] << 16) | (buf[9] << 8) | buf[8]; if (dmi_walk_early(dmi_decode) == 0) { - if (dmi_ver) + if (smbios_ver) { + dmi_ver = smbios_ver; pr_info("SMBIOS %d.%d present.\n", dmi_ver >> 8, dmi_ver & 0xFF); - else { + } else { dmi_ver = (buf[14] & 0xF0) << 4 | (buf[14] & 0x0F); pr_info("Legacy DMI %d.%d present.\n", @@ -435,40 +458,14 @@ static int __init dmi_present(const char __iomem *p) return 0; } } - dmi_ver = 0; - return 1; -} -static int __init smbios_present(const char __iomem *p) -{ - u8 buf[32]; - - memcpy_fromio(buf, p, 32); - if ((buf[5] < 32) && dmi_checksum(buf, buf[5])) { - dmi_ver = (buf[6] << 8) + buf[7]; - - /* Some BIOS report weird SMBIOS version, fix that up */ - switch (dmi_ver) { - case 0x021F: - case 0x0221: - pr_debug("SMBIOS version fixup(2.%d->2.%d)\n", - dmi_ver & 0xFF, 3); - dmi_ver = 0x0203; - break; - case 0x0233: - pr_debug("SMBIOS version fixup(2.%d->2.%d)\n", 51, 6); - dmi_ver = 0x0206; - break; - } - return memcmp(p + 16, "_DMI_", 5) || dmi_present(p + 16); - } return 1; } void __init dmi_scan_machine(void) { char __iomem *p, *q; - int rc; + char buf[32]; if (efi_enabled(EFI_CONFIG_TABLES)) { if (efi.smbios == EFI_INVALID_TABLE_ADDR) @@ -481,10 +478,10 @@ void __init dmi_scan_machine(void) p = dmi_ioremap(efi.smbios, 32); if (p == NULL) goto error; - - rc = smbios_present(p); + memcpy_fromio(buf, p, 32); dmi_iounmap(p, 32); - if (!rc) { + + if (!dmi_present(buf)) { dmi_available = 1; goto out; } @@ -499,18 +496,15 @@ void __init dmi_scan_machine(void) if (p == NULL) goto error; + memset(buf, 0, 16); for (q = p; q < p + 0x10000; q += 16) { - if (memcmp(q, "_SM_", 4) == 0 && q - p <= 0xFFE0) - rc = smbios_present(q); - else if (memcmp(q, "_DMI_", 5) == 0) - rc = dmi_present(q); - else - continue; - if (!rc) { + memcpy_fromio(buf + 16, q, 16); + if (!dmi_present(buf)) { dmi_available = 1; dmi_iounmap(p, 0x10000); goto out; } + memcpy(buf, buf + 16, 16); } dmi_iounmap(p, 0x10000); } diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 59d6b9bf204b..cf71f1d627fe 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -399,6 +399,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode) return; /* + * fbdev->blank can be called from irq context in case of a panic. + * Since we already have our own special panic handler which will + * restore the fbdev console mode completely, just bail out early. + */ + if (oops_in_progress) + return; + + /* * For each CRTC in this fb, turn the connectors on/off. */ drm_modeset_lock_all(dev); diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 31f9201b2980..c40088ecf9f3 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -62,13 +62,13 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo, kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); if (random) { j = 0; - random_bytes = random32(); + random_bytes = prandom_u32(); for (i = 0; i < RANDOM_SIZE; i++) rarray[i] = i + skip_low; for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { if (j >= RANDOM_SIZE) { j = 0; - random_bytes = random32(); + random_bytes = prandom_u32(); } idx = (random_bytes >> (j * 2)) & 0xF; kfifo_in(fifo, diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c index f95e5df30db2..0161ae6ad629 100644 --- a/drivers/infiniband/hw/cxgb4/id_table.c +++ b/drivers/infiniband/hw/cxgb4/id_table.c @@ -54,7 +54,7 @@ u32 c4iw_id_alloc(struct c4iw_id_table *alloc) if (obj < alloc->max) { if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) - alloc->last += random32() % RANDOM_SKIP; + alloc->last += prandom_u32() % RANDOM_SKIP; else alloc->last = obj + 1; if (alloc->last >= alloc->max) @@ -88,7 +88,7 @@ int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, alloc->start = start; alloc->flags = flags; if (flags & C4IW_ID_TABLE_F_RANDOM) - alloc->last = random32() % RANDOM_SKIP; + alloc->last = prandom_u32() % RANDOM_SKIP; else alloc->last = 0; alloc->max = num; diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index aed8afee56da..6d7f453b4d05 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -40,6 +40,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/aio.h> #include <linux/jiffies.h> #include <linux/cpu.h> #include <asm/pgtable.h> diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 934792c477bc..4d599cedbb0b 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -93,7 +93,7 @@ static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, __be64 mlx4_ib_gen_node_guid(void) { #define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) - return cpu_to_be64(NODE_GUID_HI | random32()); + return cpu_to_be64(NODE_GUID_HI | prandom_u32()); } __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 4f7aa301b3b1..b56c9428f3c5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -39,7 +39,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/io.h> -#include <linux/uio.h> +#include <linux/aio.h> #include <linux/jiffies.h> #include <asm/pgtable.h> #include <linux/delay.h> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 67b0c1d23678..249976cfb28a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -460,7 +460,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even goto err_qp; } - psn = random32() & 0xffffff; + psn = prandom_u32() & 0xffffff; ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); if (ret) goto err_modify; diff --git a/drivers/leds/leds-ot200.c b/drivers/leds/leds-ot200.c index ee14662ed5ce..98cae529373f 100644 --- a/drivers/leds/leds-ot200.c +++ b/drivers/leds/leds-ot200.c @@ -47,37 +47,37 @@ static struct ot200_led leds[] = { { .name = "led_1", .port = 0x49, - .mask = BIT(7), + .mask = BIT(6), }, { .name = "led_2", .port = 0x49, - .mask = BIT(6), + .mask = BIT(5), }, { .name = "led_3", .port = 0x49, - .mask = BIT(5), + .mask = BIT(4), }, { .name = "led_4", .port = 0x49, - .mask = BIT(4), + .mask = BIT(3), }, { .name = "led_5", .port = 0x49, - .mask = BIT(3), + .mask = BIT(2), }, { .name = "led_6", .port = 0x49, - .mask = BIT(2), + .mask = BIT(1), }, { .name = "led_7", .port = 0x49, - .mask = BIT(1), + .mask = BIT(0), } }; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3b62be160a6e..864baabaee25 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -686,7 +686,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, * We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ - next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); + next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { cpu->lg->pgdirs[next].pgdir = diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7e469260fe5e..e52880f6b589 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -695,7 +695,7 @@ static void end_clone_bio(struct bio *clone, int error) * Do not use blk_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - blk_update_request(tio->orig, 0, nr_bytes); + blk_update_request(tio->orig, 0, nr_bytes, NULL); } /* diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c index 5451beff183f..a60c188c2bd9 100644 --- a/drivers/message/i2o/i2o_config.c +++ b/drivers/message/i2o/i2o_config.c @@ -687,6 +687,11 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, } size = size >> 16; size *= 4; + if (size > sizeof(rmsg)) { + rcode = -EINVAL; + goto sg_list_cleanup; + } + /* Copy in the user's I2O command */ if (copy_from_user(rmsg, user_msg, size)) { rcode = -EFAULT; @@ -922,6 +927,11 @@ static int i2o_cfg_passthru(unsigned long arg) } size = size >> 16; size *= 4; + if (size > sizeof(rmsg)) { + rcode = -EFAULT; + goto sg_list_cleanup; + } + /* Copy in the user's I2O command */ if (copy_from_user(rmsg, user_msg, size)) { rcode = -EFAULT; diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 08a3cf2a7610..9290bb51a06a 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -120,8 +120,8 @@ static void mmc_should_fail_request(struct mmc_host *host, !should_fail(&host->fail_mmc_request, data->blksz * data->blocks)) return; - data->error = data_errors[random32() % ARRAY_SIZE(data_errors)]; - data->bytes_xfered = (random32() % (data->bytes_xfered >> 9)) << 9; + data->error = data_errors[prandom_u32() % ARRAY_SIZE(data_errors)]; + data->bytes_xfered = (prandom_u32() % (data->bytes_xfered >> 9)) << 9; } #else /* CONFIG_FAIL_MMC_REQUEST */ diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 149a3a038491..5abdd4894082 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4085,7 +4085,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev) if (!cp->csk_tbl) return -ENOMEM; - port_id = random32(); + port_id = prandom_u32(); port_id %= CNIC_LOCAL_PORT_RANGE; if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, CNIC_LOCAL_PORT_MIN, port_id)) { @@ -4145,7 +4145,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev) { u32 seed; - seed = random32(); + seed = prandom_u32(); cnic_ctx_wr(dev, 45, 0, seed); return 0; } diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 49b8b58fc5c6..484f77ec2ce1 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -449,7 +449,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat) if ((--bc->hdlctx.slotcnt) > 0) return 0; bc->hdlctx.slotcnt = bc->ch_params.slottime; - if ((random32() % 256) > bc->ch_params.ppersist) + if ((prandom_u32() % 256) > bc->ch_params.ppersist) return 0; } } diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index a4a3516b6bbf..3169252613fa 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -389,7 +389,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s) if ((--s->hdlctx.slotcnt) > 0) return; s->hdlctx.slotcnt = s->ch_params.slottime; - if ((random32() % 256) > s->ch_params.ppersist) + if ((prandom_u32() % 256) > s->ch_params.ppersist) return; start_tx(dev, s); } diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 4cf8f1017aad..ae3feb074f00 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -638,7 +638,7 @@ static void yam_arbitrate(struct net_device *dev) yp->slotcnt = yp->slot / 10; /* is random > persist ? */ - if ((random32() % 256) > yp->pers) + if ((prandom_u32() % 256) > yp->pers) return; yam_start_tx(dev, yp); diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c index 9eabfaa22f3e..5ca14d463ba7 100644 --- a/drivers/net/team/team_mode_random.c +++ b/drivers/net/team/team_mode_random.c @@ -18,7 +18,7 @@ static u32 random_N(unsigned int N) { - return reciprocal_divide(random32(), N); + return reciprocal_divide(prandom_u32(), N); } static bool rnd_transmit(struct team *team, struct sk_buff *skb) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 4166e642068b..bca31a855875 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1118,7 +1118,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work) if (afx_hdl->is_listen && afx_hdl->my_listen_chan) /* 100ms ~ 300ms */ err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, - 100 * (1 + (random32() % 3))); + 100 * (1 + (prandom_u32() % 3))); else err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index dbf5b1289516..aebf66cdd71f 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c @@ -216,7 +216,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, mwifiex_form_mgmt_frame(skb, buf, len); mwifiex_queue_tx_pkt(priv, skb); - *cookie = random32() | 1; + *cookie = prandom_u32() | 1; cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC); wiphy_dbg(wiphy, "info: management frame transmitted\n"); @@ -271,7 +271,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy, duration); if (!ret) { - *cookie = random32() | 1; + *cookie = prandom_u32() | 1; priv->roc_cfg.cookie = *cookie; priv->roc_cfg.chan = *chan; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 9a907567f41e..edec135b1685 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -1964,9 +1964,6 @@ struct tp_nvram_state { /* kthread for the hotkey poller */ static struct task_struct *tpacpi_hotkey_task; -/* Acquired while the poller kthread is running, use to sync start/stop */ -static struct mutex hotkey_thread_mutex; - /* * Acquire mutex to write poller control variables as an * atomic block. @@ -2462,8 +2459,6 @@ static int hotkey_kthread(void *data) unsigned int poll_freq; bool was_frozen; - mutex_lock(&hotkey_thread_mutex); - if (tpacpi_lifecycle == TPACPI_LIFE_EXITING) goto exit; @@ -2523,7 +2518,6 @@ static int hotkey_kthread(void *data) } exit: - mutex_unlock(&hotkey_thread_mutex); return 0; } @@ -2533,9 +2527,6 @@ static void hotkey_poll_stop_sync(void) if (tpacpi_hotkey_task) { kthread_stop(tpacpi_hotkey_task); tpacpi_hotkey_task = NULL; - mutex_lock(&hotkey_thread_mutex); - /* at this point, the thread did exit */ - mutex_unlock(&hotkey_thread_mutex); } } @@ -3234,7 +3225,6 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) mutex_init(&hotkey_mutex); #ifdef CONFIG_THINKPAD_ACPI_HOTKEY_POLL - mutex_init(&hotkey_thread_mutex); mutex_init(&hotkey_thread_data_mutex); #endif diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 9b742d3ffb94..66385402d20e 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -259,6 +259,76 @@ void rtc_device_unregister(struct rtc_device *rtc) } EXPORT_SYMBOL_GPL(rtc_device_unregister); +static void devm_rtc_device_release(struct device *dev, void *res) +{ + struct rtc_device *rtc = *(struct rtc_device **)res; + + rtc_device_unregister(rtc); +} + +static int devm_rtc_device_match(struct device *dev, void *res, void *data) +{ + struct rtc **r = res; + + return *r == data; +} + +/** + * devm_rtc_device_register - resource managed rtc_device_register() + * @dev: the device to register + * @name: the name of the device + * @ops: the rtc operations structure + * @owner: the module owner + * + * @return a struct rtc on success, or an ERR_PTR on error + * + * Managed rtc_device_register(). The rtc_device returned from this function + * are automatically freed on driver detach. See rtc_device_register() + * for more information. + */ + +struct rtc_device *devm_rtc_device_register(struct device *dev, + const char *name, + const struct rtc_class_ops *ops, + struct module *owner) +{ + struct rtc_device **ptr, *rtc; + + ptr = devres_alloc(devm_rtc_device_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + rtc = rtc_device_register(name, dev, ops, owner); + if (!IS_ERR(rtc)) { + *ptr = rtc; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return rtc; +} +EXPORT_SYMBOL_GPL(devm_rtc_device_register); + +/** + * devm_rtc_device_unregister - resource managed devm_rtc_device_unregister() + * @dev: the device to unregister + * @rtc: the RTC class device to unregister + * + * Deallocated a rtc allocated with devm_rtc_device_register(). Normally this + * function will not need to be called and the resource management code will + * ensure that the resource is freed. + */ +void devm_rtc_device_unregister(struct device *dev, struct rtc_device *rtc) +{ + int rc; + + rc = devres_release(dev, devm_rtc_device_release, + devm_rtc_device_match, rtc); + WARN_ON(rc); +} +EXPORT_SYMBOL_GPL(devm_rtc_device_unregister); + static int __init rtc_init(void) { rtc_class = class_create(THIS_MODULE, "rtc"); diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c index 63b17ebe90e8..76f9505ff7c5 100644 --- a/drivers/rtc/rtc-88pm80x.c +++ b/drivers/rtc/rtc-88pm80x.c @@ -312,7 +312,7 @@ static int pm80x_rtc_probe(struct platform_device *pdev) } rtc_tm_to_time(&tm, &ticks); - info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev, + info->rtc_dev = devm_rtc_device_register(&pdev->dev, "88pm80x-rtc", &pm80x_rtc_ops, THIS_MODULE); if (IS_ERR(info->rtc_dev)) { ret = PTR_ERR(info->rtc_dev); @@ -346,7 +346,6 @@ static int pm80x_rtc_remove(struct platform_device *pdev) { struct pm80x_rtc_info *info = platform_get_drvdata(pdev); platform_set_drvdata(pdev, NULL); - rtc_device_unregister(info->rtc_dev); pm80x_free_irq(info->chip, info->irq, info); return 0; } diff --git a/drivers/rtc/rtc-ab3100.c b/drivers/rtc/rtc-ab3100.c index 261a07e0fb24..c2f12d370009 100644 --- a/drivers/rtc/rtc-ab3100.c +++ b/drivers/rtc/rtc-ab3100.c @@ -257,19 +257,7 @@ static struct platform_driver ab3100_rtc_driver = { .remove = __exit_p(ab3100_rtc_remove), }; -static int __init ab3100_rtc_init(void) -{ - return platform_driver_probe(&ab3100_rtc_driver, - ab3100_rtc_probe); -} - -static void __exit ab3100_rtc_exit(void) -{ - platform_driver_unregister(&ab3100_rtc_driver); -} - -module_init(ab3100_rtc_init); -module_exit(ab3100_rtc_exit); +module_platform_driver_probe(ab3100_rtc_driver, ab3100_rtc_probe); MODULE_AUTHOR("Linus Walleij <linus.walleij@stericsson.com>"); MODULE_DESCRIPTION("AB3100 RTC Driver"); diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index 8dd08305aae1..619c8877f2f1 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -302,17 +302,7 @@ static struct platform_driver at32_rtc_driver = { }, }; -static int __init at32_rtc_init(void) -{ - return platform_driver_probe(&at32_rtc_driver, at32_rtc_probe); -} -module_init(at32_rtc_init); - -static void __exit at32_rtc_exit(void) -{ - platform_driver_unregister(&at32_rtc_driver); -} -module_exit(at32_rtc_exit); +module_platform_driver_probe(at32_rtc_driver, at32_rtc_probe); MODULE_AUTHOR("Hans-Christian Egtvedt <hcegtvedt@atmel.com>"); MODULE_DESCRIPTION("Real time clock for AVR32 AT32AP700x"); diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 434ebc3a99dc..f63c8fee73eb 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -390,18 +390,7 @@ static struct platform_driver at91_rtc_driver = { }, }; -static int __init at91_rtc_init(void) -{ - return platform_driver_probe(&at91_rtc_driver, at91_rtc_probe); -} - -static void __exit at91_rtc_exit(void) -{ - platform_driver_unregister(&at91_rtc_driver); -} - -module_init(at91_rtc_init); -module_exit(at91_rtc_exit); +module_platform_driver_probe(at91_rtc_driver, at91_rtc_probe); MODULE_AUTHOR("Rick Bronson"); MODULE_DESCRIPTION("RTC driver for Atmel AT91RM9200"); diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c index b309da4ec745..dfd6d6638aa3 100644 --- a/drivers/rtc/rtc-au1xxx.c +++ b/drivers/rtc/rtc-au1xxx.c @@ -134,18 +134,7 @@ static struct platform_driver au1xrtc_driver = { .remove = au1xtoy_rtc_remove, }; -static int __init au1xtoy_rtc_init(void) -{ - return platform_driver_probe(&au1xrtc_driver, au1xtoy_rtc_probe); -} - -static void __exit au1xtoy_rtc_exit(void) -{ - platform_driver_unregister(&au1xrtc_driver); -} - -module_init(au1xtoy_rtc_init); -module_exit(au1xtoy_rtc_exit); +module_platform_driver_probe(au1xrtc_driver, au1xtoy_rtc_probe); MODULE_DESCRIPTION("Au1xxx TOY-counter-based RTC driver"); MODULE_AUTHOR("Manuel Lauss <manuel.lauss@gmail.com>"); diff --git a/drivers/rtc/rtc-coh901331.c b/drivers/rtc/rtc-coh901331.c index 2d28ec1aa1cd..bf0387f80d2d 100644 --- a/drivers/rtc/rtc-coh901331.c +++ b/drivers/rtc/rtc-coh901331.c @@ -155,7 +155,6 @@ static int __exit coh901331_remove(struct platform_device *pdev) struct coh901331_port *rtap = dev_get_drvdata(&pdev->dev); if (rtap) { - rtc_device_unregister(rtap->rtc); clk_unprepare(rtap->clk); platform_set_drvdata(pdev, NULL); } @@ -211,8 +210,8 @@ static int __init coh901331_probe(struct platform_device *pdev) clk_disable(rtap->clk); platform_set_drvdata(pdev, rtap); - rtap->rtc = rtc_device_register("coh901331", &pdev->dev, &coh901331_ops, - THIS_MODULE); + rtap->rtc = devm_rtc_device_register(&pdev->dev, "coh901331", + &coh901331_ops, THIS_MODULE); if (IS_ERR(rtap->rtc)) { ret = PTR_ERR(rtap->rtc); goto out_no_rtc; @@ -287,18 +286,7 @@ static struct platform_driver coh901331_driver = { .shutdown = coh901331_shutdown, }; -static int __init coh901331_init(void) -{ - return platform_driver_probe(&coh901331_driver, coh901331_probe); -} - -static void __exit coh901331_exit(void) -{ - platform_driver_unregister(&coh901331_driver); -} - -module_init(coh901331_init); -module_exit(coh901331_exit); +module_platform_driver_probe(coh901331_driver, coh901331_probe); MODULE_AUTHOR("Linus Walleij <linus.walleij@stericsson.com>"); MODULE_DESCRIPTION("ST-Ericsson AB COH 901 331 RTC Driver"); diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index 0dde688ca09b..c4d12d971ded 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -249,7 +249,7 @@ static int da9052_rtc_probe(struct platform_device *pdev) return ret; } - rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, + rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &da9052_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) return PTR_ERR(rtc->rtc); @@ -259,9 +259,6 @@ static int da9052_rtc_probe(struct platform_device *pdev) static int da9052_rtc_remove(struct platform_device *pdev) { - struct da9052_rtc *rtc = pdev->dev.platform_data; - - rtc_device_unregister(rtc->rtc); platform_set_drvdata(pdev, NULL); return 0; diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c index 8f0dcfedb83c..73858ca9709a 100644 --- a/drivers/rtc/rtc-da9055.c +++ b/drivers/rtc/rtc-da9055.c @@ -294,7 +294,7 @@ static int da9055_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); - rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, + rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &da9055_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { ret = PTR_ERR(rtc->rtc); @@ -317,9 +317,6 @@ err_rtc: static int da9055_rtc_remove(struct platform_device *pdev) { - struct da9055_rtc *rtc = pdev->dev.platform_data; - - rtc_device_unregister(rtc->rtc); platform_set_drvdata(pdev, NULL); return 0; diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 56b73089bb29..a55048c3e26f 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -523,7 +523,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, davinci_rtc); - davinci_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, + davinci_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &davinci_rtc_ops, THIS_MODULE); if (IS_ERR(davinci_rtc->rtc)) { ret = PTR_ERR(davinci_rtc->rtc); @@ -543,7 +543,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) 0, "davinci_rtc", davinci_rtc); if (ret < 0) { dev_err(dev, "unable to register davinci RTC interrupt\n"); - goto fail2; + goto fail1; } /* Enable interrupts */ @@ -557,14 +557,12 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) return 0; -fail2: - rtc_device_unregister(davinci_rtc->rtc); fail1: platform_set_drvdata(pdev, NULL); return ret; } -static int davinci_rtc_remove(struct platform_device *pdev) +static int __exit davinci_rtc_remove(struct platform_device *pdev) { struct davinci_rtc *davinci_rtc = platform_get_drvdata(pdev); @@ -572,8 +570,6 @@ static int davinci_rtc_remove(struct platform_device *pdev) rtcif_write(davinci_rtc, 0, PRTCIF_INTEN); - rtc_device_unregister(davinci_rtc->rtc); - platform_set_drvdata(pdev, NULL); return 0; @@ -581,24 +577,14 @@ static int davinci_rtc_remove(struct platform_device *pdev) static struct platform_driver davinci_rtc_driver = { .probe = davinci_rtc_probe, - .remove = davinci_rtc_remove, + .remove = __exit_p(davinci_rtc_remove), .driver = { .name = "rtc_davinci", .owner = THIS_MODULE, }, }; -static int __init rtc_init(void) -{ - return platform_driver_probe(&davinci_rtc_driver, davinci_rtc_probe); -} -module_init(rtc_init); - -static void __exit rtc_exit(void) -{ - platform_driver_unregister(&davinci_rtc_driver); -} -module_exit(rtc_exit); +module_platform_driver_probe(davinci_rtc_driver, davinci_rtc_probe); MODULE_AUTHOR("Miguel Aguilar <miguel.aguilar@ridgerun.com>"); MODULE_DESCRIPTION("Texas Instruments DaVinci PRTC Driver"); diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c index fdbcdb289d60..7d4c2b44d603 100644 --- a/drivers/rtc/rtc-ds1302.c +++ b/drivers/rtc/rtc-ds1302.c @@ -234,7 +234,7 @@ static int __init ds1302_rtc_probe(struct platform_device *pdev) return 0; } -static int ds1302_rtc_remove(struct platform_device *pdev) +static int __exit ds1302_rtc_remove(struct platform_device *pdev) { struct rtc_device *rtc = platform_get_drvdata(pdev); @@ -249,21 +249,10 @@ static struct platform_driver ds1302_platform_driver = { .name = DRV_NAME, .owner = THIS_MODULE, }, - .remove = ds1302_rtc_remove, + .remove = __exit_p(ds1302_rtc_remove), }; -static int __init ds1302_rtc_init(void) -{ - return platform_driver_probe(&ds1302_platform_driver, ds1302_rtc_probe); -} - -static void __exit ds1302_rtc_exit(void) -{ - platform_driver_unregister(&ds1302_platform_driver); -} - -module_init(ds1302_rtc_init); -module_exit(ds1302_rtc_exit); +module_platform_driver_probe(ds1302_platform_driver, ds1302_rtc_probe); MODULE_DESCRIPTION("Dallas DS1302 RTC driver"); MODULE_VERSION(DRV_VERSION); diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 970a236b147a..a65621c42170 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -4,6 +4,7 @@ * Copyright (C) 2005 James Chapman (ds1337 core) * Copyright (C) 2006 David Brownell * Copyright (C) 2009 Matthias Fuchs (rx8025 support) + * Copyright (C) 2012 Bertrand Achard (nvram access fixes) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -196,7 +197,7 @@ static s32 ds1307_read_block_data_once(const struct i2c_client *client, static s32 ds1307_read_block_data(const struct i2c_client *client, u8 command, u8 length, u8 *values) { - u8 oldvalues[I2C_SMBUS_BLOCK_MAX]; + u8 oldvalues[255]; s32 ret; int tries = 0; @@ -222,7 +223,7 @@ static s32 ds1307_read_block_data(const struct i2c_client *client, u8 command, static s32 ds1307_write_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values) { - u8 currvalues[I2C_SMBUS_BLOCK_MAX]; + u8 currvalues[255]; int tries = 0; dev_dbg(&client->dev, "ds1307_write_block_data (length=%d)\n", length); @@ -250,6 +251,57 @@ static s32 ds1307_write_block_data(const struct i2c_client *client, u8 command, /*----------------------------------------------------------------------*/ +/* These RTC devices are not designed to be connected to a SMbus adapter. + SMbus limits block operations length to 32 bytes, whereas it's not + limited on I2C buses. As a result, accesses may exceed 32 bytes; + in that case, split them into smaller blocks */ + +static s32 ds1307_native_smbus_write_block_data(const struct i2c_client *client, + u8 command, u8 length, const u8 *values) +{ + u8 suboffset = 0; + + if (length <= I2C_SMBUS_BLOCK_MAX) + return i2c_smbus_write_i2c_block_data(client, + command, length, values); + + while (suboffset < length) { + s32 retval = i2c_smbus_write_i2c_block_data(client, + command + suboffset, + min(I2C_SMBUS_BLOCK_MAX, length - suboffset), + values + suboffset); + if (retval < 0) + return retval; + + suboffset += I2C_SMBUS_BLOCK_MAX; + } + return length; +} + +static s32 ds1307_native_smbus_read_block_data(const struct i2c_client *client, + u8 command, u8 length, u8 *values) +{ + u8 suboffset = 0; + + if (length <= I2C_SMBUS_BLOCK_MAX) + return i2c_smbus_read_i2c_block_data(client, + command, length, values); + + while (suboffset < length) { + s32 retval = i2c_smbus_read_i2c_block_data(client, + command + suboffset, + min(I2C_SMBUS_BLOCK_MAX, length - suboffset), + values + suboffset); + if (retval < 0) + return retval; + + suboffset += I2C_SMBUS_BLOCK_MAX; + } + return length; +} + +/*----------------------------------------------------------------------*/ + /* * The IRQ logic includes a "real" handler running in IRQ context just * long enough to schedule this workqueue entry. We need a task context @@ -646,8 +698,8 @@ static int ds1307_probe(struct i2c_client *client, buf = ds1307->regs; if (i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) { - ds1307->read_block_data = i2c_smbus_read_i2c_block_data; - ds1307->write_block_data = i2c_smbus_write_i2c_block_data; + ds1307->read_block_data = ds1307_native_smbus_read_block_data; + ds1307->write_block_data = ds1307_native_smbus_write_block_data; } else { ds1307->read_block_data = ds1307_read_block_data; ds1307->write_block_data = ds1307_write_block_data; diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 6a3fcfe3b0e7..6ce8a997cf51 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -538,15 +538,14 @@ static int ds1511_rtc_probe(struct platform_device *pdev) } } - rtc = rtc_device_register(pdev->name, &pdev->dev, &ds1511_rtc_ops, - THIS_MODULE); + rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &ds1511_rtc_ops, + THIS_MODULE); if (IS_ERR(rtc)) return PTR_ERR(rtc); pdata->rtc = rtc; ret = sysfs_create_bin_file(&pdev->dev.kobj, &ds1511_nvram_attr); - if (ret) - rtc_device_unregister(pdata->rtc); + return ret; } @@ -555,7 +554,6 @@ static int ds1511_rtc_remove(struct platform_device *pdev) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); sysfs_remove_bin_file(&pdev->dev.kobj, &ds1511_nvram_attr); - rtc_device_unregister(pdata->rtc); if (pdata->irq > 0) { /* * disable the alarm interrupt diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 25ce0621ade9..8c6c952e90b1 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -326,15 +326,14 @@ static int ds1553_rtc_probe(struct platform_device *pdev) } } - rtc = rtc_device_register(pdev->name, &pdev->dev, + rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &ds1553_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) return PTR_ERR(rtc); pdata->rtc = rtc; ret = sysfs_create_bin_file(&pdev->dev.kobj, &ds1553_nvram_attr); - if (ret) - rtc_device_unregister(rtc); + return ret; } @@ -343,7 +342,6 @@ static int ds1553_rtc_remove(struct platform_device *pdev) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); sysfs_remove_bin_file(&pdev->dev.kobj, &ds1553_nvram_attr); - rtc_device_unregister(pdata->rtc); if (pdata->irq > 0) writeb(0, pdata->ioaddr + RTC_INTERRUPTS); return 0; diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index 609c870e2cc5..eccdc62ae1c0 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -208,17 +208,14 @@ static int ds1742_rtc_probe(struct platform_device *pdev) pdata->last_jiffies = jiffies; platform_set_drvdata(pdev, pdata); - rtc = rtc_device_register(pdev->name, &pdev->dev, + rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &ds1742_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) return PTR_ERR(rtc); pdata->rtc = rtc; ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr); - if (ret) { - dev_err(&pdev->dev, "creating nvram file in sysfs failed\n"); - rtc_device_unregister(rtc); - } + return ret; } @@ -227,7 +224,6 @@ static int ds1742_rtc_remove(struct platform_device *pdev) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); sysfs_remove_bin_file(&pdev->dev.kobj, &pdata->nvram_attr); - rtc_device_unregister(pdata->rtc); return 0; } diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index 1a0c37c9152b..e70666272266 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -218,18 +218,7 @@ static struct platform_driver efi_rtc_driver = { .remove = __exit_p(efi_rtc_remove), }; -static int __init efi_rtc_init(void) -{ - return platform_driver_probe(&efi_rtc_driver, efi_rtc_probe); -} - -static void __exit efi_rtc_exit(void) -{ - platform_driver_unregister(&efi_rtc_driver); -} - -module_init(efi_rtc_init); -module_exit(efi_rtc_exit); +module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe); MODULE_AUTHOR("dann frazier <dannf@hp.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 1a4e5e4a70cd..5807b77c444a 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -153,8 +153,8 @@ static int ep93xx_rtc_probe(struct platform_device *pdev) pdev->dev.platform_data = ep93xx_rtc; platform_set_drvdata(pdev, ep93xx_rtc); - ep93xx_rtc->rtc = rtc_device_register(pdev->name, - &pdev->dev, &ep93xx_rtc_ops, THIS_MODULE); + ep93xx_rtc->rtc = devm_rtc_device_register(&pdev->dev, + pdev->name, &ep93xx_rtc_ops, THIS_MODULE); if (IS_ERR(ep93xx_rtc->rtc)) { err = PTR_ERR(ep93xx_rtc->rtc); goto exit; @@ -162,12 +162,10 @@ static int ep93xx_rtc_probe(struct platform_device *pdev) err = sysfs_create_group(&pdev->dev.kobj, &ep93xx_rtc_sysfs_files); if (err) - goto fail; + goto exit; return 0; -fail: - rtc_device_unregister(ep93xx_rtc->rtc); exit: platform_set_drvdata(pdev, NULL); pdev->dev.platform_data = NULL; @@ -176,11 +174,8 @@ exit: static int ep93xx_rtc_remove(struct platform_device *pdev) { - struct ep93xx_rtc *ep93xx_rtc = platform_get_drvdata(pdev); - sysfs_remove_group(&pdev->dev.kobj, &ep93xx_rtc_sysfs_files); platform_set_drvdata(pdev, NULL); - rtc_device_unregister(ep93xx_rtc->rtc); pdev->dev.platform_data = NULL; return 0; diff --git a/drivers/rtc/rtc-generic.c b/drivers/rtc/rtc-generic.c index 98322004ad2e..0bf4530e18b0 100644 --- a/drivers/rtc/rtc-generic.c +++ b/drivers/rtc/rtc-generic.c @@ -65,18 +65,7 @@ static struct platform_driver generic_rtc_driver = { .remove = __exit_p(generic_rtc_remove), }; -static int __init generic_rtc_init(void) -{ - return platform_driver_probe(&generic_rtc_driver, generic_rtc_probe); -} - -static void __exit generic_rtc_fini(void) -{ - platform_driver_unregister(&generic_rtc_driver); -} - -module_init(generic_rtc_init); -module_exit(generic_rtc_fini); +module_platform_driver_probe(generic_rtc_driver, generic_rtc_probe); MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index 82aad695979e..d3a8c8e255de 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -369,7 +369,7 @@ static void dryice_work(struct work_struct *work) /* * probe for dryice rtc device */ -static int dryice_rtc_probe(struct platform_device *pdev) +static int __init dryice_rtc_probe(struct platform_device *pdev) { struct resource *res; struct imxdi_dev *imxdi; @@ -464,7 +464,7 @@ static int dryice_rtc_probe(struct platform_device *pdev) } platform_set_drvdata(pdev, imxdi); - imxdi->rtc = rtc_device_register(pdev->name, &pdev->dev, + imxdi->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &dryice_rtc_ops, THIS_MODULE); if (IS_ERR(imxdi->rtc)) { rc = PTR_ERR(imxdi->rtc); @@ -479,7 +479,7 @@ err: return rc; } -static int dryice_rtc_remove(struct platform_device *pdev) +static int __exit dryice_rtc_remove(struct platform_device *pdev) { struct imxdi_dev *imxdi = platform_get_drvdata(pdev); @@ -488,8 +488,6 @@ static int dryice_rtc_remove(struct platform_device *pdev) /* mask all interrupts */ __raw_writel(0, imxdi->ioaddr + DIER); - rtc_device_unregister(imxdi->rtc); - clk_disable_unprepare(imxdi->clk); return 0; @@ -510,21 +508,10 @@ static struct platform_driver dryice_rtc_driver = { .owner = THIS_MODULE, .of_match_table = of_match_ptr(dryice_dt_ids), }, - .remove = dryice_rtc_remove, + .remove = __exit_p(dryice_rtc_remove), }; -static int __init dryice_rtc_init(void) -{ - return platform_driver_probe(&dryice_rtc_driver, dryice_rtc_probe); -} - -static void __exit dryice_rtc_exit(void) -{ - platform_driver_unregister(&dryice_rtc_driver); -} - -module_init(dryice_rtc_init); -module_exit(dryice_rtc_exit); +module_platform_driver_probe(dryice_rtc_driver, dryice_rtc_probe); MODULE_AUTHOR("Freescale Semiconductor, Inc."); MODULE_AUTHOR("Baruch Siach <baruch@tkos.co.il>"); diff --git a/drivers/rtc/rtc-lp8788.c b/drivers/rtc/rtc-lp8788.c index 9a4631218f41..9853ac15b296 100644 --- a/drivers/rtc/rtc-lp8788.c +++ b/drivers/rtc/rtc-lp8788.c @@ -299,7 +299,7 @@ static int lp8788_rtc_probe(struct platform_device *pdev) device_init_wakeup(dev, 1); - rtc->rdev = rtc_device_register("lp8788_rtc", dev, + rtc->rdev = devm_rtc_device_register(dev, "lp8788_rtc", &lp8788_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rdev)) { dev_err(dev, "can not register rtc device\n"); @@ -314,9 +314,6 @@ static int lp8788_rtc_probe(struct platform_device *pdev) static int lp8788_rtc_remove(struct platform_device *pdev) { - struct lp8788_rtc *rtc = platform_get_drvdata(pdev); - - rtc_device_unregister(rtc->rdev); platform_set_drvdata(pdev, NULL); return 0; diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index 40a598332bac..787550d756e9 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -273,8 +273,8 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc); - rtc->rtc = rtc_device_register(RTC_NAME, &pdev->dev, &lpc32xx_rtc_ops, - THIS_MODULE); + rtc->rtc = devm_rtc_device_register(&pdev->dev, RTC_NAME, + &lpc32xx_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { dev_err(&pdev->dev, "Can't get RTC\n"); platform_set_drvdata(pdev, NULL); @@ -307,7 +307,6 @@ static int lpc32xx_rtc_remove(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 0); platform_set_drvdata(pdev, NULL); - rtc_device_unregister(rtc->rtc); return 0; } diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c index 6b1337f9baf4..5a12b32f77ec 100644 --- a/drivers/rtc/rtc-max77686.c +++ b/drivers/rtc/rtc-max77686.c @@ -24,7 +24,7 @@ /* RTC Control Register */ #define BCD_EN_SHIFT 0 -#define BCD_EN_MASK (1 << BCD_EN_SHIFT) +#define BCD_EN_MASK (1 << BCD_EN_SHIFT) #define MODEL24_SHIFT 1 #define MODEL24_MASK (1 << MODEL24_SHIFT) /* RTC Update Register1 */ @@ -33,12 +33,12 @@ #define RTC_RBUDR_SHIFT 4 #define RTC_RBUDR_MASK (1 << RTC_RBUDR_SHIFT) /* WTSR and SMPL Register */ -#define WTSRT_SHIFT 0 -#define SMPLT_SHIFT 2 +#define WTSRT_SHIFT 0 +#define SMPLT_SHIFT 2 #define WTSR_EN_SHIFT 6 #define SMPL_EN_SHIFT 7 -#define WTSRT_MASK (3 << WTSRT_SHIFT) -#define SMPLT_MASK (3 << SMPLT_SHIFT) +#define WTSRT_MASK (3 << WTSRT_SHIFT) +#define SMPLT_MASK (3 << SMPLT_SHIFT) #define WTSR_EN_MASK (1 << WTSR_EN_SHIFT) #define SMPL_EN_MASK (1 << SMPL_EN_SHIFT) /* RTC Hour register */ @@ -466,7 +466,7 @@ static void max77686_rtc_enable_smpl(struct max77686_rtc_info *info, bool enable val = 0; regmap_read(info->max77686->rtc_regmap, MAX77686_WTSR_SMPL_CNTL, &val); - pr_info("%s: WTSR_SMPL(0x%02x)\n", __func__, val); + dev_info(info->dev, "%s: WTSR_SMPL(0x%02x)\n", __func__, val); } #endif /* MAX77686_RTC_WTSR_SMPL */ @@ -505,7 +505,8 @@ static int max77686_rtc_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%s\n", __func__); - info = kzalloc(sizeof(struct max77686_rtc_info), GFP_KERNEL); + info = devm_kzalloc(&pdev->dev, sizeof(struct max77686_rtc_info), + GFP_KERNEL); if (!info) return -ENOMEM; @@ -519,7 +520,6 @@ static int max77686_rtc_probe(struct platform_device *pdev) ret = PTR_ERR(info->max77686->rtc_regmap); dev_err(info->max77686->dev, "Failed to allocate register map: %d\n", ret); - kfree(info); return ret; } platform_set_drvdata(pdev, info); @@ -538,8 +538,8 @@ static int max77686_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); - info->rtc_dev = rtc_device_register("max77686-rtc", &pdev->dev, - &max77686_rtc_ops, THIS_MODULE); + info->rtc_dev = devm_rtc_device_register(&pdev->dev, "max77686-rtc", + &max77686_rtc_ops, THIS_MODULE); if (IS_ERR(info->rtc_dev)) { dev_info(&pdev->dev, "%s: fail\n", __func__); @@ -555,32 +555,20 @@ static int max77686_rtc_probe(struct platform_device *pdev) goto err_rtc; info->virq = virq; - ret = request_threaded_irq(virq, NULL, max77686_rtc_alarm_irq, 0, - "rtc-alarm0", info); + ret = devm_request_threaded_irq(&pdev->dev, virq, NULL, + max77686_rtc_alarm_irq, 0, "rtc-alarm0", info); if (ret < 0) { dev_err(&pdev->dev, "Failed to request alarm IRQ: %d: %d\n", info->virq, ret); goto err_rtc; } - goto out; err_rtc: - kfree(info); - return ret; -out: return ret; } static int max77686_rtc_remove(struct platform_device *pdev) { - struct max77686_rtc_info *info = platform_get_drvdata(pdev); - - if (info) { - free_irq(info->virq, info); - rtc_device_unregister(info->rtc_dev); - kfree(info); - } - return 0; } @@ -594,11 +582,14 @@ static void max77686_rtc_shutdown(struct platform_device *pdev) for (i = 0; i < 3; i++) { max77686_rtc_enable_wtsr(info, false); regmap_read(info->max77686->rtc_regmap, MAX77686_WTSR_SMPL_CNTL, &val); - pr_info("%s: WTSR_SMPL reg(0x%02x)\n", __func__, val); - if (val & WTSR_EN_MASK) - pr_emerg("%s: fail to disable WTSR\n", __func__); - else { - pr_info("%s: success to disable WTSR\n", __func__); + dev_info(info->dev, "%s: WTSR_SMPL reg(0x%02x)\n", __func__, + val); + if (val & WTSR_EN_MASK) { + dev_emerg(info->dev, "%s: fail to disable WTSR\n", + __func__); + } else { + dev_info(info->dev, "%s: success to disable WTSR\n", + __func__); break; } } @@ -624,18 +615,8 @@ static struct platform_driver max77686_rtc_driver = { .id_table = rtc_id, }; -static int __init max77686_rtc_init(void) -{ - return platform_driver_register(&max77686_rtc_driver); -} -module_init(max77686_rtc_init); - -static void __exit max77686_rtc_exit(void) -{ - platform_driver_unregister(&max77686_rtc_driver); -} -module_exit(max77686_rtc_exit); +module_platform_driver(max77686_rtc_driver); MODULE_DESCRIPTION("Maxim MAX77686 RTC driver"); -MODULE_AUTHOR("<woong.byun@samsung.com>"); +MODULE_AUTHOR("Chiwoong Byun <woong.byun@samsung.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-max8907.c b/drivers/rtc/rtc-max8907.c index 31ca8faf9f05..9d62cdb83d11 100644 --- a/drivers/rtc/rtc-max8907.c +++ b/drivers/rtc/rtc-max8907.c @@ -190,7 +190,7 @@ static int max8907_rtc_probe(struct platform_device *pdev) rtc->max8907 = max8907; rtc->regmap = max8907->regmap_rtc; - rtc->rtc_dev = rtc_device_register("max8907-rtc", &pdev->dev, + rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, "max8907-rtc", &max8907_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc_dev)) { ret = PTR_ERR(rtc->rtc_dev); @@ -217,16 +217,11 @@ static int max8907_rtc_probe(struct platform_device *pdev) return 0; err_unregister: - rtc_device_unregister(rtc->rtc_dev); return ret; } static int max8907_rtc_remove(struct platform_device *pdev) { - struct max8907_rtc *rtc = platform_get_drvdata(pdev); - - rtc_device_unregister(rtc->rtc_dev); - return 0; } diff --git a/drivers/rtc/rtc-max8997.c b/drivers/rtc/rtc-max8997.c index 00e505b6bee3..d12acc49c822 100644 --- a/drivers/rtc/rtc-max8997.c +++ b/drivers/rtc/rtc-max8997.c @@ -479,8 +479,8 @@ static int max8997_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); - info->rtc_dev = rtc_device_register("max8997-rtc", &pdev->dev, - &max8997_rtc_ops, THIS_MODULE); + info->rtc_dev = devm_rtc_device_register(&pdev->dev, "max8997-rtc", + &max8997_rtc_ops, THIS_MODULE); if (IS_ERR(info->rtc_dev)) { ret = PTR_ERR(info->rtc_dev); @@ -507,17 +507,11 @@ static int max8997_rtc_probe(struct platform_device *pdev) return ret; err_out: - rtc_device_unregister(info->rtc_dev); return ret; } static int max8997_rtc_remove(struct platform_device *pdev) { - struct max8997_rtc_info *info = platform_get_drvdata(pdev); - - if (info) - rtc_device_unregister(info->rtc_dev); - return 0; } diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 2643d8874925..5391b154b43c 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -420,17 +420,7 @@ static struct platform_driver mc13xxx_rtc_driver = { }, }; -static int __init mc13xxx_rtc_init(void) -{ - return platform_driver_probe(&mc13xxx_rtc_driver, &mc13xxx_rtc_probe); -} -module_init(mc13xxx_rtc_init); - -static void __exit mc13xxx_rtc_exit(void) -{ - platform_driver_unregister(&mc13xxx_rtc_driver); -} -module_exit(mc13xxx_rtc_exit); +module_platform_driver_probe(mc13xxx_rtc_driver, &mc13xxx_rtc_probe); MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>"); MODULE_DESCRIPTION("RTC driver for Freescale MC13XXX PMIC"); diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c index fcb113c11122..3ac1e8eca89d 100644 --- a/drivers/rtc/rtc-msm6242.c +++ b/drivers/rtc/rtc-msm6242.c @@ -252,18 +252,7 @@ static struct platform_driver msm6242_rtc_driver = { .remove = __exit_p(msm6242_rtc_remove), }; -static int __init msm6242_rtc_init(void) -{ - return platform_driver_probe(&msm6242_rtc_driver, msm6242_rtc_probe); -} - -static void __exit msm6242_rtc_fini(void) -{ - platform_driver_unregister(&msm6242_rtc_driver); -} - -module_init(msm6242_rtc_init); -module_exit(msm6242_rtc_fini); +module_platform_driver_probe(msm6242_rtc_driver, msm6242_rtc_probe); MODULE_AUTHOR("Geert Uytterhoeven <geert@linux-m68k.org>"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c index 8f87fec27ce7..baab802f2153 100644 --- a/drivers/rtc/rtc-mv.c +++ b/drivers/rtc/rtc-mv.c @@ -217,7 +217,7 @@ static const struct rtc_class_ops mv_rtc_alarm_ops = { .alarm_irq_enable = mv_rtc_alarm_irq_enable, }; -static int mv_rtc_probe(struct platform_device *pdev) +static int __init mv_rtc_probe(struct platform_device *pdev) { struct resource *res; struct rtc_plat_data *pdata; @@ -272,12 +272,13 @@ static int mv_rtc_probe(struct platform_device *pdev) if (pdata->irq >= 0) { device_init_wakeup(&pdev->dev, 1); - pdata->rtc = rtc_device_register(pdev->name, &pdev->dev, + pdata->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &mv_rtc_alarm_ops, THIS_MODULE); - } else - pdata->rtc = rtc_device_register(pdev->name, &pdev->dev, + } else { + pdata->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &mv_rtc_ops, THIS_MODULE); + } if (IS_ERR(pdata->rtc)) { ret = PTR_ERR(pdata->rtc); goto out; @@ -308,7 +309,6 @@ static int __exit mv_rtc_remove(struct platform_device *pdev) if (pdata->irq >= 0) device_init_wakeup(&pdev->dev, 0); - rtc_device_unregister(pdata->rtc); if (!IS_ERR(pdata->clk)) clk_disable_unprepare(pdata->clk); @@ -331,18 +331,7 @@ static struct platform_driver mv_rtc_driver = { }, }; -static __init int mv_init(void) -{ - return platform_driver_probe(&mv_rtc_driver, mv_rtc_probe); -} - -static __exit void mv_exit(void) -{ - platform_driver_unregister(&mv_rtc_driver); -} - -module_init(mv_init); -module_exit(mv_exit); +module_platform_driver_probe(mv_rtc_driver, mv_rtc_probe); MODULE_AUTHOR("Saeed Bishara <saeed@marvell.com>"); MODULE_DESCRIPTION("Marvell RTC driver"); diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 1c3ef7289565..13380ca3651c 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -439,7 +439,7 @@ static int mxc_rtc_probe(struct platform_device *pdev) if (pdata->irq >=0) device_init_wakeup(&pdev->dev, 1); - rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops, + rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &mxc_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { ret = PTR_ERR(rtc); @@ -464,8 +464,6 @@ static int mxc_rtc_remove(struct platform_device *pdev) { struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - rtc_device_unregister(pdata->rtc); - clk_disable_unprepare(pdata->clk); platform_set_drvdata(pdev, NULL); diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index a63680850fef..4d9525cc1cf4 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -222,7 +222,7 @@ static struct rtc_class_ops nuc900_rtc_ops = { .alarm_irq_enable = nuc900_alarm_irq_enable, }; -static int nuc900_rtc_probe(struct platform_device *pdev) +static int __init nuc900_rtc_probe(struct platform_device *pdev) { struct resource *res; struct nuc900_rtc *nuc900_rtc; @@ -284,7 +284,7 @@ fail1: kfree(nuc900_rtc); return err; } -static int nuc900_rtc_remove(struct platform_device *pdev) +static int __exit nuc900_rtc_remove(struct platform_device *pdev) { struct nuc900_rtc *nuc900_rtc = platform_get_drvdata(pdev); struct resource *res; @@ -304,25 +304,14 @@ static int nuc900_rtc_remove(struct platform_device *pdev) } static struct platform_driver nuc900_rtc_driver = { - .remove = nuc900_rtc_remove, + .remove = __exit_p(nuc900_rtc_remove), .driver = { .name = "nuc900-rtc", .owner = THIS_MODULE, }, }; -static int __init nuc900_rtc_init(void) -{ - return platform_driver_probe(&nuc900_rtc_driver, nuc900_rtc_probe); -} - -static void __exit nuc900_rtc_exit(void) -{ - platform_driver_unregister(&nuc900_rtc_driver); -} - -module_init(nuc900_rtc_init); -module_exit(nuc900_rtc_exit); +module_platform_driver_probe(nuc900_rtc_driver, nuc900_rtc_probe); MODULE_AUTHOR("Wan ZongShun <mcuos.com@gmail.com>"); MODULE_DESCRIPTION("nuc910/nuc920 RTC driver"); diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index 600971407aac..172cc5ca7489 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -550,17 +550,7 @@ static struct platform_driver omap_rtc_driver = { .id_table = omap_rtc_devtype, }; -static int __init rtc_init(void) -{ - return platform_driver_probe(&omap_rtc_driver, omap_rtc_probe); -} -module_init(rtc_init); - -static void __exit rtc_exit(void) -{ - platform_driver_unregister(&omap_rtc_driver); -} -module_exit(rtc_exit); +module_platform_driver_probe(omap_rtc_driver, omap_rtc_probe); MODULE_AUTHOR("George G. Davis (and others)"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c index 59c42986254e..bbc3b9efdeb2 100644 --- a/drivers/rtc/rtc-palmas.c +++ b/drivers/rtc/rtc-palmas.c @@ -264,7 +264,7 @@ static int palmas_rtc_probe(struct platform_device *pdev) palmas_rtc->irq = platform_get_irq(pdev, 0); - palmas_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, + palmas_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &palmas_rtc_ops, THIS_MODULE); if (IS_ERR(palmas_rtc->rtc)) { ret = PTR_ERR(palmas_rtc->rtc); @@ -272,14 +272,13 @@ static int palmas_rtc_probe(struct platform_device *pdev) return ret; } - ret = request_threaded_irq(palmas_rtc->irq, NULL, + ret = devm_request_threaded_irq(&pdev->dev, palmas_rtc->irq, NULL, palmas_rtc_interrupt, IRQF_TRIGGER_LOW | IRQF_ONESHOT | IRQF_EARLY_RESUME, dev_name(&pdev->dev), palmas_rtc); if (ret < 0) { dev_err(&pdev->dev, "IRQ request failed, err = %d\n", ret); - rtc_device_unregister(palmas_rtc->rtc); return ret; } @@ -289,11 +288,7 @@ static int palmas_rtc_probe(struct platform_device *pdev) static int palmas_rtc_remove(struct platform_device *pdev) { - struct palmas_rtc *palmas_rtc = platform_get_drvdata(pdev); - palmas_rtc_alarm_irq_enable(&pdev->dev, 0); - free_irq(palmas_rtc->irq, palmas_rtc); - rtc_device_unregister(palmas_rtc->rtc); return 0; } diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c index e0019cd0bf71..ce0982490e8c 100644 --- a/drivers/rtc/rtc-pcap.c +++ b/drivers/rtc/rtc-pcap.c @@ -139,7 +139,7 @@ static const struct rtc_class_ops pcap_rtc_ops = { .alarm_irq_enable = pcap_rtc_alarm_irq_enable, }; -static int pcap_rtc_probe(struct platform_device *pdev) +static int __init pcap_rtc_probe(struct platform_device *pdev) { struct pcap_rtc *pcap_rtc; int timer_irq, alarm_irq; @@ -183,7 +183,7 @@ fail_rtc: return err; } -static int pcap_rtc_remove(struct platform_device *pdev) +static int __exit pcap_rtc_remove(struct platform_device *pdev) { struct pcap_rtc *pcap_rtc = platform_get_drvdata(pdev); @@ -196,25 +196,14 @@ static int pcap_rtc_remove(struct platform_device *pdev) } static struct platform_driver pcap_rtc_driver = { - .remove = pcap_rtc_remove, + .remove = __exit_p(pcap_rtc_remove), .driver = { .name = "pcap-rtc", .owner = THIS_MODULE, }, }; -static int __init rtc_pcap_init(void) -{ - return platform_driver_probe(&pcap_rtc_driver, pcap_rtc_probe); -} - -static void __exit rtc_pcap_exit(void) -{ - platform_driver_unregister(&pcap_rtc_driver); -} - -module_init(rtc_pcap_init); -module_exit(rtc_pcap_exit); +module_platform_driver_probe(pcap_rtc_driver, pcap_rtc_probe); MODULE_DESCRIPTION("Motorola pcap rtc driver"); MODULE_AUTHOR("guiming zhuo <gmzhuo@gmail.com>"); diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c index 889e3160e701..305c9515e5bb 100644 --- a/drivers/rtc/rtc-pcf8523.c +++ b/drivers/rtc/rtc-pcf8523.c @@ -307,7 +307,7 @@ static int pcf8523_probe(struct i2c_client *client, if (err < 0) return err; - pcf->rtc = rtc_device_register(DRIVER_NAME, &client->dev, + pcf->rtc = devm_rtc_device_register(&client->dev, DRIVER_NAME, &pcf8523_rtc_ops, THIS_MODULE); if (IS_ERR(pcf->rtc)) return PTR_ERR(pcf->rtc); @@ -319,10 +319,6 @@ static int pcf8523_probe(struct i2c_client *client, static int pcf8523_remove(struct i2c_client *client) { - struct pcf8523 *pcf = i2c_get_clientdata(client); - - rtc_device_unregister(pcf->rtc); - return 0; } diff --git a/drivers/rtc/rtc-ps3.c b/drivers/rtc/rtc-ps3.c index 968133ce1ee8..846722de0ce4 100644 --- a/drivers/rtc/rtc-ps3.c +++ b/drivers/rtc/rtc-ps3.c @@ -85,18 +85,7 @@ static struct platform_driver ps3_rtc_driver = { .remove = __exit_p(ps3_rtc_remove), }; -static int __init ps3_rtc_init(void) -{ - return platform_driver_probe(&ps3_rtc_driver, ps3_rtc_probe); -} - -static void __exit ps3_rtc_fini(void) -{ - platform_driver_unregister(&ps3_rtc_driver); -} - -module_init(ps3_rtc_init); -module_exit(ps3_rtc_fini); +module_platform_driver_probe(ps3_rtc_driver, ps3_rtc_probe); MODULE_AUTHOR("Sony Corporation"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index 03c85ee719a7..72e13dafa901 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -19,6 +19,7 @@ * */ +#include <linux/delay.h> #include <linux/init.h> #include <linux/platform_device.h> #include <linux/module.h> @@ -80,22 +81,29 @@ #define RYAR1 0x1c #define RTCPICR 0x34 #define PIAR 0x38 +#define PSBR_RTC 0x00 #define rtc_readl(pxa_rtc, reg) \ __raw_readl((pxa_rtc)->base + (reg)) #define rtc_writel(pxa_rtc, reg, value) \ __raw_writel((value), (pxa_rtc)->base + (reg)) +#define rtc_readl_psbr(pxa_rtc, reg) \ + __raw_readl((pxa_rtc)->base_psbr + (reg)) +#define rtc_writel_psbr(pxa_rtc, reg, value) \ + __raw_writel((value), (pxa_rtc)->base_psbr + (reg)) struct pxa_rtc { struct resource *ress; + struct resource *ress_psbr; void __iomem *base; + void __iomem *base_psbr; int irq_1Hz; int irq_Alrm; struct rtc_device *rtc; spinlock_t lock; /* Protects this structure */ }; - +static struct pxa_rtc *rtc_info; static u32 ryxr_calc(struct rtc_time *tm) { return ((tm->tm_year + 1900) << RYxR_YEAR_S) @@ -117,7 +125,7 @@ static void tm_calc(u32 rycr, u32 rdcr, struct rtc_time *tm) tm->tm_year = ((rycr & RYxR_YEAR_MASK) >> RYxR_YEAR_S) - 1900; tm->tm_mon = (((rycr & RYxR_MONTH_MASK) >> RYxR_MONTH_S)) - 1; tm->tm_mday = (rycr & RYxR_DAY_MASK); - tm->tm_wday = ((rycr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1; + tm->tm_wday = ((rdcr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1; tm->tm_hour = (rdcr & RDxR_HOUR_MASK) >> RDxR_HOUR_S; tm->tm_min = (rdcr & RDxR_MIN_MASK) >> RDxR_MIN_S; tm->tm_sec = rdcr & RDxR_SEC_MASK; @@ -175,7 +183,6 @@ static irqreturn_t pxa_rtc_irq(int irq, void *dev_id) /* enable back rtc interrupts */ rtc_writel(pxa_rtc, RTSR, rtsr & ~RTSR_TRIG_MASK); - spin_unlock(&pxa_rtc->lock); return IRQ_HANDLED; } @@ -250,12 +257,45 @@ static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm) static int pxa_rtc_set_time(struct device *dev, struct rtc_time *tm) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - + /* sequence to wirte pxa rtc register RCNR RDCR RYCR is + *1. set PSBR[RWE] bit, take 2x32-khz to complete + *2. write to RTC register,take 2x32-khz to complete + *3. clear PSBR[RWE] bit,take 2x32-khz to complete + */ + if ((tm->tm_year < 70) || (tm->tm_year > 138)) + return -EINVAL; + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01); + udelay(100); rtc_writel(pxa_rtc, RYCR, ryxr_calc(tm)); rtc_writel(pxa_rtc, RDCR, rdxr_calc(tm)); + udelay(100); + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00); + udelay(100); + pxa_rtc_read_time(dev, tm); + dev_info(dev, "tm.year = %d, tm.month = %d, tm.day = %d\n", + tm->tm_year + 1900, tm->tm_mon, tm->tm_mday); + return 0; +} +int pxa_rtc_sync_time(unsigned int ticks) +{ + /* sequence to wirte pxa rtc register RCNR RDCR RYCR is + *1. set PSBR[RWE] bit, take 2x32-khz to complete + *2. write to RTC register,take 2x32-khz to complete + *3. clear PSBR[RWE] bit,take 2x32-khz to complete + */ + struct rtc_time tm; + rtc_time_to_tm(ticks, &tm); + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01); + udelay(100); + rtc_writel(rtc_info, RYCR, ryxr_calc(&tm)); + rtc_writel(rtc_info, RDCR, rdxr_calc(&tm)); + udelay(100); + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00); + udelay(100); return 0; } +EXPORT_SYMBOL(pxa_rtc_sync_time); static int pxa_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { @@ -327,7 +367,7 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) pxa_rtc = kzalloc(sizeof(struct pxa_rtc), GFP_KERNEL); if (!pxa_rtc) return -ENOMEM; - + rtc_info = pxa_rtc; spin_lock_init(&pxa_rtc->lock); platform_set_drvdata(pdev, pxa_rtc); @@ -337,6 +377,11 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) dev_err(dev, "No I/O memory resource defined\n"); goto err_ress; } + pxa_rtc->ress_psbr = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!pxa_rtc->ress_psbr) { + dev_err(dev, "No I/O memory resource defined\n"); + goto err_ress; + } pxa_rtc->irq_1Hz = platform_get_irq(pdev, 0); if (pxa_rtc->irq_1Hz < 0) { @@ -348,7 +393,6 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) dev_err(dev, "No alarm IRQ resource defined\n"); goto err_ress; } - pxa_rtc_open(dev); ret = -ENOMEM; pxa_rtc->base = ioremap(pxa_rtc->ress->start, resource_size(pxa_rtc->ress)); @@ -357,6 +401,12 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) goto err_map; } + pxa_rtc->base_psbr = ioremap(pxa_rtc->ress_psbr->start, + resource_size(pxa_rtc->ress_psbr)); + if (!pxa_rtc->base_psbr) { + dev_err(&pdev->dev, "Unable to map pxa RTC PSBR I/O memory\n"); + goto err_map; + } /* * If the clock divider is uninitialized then reset it to the * default value to get the 1Hz clock. @@ -379,7 +429,7 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) } device_init_wakeup(dev, 1); - + pxa_rtc_open(dev); return 0; err_rtc_reg: @@ -452,18 +502,7 @@ static struct platform_driver pxa_rtc_driver = { }, }; -static int __init pxa_rtc_init(void) -{ - return platform_driver_probe(&pxa_rtc_driver, pxa_rtc_probe); -} - -static void __exit pxa_rtc_exit(void) -{ - platform_driver_unregister(&pxa_rtc_driver); -} - -module_init(pxa_rtc_init); -module_exit(pxa_rtc_exit); +module_platform_driver_probe(pxa_rtc_driver, pxa_rtc_probe); MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>"); MODULE_DESCRIPTION("PXA27x/PXA3xx Realtime Clock Driver (RTC)"); diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c index 359da6d020b9..d25d2f6c0cad 100644 --- a/drivers/rtc/rtc-rp5c01.c +++ b/drivers/rtc/rtc-rp5c01.c @@ -294,18 +294,7 @@ static struct platform_driver rp5c01_rtc_driver = { .remove = __exit_p(rp5c01_rtc_remove), }; -static int __init rp5c01_rtc_init(void) -{ - return platform_driver_probe(&rp5c01_rtc_driver, rp5c01_rtc_probe); -} - -static void __exit rp5c01_rtc_fini(void) -{ - platform_driver_unregister(&rp5c01_rtc_driver); -} - -module_init(rp5c01_rtc_init); -module_exit(rp5c01_rtc_fini); +module_platform_driver_probe(rp5c01_rtc_driver, rp5c01_rtc_probe); MODULE_AUTHOR("Geert Uytterhoeven <geert@linux-m68k.org>"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index fb994e9ddc15..7dcf719d9b85 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -423,10 +423,7 @@ static void s3c_rtc_enable(struct platform_device *pdev, int en) static int s3c_rtc_remove(struct platform_device *dev) { - struct rtc_device *rtc = platform_get_drvdata(dev); - platform_set_drvdata(dev, NULL); - rtc_device_unregister(rtc); s3c_rtc_setaie(&dev->dev, 0); @@ -511,7 +508,7 @@ static int s3c_rtc_probe(struct platform_device *pdev) /* register RTC and exit */ - rtc = rtc_device_register("s3c", &pdev->dev, &s3c_rtcops, + rtc = devm_rtc_device_register(&pdev->dev, "s3c", &s3c_rtcops, THIS_MODULE); if (IS_ERR(rtc)) { @@ -574,7 +571,6 @@ static int s3c_rtc_probe(struct platform_device *pdev) err_alarm_irq: platform_set_drvdata(pdev, NULL); - rtc_device_unregister(rtc); err_nortc: s3c_rtc_enable(pdev, 0); diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index e55a7635ae5f..5f4708522f4d 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -820,18 +820,7 @@ static struct platform_driver sh_rtc_platform_driver = { .remove = __exit_p(sh_rtc_remove), }; -static int __init sh_rtc_init(void) -{ - return platform_driver_probe(&sh_rtc_platform_driver, sh_rtc_probe); -} - -static void __exit sh_rtc_exit(void) -{ - platform_driver_unregister(&sh_rtc_platform_driver); -} - -module_init(sh_rtc_init); -module_exit(sh_rtc_exit); +module_platform_driver_probe(sh_rtc_platform_driver, sh_rtc_probe); MODULE_DESCRIPTION("SuperH on-chip RTC driver"); MODULE_VERSION(DRV_VERSION); diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c index f7d90703db5e..b04f09a1df2a 100644 --- a/drivers/rtc/rtc-snvs.c +++ b/drivers/rtc/rtc-snvs.c @@ -283,7 +283,7 @@ static int snvs_rtc_probe(struct platform_device *pdev) return ret; } - data->rtc = rtc_device_register(pdev->name, &pdev->dev, + data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &snvs_rtc_ops, THIS_MODULE); if (IS_ERR(data->rtc)) { ret = PTR_ERR(data->rtc); @@ -296,10 +296,6 @@ static int snvs_rtc_probe(struct platform_device *pdev) static int snvs_rtc_remove(struct platform_device *pdev) { - struct snvs_rtc_data *data = platform_get_drvdata(pdev); - - rtc_device_unregister(data->rtc); - return 0; } diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index a18c3192ed40..db3ef610dd7c 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -400,8 +400,8 @@ static int spear_rtc_probe(struct platform_device *pdev) spin_lock_init(&config->lock); platform_set_drvdata(pdev, config); - config->rtc = rtc_device_register(pdev->name, &pdev->dev, - &spear_rtc_ops, THIS_MODULE); + config->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &spear_rtc_ops, THIS_MODULE); if (IS_ERR(config->rtc)) { dev_err(&pdev->dev, "can't register RTC device, err %ld\n", PTR_ERR(config->rtc)); @@ -427,7 +427,6 @@ static int spear_rtc_remove(struct platform_device *pdev) { struct spear_rtc_config *config = platform_get_drvdata(pdev); - rtc_device_unregister(config->rtc); spear_rtc_disable_interrupt(config); clk_disable_unprepare(config->clk); device_init_wakeup(&pdev->dev, 0); diff --git a/drivers/rtc/rtc-starfire.c b/drivers/rtc/rtc-starfire.c index 5be98bfd7ed3..db7d0765aabd 100644 --- a/drivers/rtc/rtc-starfire.c +++ b/drivers/rtc/rtc-starfire.c @@ -66,15 +66,4 @@ static struct platform_driver starfire_rtc_driver = { .remove = __exit_p(starfire_rtc_remove), }; -static int __init starfire_rtc_init(void) -{ - return platform_driver_probe(&starfire_rtc_driver, starfire_rtc_probe); -} - -static void __exit starfire_rtc_exit(void) -{ - platform_driver_unregister(&starfire_rtc_driver); -} - -module_init(starfire_rtc_init); -module_exit(starfire_rtc_exit); +module_platform_driver_probe(starfire_rtc_driver, starfire_rtc_probe); diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index 7e4a6f65cb91..af5e97e3f272 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -336,14 +336,13 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev) } } - pdata->rtc = rtc_device_register(pdev->name, &pdev->dev, + pdata->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &stk17ta8_rtc_ops, THIS_MODULE); if (IS_ERR(pdata->rtc)) return PTR_ERR(pdata->rtc); ret = sysfs_create_bin_file(&pdev->dev.kobj, &stk17ta8_nvram_attr); - if (ret) - rtc_device_unregister(pdata->rtc); + return ret; } @@ -352,7 +351,6 @@ static int stk17ta8_rtc_remove(struct platform_device *pdev) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); sysfs_remove_bin_file(&pdev->dev.kobj, &stk17ta8_nvram_attr); - rtc_device_unregister(pdata->rtc); if (pdata->irq > 0) writeb(0, pdata->ioaddr + RTC_INTERRUPTS); return 0; diff --git a/drivers/rtc/rtc-sun4v.c b/drivers/rtc/rtc-sun4v.c index 59b5c2dcb58c..7c51c499877b 100644 --- a/drivers/rtc/rtc-sun4v.c +++ b/drivers/rtc/rtc-sun4v.c @@ -106,18 +106,7 @@ static struct platform_driver sun4v_rtc_driver = { .remove = __exit_p(sun4v_rtc_remove), }; -static int __init sun4v_rtc_init(void) -{ - return platform_driver_probe(&sun4v_rtc_driver, sun4v_rtc_probe); -} - -static void __exit sun4v_rtc_exit(void) -{ - platform_driver_unregister(&sun4v_rtc_driver); -} - -module_init(sun4v_rtc_init); -module_exit(sun4v_rtc_exit); +module_platform_driver_probe(sun4v_rtc_driver, sun4v_rtc_probe); MODULE_AUTHOR("David S. Miller <davem@davemloft.net>"); MODULE_DESCRIPTION("SUN4V RTC driver"); diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c index 7c033756d6b5..92e372429854 100644 --- a/drivers/rtc/rtc-tegra.c +++ b/drivers/rtc/rtc-tegra.c @@ -309,7 +309,7 @@ static const struct of_device_id tegra_rtc_dt_match[] = { }; MODULE_DEVICE_TABLE(of, tegra_rtc_dt_match); -static int tegra_rtc_probe(struct platform_device *pdev) +static int __init tegra_rtc_probe(struct platform_device *pdev) { struct tegra_rtc_info *info; struct resource *res; @@ -348,8 +348,8 @@ static int tegra_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); - info->rtc_dev = rtc_device_register( - pdev->name, &pdev->dev, &tegra_rtc_ops, THIS_MODULE); + info->rtc_dev = devm_rtc_device_register( + &pdev->dev, pdev->name, &tegra_rtc_ops, THIS_MODULE); if (IS_ERR(info->rtc_dev)) { ret = PTR_ERR(info->rtc_dev); info->rtc_dev = NULL; @@ -374,17 +374,11 @@ static int tegra_rtc_probe(struct platform_device *pdev) return 0; err_dev_unreg: - rtc_device_unregister(info->rtc_dev); - return ret; } -static int tegra_rtc_remove(struct platform_device *pdev) +static int __exit tegra_rtc_remove(struct platform_device *pdev) { - struct tegra_rtc_info *info = platform_get_drvdata(pdev); - - rtc_device_unregister(info->rtc_dev); - platform_set_drvdata(pdev, NULL); return 0; @@ -439,7 +433,7 @@ static void tegra_rtc_shutdown(struct platform_device *pdev) MODULE_ALIAS("platform:tegra_rtc"); static struct platform_driver tegra_rtc_driver = { - .remove = tegra_rtc_remove, + .remove = __exit_p(tegra_rtc_remove), .shutdown = tegra_rtc_shutdown, .driver = { .name = "tegra_rtc", @@ -452,17 +446,7 @@ static struct platform_driver tegra_rtc_driver = { #endif }; -static int __init tegra_rtc_init(void) -{ - return platform_driver_probe(&tegra_rtc_driver, tegra_rtc_probe); -} -module_init(tegra_rtc_init); - -static void __exit tegra_rtc_exit(void) -{ - platform_driver_unregister(&tegra_rtc_driver); -} -module_exit(tegra_rtc_exit); +module_platform_driver_probe(tegra_rtc_driver, tegra_rtc_probe); MODULE_AUTHOR("Jon Mayo <jmayo@nvidia.com>"); MODULE_DESCRIPTION("driver for Tegra internal RTC"); diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c index aab4e8c93622..b6aab9f80f73 100644 --- a/drivers/rtc/rtc-tps6586x.c +++ b/drivers/rtc/rtc-tps6586x.c @@ -274,7 +274,7 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) } platform_set_drvdata(pdev, rtc); - rtc->rtc = rtc_device_register(dev_name(&pdev->dev), &pdev->dev, + rtc->rtc = devm_rtc_device_register(&pdev->dev, dev_name(&pdev->dev), &tps6586x_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { ret = PTR_ERR(rtc->rtc); @@ -306,12 +306,10 @@ fail_rtc_register: static int tps6586x_rtc_remove(struct platform_device *pdev) { - struct tps6586x_rtc *rtc = platform_get_drvdata(pdev); struct device *tps_dev = to_tps6586x_dev(&pdev->dev); tps6586x_update(tps_dev, RTC_CTRL, 0, RTC_ENABLE | OSC_SRC_SEL | PRE_BYPASS | CL_SEL_MASK); - rtc_device_unregister(rtc->rtc); return 0; } diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c index 8bd8115329b5..ef5d199af298 100644 --- a/drivers/rtc/rtc-tps65910.c +++ b/drivers/rtc/rtc-tps65910.c @@ -276,7 +276,7 @@ static int tps65910_rtc_probe(struct platform_device *pdev) tps_rtc->irq = irq; device_set_wakeup_capable(&pdev->dev, 1); - tps_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, + tps_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &tps65910_rtc_ops, THIS_MODULE); if (IS_ERR(tps_rtc->rtc)) { ret = PTR_ERR(tps_rtc->rtc); @@ -295,12 +295,8 @@ static int tps65910_rtc_probe(struct platform_device *pdev) */ static int tps65910_rtc_remove(struct platform_device *pdev) { - /* leave rtc running, but disable irqs */ - struct tps65910_rtc *tps_rtc = platform_get_drvdata(pdev); - tps65910_rtc_alarm_irq_enable(&pdev->dev, 0); - rtc_device_unregister(tps_rtc->rtc); return 0; } diff --git a/drivers/rtc/rtc-tps80031.c b/drivers/rtc/rtc-tps80031.c index 9aaf8aaebae9..dc62d5fa2a29 100644 --- a/drivers/rtc/rtc-tps80031.c +++ b/drivers/rtc/rtc-tps80031.c @@ -277,7 +277,7 @@ static int tps80031_rtc_probe(struct platform_device *pdev) return ret; } - rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, + rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &tps80031_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { ret = PTR_ERR(rtc->rtc); @@ -292,7 +292,6 @@ static int tps80031_rtc_probe(struct platform_device *pdev) if (ret < 0) { dev_err(&pdev->dev, "request IRQ:%d failed, err = %d\n", rtc->irq, ret); - rtc_device_unregister(rtc->rtc); return ret; } device_set_wakeup_capable(&pdev->dev, 1); @@ -301,9 +300,6 @@ static int tps80031_rtc_probe(struct platform_device *pdev) static int tps80031_rtc_remove(struct platform_device *pdev) { - struct tps80031_rtc *rtc = platform_get_drvdata(pdev); - - rtc_device_unregister(rtc->rtc); return 0; } diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c index a12bfac49d36..f9a0677e4e3b 100644 --- a/drivers/rtc/rtc-tx4939.c +++ b/drivers/rtc/rtc-tx4939.c @@ -268,14 +268,13 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev) if (devm_request_irq(&pdev->dev, irq, tx4939_rtc_interrupt, 0, pdev->name, &pdev->dev) < 0) return -EBUSY; - rtc = rtc_device_register(pdev->name, &pdev->dev, + rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &tx4939_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) return PTR_ERR(rtc); pdata->rtc = rtc; ret = sysfs_create_bin_file(&pdev->dev.kobj, &tx4939_rtc_nvram_attr); - if (ret) - rtc_device_unregister(rtc); + return ret; } @@ -284,7 +283,6 @@ static int __exit tx4939_rtc_remove(struct platform_device *pdev) struct tx4939rtc_plat_data *pdata = platform_get_drvdata(pdev); sysfs_remove_bin_file(&pdev->dev.kobj, &tx4939_rtc_nvram_attr); - rtc_device_unregister(pdata->rtc); spin_lock_irq(&pdata->lock); tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP); spin_unlock_irq(&pdata->lock); @@ -299,18 +297,7 @@ static struct platform_driver tx4939_rtc_driver = { }, }; -static int __init tx4939rtc_init(void) -{ - return platform_driver_probe(&tx4939_rtc_driver, tx4939_rtc_probe); -} - -static void __exit tx4939rtc_exit(void) -{ - platform_driver_unregister(&tx4939_rtc_driver); -} - -module_init(tx4939rtc_init); -module_exit(tx4939rtc_exit); +module_platform_driver_probe(tx4939_rtc_driver, tx4939_rtc_probe); MODULE_AUTHOR("Atsushi Nemoto <anemo@mba.ocn.ne.jp>"); MODULE_DESCRIPTION("TX4939 internal RTC driver"); diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index bca5d677bc85..600798cd4d0c 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -49,18 +49,13 @@ struct v3020_chip_ops { #define V3020_RD 2 #define V3020_IO 3 -struct v3020_gpio { - const char *name; - unsigned int gpio; -}; - struct v3020 { /* MMIO access */ void __iomem *ioaddress; int leftshift; /* GPIO access */ - struct v3020_gpio *gpio; + struct gpio *gpio; struct v3020_chip_ops *ops; @@ -107,48 +102,40 @@ static struct v3020_chip_ops v3020_mmio_ops = { .write_bit = v3020_mmio_write_bit, }; -static struct v3020_gpio v3020_gpio[] = { - { "RTC CS", 0 }, - { "RTC WR", 0 }, - { "RTC RD", 0 }, - { "RTC IO", 0 }, +static struct gpio v3020_gpio[] = { + { 0, GPIOF_OUT_INIT_HIGH, "RTC CS"}, + { 0, GPIOF_OUT_INIT_HIGH, "RTC WR"}, + { 0, GPIOF_OUT_INIT_HIGH, "RTC RD"}, + { 0, GPIOF_OUT_INIT_HIGH, "RTC IO"}, }; static int v3020_gpio_map(struct v3020 *chip, struct platform_device *pdev, struct v3020_platform_data *pdata) { - int i, err; + int err; v3020_gpio[V3020_CS].gpio = pdata->gpio_cs; v3020_gpio[V3020_WR].gpio = pdata->gpio_wr; v3020_gpio[V3020_RD].gpio = pdata->gpio_rd; v3020_gpio[V3020_IO].gpio = pdata->gpio_io; - for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) { - err = gpio_request(v3020_gpio[i].gpio, v3020_gpio[i].name); - if (err) - goto err_request; - - gpio_direction_output(v3020_gpio[i].gpio, 1); - } + err = gpio_request_array(v3020_gpio, ARRAY_SIZE(v3020_gpio)); + if (err) + goto err_request; chip->gpio = v3020_gpio; return 0; err_request: - while (--i >= 0) - gpio_free(v3020_gpio[i].gpio); + gpio_free_array(v3020_gpio, ARRAY_SIZE(v3020_gpio)); return err; } static void v3020_gpio_unmap(struct v3020 *chip) { - int i; - - for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) - gpio_free(v3020_gpio[i].gpio); + gpio_free_array(v3020_gpio, ARRAY_SIZE(v3020_gpio)); } static void v3020_gpio_write_bit(struct v3020 *chip, unsigned char bit) diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c index a000bc0a8bff..d89efee6d29e 100644 --- a/drivers/rtc/rtc-vt8500.c +++ b/drivers/rtc/rtc-vt8500.c @@ -252,7 +252,7 @@ static int vt8500_rtc_probe(struct platform_device *pdev) writel(VT8500_RTC_CR_ENABLE, vt8500_rtc->regbase + VT8500_RTC_CR); - vt8500_rtc->rtc = rtc_device_register("vt8500-rtc", &pdev->dev, + vt8500_rtc->rtc = devm_rtc_device_register(&pdev->dev, "vt8500-rtc", &vt8500_rtc_ops, THIS_MODULE); if (IS_ERR(vt8500_rtc->rtc)) { ret = PTR_ERR(vt8500_rtc->rtc); @@ -266,13 +266,11 @@ static int vt8500_rtc_probe(struct platform_device *pdev) if (ret < 0) { dev_err(&pdev->dev, "can't get irq %i, err %d\n", vt8500_rtc->irq_alarm, ret); - goto err_unreg; + goto err_return; } return 0; -err_unreg: - rtc_device_unregister(vt8500_rtc->rtc); err_return: return ret; } @@ -281,8 +279,6 @@ static int vt8500_rtc_remove(struct platform_device *pdev) { struct vt8500_rtc *vt8500_rtc = platform_get_drvdata(pdev); - rtc_device_unregister(vt8500_rtc->rtc); - /* Disable alarm matching */ writel(0, vt8500_rtc->regbase + VT8500_RTC_IS); diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index 2f0ac7b30a0c..8d65b94e5a7e 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -436,7 +436,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); - wm831x_rtc->rtc = rtc_device_register("wm831x", &pdev->dev, + wm831x_rtc->rtc = devm_rtc_device_register(&pdev->dev, "wm831x", &wm831x_rtc_ops, THIS_MODULE); if (IS_ERR(wm831x_rtc->rtc)) { ret = PTR_ERR(wm831x_rtc->rtc); @@ -462,10 +462,6 @@ err: static int wm831x_rtc_remove(struct platform_device *pdev) { - struct wm831x_rtc *wm831x_rtc = platform_get_drvdata(pdev); - - rtc_device_unregister(wm831x_rtc->rtc); - return 0; } diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c index 08c3bc398da2..43bb55827f30 100644 --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -2161,7 +2161,7 @@ static void fcoe_ctlr_vn_restart(struct fcoe_ctlr *fip) if (fip->probe_tries < FIP_VN_RLIM_COUNT) { fip->probe_tries++; - wait = random32() % FIP_VN_PROBE_WAIT; + wait = prandom_u32() % FIP_VN_PROBE_WAIT; } else wait = FIP_VN_RLIM_INT; mod_timer(&fip->timer, jiffies + msecs_to_jiffies(wait)); @@ -2794,7 +2794,7 @@ static void fcoe_ctlr_vn_timeout(struct fcoe_ctlr *fip) fcoe_all_vn2vn, 0); fip->port_ka_time = jiffies + msecs_to_jiffies(FIP_VN_BEACON_INT + - (random32() % FIP_VN_BEACON_FUZZ)); + (prandom_u32() % FIP_VN_BEACON_FUZZ)); } if (time_before(fip->port_ka_time, next_time)) next_time = fip->port_ka_time; diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index d7096ad94d3f..bfda18467ee6 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -1732,7 +1732,7 @@ lpfc_check_pending_fcoe_event(struct lpfc_hba *phba, uint8_t unreg_fcf) * use through a sequence of @fcf_cnt eligible FCF records with equal * probability. To perform integer manunipulation of random numbers with * size unit32_t, the lower 16 bits of the 32-bit random number returned - * from random32() are taken as the random random number generated. + * from prandom_u32() are taken as the random random number generated. * * Returns true when outcome is for the newly read FCF record should be * chosen; otherwise, return false when outcome is for keeping the previously @@ -1744,7 +1744,7 @@ lpfc_sli4_new_fcf_random_select(struct lpfc_hba *phba, uint32_t fcf_cnt) uint32_t rand_num; /* Get 16-bit uniform random number */ - rand_num = (0xFFFF & random32()); + rand_num = 0xFFFF & prandom_u32(); /* Decision with probability 1/fcf_cnt */ if ((fcf_cnt * rand_num) < 0xFFFF) @@ -2380,7 +2380,7 @@ lpfc_mbx_cmpl_fcf_scan_read_fcf_rec(struct lpfc_hba *phba, LPFC_MBOXQ_t *mboxq) phba->fcf.eligible_fcf_cnt = 1; /* Seeding the random number generator for random selection */ seed = (uint32_t)(0xFFFFFFFF & jiffies); - srandom32(seed); + prandom_seed(seed); } spin_unlock_irq(&phba->hbalock); goto read_next_fcf; diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 1d82eef4e1eb..04bf7b8c98d4 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -1940,8 +1940,11 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) /* No pending activities shall be there on the vha now */ if (ql2xextended_error_logging & ql_dbg_user) - msleep(random32()%10); /* Just to see if something falls on - * the net we have placed below */ + msleep(prandom_u32() % 10); + /* + * Just to see if something falls on the net we have placed + * below + */ BUG_ON(atomic_read(&vha->vref_count)); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9f0c46547459..df5e961484e1 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -35,6 +35,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/aio.h> #include <linux/errno.h> #include <linux/mtio.h> #include <linux/ioctl.h> diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index b14a55742559..b040200a5a55 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/vmalloc.h> +#include <linux/aio.h> #include "logger.h" #include <asm/ioctls.h> diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig index 2da6cc444c7e..f47653b623c1 100644 --- a/drivers/staging/zcache/Kconfig +++ b/drivers/staging/zcache/Kconfig @@ -1,5 +1,5 @@ config ZCACHE - bool "Dynamic compression of swap pages and clean pagecache pages" + tristate "Dynamic compression of swap pages and clean pagecache pages" depends on CRYPTO=y && SWAP=y && CLEANCACHE && FRONTSWAP select CRYPTO_LZO default n @@ -19,8 +19,8 @@ config ZCACHE_DEBUG how zcache is doing. You probably want to set this to 'N'. config RAMSTER - bool "Cross-machine RAM capacity sharing, aka peer-to-peer tmem" - depends on CONFIGFS_FS=y && SYSFS=y && !HIGHMEM && ZCACHE=y + tristate "Cross-machine RAM capacity sharing, aka peer-to-peer tmem" + depends on CONFIGFS_FS=y && SYSFS=y && !HIGHMEM && ZCACHE depends on NET # must ensure struct page is 8-byte aligned select HAVE_ALIGNED_STRUCT_PAGE if !64_BIT diff --git a/drivers/staging/zcache/ramster.h b/drivers/staging/zcache/ramster.h index 1b71aea2ff62..e1f91d5a0f6a 100644 --- a/drivers/staging/zcache/ramster.h +++ b/drivers/staging/zcache/ramster.h @@ -11,10 +11,14 @@ #ifndef _ZCACHE_RAMSTER_H_ #define _ZCACHE_RAMSTER_H_ +#ifdef CONFIG_RAMSTER_MODULE +#define CONFIG_RAMSTER +#endif + #ifdef CONFIG_RAMSTER #include "ramster/ramster.h" #else -static inline void ramster_init(bool x, bool y, bool z) +static inline void ramster_init(bool x, bool y, bool z, bool w) { } diff --git a/drivers/staging/zcache/ramster/nodemanager.c b/drivers/staging/zcache/ramster/nodemanager.c index c0f48158735d..2cfe93342c0d 100644 --- a/drivers/staging/zcache/ramster/nodemanager.c +++ b/drivers/staging/zcache/ramster/nodemanager.c @@ -949,7 +949,7 @@ static void __exit exit_r2nm(void) r2hb_exit(); } -static int __init init_r2nm(void) +int r2nm_init(void) { int ret = -1; @@ -986,10 +986,11 @@ out_r2hb: out: return ret; } +EXPORT_SYMBOL_GPL(r2nm_init); MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); -/* module_init(init_r2nm) */ -late_initcall(init_r2nm); -/* module_exit(exit_r2nm) */ +#ifndef CONFIG_RAMSTER_MODULE +late_initcall(r2nm_init); +#endif diff --git a/drivers/staging/zcache/ramster/ramster.c b/drivers/staging/zcache/ramster/ramster.c index bf96a1cbf7c1..4f715c791188 100644 --- a/drivers/staging/zcache/ramster/ramster.c +++ b/drivers/staging/zcache/ramster/ramster.c @@ -92,7 +92,7 @@ static ssize_t ramster_remote_page_flushes_failed; #include <linux/debugfs.h> #define zdfs debugfs_create_size_t #define zdfs64 debugfs_create_u64 -static int __init ramster_debugfs_init(void) +static int ramster_debugfs_init(void) { struct dentry *root = debugfs_create_dir("ramster", NULL); if (root == NULL) @@ -191,6 +191,7 @@ int ramster_do_preload_flnode(struct tmem_pool *pool) kmem_cache_free(ramster_flnode_cache, flnode); return ret; } +EXPORT_SYMBOL_GPL(ramster_do_preload_flnode); /* * Called by the message handler after a (still compressed) page has been @@ -458,6 +459,7 @@ void *ramster_pampd_free(void *pampd, struct tmem_pool *pool, } return local_pampd; } +EXPORT_SYMBOL_GPL(ramster_pampd_free); void ramster_count_foreign_pages(bool eph, int count) { @@ -489,6 +491,7 @@ void ramster_count_foreign_pages(bool eph, int count) ramster_foreign_pers_pages = c; } } +EXPORT_SYMBOL_GPL(ramster_count_foreign_pages); /* * For now, just push over a few pages every few seconds to @@ -674,7 +677,7 @@ requeue: ramster_remotify_queue_delayed_work(HZ); } -void __init ramster_remotify_init(void) +void ramster_remotify_init(void) { unsigned long n = 60UL; ramster_remotify_workqueue = @@ -849,8 +852,10 @@ static bool frontswap_selfshrinking __read_mostly; static void selfshrink_process(struct work_struct *work); static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); +#ifndef CONFIG_RAMSTER_MODULE /* Enable/disable with kernel boot option. */ static bool use_frontswap_selfshrink __initdata = true; +#endif /* * The default values for the following parameters were deemed reasonable @@ -905,6 +910,7 @@ static void frontswap_selfshrink(void) frontswap_shrink(tgt_frontswap_pages); } +#ifndef CONFIG_RAMSTER_MODULE static int __init ramster_nofrontswap_selfshrink_setup(char *s) { use_frontswap_selfshrink = false; @@ -912,6 +918,7 @@ static int __init ramster_nofrontswap_selfshrink_setup(char *s) } __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); +#endif static void selfshrink_process(struct work_struct *work) { @@ -930,6 +937,7 @@ void ramster_cpu_up(int cpu) per_cpu(ramster_remoteputmem1, cpu) = p1; per_cpu(ramster_remoteputmem2, cpu) = p2; } +EXPORT_SYMBOL_GPL(ramster_cpu_up); void ramster_cpu_down(int cpu) { @@ -945,6 +953,7 @@ void ramster_cpu_down(int cpu) kp->flnode = NULL; } } +EXPORT_SYMBOL_GPL(ramster_cpu_down); void ramster_register_pamops(struct tmem_pamops *pamops) { @@ -955,9 +964,11 @@ void ramster_register_pamops(struct tmem_pamops *pamops) pamops->repatriate = ramster_pampd_repatriate; pamops->repatriate_preload = ramster_pampd_repatriate_preload; } +EXPORT_SYMBOL_GPL(ramster_register_pamops); -void __init ramster_init(bool cleancache, bool frontswap, - bool frontswap_exclusive_gets) +void ramster_init(bool cleancache, bool frontswap, + bool frontswap_exclusive_gets, + bool frontswap_selfshrink) { int ret = 0; @@ -972,10 +983,17 @@ void __init ramster_init(bool cleancache, bool frontswap, if (ret) pr_err("ramster: can't create sysfs for ramster\n"); (void)r2net_register_handlers(); +#ifdef CONFIG_RAMSTER_MODULE + ret = r2nm_init(); + if (ret) + pr_err("ramster: can't init r2net\n"); + frontswap_selfshrinking = frontswap_selfshrink; +#else + frontswap_selfshrinking = use_frontswap_selfshrink; +#endif INIT_LIST_HEAD(&ramster_rem_op_list); ramster_flnode_cache = kmem_cache_create("ramster_flnode", sizeof(struct flushlist_node), 0, 0, NULL); - frontswap_selfshrinking = use_frontswap_selfshrink; if (frontswap_selfshrinking) { pr_info("ramster: Initializing frontswap selfshrink driver.\n"); schedule_delayed_work(&selfshrink_worker, @@ -983,3 +1001,4 @@ void __init ramster_init(bool cleancache, bool frontswap, } ramster_remotify_init(); } +EXPORT_SYMBOL_GPL(ramster_init); diff --git a/drivers/staging/zcache/ramster/ramster.h b/drivers/staging/zcache/ramster/ramster.h index 12ae56f09ca4..6d41a7a772e3 100644 --- a/drivers/staging/zcache/ramster/ramster.h +++ b/drivers/staging/zcache/ramster/ramster.h @@ -147,7 +147,7 @@ extern int r2net_register_handlers(void); extern int r2net_remote_target_node_set(int); extern int ramster_remotify_pageframe(bool); -extern void ramster_init(bool, bool, bool); +extern void ramster_init(bool, bool, bool, bool); extern void ramster_register_pamops(struct tmem_pamops *); extern int ramster_localify(int, struct tmem_oid *oidp, uint32_t, char *, unsigned int, void *); diff --git a/drivers/staging/zcache/ramster/ramster_nodemanager.h b/drivers/staging/zcache/ramster/ramster_nodemanager.h index 49f879d943ab..dbaae34ea613 100644 --- a/drivers/staging/zcache/ramster/ramster_nodemanager.h +++ b/drivers/staging/zcache/ramster/ramster_nodemanager.h @@ -36,4 +36,6 @@ /* host name, group name, cluster name all 64 bytes */ #define R2NM_MAX_NAME_LEN 64 /* __NEW_UTS_LEN */ +extern int r2nm_init(void); + #endif /* _RAMSTER_NODEMANAGER_H */ diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c index a2b7e03b6062..d7e51e4152eb 100644 --- a/drivers/staging/zcache/tmem.c +++ b/drivers/staging/zcache/tmem.c @@ -35,7 +35,8 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> -#ifdef CONFIG_RAMSTER +#include <linux/export.h> +#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE) #include <linux/delay.h> #endif @@ -641,6 +642,7 @@ void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, /* note, hashbucket remains locked */ return pampd; } +EXPORT_SYMBOL_GPL(tmem_localify_get_pampd); void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, void *pampd, void *saved_hb, bool delete) @@ -658,6 +660,7 @@ void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, } spin_unlock(&hb->lock); } +EXPORT_SYMBOL_GPL(tmem_localify_finish); /* * For ramster only. Helper function to support asynchronous tmem_get. @@ -719,6 +722,7 @@ out: spin_unlock(&hb->lock); return ret; } +EXPORT_SYMBOL_GPL(tmem_replace); #endif /* diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h index adbe5a8f28aa..d128ce290f1f 100644 --- a/drivers/staging/zcache/tmem.h +++ b/drivers/staging/zcache/tmem.h @@ -126,7 +126,7 @@ static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) TMEM_HASH_BUCKET_BITS); } -#ifdef CONFIG_RAMSTER +#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE) struct tmem_xhandle { uint8_t client_id; uint8_t xh_data_cksum; @@ -171,7 +171,7 @@ struct tmem_obj { unsigned int objnode_tree_height; unsigned long objnode_count; long pampd_count; -#ifdef CONFIG_RAMSTER +#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE) /* * for current design of ramster, all pages belonging to * an object reside on the same remotenode and extra is @@ -215,7 +215,7 @@ struct tmem_pamops { uint32_t); void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t, bool); -#ifdef CONFIG_RAMSTER +#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE) void (*new_obj)(struct tmem_obj *); void (*free_obj)(struct tmem_pool *, struct tmem_obj *, bool); void *(*repatriate_preload)(void *, struct tmem_pool *, @@ -247,7 +247,7 @@ extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); extern int tmem_destroy_pool(struct tmem_pool *); extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#ifdef CONFIG_RAMSTER +#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE) extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, void *); extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *, diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c index 7a6dd966931b..4e52a949d12a 100644 --- a/drivers/staging/zcache/zcache-main.c +++ b/drivers/staging/zcache/zcache-main.c @@ -37,8 +37,10 @@ #include "debug.h" #ifdef CONFIG_RAMSTER static bool ramster_enabled __read_mostly; +static int disable_frontswap_selfshrink; #else #define ramster_enabled false +#define disable_frontswap_selfshrink 0 #endif #ifndef __PG_WAS_ACTIVE @@ -75,8 +77,12 @@ static char *namestr __read_mostly = "zcache"; (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) /* crypto API for zcache */ +#ifdef CONFIG_ZCACHE_MODULE +static char *zcache_comp_name = "lzo"; +#else #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME static char zcache_comp_name[ZCACHE_COMP_NAME_SZ] __read_mostly; +#endif static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms __read_mostly; enum comp_op { @@ -1483,9 +1489,9 @@ static struct cleancache_ops zcache_cleancache_ops = { .init_fs = zcache_cleancache_init_fs }; -struct cleancache_ops zcache_cleancache_register_ops(void) +struct cleancache_ops *zcache_cleancache_register_ops(void) { - struct cleancache_ops old_ops = + struct cleancache_ops *old_ops = cleancache_register_ops(&zcache_cleancache_ops); return old_ops; @@ -1614,9 +1620,9 @@ static struct frontswap_ops zcache_frontswap_ops = { .init = zcache_frontswap_init }; -struct frontswap_ops zcache_frontswap_register_ops(void) +struct frontswap_ops *zcache_frontswap_register_ops(void) { - struct frontswap_ops old_ops = + struct frontswap_ops *old_ops = frontswap_register_ops(&zcache_frontswap_ops); return old_ops; @@ -1628,6 +1634,7 @@ struct frontswap_ops zcache_frontswap_register_ops(void) * OR NOTHING HAPPENS! */ +#ifndef CONFIG_ZCACHE_MODULE static int __init enable_zcache(char *s) { zcache_enabled = true; @@ -1694,18 +1701,27 @@ static int __init enable_zcache_compressor(char *s) return 1; } __setup("zcache=", enable_zcache_compressor); +#endif -static int __init zcache_comp_init(void) +static int zcache_comp_init(void) { int ret = 0; /* check crypto algorithm */ +#ifdef CONFIG_ZCACHE_MODULE + ret = crypto_has_comp(zcache_comp_name, 0, 0); + if (!ret) { + ret = -1; + goto out; + } +#else if (*zcache_comp_name != '\0') { ret = crypto_has_comp(zcache_comp_name, 0, 0); if (!ret) pr_info("zcache: %s not supported\n", zcache_comp_name); + goto out; } if (!ret) strcpy(zcache_comp_name, "lzo"); @@ -1714,6 +1730,7 @@ static int __init zcache_comp_init(void) ret = 1; goto out; } +#endif pr_info("zcache: using %s compressor\n", zcache_comp_name); /* alloc percpu transforms */ @@ -1725,10 +1742,13 @@ out: return ret; } -static int __init zcache_init(void) +static int zcache_init(void) { int ret = 0; +#ifdef CONFIG_ZCACHE_MODULE + zcache_enabled = 1; +#endif if (ramster_enabled) { namestr = "ramster"; ramster_register_pamops(&zcache_pamops); @@ -1769,7 +1789,7 @@ static int __init zcache_init(void) } zbud_init(); if (zcache_enabled && !disable_cleancache) { - struct cleancache_ops old_ops; + struct cleancache_ops *old_ops; register_shrinker(&zcache_shrinker); old_ops = zcache_cleancache_register_ops(); @@ -1779,11 +1799,11 @@ static int __init zcache_init(void) pr_info("%s: cleancache: ignorenonactive = %d\n", namestr, !disable_cleancache_ignore_nonactive); #endif - if (old_ops.init_fs != NULL) + if (old_ops != NULL) pr_warn("%s: cleancache_ops overridden\n", namestr); } if (zcache_enabled && !disable_frontswap) { - struct frontswap_ops old_ops; + struct frontswap_ops *old_ops; old_ops = zcache_frontswap_register_ops(); if (frontswap_has_exclusive_gets) @@ -1795,14 +1815,36 @@ static int __init zcache_init(void) namestr, frontswap_has_exclusive_gets, !disable_frontswap_ignore_nonactive); #endif - if (old_ops.init != NULL) + if (IS_ERR(old_ops) || old_ops) { + if (IS_ERR(old_ops)) + return PTR_RET(old_ops); pr_warn("%s: frontswap_ops overridden\n", namestr); + } } if (ramster_enabled) ramster_init(!disable_cleancache, !disable_frontswap, - frontswap_has_exclusive_gets); + frontswap_has_exclusive_gets, + !disable_frontswap_selfshrink); out: return ret; } +#ifdef CONFIG_ZCACHE_MODULE +#ifdef CONFIG_RAMSTER +module_param(ramster_enabled, int, S_IRUGO); +module_param(disable_frontswap_selfshrink, int, S_IRUGO); +#endif +module_param(disable_cleancache, int, S_IRUGO); +module_param(disable_frontswap, int, S_IRUGO); +#ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS +module_param(frontswap_has_exclusive_gets, bool, S_IRUGO); +#endif +module_param(disable_frontswap_ignore_nonactive, int, S_IRUGO); +module_param(zcache_comp_name, charp, S_IRUGO); +module_init(zcache_init); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("In-kernel compression of cleancache/frontswap pages"); +#else late_initcall(zcache_init); +#endif diff --git a/drivers/staging/zcache/zcache.h b/drivers/staging/zcache/zcache.h index 81722b33b087..849120095e79 100644 --- a/drivers/staging/zcache/zcache.h +++ b/drivers/staging/zcache/zcache.h @@ -39,7 +39,7 @@ extern int zcache_flush_page(int, int, struct tmem_oid *, uint32_t); extern int zcache_flush_object(int, int, struct tmem_oid *); extern void zcache_decompress_to_page(char *, unsigned int, struct page *); -#ifdef CONFIG_RAMSTER +#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE) extern void *zcache_pampd_create(char *, unsigned int, bool, int, struct tmem_handle *); int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph); diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index f52dcfe8f545..f5c32e975c4d 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -3099,7 +3099,7 @@ static int init_dma_pools(struct udc *dev) } /* DMA setup */ - dev->data_requests = dma_pool_create("data_requests", NULL, + dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev, sizeof(struct udc_data_dma), 0, 0); if (!dev->data_requests) { DBG(dev, "can't get request data pool\n"); @@ -3111,7 +3111,7 @@ static int init_dma_pools(struct udc *dev) dev->ep[UDC_EP0IN_IX].dma = &dev->regs->ctl; /* dma desc for setup data */ - dev->stp_requests = dma_pool_create("setup requests", NULL, + dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev, sizeof(struct udc_stp_dma), 0, 0); if (!dev->stp_requests) { DBG(dev, "can't get stp request pool\n"); @@ -3232,6 +3232,10 @@ static int udc_pci_probe( pci_set_master(pdev); pci_try_set_mwi(pdev); + dev->phys_addr = resource; + dev->irq = pdev->irq; + dev->pdev = pdev; + /* init dma pools */ if (use_dma) { retval = init_dma_pools(dev); @@ -3239,10 +3243,6 @@ static int udc_pci_probe( goto finished; } - dev->phys_addr = resource; - dev->irq = pdev->irq; - dev->pdev = pdev; - /* general probing */ if (udc_probe(dev) == 0) return 0; diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index e2b2e9cf254a..5cc4e7eed4a9 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -24,6 +24,8 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> +#include <linux/mmu_context.h> +#include <linux/aio.h> #include <linux/device.h> #include <linux/moduleparam.h> @@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) struct kiocb_priv { struct usb_request *req; struct ep_data *epdata; + struct kiocb *iocb; + struct mm_struct *mm; + struct work_struct work; void *buf; const struct iovec *iv; unsigned long nr_segs; @@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); - kiocbSetCancelled(iocb); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else @@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) return value; } -static ssize_t ep_aio_read_retry(struct kiocb *iocb) +static ssize_t ep_copy_to_user(struct kiocb_priv *priv) { - struct kiocb_priv *priv = iocb->private; ssize_t len, total; void *to_copy; int i; - /* we "retry" to get the right mm context for this: */ - /* copy stuff into user buffers */ total = priv->actual; len = 0; @@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb) if (total == 0) break; } + + return len; +} + +static void ep_user_copy_worker(struct work_struct *work) +{ + struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); + struct mm_struct *mm = priv->mm; + struct kiocb *iocb = priv->iocb; + size_t ret; + + use_mm(mm); + ret = ep_copy_to_user(priv); + unuse_mm(mm); + + /* completing the iocb can drop the ctx and mm, don't touch mm after */ + aio_complete(iocb, ret, ret); + kfree(priv->buf); kfree(priv); - return len; } static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) @@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) aio_complete(iocb, req->actual ? req->actual : req->status, req->status); } else { - /* retry() won't report both; so we hide some faults */ + /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) DBG(epdata->dev, "%s fault %d len %d\n", ep->name, req->status, req->actual); priv->buf = req->buf; priv->actual = req->actual; - kick_iocb(iocb); + schedule_work(&priv->work); } spin_unlock(&epdata->dev->lock); @@ -633,8 +651,10 @@ fail: return value; } iocb->private = priv; + priv->iocb = iocb; priv->iv = iv; priv->nr_segs = nr_segs; + INIT_WORK(&priv->work, ep_user_copy_worker); value = get_ready_ep(iocb->ki_filp->f_flags, epdata); if (unlikely(value < 0)) { @@ -642,10 +662,11 @@ fail: goto fail; } - iocb->ki_cancel = ep_aio_cancel; + kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; + priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ /* each kiocb is coupled to one usb_request, but we can't * allocate or submit those if the host disconnected. @@ -674,7 +695,7 @@ fail: kfree(priv); put_ep(epdata); } else - value = (iv ? -EIOCBRETRY : -EIOCBQUEUED); + value = -EIOCBQUEUED; return value; } @@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, if (unlikely(!buf)) return -ENOMEM; - iocb->ki_retry = ep_aio_read_retry; return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); } diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c index 0b0d8bce842e..f4ae05f78c42 100644 --- a/drivers/uwb/rsv.c +++ b/drivers/uwb/rsv.c @@ -231,7 +231,7 @@ void uwb_rsv_backoff_win_increment(struct uwb_rc *rc) return; bow->window <<= 1; - bow->n = random32() & (bow->window - 1); + bow->n = prandom_u32() & (bow->window - 1); dev_dbg(dev, "new_window=%d, n=%d\n: ", bow->window, bow->n); /* reset the timer associated variables */ @@ -557,7 +557,7 @@ int uwb_rsv_establish(struct uwb_rsv *rsv) if (ret) goto out; - rsv->tiebreaker = random32() & 1; + rsv->tiebreaker = prandom_u32() & 1; /* get available mas bitmap */ uwb_drp_available(rc, &available); diff --git a/drivers/video/backlight/ams369fg06.c b/drivers/video/backlight/ams369fg06.c index c02aa2c2575a..319fef6cb422 100644 --- a/drivers/video/backlight/ams369fg06.c +++ b/drivers/video/backlight/ams369fg06.c @@ -533,12 +533,12 @@ static int ams369fg06_remove(struct spi_device *spi) return 0; } -#if defined(CONFIG_PM) -static int ams369fg06_suspend(struct spi_device *spi, pm_message_t mesg) +#ifdef CONFIG_PM_SLEEP +static int ams369fg06_suspend(struct device *dev) { - struct ams369fg06 *lcd = spi_get_drvdata(spi); + struct ams369fg06 *lcd = dev_get_drvdata(dev); - dev_dbg(&spi->dev, "lcd->power = %d\n", lcd->power); + dev_dbg(dev, "lcd->power = %d\n", lcd->power); /* * when lcd panel is suspend, lcd panel becomes off @@ -547,19 +547,19 @@ static int ams369fg06_suspend(struct spi_device *spi, pm_message_t mesg) return ams369fg06_power(lcd, FB_BLANK_POWERDOWN); } -static int ams369fg06_resume(struct spi_device *spi) +static int ams369fg06_resume(struct device *dev) { - struct ams369fg06 *lcd = spi_get_drvdata(spi); + struct ams369fg06 *lcd = dev_get_drvdata(dev); lcd->power = FB_BLANK_POWERDOWN; return ams369fg06_power(lcd, FB_BLANK_UNBLANK); } -#else -#define ams369fg06_suspend NULL -#define ams369fg06_resume NULL #endif +static SIMPLE_DEV_PM_OPS(ams369fg06_pm_ops, ams369fg06_suspend, + ams369fg06_resume); + static void ams369fg06_shutdown(struct spi_device *spi) { struct ams369fg06 *lcd = spi_get_drvdata(spi); @@ -571,12 +571,11 @@ static struct spi_driver ams369fg06_driver = { .driver = { .name = "ams369fg06", .owner = THIS_MODULE, + .pm = &ams369fg06_pm_ops, }, .probe = ams369fg06_probe, .remove = ams369fg06_remove, .shutdown = ams369fg06_shutdown, - .suspend = ams369fg06_suspend, - .resume = ams369fg06_resume, }; module_spi_driver(ams369fg06_driver); diff --git a/drivers/video/backlight/atmel-pwm-bl.c b/drivers/video/backlight/atmel-pwm-bl.c index de5e5e74e2a7..a60d6afca97c 100644 --- a/drivers/video/backlight/atmel-pwm-bl.c +++ b/drivers/video/backlight/atmel-pwm-bl.c @@ -118,7 +118,7 @@ static const struct backlight_ops atmel_pwm_bl_ops = { .update_status = atmel_pwm_bl_set_intensity, }; -static int atmel_pwm_bl_probe(struct platform_device *pdev) +static int __init atmel_pwm_bl_probe(struct platform_device *pdev) { struct backlight_properties props; const struct atmel_pwm_bl_platform_data *pdata; @@ -225,17 +225,7 @@ static struct platform_driver atmel_pwm_bl_driver = { .remove = __exit_p(atmel_pwm_bl_remove), }; -static int __init atmel_pwm_bl_init(void) -{ - return platform_driver_probe(&atmel_pwm_bl_driver, atmel_pwm_bl_probe); -} -module_init(atmel_pwm_bl_init); - -static void __exit atmel_pwm_bl_exit(void) -{ - platform_driver_unregister(&atmel_pwm_bl_driver); -} -module_exit(atmel_pwm_bl_exit); +module_platform_driver_probe(atmel_pwm_bl_driver, atmel_pwm_bl_probe); MODULE_AUTHOR("Hans-Christian egtvedt <hans-christian.egtvedt@atmel.com>"); MODULE_DESCRIPTION("Atmel PWM backlight driver"); diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index ef3e21e8f825..fd073b277e48 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -60,7 +60,7 @@ static const struct backlight_ops ep93xxbl_ops = { .get_brightness = ep93xxbl_get_brightness, }; -static int __init ep93xxbl_probe(struct platform_device *dev) +static int ep93xxbl_probe(struct platform_device *dev) { struct ep93xxbl *ep93xxbl; struct backlight_device *bl; diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c index 17a6b83f97af..54d94de652b0 100644 --- a/drivers/video/backlight/platform_lcd.c +++ b/drivers/video/backlight/platform_lcd.c @@ -121,7 +121,7 @@ static int platform_lcd_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static int platform_lcd_suspend(struct device *dev) { struct platform_lcd *plcd = dev_get_drvdata(dev); @@ -141,10 +141,10 @@ static int platform_lcd_resume(struct device *dev) return 0; } +#endif static SIMPLE_DEV_PM_OPS(platform_lcd_pm_ops, platform_lcd_suspend, platform_lcd_resume); -#endif #ifdef CONFIG_OF static const struct of_device_id platform_lcd_of_match[] = { @@ -158,9 +158,7 @@ static struct platform_driver platform_lcd_driver = { .driver = { .name = "platform-lcd", .owner = THIS_MODULE, -#ifdef CONFIG_PM .pm = &platform_lcd_pm_ops, -#endif .of_match_table = of_match_ptr(platform_lcd_of_match), }, .probe = platform_lcd_probe, diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 57886787ead0..e78d9f2233b8 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -518,6 +518,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw) cyber2000_grphw(0xb9, 0x00, cfb); spin_unlock(&cfb->reg_b0_lock); + /* wait (for the PLL?) to avoid palette corruption at higher clocks */ + msleep(1000); + cfb->ramdac_ctrl = hw->ramdac; cyber2000fb_write_ramdac_ctrl(cfb); diff --git a/drivers/video/exynos/exynos_mipi_dsi.c b/drivers/video/exynos/exynos_mipi_dsi.c index fac7df6d1aba..87cd13b5dee6 100644 --- a/drivers/video/exynos/exynos_mipi_dsi.c +++ b/drivers/video/exynos/exynos_mipi_dsi.c @@ -32,6 +32,7 @@ #include <linux/notifier.h> #include <linux/regulator/consumer.h> #include <linux/pm_runtime.h> +#include <linux/err.h> #include <video/exynos_mipi_dsim.h> @@ -384,10 +385,9 @@ static int exynos_mipi_dsi_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - dsim->reg_base = devm_request_and_ioremap(&pdev->dev, res); - if (!dsim->reg_base) { - dev_err(&pdev->dev, "failed to remap io region\n"); - ret = -ENOMEM; + dsim->reg_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(dsim->reg_base)) { + ret = PTR_ERR(dsim->reg_base); goto error; } diff --git a/drivers/video/matrox/matroxfb_maven.c b/drivers/video/matrox/matroxfb_maven.c index 217678e0b983..fd2897455696 100644 --- a/drivers/video/matrox/matroxfb_maven.c +++ b/drivers/video/matrox/matroxfb_maven.c @@ -137,8 +137,20 @@ static int* get_ctrl_ptr(struct maven_data* md, int idx) { static int maven_get_reg(struct i2c_client* c, char reg) { char dst; - struct i2c_msg msgs[] = {{ c->addr, I2C_M_REV_DIR_ADDR, sizeof(reg), ® }, - { c->addr, I2C_M_RD | I2C_M_NOSTART, sizeof(dst), &dst }}; + struct i2c_msg msgs[] = { + { + .addr = c->addr, + .flags = I2C_M_REV_DIR_ADDR, + .len = sizeof(reg), + .buf = ® + }, + { + .addr = c->addr, + .flags = I2C_M_RD | I2C_M_NOSTART, + .len = sizeof(dst), + .buf = &dst + } + }; s32 err; err = i2c_transfer(c->adapter, msgs, 2); diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c index b75db0186488..0d0a43c5de4f 100644 --- a/drivers/video/uvesafb.c +++ b/drivers/video/uvesafb.c @@ -166,7 +166,7 @@ static int uvesafb_exec(struct uvesafb_ktask *task) memcpy(&m->id, &uvesafb_cn_id, sizeof(m->id)); m->seq = seq; m->len = len; - m->ack = random32(); + m->ack = prandom_u32(); /* uvesafb_task structure */ memcpy(m + 1, &task->t, sizeof(task->t)); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 5a32232cf7c1..7b16994aa6e1 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -145,9 +145,9 @@ config SWIOTLB_XEN select SWIOTLB config XEN_TMEM - bool + tristate depends on !ARM - default y if (CLEANCACHE || FRONTSWAP) + default m if (CLEANCACHE || FRONTSWAP) help Shim to interface in-kernel Transcendent Memory hooks (e.g. cleancache and frontswap) to Xen tmem hypercalls. diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 3ee836d42581..4f3ff99640ab 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -5,6 +5,7 @@ * Author: Dan Magenheimer */ +#include <linux/module.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -128,6 +129,7 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); } +#ifndef CONFIG_XEN_TMEM_MODULE bool __read_mostly tmem_enabled = false; static int __init enable_tmem(char *s) @@ -136,6 +138,7 @@ static int __init enable_tmem(char *s) return 1; } __setup("tmem", enable_tmem); +#endif #ifdef CONFIG_CLEANCACHE static int xen_tmem_destroy_pool(u32 pool_id) @@ -227,16 +230,21 @@ static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); } -static bool __initdata use_cleancache = true; - +static bool disable_cleancache __read_mostly; +static bool disable_selfballooning __read_mostly; +#ifdef CONFIG_XEN_TMEM_MODULE +module_param(disable_cleancache, bool, S_IRUGO); +module_param(disable_selfballooning, bool, S_IRUGO); +#else static int __init no_cleancache(char *s) { - use_cleancache = false; + disable_cleancache = true; return 1; } __setup("nocleancache", no_cleancache); +#endif -static struct cleancache_ops __initdata tmem_cleancache_ops = { +static struct cleancache_ops tmem_cleancache_ops = { .put_page = tmem_cleancache_put_page, .get_page = tmem_cleancache_get_page, .invalidate_page = tmem_cleancache_flush_page, @@ -353,16 +361,21 @@ static void tmem_frontswap_init(unsigned ignored) xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); } -static bool __initdata use_frontswap = true; - +static bool disable_frontswap __read_mostly; +static bool disable_frontswap_selfshrinking __read_mostly; +#ifdef CONFIG_XEN_TMEM_MODULE +module_param(disable_frontswap, bool, S_IRUGO); +module_param(disable_frontswap_selfshrinking, bool, S_IRUGO); +#else static int __init no_frontswap(char *s) { - use_frontswap = false; + disable_frontswap = true; return 1; } __setup("nofrontswap", no_frontswap); +#endif -static struct frontswap_ops __initdata tmem_frontswap_ops = { +static struct frontswap_ops tmem_frontswap_ops = { .store = tmem_frontswap_store, .load = tmem_frontswap_load, .invalidate_page = tmem_frontswap_flush_page, @@ -371,36 +384,46 @@ static struct frontswap_ops __initdata tmem_frontswap_ops = { }; #endif -static int __init xen_tmem_init(void) +static int xen_tmem_init(void) { if (!xen_domain()) return 0; #ifdef CONFIG_FRONTSWAP - if (tmem_enabled && use_frontswap) { + if (tmem_enabled && !disable_frontswap) { char *s = ""; - struct frontswap_ops old_ops = + struct frontswap_ops *old_ops = frontswap_register_ops(&tmem_frontswap_ops); tmem_frontswap_poolid = -1; - if (old_ops.init != NULL) + if (IS_ERR(old_ops) || old_ops) { + if (IS_ERR(old_ops)) + return PTR_ERR(old_ops); s = " (WARNING: frontswap_ops overridden)"; + } printk(KERN_INFO "frontswap enabled, RAM provided by " "Xen Transcendent Memory%s\n", s); } #endif #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && use_cleancache) { + if (tmem_enabled && !disable_cleancache) { char *s = ""; - struct cleancache_ops old_ops = + struct cleancache_ops *old_ops = cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops.init_fs != NULL) + if (old_ops) s = " (WARNING: cleancache_ops overridden)"; printk(KERN_INFO "cleancache enabled, RAM provided by " "Xen Transcendent Memory%s\n", s); } #endif +#ifdef CONFIG_XEN_SELFBALLOONING + xen_selfballoon_init(!disable_selfballooning, + !disable_frontswap_selfshrinking); +#endif return 0; } module_init(xen_tmem_init) +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 2552d3e0a70f..f2ef569c7cc1 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -121,7 +121,7 @@ static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); static bool frontswap_selfshrinking __read_mostly; /* Enable/disable with kernel boot option. */ -static bool use_frontswap_selfshrink __initdata = true; +static bool use_frontswap_selfshrink = true; /* * The default values for the following parameters were deemed reasonable @@ -185,7 +185,7 @@ static int __init xen_nofrontswap_selfshrink_setup(char *s) __setup("noselfshrink", xen_nofrontswap_selfshrink_setup); /* Disable with kernel boot option. */ -static bool use_selfballooning __initdata = true; +static bool use_selfballooning = true; static int __init xen_noselfballooning_setup(char *s) { @@ -196,7 +196,7 @@ static int __init xen_noselfballooning_setup(char *s) __setup("noselfballooning", xen_noselfballooning_setup); #else /* !CONFIG_FRONTSWAP */ /* Enable with kernel boot option. */ -static bool use_selfballooning __initdata = false; +static bool use_selfballooning; static int __init xen_selfballooning_setup(char *s) { @@ -537,7 +537,7 @@ int register_xen_selfballooning(struct device *dev) } EXPORT_SYMBOL(register_xen_selfballooning); -static int __init xen_selfballoon_init(void) +int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) { bool enable = false; @@ -571,7 +571,4 @@ static int __init xen_selfballoon_init(void) return 0; } - -subsys_initcall(xen_selfballoon_init); - -MODULE_LICENSE("GPL"); +EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,6 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/aio.h> #include <net/9p/9p.h> #include <net/9p/client.h> diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -8,6 +8,8 @@ * * See ../COPYING for licensing terms. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,14 +20,14 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#define DEBUG 0 - #include <linux/sched.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/bio.h> #include <linux/mmu_context.h> +#include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -35,15 +37,94 @@ #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/percpu-refcount.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> -#if DEBUG > 1 -#define dprintk printk -#else -#define dprintk(x...) do { ; } while (0) -#endif +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 + +struct kioctx_cpu { + unsigned reqs_available; +}; + +struct kioctx { + struct percpu_ref users; + + /* This needs improving */ + unsigned long user_id; + struct hlist_node list; + + struct __percpu kioctx_cpu *cpu; + + unsigned req_batch; + + unsigned nr; + + /* sys_io_setup currently limits this to an unsigned int */ + unsigned max_reqs; + + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct rcu_head rcu_head; + struct work_struct rcu_work; + + struct { + atomic_t reqs_available; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + + struct { + struct mutex ring_lock; + wait_queue_head_t wait; + + /* + * Copy of the real tail, that aio_complete uses - to reduce + * cacheline bouncing. The real tail will tend to be much more + * contended - since typically events are delivered one at a + * time, and then aio_read_events() slurps them up a bunch at a + * time - so it's helpful if aio_read_events() isn't also + * contending for the tail. So, aio_complete() updates + * shadow_tail whenever it updates tail. + * + * Also needed because tail is used as a hacky lock and isn't + * always the real tail. + */ + unsigned shadow_tail; + } ____cacheline_aligned_in_smp; + + struct { + unsigned tail; + } ____cacheline_aligned_in_smp; + + struct page *internal_pages[AIO_RING_PAGES]; +}; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); @@ -54,11 +135,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; -static struct workqueue_struct *aio_wq; - -static void aio_kick_handler(struct work_struct *); -static void aio_queue_work(struct kioctx *); - /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. @@ -68,10 +144,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ - BUG_ON(!aio_wq); - - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); return 0; } @@ -79,31 +152,29 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; long i; - for (i=0; i<info->nr_pages; i++) - put_page(info->ring_pages[i]); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); - if (info->mmap_size) { - BUG_ON(ctx->mm != current->mm); - vm_munmap(info->mmap_base, info->mmap_size); - } + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (info->ring_pages && info->ring_pages != info->internal_pages) - kfree(info->ring_pages); - info->ring_pages = NULL; - info->nr = 0; + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + kfree(ctx->ring_pages); } static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; - struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ @@ -116,46 +187,44 @@ static int aio_setup_ring(struct kioctx *ctx) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; - info->ring_pages = info->internal_pages; + ctx->nr = 0; + ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!info->ring_pages) + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) return -ENOMEM; } - info->mmap_size = nr_pages * PAGE_SIZE; - dprintk("attempting mmap of %lu bytes\n", info->mmap_size); - down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, - &populate); - if (IS_ERR((void *)info->mmap_base)) { - up_write(&ctx->mm->mmap_sem); - info->mmap_size = 0; + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); + ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + if (IS_ERR((void *)ctx->mmap_base)) { + up_write(&mm->mmap_sem); + ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } - dprintk("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, ctx->mm, - info->mmap_base, nr_pages, - 1, 0, info->ring_pages, NULL); - up_write(&ctx->mm->mmap_sem); + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, + 1, 0, ctx->ring_pages, NULL); + up_write(&mm->mmap_sem); - if (unlikely(info->nr_pages != nr_pages)) { + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, populate); - - ctx->user_id = info->mmap_base; + mm_populate(ctx->mmap_base, populate); - info->nr = nr_events; /* trusted copy */ + ctx->user_id = ctx->mmap_base; + ctx->nr = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -164,72 +233,145 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); return 0; } - -/* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(). Release the pointer with put_aio_ring_event(); - */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) - -#define put_aio_ring_event(event) do { \ - struct io_event *__event = (event); \ - (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ -} while(0) - -static void ctx_rcu_free(struct rcu_head *head) +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +{ + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, + struct io_event *res) +{ + kiocb_cancel_fn *old, *cancel; + int ret = -EINVAL; + + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return ret; + + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); + + atomic_inc(&kiocb->ki_users); + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); + + return ret; +} + +static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. */ -static void __put_ioctx(struct kioctx *ctx) +static void free_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); + struct aio_ring *ring; + struct io_event res; + struct kiocb *req; + unsigned cpu, head, avail; - cancel_delayed_work_sync(&ctx->wq); - aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); } - pr_debug("__put_ioctx: freeing %p\n", ctx); - call_rcu(&ctx->rcu_head, ctx_rcu_free); -} -static inline int try_get_ioctx(struct kioctx *kioctx) -{ - return atomic_inc_not_zero(&kioctx->users); + spin_unlock_irq(&ctx->ctx_lock); + + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); + + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } + + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + while (atomic_read(&ctx->reqs_available) < ctx->nr) { + wait_event(ctx->wait, + (head != ctx->shadow_tail) || + (atomic_read(&ctx->reqs_available) >= ctx->nr)); + + avail = (head <= ctx->shadow_tail ? + ctx->shadow_tail : ctx->nr) - head; + + atomic_add(avail, &ctx->reqs_available); + head += avail; + head %= ctx->nr; + } + + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr); + + aio_free_ring(ctx); + + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + pr_debug("freeing %p\n", ctx); + + /* + * Here the call_rcu() is between the wait_event() for reqs_active to + * hit 0, and freeing the ioctx. + * + * aio_complete() decrements reqs_active, but it has to touch the ioctx + * after to issue a wakeup so we use rcu. + */ + call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static inline void put_ioctx(struct kioctx *kioctx) +static void put_ioctx(struct kioctx *ctx) { - BUG_ON(atomic_read(&kioctx->users) <= 0); - if (unlikely(atomic_dec_and_test(&kioctx->users))) - __put_ioctx(kioctx); + if (percpu_ref_put(&ctx->users)) + free_ioctx(ctx); } /* ioctx_alloc @@ -237,7 +379,7 @@ static inline void put_ioctx(struct kioctx *kioctx) */ static struct kioctx *ioctx_alloc(unsigned nr_events) { - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; @@ -256,21 +398,29 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-ENOMEM); ctx->max_reqs = nr_events; - mm = ctx->mm = current->mm; - atomic_inc(&mm->mm_count); - atomic_set(&ctx->users, 2); + percpu_ref_init(&ctx->users); + rcu_read_lock(); + percpu_ref_get(&ctx->users); + rcu_read_unlock(); + spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); + mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - INIT_LIST_HEAD(&ctx->run_list); - INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); - if (aio_setup_ring(ctx) < 0) + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) goto out_freectx; + if (aio_setup_ring(ctx) < 0) + goto out_freepcpu; + + atomic_set(&ctx->reqs_available, ctx->nr); + ctx->req_batch = ctx->nr / (num_possible_cpus() * 4); + BUG_ON(!ctx->req_batch); + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > aio_max_nr || @@ -286,64 +436,58 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); spin_unlock(&mm->ioctx_lock); - dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, mm, ctx->nr); return ctx; out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); out_freectx: - mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - dprintk("aio: error allocating ioctx %d\n", err); + pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -/* kill_ctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static void kill_ctx(struct kioctx *ctx) +static void kill_ioctx_work(struct work_struct *work) { - int (*cancel)(struct kiocb *, struct io_event *); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - struct io_event res; + struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - spin_lock_irq(&ctx->ctx_lock); - ctx->dead = 1; - while (!list_empty(&ctx->active_reqs)) { - struct list_head *pos = ctx->active_reqs.next; - struct kiocb *iocb = list_kiocb(pos); - list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; - kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } - } + wake_up_all(&ctx->wait); + put_ioctx(ctx); +} - if (!ctx->reqs_active) - goto out; +static void kill_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - add_wait_queue(&ctx->wait, &wait); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (ctx->reqs_active) { - spin_unlock_irq(&ctx->ctx_lock); - io_schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&ctx->ctx_lock); - } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); + INIT_WORK(&ctx->rcu_work, kill_ioctx_work); + schedule_work(&ctx->rcu_work); +} -out: - spin_unlock_irq(&ctx->ctx_lock); +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void kill_ioctx(struct kioctx *ctx) +{ + if (percpu_ref_kill(&ctx->users)) { + hlist_del_rcu(&ctx->list); + /* Between hlist_del_rcu() and dropping the initial ref */ + synchronize_rcu(); + + /* + * We can't punt to workqueue here because put_ioctx() -> + * free_ioctx() will unmap the ringbuffer, and that has to be + * done in the original process's context. kill_ioctx_rcu/work() + * exist for exit_aio(), as in that path free_ioctx() won't do + * the unmap. + */ + kill_ioctx_work(&ctx->rcu_work); + } } /* wait_on_sync_kiocb: @@ -351,9 +495,9 @@ out: */ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { - while (iocb->ki_users) { + while (atomic_read(&iocb->ki_users)) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!iocb->ki_users) + if (!atomic_read(&iocb->ki_users)) break; io_schedule(); } @@ -362,28 +506,20 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) } EXPORT_SYMBOL(wait_on_sync_kiocb); -/* exit_aio: called when the last user of mm goes away. At this point, - * there is no way for any new requests to be submited or any of the - * io_* syscalls to be called on the context. However, there may be - * outstanding requests which hold references to the context; as they - * go away, they will call put_ioctx and release any pinned memory - * associated with the request (held via struct page * references). +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx *ctx; + struct hlist_node *n; - while (!hlist_empty(&mm->ioctx_list)) { - ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); - hlist_del_rcu(&ctx->list); - - kill_ctx(ctx); - - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), ctx->dead, - ctx->reqs_active); + hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -391,150 +527,95 @@ void exit_aio(struct mm_struct *mm) * as indicator that it needs to unmap the area, * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. - * That way we get all munmap done to current->mm - - * all other callers have ctx->mm == current->mm. */ - ctx->ring_info.mmap_size = 0; - put_ioctx(ctx); + ctx->mmap_size = 0; + + if (percpu_ref_kill(&ctx->users)) { + hlist_del_rcu(&ctx->list); + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + } } } -/* aio_get_req - * Allocate a slot for an aio request. Increments the users count - * of the kioctx so that the kioctx stays around until all requests are - * complete. Returns NULL if no requests are free. - * - * Returns with kiocb->users set to 2. The io submit code path holds - * an extra reference while submitting the i/o. - * This prevents races between the aio code path referencing the - * req (after submitting it) and aio_complete() freeing the req. - */ -static struct kiocb *__aio_get_req(struct kioctx *ctx) +static void put_reqs_available(struct kioctx *ctx, unsigned nr) { - struct kiocb *req = NULL; - - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); - if (unlikely(!req)) - return NULL; - - req->ki_flags = 0; - req->ki_users = 2; - req->ki_key = 0; - req->ki_ctx = ctx; - req->ki_cancel = NULL; - req->ki_retry = NULL; - req->ki_dtor = NULL; - req->private = NULL; - req->ki_iovec = NULL; - INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = NULL; + struct kioctx_cpu *kcpu; - return req; -} + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); -/* - * struct kiocb's are allocated in batches to reduce the number of - * times the ctx lock is acquired and released. - */ -#define KIOCB_BATCH_SIZE 32L -struct kiocb_batch { - struct list_head head; - long count; /* number of requests left to allocate */ -}; + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } -static void kiocb_batch_init(struct kiocb_batch *batch, long total) -{ - INIT_LIST_HEAD(&batch->head); - batch->count = total; + preempt_enable(); } -static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) +static bool get_reqs_available(struct kioctx *ctx) { - struct kiocb *req, *n; + struct kioctx_cpu *kcpu; + bool ret = false; - if (list_empty(&batch->head)) - return; + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); - spin_lock_irq(&ctx->ctx_lock); - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - list_del(&req->ki_list); - kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - } - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); - spin_unlock_irq(&ctx->ctx_lock); -} + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); -/* - * Allocate a batch of kiocbs. This avoids taking and dropping the - * context lock a lot during setup. - */ -static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) -{ - unsigned short allocated, to_alloc; - long avail; - struct kiocb *req, *n; - struct aio_ring *ring; - - to_alloc = min(batch->count, KIOCB_BATCH_SIZE); - for (allocated = 0; allocated < to_alloc; allocated++) { - req = __aio_get_req(ctx); - if (!req) - /* allocation failed, go with what we've got */ - break; - list_add(&req->ki_batch, &batch->head); - } + do { + if (avail < ctx->req_batch) + goto out; - if (allocated == 0) - goto out; + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); - spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - - avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; - BUG_ON(avail < 0); - if (avail < allocated) { - /* Trim back the number of requests. */ - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - if (--allocated <= avail) - break; - } + kcpu->reqs_available += ctx->req_batch; } - batch->count -= allocated; - list_for_each_entry(req, &batch->head, ki_batch) { - list_add(&req->ki_list, &ctx->active_reqs); - ctx->reqs_active++; - } - - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); - + ret = true; + kcpu->reqs_available--; out: - return allocated; + preempt_enable(); + return ret; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx, - struct kiocb_batch *batch) +/* aio_get_req + * Allocate a slot for an aio request. Increments the ki_users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns NULL if no requests are free. + * + * Returns with kiocb->ki_users set to 2. The io submit code path holds + * an extra reference while submitting the i/o. + * This prevents races between the aio code path referencing the + * req (after submitting it) and aio_complete() freeing the req. + */ +static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (list_empty(&batch->head)) - if (kiocb_batch_refill(ctx, batch) == 0) - return NULL; - req = list_first_entry(&batch->head, struct kiocb, ki_batch); - list_del(&req->ki_batch); + if (!get_reqs_available(ctx)) + return NULL; + + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) + goto out_put; + + atomic_set(&req->ki_users, 2); + req->ki_ctx = ctx; return req; +out_put: + put_reqs_available(ctx, 1); + return NULL; } -static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +static void kiocb_free(struct kiocb *req) { - assert_spin_locked(&ctx->ctx_lock); - + if (req->ki_filp) + fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) @@ -542,48 +623,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) if (req->ki_iovec != &req->ki_inline_vec) kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); -} - -/* __aio_put_req - * Returns true if this put was the last user of the request. - */ -static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) -{ - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); - - assert_spin_locked(&ctx->ctx_lock); - - req->ki_users--; - BUG_ON(req->ki_users < 0); - if (likely(req->ki_users)) - return 0; - list_del(&req->ki_list); /* remove from active_reqs */ - req->ki_cancel = NULL; - req->ki_retry = NULL; - - fput(req->ki_filp); - req->ki_filp = NULL; - really_put_req(ctx, req); - return 1; } -/* aio_put_req - * Returns true if this put was the last user of the kiocb, - * false if the request is still in use. - */ -int aio_put_req(struct kiocb *req) +void aio_put_req(struct kiocb *req) { - struct kioctx *ctx = req->ki_ctx; - int ret; - spin_lock_irq(&ctx->ctx_lock); - ret = __aio_put_req(ctx, req); - spin_unlock_irq(&ctx->ctx_lock); - return ret; + if (atomic_dec_and_test(&req->ki_users)) + kiocb_free(req); } EXPORT_SYMBOL(aio_put_req); @@ -595,13 +640,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - /* - * RCU protects us against accessing freed memory but - * we have to be careful not to get a reference when the - * reference count already dropped to 0 (ctx->dead test - * is unreliable because of races). - */ - if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + if (ctx->user_id == ctx_id){ + percpu_ref_get(&ctx->users); ret = ctx; break; } @@ -611,610 +651,330 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } -/* - * Queue up a kiocb to be retried. Assumes that the kiocb - * has already been marked as kicked, and places it on - * the retry run list for the corresponding ioctx, if it - * isn't already queued. Returns 1 if it actually queued - * the kiocb (to tell the caller to activate the work - * queue to process it), or 0, if it found that it was - * already queued. - */ -static inline int __queue_kicked_iocb(struct kiocb *iocb) +static inline unsigned kioctx_ring_put(struct kioctx *ctx, struct kiocb *req, + unsigned tail) { - struct kioctx *ctx = iocb->ki_ctx; - - assert_spin_locked(&ctx->ctx_lock); + struct io_event *ev_page, *event; + unsigned pos = tail + AIO_EVENTS_OFFSET; - if (list_empty(&iocb->ki_run_list)) { - list_add_tail(&iocb->ki_run_list, - &ctx->run_list); - return 1; - } - return 0; -} + if (++tail >= ctx->nr) + tail = 0; -/* aio_run_iocb - * This is the core aio execution routine. It is - * invoked both for initial i/o submission and - * subsequent retries via the aio_kick_handler. - * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reacquired - * as needed during processing. - * - * Calls the iocb retry method (already setup for the - * iocb on initial submission) for operation specific - * handling, but takes care of most of common retry - * execution details for a given iocb. The retry method - * needs to be non-blocking as far as possible, to avoid - * holding up other iocbs waiting to be serviced by the - * retry kernel thread. - * - * The trickier parts in this code have to do with - * ensuring that only one retry instance is in progress - * for a given iocb at any time. Providing that guarantee - * simplifies the coding of individual aio operations as - * it avoids various potential races. - */ -static ssize_t aio_run_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - ssize_t (*retry)(struct kiocb *); - ssize_t ret; + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; - if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); - return 0; - } + event->obj = (u64)(unsigned long)req->ki_obj.user; + event->data = req->ki_user_data; + event->res = req->ki_res; + event->res2 = req->ki_res2; - /* - * We don't want the next retry iteration for this - * operation to start until this one has returned and - * updated the iocb state. However, wait_queue functions - * can trigger a kick_iocb from interrupt context in the - * meantime, indicating that data is available for the next - * iteration. We want to remember that and enable the - * next retry iteration _after_ we are through with - * this one. - * - * So, in order to be able to register a "kick", but - * prevent it from being queued now, we clear the kick - * flag, but make the kick code *think* that the iocb is - * still on the run list until we are actually done. - * When we are done with this iteration, we check if - * the iocb was kicked in the meantime and if so, queue - * it up afresh. - */ + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - kiocbClearKicked(iocb); + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", + ctx, tail, req, req->ki_obj.user, req->ki_user_data, + req->ki_res, req->ki_res2); - /* - * This is so that aio_complete knows it doesn't need to - * pull the iocb off the run list (We can't just call - * INIT_LIST_HEAD because we don't want a kick_iocb to - * queue this on the run list yet) - */ - iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; - spin_unlock_irq(&ctx->ctx_lock); + return tail; +} - /* Quit retrying if the i/o has been cancelled */ - if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - } +static inline unsigned kioctx_ring_lock(struct kioctx *ctx) +{ + unsigned tail; /* - * Now we are all set to call the retry method in async - * context. + * ctx->tail is both our lock and the canonical version of the tail + * pointer. */ - ret = retry(iocb); - - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(iocb, ret, 0); - } -out: - spin_lock_irq(&ctx->ctx_lock); + while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX) + cpu_relax(); - if (-EIOCBRETRY == ret) { - /* - * OK, now that we are done with this iteration - * and know that there is more left to go, - * this is where we let go so that a subsequent - * "kick" can start the next iteration - */ - - /* will make __queue_kicked_iocb succeed from here on */ - INIT_LIST_HEAD(&iocb->ki_run_list); - /* we must queue the next iteration ourselves, if it - * has already been kicked */ - if (kiocbIsKicked(iocb)) { - __queue_kicked_iocb(iocb); - - /* - * __queue_kicked_iocb will always return 1 here, because - * iocb->ki_run_list is empty at this point so it should - * be safe to unconditionally queue the context into the - * work queue. - */ - aio_queue_work(ctx); - } - } - return ret; + return tail; } -/* - * __aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static int __aio_run_iocbs(struct kioctx *ctx) +static inline void kioctx_ring_unlock(struct kioctx *ctx, unsigned tail) { - struct kiocb *iocb; - struct list_head run_list; + struct aio_ring *ring; - assert_spin_locked(&ctx->ctx_lock); + if (!ctx) + return; - list_replace_init(&ctx->run_list, &run_list); - while (!list_empty(&run_list)) { - iocb = list_entry(run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - /* - * Hold an extra reference while retrying i/o. - */ - iocb->ki_users++; /* grab extra reference */ - aio_run_iocb(iocb); - __aio_put_req(ctx, iocb); - } - if (!list_empty(&ctx->run_list)) - return 1; - return 0; -} + smp_wmb(); + /* make event visible before updating tail */ -static void aio_queue_work(struct kioctx * ctx) -{ - unsigned long timeout; - /* - * if someone is waiting, get the work started right - * away, otherwise, use a longer delay - */ - smp_mb(); - if (waitqueue_active(&ctx->wait)) - timeout = 1; - else - timeout = HZ/10; - queue_delayed_work(aio_wq, &ctx->wq, timeout); -} + ctx->shadow_tail = tail; -/* - * aio_run_all_iocbs: - * Process all pending retries queued on the ioctx - * run list, and keep running them until the list - * stays empty. - * Assumes it is operating within the aio issuer's mm context. - */ -static inline void aio_run_all_iocbs(struct kioctx *ctx) -{ - spin_lock_irq(&ctx->ctx_lock); - while (__aio_run_iocbs(ctx)) - ; - spin_unlock_irq(&ctx->ctx_lock); -} + ring = kmap_atomic(ctx->ring_pages[0]); + ring->tail = tail; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); -/* - * aio_kick_handler: - * Work queue handler triggered to process pending - * retries on an ioctx. Takes on the aio issuer's - * mm context before running the iocbs, so that - * copy_xxx_user operates on the issuer's address - * space. - * Run on aiod's context. - */ -static void aio_kick_handler(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, wq.work); - mm_segment_t oldfs = get_fs(); - struct mm_struct *mm; - int requeue; + /* unlock, make new tail visible before checking waitlist */ + smp_mb(); - set_fs(USER_DS); - use_mm(ctx->mm); - spin_lock_irq(&ctx->ctx_lock); - requeue =__aio_run_iocbs(ctx); - mm = ctx->mm; - spin_unlock_irq(&ctx->ctx_lock); - unuse_mm(mm); - set_fs(oldfs); - /* - * we're in a worker thread already; no point using non-zero delay - */ - if (requeue) - queue_delayed_work(aio_wq, &ctx->wq, 0); -} + ctx->tail = tail; + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} -/* - * Called by kick_iocb to queue the kiocb for retry - * and if required activate the aio work queue to process - * it - */ -static void try_queue_kicked_iocb(struct kiocb *iocb) +void batch_complete_aio(struct batch_complete *batch) { - struct kioctx *ctx = iocb->ki_ctx; + struct kioctx *ctx = NULL; + struct eventfd_ctx *eventfd = NULL; + struct rb_node *n; unsigned long flags; - int run = 0; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - /* set this inside the lock so that we can't race with aio_run_iocb() - * testing it and putting the iocb on the run list under the lock */ - if (!kiocbTryKick(iocb)) - run = __queue_kicked_iocb(iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (run) - aio_queue_work(ctx); -} + unsigned tail = 0; -/* - * kick_iocb: - * Called typically from a wait queue callback context - * to trigger a retry of the iocb. - * The retry is usually executed by aio workqueue - * threads (See aio_kick_handler). - */ -void kick_iocb(struct kiocb *iocb) -{ - /* sync iocbs are easy: they can only ever be executing from a - * single context. */ - if (is_sync_kiocb(iocb)) { - kiocbSetKicked(iocb); - wake_up_process(iocb->ki_obj.tsk); + if (RB_EMPTY_ROOT(&batch->kiocb)) return; - } - - try_queue_kicked_iocb(iocb); -} -EXPORT_SYMBOL(kick_iocb); - -/* aio_complete - * Called when the io request on the given iocb is complete. - * Returns true if this is the last user of the request. The - * only other user of the request can be the cancellation code. - */ -int aio_complete(struct kiocb *iocb, long res, long res2) -{ - struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; - struct aio_ring *ring; - struct io_event *event; - unsigned long flags; - unsigned long tail; - int ret; /* - * Special case handling for sync iocbs: - * - events go directly into the iocb for fast handling - * - the sync task with the iocb in its stack holds the single iocb - * ref, no other paths have a way to get another ref - * - the sync task helpfully left a reference to itself in the iocb - */ - if (is_sync_kiocb(iocb)) { - BUG_ON(iocb->ki_users != 1); - iocb->ki_user_data = res; - iocb->ki_users = 0; - wake_up_process(iocb->ki_obj.tsk); - return 1; - } - - info = &ctx->ring_info; - - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after incrementing reqs_available. */ - spin_lock_irqsave(&ctx->ctx_lock, flags); + rcu_read_lock(); + local_irq_save(flags); - if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) - list_del_init(&iocb->ki_run_list); + n = rb_first(&batch->kiocb); + while (n) { + struct kiocb *req = container_of(n, struct kiocb, ki_node); - /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (kiocbIsCancelled(iocb)) - goto put_rq; + if (n->rb_right) { + n->rb_right->__rb_parent_color = n->__rb_parent_color; + n = n->rb_right; - ring = kmap_atomic(info->ring_pages[0]); + while (n->rb_left) + n = n->rb_left; + } else { + n = rb_parent(n); + } - tail = info->tail; - event = aio_ring_event(info, tail); - if (++tail >= info->nr) - tail = 0; + if (unlikely(req->ki_eventfd != eventfd)) { + if (eventfd) { + /* Make event visible */ + kioctx_ring_unlock(ctx, tail); + ctx = NULL; - event->obj = (u64)(unsigned long)iocb->ki_obj.user; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + eventfd_signal(eventfd, 1); + eventfd_ctx_put(eventfd); + } - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + eventfd = req->ki_eventfd; + req->ki_eventfd = NULL; + } - /* after flagging the request as done, we - * must never even look at it again - */ - smp_wmb(); /* make event visible before updating tail */ + if (unlikely(req->ki_ctx != ctx)) { + kioctx_ring_unlock(ctx, tail); - info->tail = tail; - ring->tail = tail; + ctx = req->ki_ctx; + tail = kioctx_ring_lock(ctx); + } - put_aio_ring_event(event); - kunmap_atomic(ring); + tail = kioctx_ring_put(ctx, req, tail); + aio_put_req(req); + } - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + kioctx_ring_unlock(ctx, tail); + local_irq_restore(flags); + rcu_read_unlock(); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd != NULL) - eventfd_signal(iocb->ki_eventfd, 1); + if (eventfd) { + eventfd_signal(eventfd, 1); + eventfd_ctx_put(eventfd); + } +} +EXPORT_SYMBOL(batch_complete_aio); -put_rq: - /* everything turned out well, dispose of the aiocb. */ - ret = __aio_put_req(ctx, iocb); +/* aio_complete_batch + * Called when the io request on the given iocb is complete; @batch may be + * NULL. + */ +void aio_complete_batch(struct kiocb *req, long res, long res2, + struct batch_complete *batch) +{ + req->ki_res = res; + req->ki_res2 = res2; + + if (req->ki_list.next) { + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&req->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* - * We have to order our ring_info tail store above and test - * of the wait list below outside the wait lock. This is - * like in wake_up_bit() where clearing a bit has to be - * ordered with the unlocked test. + * Special case handling for sync iocbs: + * - events go directly into the iocb for fast handling + * - the sync task with the iocb in its stack holds the single iocb + * ref, no other paths have a way to get another ref + * - the sync task helpfully left a reference to itself in the iocb */ - smp_mb(); + if (is_sync_kiocb(req)) { + BUG_ON(atomic_read(&req->ki_users) != 1); + req->ki_user_data = req->ki_res; + atomic_set(&req->ki_users, 0); + wake_up_process(req->ki_obj.tsk); + } else if (batch) { + int res; + struct kiocb *t; + struct rb_node **n = &batch->kiocb.rb_node, *parent = NULL; + + while (*n) { + parent = *n; + t = container_of(*n, struct kiocb, ki_node); + + res = req->ki_ctx != t->ki_ctx + ? req->ki_ctx < t->ki_ctx + : req->ki_eventfd != t->ki_eventfd + ? req->ki_eventfd < t->ki_eventfd + : req < t; + + n = res ? &(*n)->rb_left : &(*n)->rb_right; + } - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + rb_link_node(&req->ki_node, parent, n); + rb_insert_color(&req->ki_node, &batch->kiocb); + } else { + struct batch_complete batch_stack; - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - return ret; + memset(&req->ki_node, 0, sizeof(req->ki_node)); + batch_stack.kiocb.rb_node = &req->ki_node; + + batch_complete_aio(&batch_stack); + } } -EXPORT_SYMBOL(aio_complete); +EXPORT_SYMBOL(aio_complete_batch); -/* aio_read_evt - * Pull an event off of the ioctx's event ring. Returns the number of - * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. - * TODO: make the ringbuffer user mmap()able (requires FIXME). +/* aio_read_events + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched */ -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; - - ring = kmap_atomic(info->ring_pages[0]); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; + unsigned head, pos; + long ret = 0; + int copy_ret; - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp); - } - spin_unlock(&info->ring_lock); + mutex_lock(&ctx->ring_lock); -out: + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; kunmap_atomic(ring); - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); - return ret; -} -struct aio_timeout { - struct timer_list timer; - int timed_out; - struct task_struct *p; -}; + pr_debug("h%u t%u m%u\n", head, ctx->shadow_tail, ctx->nr); -static void timeout_func(unsigned long data) -{ - struct aio_timeout *to = (struct aio_timeout *)data; + if (head == ctx->shadow_tail) + goto out; - to->timed_out = 1; - wake_up_process(to->p); -} + while (ret < nr) { + long avail = (head <= ctx->shadow_tail + ? ctx->shadow_tail : ctx->nr) - head; + struct io_event *ev; + struct page *page; -static inline void init_timeout(struct aio_timeout *to) -{ - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); - to->timed_out = 0; - to->p = current; -} + if (head == ctx->shadow_tail) + break; -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, - const struct timespec *ts) -{ - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); - if (time_after(to->timer.expires, jiffies)) - add_timer(&to->timer); - else - to->timed_out = 1; -} + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); -static inline void clear_timeout(struct aio_timeout *to) -{ - del_singleshot_timer_sync(&to->timer); -} + pos = head + AIO_EVENTS_OFFSET; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; -static int read_events(struct kioctx *ctx, - long min_nr, long nr, - struct io_event __user *event, - struct timespec __user *timeout) -{ - long start_jiffies = jiffies; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - int ret; - int i = 0; - struct io_event ent; - struct aio_timeout to; - int retry = 0; - - /* needed to zero any padding within an entry (there shouldn't be - * any, but C is fun! - */ - memset(&ent, 0, sizeof(ent)); -retry: - ret = 0; - while (likely(i < nr)) { - ret = aio_read_evt(ctx, &ent); - if (unlikely(ret <= 0)) - break; + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); + kunmap(page); - dprintk("read event: %Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); - - /* Could we split the check in two? */ - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; } - ret = 0; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; + ret += avail; + head += avail; + head %= ctx->nr; } - if (min_nr <= i) - return i; - if (ret) - return ret; - - /* End fast path */ + ring = kmap_atomic(ctx->ring_pages[0]); + ring->head = head; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - /* racey check, but it gets redone */ - if (!retry && unlikely(!list_empty(&ctx->run_list))) { - retry = 1; - aio_run_all_iocbs(ctx); - goto retry; - } + pr_debug("%li h%u t%u\n", ret, head, ctx->shadow_tail); - init_timeout(&to); - if (timeout) { - struct timespec ts; - ret = -EFAULT; - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - goto out; + put_reqs_available(ctx, ret); +out: + mutex_unlock(&ctx->ring_lock); - set_timeout(start_jiffies, &to, &ts); - } + return ret; +} - while (likely(i < nr)) { - add_wait_queue_exclusive(&ctx->wait, &wait); - do { - set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); - if (ret) - break; - if (min_nr <= i) - break; - if (unlikely(ctx->dead)) { - ret = -EINVAL; - break; - } - if (to.timed_out) /* Only check after read evt */ - break; - /* Try to only show up in io wait if there are ops - * in flight */ - if (ctx->reqs_active) - io_schedule(); - else - schedule(); - if (signal_pending(tsk)) { - ret = -EINTR; - break; - } - /*ret = aio_read_evt(ctx, &ent);*/ - } while (1) ; +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) +{ + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); - set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); + if (ret > 0) + *i += ret; - if (unlikely(ret <= 0)) - break; - - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; - } + if (unlikely(percpu_ref_dead(&ctx->users))) + ret = -EINVAL; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } + if (!*i) + *i = ret; - if (timeout) - clear_timeout(&to); -out: - destroy_timer_on_stack(&to.timer); - return i ? i : ret; + return ret < 0 || *i >= min_nr; } -/* Take an ioctx and remove it from the list of ioctx's. Protects - * against races with itself via ->dead. - */ -static void io_destroy(struct kioctx *ioctx) +static long read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) { - struct mm_struct *mm = current->mm; - int was_dead; + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; - /* delete the entry from the list is someone else hasn't already */ - spin_lock(&mm->ioctx_lock); - was_dead = ioctx->dead; - ioctx->dead = 1; - hlist_del_rcu(&ioctx->list); - spin_unlock(&mm->ioctx_lock); + if (timeout) { + struct timespec ts; - dprintk("aio_release(%p)\n", ioctx); - if (likely(!was_dead)) - put_ioctx(ioctx); /* twice for the list */ + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + return -EFAULT; - kill_ctx(ioctx); + until = timespec_to_ktime(ts); + } /* - * Wake up any waiters. The setting of ctx->dead must be seen - * by other CPUs at this point. Right now, we rely on the - * locking done by the above calls to ensure this consistency. + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. */ - wake_up_all(&ioctx->wait); + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), until); + + if (!ret && signal_pending(current)) + ret = -EINTR; + + return ret; } /* sys_io_setup: @@ -1252,7 +1012,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); } @@ -1270,7 +1030,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); return 0; } @@ -1301,24 +1061,15 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) BUG_ON(ret > 0 && iocb->ki_left == 0); } -static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - ssize_t (*rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); ssize_t ret = 0; - unsigned short opcode; - - if ((iocb->ki_opcode == IOCB_CMD_PREADV) || - (iocb->ki_opcode == IOCB_CMD_PREAD)) { - rw_op = file->f_op->aio_read; - opcode = IOCB_CMD_PREADV; - } else { - rw_op = file->f_op->aio_write; - opcode = IOCB_CMD_PWRITEV; - } /* This matches the pread()/pwrite() logic */ if (iocb->ki_pos < 0) @@ -1334,7 +1085,7 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* retry all partial writes. retry partial reads as long as its a * regular file. */ } while (ret > 0 && iocb->ki_left > 0 && - (opcode == IOCB_CMD_PWRITEV || + (rw == WRITE || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); /* This means we must have transferred all that we could */ @@ -1344,81 +1095,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ - if (opcode == IOCB_CMD_PWRITEV - && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + if (rw == WRITE + && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; return ret; } -static ssize_t aio_fdsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 1); - return ret; -} - -static ssize_t aio_fsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 0); - return ret; -} - -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; + kiocb->ki_nr_segs = kiocb->ki_nbytes; + #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(type, + ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); else #endif - ret = rw_copy_check_uvector(type, + ret = rw_copy_check_uvector(rw, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); if (ret < 0) - goto out; - - ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); - if (ret < 0) - goto out; + return ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; - kiocb->ki_cur_seg = 0; - /* ki_nbytes/left now reflect bytes instead of segs */ + /* ki_nbytes now reflect bytes instead of segs */ kiocb->ki_nbytes = ret; - kiocb->ki_left = ret; - - ret = 0; -out: - return ret; + return 0; } -static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) { - int bytes; - - bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); - if (bytes < 0) - return bytes; + if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + return -EFAULT; kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; kiocb->ki_nr_segs = 1; - kiocb->ki_cur_seg = 0; return 0; } @@ -1427,96 +1146,94 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, bool compat) { - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + aio_rw_op *rw_op; - switch (kiocb->ki_opcode) { + switch (req->ki_opcode) { case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(READ, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(WRITE, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; - break; case IOCB_CMD_PREADV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = aio_setup_vectored_rw(READ, kiocb, compat); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; + mode = FMODE_READ; + rw = READ; + rw_op = file->f_op->aio_read; + goto rw_common; + + case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITEV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + mode = FMODE_WRITE; + rw = WRITE; + rw_op = file->f_op->aio_write; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!rw_op) + return -EINVAL; + + ret = (req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(rw, req, compat) + : aio_setup_single_vector(rw, req); if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; + return ret; + + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (ret < 0) + return ret; + + req->ki_nbytes = ret; + req->ki_left = ret; + + ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fdsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); break; + case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); break; + default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; + pr_debug("EINVAL: no operation provided\n"); + return -EINVAL; } - if (!kiocb->ki_retry) - return ret; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } return 0; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct kiocb_batch *batch, - bool compat) + struct iocb *iocb, bool compat) { struct kiocb *req; - struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { - pr_debug("EINVAL: io_submit: reserve field set\n"); + pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1530,16 +1247,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - file = fget(iocb->aio_fildes); - if (unlikely(!file)) - return -EBADF; - - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ - if (unlikely(!req)) { - fput(file); + req = aio_get_req(ctx); + if (unlikely(!req)) return -EAGAIN; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) { + ret = -EBADF; + goto out_put_req; } - req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an @@ -1555,9 +1272,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } } - ret = put_user(req->ki_key, &user_iocb->aio_key); + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { - dprintk("EFAULT: aio_key\n"); + pr_debug("EFAULT: aio_key\n"); goto out_put_req; } @@ -1569,41 +1286,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req, compat); - + ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - spin_lock_irq(&ctx->ctx_lock); - /* - * We could have raced with io_destroy() and are currently holding a - * reference to ctx which should be destroyed. We cannot submit IO - * since ctx gets freed as soon as io_submit() puts its reference. The - * check here is reliable: io_destroy() sets ctx->dead before waiting - * for outstanding IO and the barrier between these two is realized by - * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we - * increment ctx->reqs_active before checking for ctx->dead and the - * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we - * don't see ctx->dead set here, io_destroy() waits for our IO to - * finish. - */ - if (ctx->dead) { - spin_unlock_irq(&ctx->ctx_lock); - ret = -EINVAL; - goto out_put_req; - } - aio_run_iocb(req); - if (!list_empty(&ctx->run_list)) { - /* drain the run list */ - while (__aio_run_iocbs(ctx)) - ; - } - spin_unlock_irq(&ctx->ctx_lock); - aio_put_req(req); /* drop extra ref to req */ return 0; - out_put_req: + put_reqs_available(ctx, 1); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; @@ -1616,7 +1306,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, long ret = 0; int i = 0; struct blk_plug plug; - struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1629,12 +1318,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { - pr_debug("EINVAL: io_submit: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - kiocb_batch_init(&batch, nr); - blk_start_plug(&plug); /* @@ -1655,13 +1342,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, compat); if (ret) break; } blk_finish_plug(&plug); - kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } @@ -1694,10 +1380,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, assert_spin_locked(&ctx->ctx_lock); + if (key != KIOCB_KEY) + return NULL; + /* TODO: use a hash or array, this sucks. */ list_for_each(pos, &ctx->active_reqs) { struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + if (kiocb->ki_obj.user == iocb) return kiocb; } return NULL; @@ -1716,7 +1405,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1731,32 +1420,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb && kiocb->ki_cancel) { - cancel = kiocb->ki_cancel; - kiocb->ki_users ++; - kiocbSetCancelled(kiocb); - } else - cancel = NULL; + if (kiocb) + ret = kiocb_cancel(ctx, kiocb, &res); + else + ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); - if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } - } else - ret = -EINVAL; + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &res, sizeof(res))) + ret = -EFAULT; + } put_ioctx(ctx); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bbc8f8827eac..14b7ea3c8f5e 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -62,7 +62,6 @@ static int aout_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; - current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); dump.signal = cprm->siginfo->si_signo; @@ -256,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3939829f6c5c..ced3dcfdac8c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss) #define ELF_BASE_PLATFORM NULL #endif +/* + * Use get_random_int() to implement AT_RANDOM while avoiding depletion + * of the entropy pool. + */ +static void get_atrandom_bytes(unsigned char *buf, size_t nbytes) +{ + unsigned char *p = buf; + + while (nbytes) { + unsigned int random_variable; + size_t chunk = min(nbytes, sizeof(random_variable)); + + random_variable = get_random_int(); + memcpy(p, &random_variable, chunk); + p += chunk; + nbytes -= chunk; + } +} + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, /* * Generate 16 random bytes for userspace PRNG seeding. */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) @@ -735,8 +754,6 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { @@ -2090,8 +2107,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto cleanup; has_dumped = 1; - current->flags |= PF_DUMPCORE; - + fs = get_fs(); set_fs(KERNEL_DS); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9c13e023e2b7..c1cc06aed601 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1687,8 +1687,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_elf_fdpic_header(elf, e_phnum); has_dumped = 1; - current->flags |= PF_DUMPCORE; - /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. @@ -27,6 +27,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/aio.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -1407,33 +1408,44 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif -/** - * bio_endio - end I/O on a bio - * @bio: bio - * @error: error, if any - * - * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. - **/ -void bio_endio(struct bio *bio, int error) +static inline void __bio_endio(struct bio *bio, struct batch_complete *batch) { - if (error) + if (bio->bi_error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = -EIO; + + if (bio_flagged(bio, BIO_BATCH_ENDIO)) + bio->bi_batch_end_io(bio, bio->bi_error, batch); + else if (bio->bi_end_io) + bio->bi_end_io(bio, bio->bi_error); +} + +void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch) +{ + if (error) + bio->bi_error = error; trace_block_bio_complete(bio, error); - if (bio->bi_end_io) - bio->bi_end_io(bio, error); + if (batch) + bio_list_add(&batch->bio, bio); + else + __bio_endio(bio, batch); + +} +EXPORT_SYMBOL(bio_endio_batch); + +void batch_complete(struct batch_complete *batch) +{ + struct bio *bio; + + while ((bio = bio_list_pop(&batch->bio))) + __bio_endio(bio, batch); + + batch_complete_aio(batch); } -EXPORT_SYMBOL(bio_endio); +EXPORT_SYMBOL(batch_complete); void bio_pair_release(struct bio_pair *bp) { diff --git a/fs/block_dev.c b/fs/block_dev.c index aea605c98ba6..4d48cf5814f8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" @@ -616,11 +617,9 @@ void bd_forget(struct inode *inode) struct block_device *bdev = NULL; spin_lock(&bdev_lock); - if (inode->i_bdev) { - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - __bd_forget(inode); - } + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); spin_unlock(&bdev_lock); if (bdev) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5b4ea5f55b8f..6add1b16be80 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -1514,7 +1515,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, size_t count, ocount; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; mutex_lock(&inode->i_mutex); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca1b767d51f7..ca26188ff629 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> +#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bf338d9b67e3..eb09f41ee52d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "super.h" #include "mds_client.h" diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7a0dd99e4507..50b9868af4de 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2520,7 +2520,8 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; /* * We need to hold the sem to be sure nobody modifies lock list diff --git a/fs/compat.c b/fs/compat.c index 5f83ffa42115..4899d3b5cf4b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -47,6 +47,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> diff --git a/fs/coredump.c b/fs/coredump.c index c6479658d487..7638895df974 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,7 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -263,7 +264,6 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -280,8 +280,8 @@ static int zap_process(struct task_struct *start, int exit_code) return nr; } -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - struct core_state *core_state, int exit_code) +static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) { struct task_struct *g, *p; unsigned long flags; @@ -291,6 +291,11 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; nr = zap_process(tsk, exit_code); + tsk->flags = PF_DUMPCORE; + tsk->signal->group_exit_task = tsk; + /* ignore all signals except SIGKILL, see prepare_signal() */ + tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) @@ -340,6 +345,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); nr += zap_process(p, exit_code); + p->signal->flags = SIGNAL_GROUP_EXIT; unlock_task_sighand(p, &flags); } break; @@ -386,11 +392,18 @@ static int coredump_wait(int exit_code, struct core_state *core_state) return core_waiters; } -static void coredump_finish(struct mm_struct *mm) +static void coredump_finish(struct mm_struct *mm, bool core_dumped) { struct core_thread *curr, *next; struct task_struct *task; + spin_lock_irq(¤t->sighand->siglock); + if (core_dumped && !__fatal_signal_pending(current)) + current->signal->group_exit_code |= 0x80; + current->signal->group_exit_task = NULL; + current->signal->flags = SIGNAL_GROUP_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + next = mm->core_state->dumper.next; while ((curr = next) != NULL) { next = curr->next; @@ -416,17 +429,16 @@ static void wait_for_dump_helpers(struct file *file) pipe_lock(pipe); pipe->readers++; pipe->writers--; + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_unlock(pipe); - while ((pipe->readers > 1) && (!signal_pending(current))) { - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - pipe_wait(pipe); - } + wait_event_freezable(pipe->wait, pipe->readers == 1); + pipe_lock(pipe); pipe->readers--; pipe->writers++; pipe_unlock(pipe); - } /* @@ -471,6 +483,7 @@ void do_coredump(siginfo_t *siginfo) int ispipe; struct files_struct *displaced; bool need_nonrelative = false; + bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, @@ -514,12 +527,6 @@ void do_coredump(siginfo_t *siginfo) old_cred = override_creds(cred); - /* - * Clear any false indication of pending signals that might - * be seen by the filesystem code called to write the core file. - */ - clear_thread_flag(TIF_SIGPENDING); - ispipe = format_corename(&cn, &cprm); if (ispipe) { @@ -629,9 +636,8 @@ void do_coredump(siginfo_t *siginfo) goto close_fail; if (displaced) put_files_struct(displaced); - retval = binfmt->core_dump(&cprm); - if (retval) - current->signal->group_exit_code |= 0x80; + + core_dumped = binfmt->core_dump(&cprm); if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -644,7 +650,7 @@ fail_dropcount: fail_unlock: kfree(cn.corename); fail_corename: - coredump_finish(mm); + coredump_finish(mm, core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); diff --git a/fs/direct-io.c b/fs/direct-io.c index f853263cf74f..55683f36a2f1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> +#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -229,7 +230,8 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async, + struct batch_complete *batch) { ssize_t transferred = 0; @@ -263,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is } else { inode_dio_done(dio->inode); if (is_async) - aio_complete(dio->iocb, ret, 0); + aio_complete_batch(dio->iocb, ret, 0, batch); } return ret; @@ -273,7 +275,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. */ -static void dio_bio_end_aio(struct bio *bio, int error) +static void dio_bio_end_aio(struct bio *bio, int error, struct batch_complete *batch) { struct dio *dio = bio->bi_private; unsigned long remaining; @@ -289,7 +291,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); + dio_complete(dio, dio->iocb->ki_pos, 0, true, batch); kmem_cache_free(dio_cache, dio); } } @@ -328,7 +330,7 @@ void dio_end_io(struct bio *bio, int error) struct dio *dio = bio->bi_private; if (dio->is_async) - dio_bio_end_aio(bio, error); + dio_bio_end_aio(bio, error, NULL); else dio_bio_end_io(bio, error); } @@ -349,9 +351,10 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_bdev = bdev; bio->bi_sector = first_sector; - if (dio->is_async) - bio->bi_end_io = dio_bio_end_aio; - else + if (dio->is_async) { + bio->bi_batch_end_io = dio_bio_end_aio; + bio->bi_flags |= 1 << BIO_BATCH_ENDIO; + } else bio->bi_end_io = dio_bio_end_io; sdio->bio = bio; @@ -1272,7 +1275,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, offset, retval, false); + retval = dio_complete(dio, offset, retval, false, NULL); kmem_cache_free(dio_cache, dio); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..f23d2a7ed438 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { + printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n", + current->comm, task_pid_nr(current), sysctl_drop_caches); if (sysctl_drop_caches & 1) iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/aio.h> #include "ecryptfs_kernel.h" /** diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 495d15558f42..7de3c5b5fab2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -105,7 +105,7 @@ struct epoll_filefd { struct file *file; int fd; -}; +} __packed; /* * Structure used to track possible nested calls, for too deep recursions @@ -349,7 +349,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op != EPOLL_CTL_DEL; + return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; } /* Initialize the poll safe wake up structure */ @@ -679,6 +679,36 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } +/* + * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item + * had no event flags set, indicating that another thread may be currently + * handling that item's events (in the case that EPOLLONESHOT was being + * used). Otherwise a zero result indicates that the item has been disabled + * from receiving events. A disabled item may be re-enabled via + * EPOLL_CTL_MOD. Must be called with "mtx" held. + */ +static int ep_disable(struct eventpoll *ep, struct epitem *epi) +{ + int result = 0; + unsigned long flags; + + spin_lock_irqsave(&ep->lock, flags); + if (epi->event.events & EPOLLONESHOT) { + if (epi->event.events & ~EP_PRIVATE_BITS) { + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + /* Ensure ep_poll_callback will not add epi back onto + ready list: */ + epi->event.events &= EP_PRIVATE_BITS; + } else + result = -EBUSY; + } else + result = -EINVAL; + spin_unlock_irqrestore(&ep->lock, flags); + + return result; +} + static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1049,8 +1079,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } - - #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1836,6 +1864,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; + case EPOLL_CTL_DISABLE: + if (epi) + error = ep_disable(ep, epi); + else + error = -ENOENT; + break; } mutex_unlock(&ep->mtx); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext2.h" #include "acl.h" #include "xip.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d512c4bc4ad7..eac4f041f5fc 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..4959e29573b6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include "ext4.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b505a145a593..21de12366b47 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cfbebe9c0250..96eb9410ed03 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/aio.h> #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ecc..d9903af92e51 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,6 +18,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 277966a8547a..02ad450d226b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> diff --git a/fs/fat/inode.c b/fs/fat/inode.c index acf6e479b443..d1d502a026a5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/mount.h> +#include <linux/aio.h> #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 6f96a8def147..06b5e086ab3a 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11dfa0c3fb46..06c569e492ed 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 34b80ba95bad..f2ae8fd6242c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> +#include <linux/aio.h> static const struct file_operations fuse_direct_io_file_operations; @@ -971,7 +972,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 24f414f0ce61..371bd144d802 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3031dfdd2358..a9d60d46ba99 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index d73c98d1ee99..bbfdc1707725 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -56,7 +56,8 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, int *end, int *cur_rec) { - __be32 cur_cnid, search_cnid; + __be32 cur_cnid; + __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; @@ -67,8 +68,11 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; - } else + } else { + cur_cnid = 0; /* used-uninitialized warning */ + search_cnid = 0; BUG(); + } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 160ccc9cdb4b..cdd181d8ba09 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b7dc47ba675e..1781f06aa1c1 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6b49f14eac8c..1e92930d59c3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -25,7 +25,7 @@ #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/uio.h> +#include <linux/aio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5b2d4f0853ac..b870ae00517a 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> +#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -2129,7 +2130,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> +#include <linux/aio.h> #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,6 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H +#include <linux/aio.h> + handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags, subclass, _RET_IP_); if (status < 0) { - if (status != -EAGAIN && status != -EIOCBRETRY) + if (status != -EAGAIN) mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6474cb44004d..859cef7e8b4a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2248,7 +2248,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2468,6 +2469,9 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name, len); + if (!sb_start_file_write(out)) + return -EAGAIN; + if (pipe->inode) mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); @@ -2506,6 +2510,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, balance_dirty_pages_ratelimited(mapping); } + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..c765bdf6d60e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -28,6 +28,8 @@ #include "extent_map.h" +struct iocb; + /* OCFS2 Inode Private Data */ struct ocfs2_inode_info { diff --git a/fs/pipe.c b/fs/pipe.c index 2234f3f61f8d..34a643dd22dc 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4b3b3ffb52f1..c5450183ca78 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -181,14 +181,16 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes, { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + rcu_read_unlock(); rv = __proc_file_read(file, buf, nbytes, ppos); @@ -204,13 +206,16 @@ proc_file_write(struct file *file, const char __user *buffer, ssize_t rv = -EIO; if (pde->write_proc) { - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + const struct file_operations *fops; + + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + rcu_read_unlock(); /* FIXME: does this routine need ppos? probably... */ rv = pde->write_proc(file, buffer, count, pde->data); @@ -542,7 +547,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { - dp->proc_fops = &proc_dir_operations; + RCU_INIT_POINTER(dp->proc_fops, &proc_dir_operations); dp->proc_iops = &proc_dir_inode_operations; } dir->nlink++; @@ -551,7 +556,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp dp->proc_iops = &proc_link_inode_operations; } else if (S_ISREG(dp->mode)) { if (dp->proc_fops == NULL) - dp->proc_fops = &proc_file_operations; + RCU_INIT_POINTER(dp->proc_fops, &proc_file_operations); if (dp->proc_iops == NULL) dp->proc_iops = &proc_file_inode_operations; } @@ -604,7 +609,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); - spin_lock_init(&ent->pde_unload_lock); + atomic_set(&ent->pde_users, 1); + spin_lock_init(&ent->pde_openers_lock); INIT_LIST_HEAD(&ent->pde_openers); out: return ent; @@ -728,7 +734,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, pde = __proc_create(&parent, name, mode, nlink); if (!pde) goto out; - pde->proc_fops = proc_fops; + rcu_assign_pointer(pde->proc_fops, proc_fops); pde->data = data; if (proc_register(parent, pde) < 0) goto out_free; @@ -764,6 +770,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) struct proc_dir_entry *de = NULL; const char *fn = name; unsigned int len; + DECLARE_COMPLETION_ONSTACK(c); spin_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { @@ -786,37 +793,30 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) return; } - spin_lock(&de->pde_unload_lock); /* * Stop accepting new callers into module. If you're * dynamically allocating ->proc_fops, save a pointer somewhere. */ - de->proc_fops = NULL; - /* Wait until all existing callers into module are done. */ - if (de->pde_users > 0) { - DECLARE_COMPLETION_ONSTACK(c); - - if (!de->pde_unload_completion) - de->pde_unload_completion = &c; - - spin_unlock(&de->pde_unload_lock); - + rcu_assign_pointer(de->proc_fops, NULL); + synchronize_rcu(); + /* + * Wait until all existing callers into module are done. + * Once pde_users hits zero we are free to clean out pde_openers. + */ + de->pde_unload_completion = &c; + if (!atomic_dec_and_test(&de->pde_users)) wait_for_completion(de->pde_unload_completion); - spin_lock(&de->pde_unload_lock); - } - + spin_lock(&de->pde_openers_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); list_del(&pdeo->lh); - spin_unlock(&de->pde_unload_lock); pdeo->release(pdeo->inode, pdeo->file); kfree(pdeo); - spin_lock(&de->pde_unload_lock); } - spin_unlock(&de->pde_unload_lock); + spin_unlock(&de->pde_openers_lock); if (S_ISDIR(de->mode)) parent->nlink--; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7c..f53660a471c2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -129,46 +129,41 @@ static const struct super_operations proc_sops = { .show_options = proc_show_options, }; -static void __pde_users_dec(struct proc_dir_entry *pde) -{ - pde->pde_users--; - if (pde->pde_unload_completion && pde->pde_users == 0) - complete(pde->pde_unload_completion); -} - void pde_users_dec(struct proc_dir_entry *pde) { - spin_lock(&pde->pde_unload_lock); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); + if (atomic_dec_and_test(&pde->pde_users) && pde->pde_unload_completion) + complete(pde->pde_unload_completion); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { + const struct file_operations *fops; struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; loff_t (*llseek)(struct file *, loff_t, int); - spin_lock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); /* * remove_proc_entry() is going to delete PDE (as part of module * cleanup sequence). No new callers into module allowed. */ - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + if (!fops) { + rcu_read_unlock(); return rv; } /* * Bump refcount so that remove_proc_entry will wail for ->llseek to * complete. */ - pde->pde_users++; + atomic_inc(&pde->pde_users); + /* - * Save function pointer under lock, to protect against ->proc_fops - * NULL'ifying right after ->pde_unload_lock is dropped. + * Save function pointer under rcu lock, to protect against + * ->proc_fops NULL'ifying by remove_proc_entry. */ - llseek = pde->proc_fops->llseek; - spin_unlock(&pde->pde_unload_lock); + llseek = fops->llseek; + rcu_read_unlock(); if (!llseek) llseek = default_llseek; @@ -183,15 +178,17 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - read = pde->proc_fops->read; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + read = fops->read; + rcu_read_unlock(); if (read) rv = read(file, buf, count, ppos); @@ -205,15 +202,17 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - write = pde->proc_fops->write; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + write = fops->write; + rcu_read_unlock(); if (write) rv = write(file, buf, count, ppos); @@ -227,15 +226,17 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned int rv = DEFAULT_POLLMASK; unsigned int (*poll)(struct file *, struct poll_table_struct *); + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - poll = pde->proc_fops->poll; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + poll = fops->poll; + rcu_read_unlock(); if (poll) rv = poll(file, pts); @@ -249,15 +250,17 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*ioctl)(struct file *, unsigned int, unsigned long); + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - ioctl = pde->proc_fops->unlocked_ioctl; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + ioctl = fops->unlocked_ioctl; + rcu_read_unlock(); if (ioctl) rv = ioctl(file, cmd, arg); @@ -272,15 +275,17 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - compat_ioctl = pde->proc_fops->compat_ioctl; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + compat_ioctl = fops->compat_ioctl; + rcu_read_unlock(); if (compat_ioctl) rv = compat_ioctl(file, cmd, arg); @@ -295,15 +300,17 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; int (*mmap)(struct file *, struct vm_area_struct *); + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); return rv; } - pde->pde_users++; - mmap = pde->proc_fops->mmap; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + mmap = fops->mmap; + rcu_read_unlock(); if (mmap) rv = mmap(file, vma); @@ -319,6 +326,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) int (*open)(struct inode *, struct file *); int (*release)(struct inode *, struct file *); struct pde_opener *pdeo; + const struct file_operations *fops; /* * What for, you ask? Well, we can have open, rmmod, remove_proc_entry @@ -334,32 +342,33 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pdeo) return -ENOMEM; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); kfree(pdeo); return -ENOENT; } - pde->pde_users++; - open = pde->proc_fops->open; - release = pde->proc_fops->release; - spin_unlock(&pde->pde_unload_lock); + atomic_inc(&pde->pde_users); + open = fops->open; + release = fops->release; + rcu_read_unlock(); if (open) rv = open(inode, file); - spin_lock(&pde->pde_unload_lock); if (rv == 0 && release) { /* To know what to release. */ pdeo->inode = inode; pdeo->file = file; /* Strictly for "too late" ->release in proc_reg_release(). */ pdeo->release = release; + spin_lock(&pde->pde_openers_lock); list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_openers_lock); } else kfree(pdeo); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); + pde_users_dec(pde); return rv; } @@ -368,10 +377,14 @@ static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, { struct pde_opener *pdeo; + spin_lock(&pde->pde_openers_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { - if (pdeo->inode == inode && pdeo->file == file) + if (pdeo->inode == inode && pdeo->file == file) { + spin_unlock(&pde->pde_openers_lock); return pdeo; + } } + spin_unlock(&pde->pde_openers_lock); return NULL; } @@ -381,10 +394,13 @@ static int proc_reg_release(struct inode *inode, struct file *file) int rv = 0; int (*release)(struct inode *, struct file *); struct pde_opener *pdeo; + const struct file_operations *fops; - spin_lock(&pde->pde_unload_lock); pdeo = find_pde_opener(pde, inode, file); - if (!pde->proc_fops) { + rcu_read_lock(); + fops = rcu_dereference(pde->proc_fops); + if (!fops) { + rcu_read_unlock(); /* * Can't simply exit, __fput() will think that everything is OK, * and move on to freeing struct file. remove_proc_entry() will @@ -394,21 +410,23 @@ static int proc_reg_release(struct inode *inode, struct file *file) * But if opener is removed from list, who will ->release it? */ if (pdeo) { + spin_lock(&pde->pde_openers_lock); list_del(&pdeo->lh); - spin_unlock(&pde->pde_unload_lock); + spin_unlock(&pde->pde_openers_lock); rv = pdeo->release(inode, file); kfree(pdeo); - } else - spin_unlock(&pde->pde_unload_lock); + } return rv; } - pde->pde_users++; - release = pde->proc_fops->release; + atomic_inc(&pde->pde_users); + release = fops->release; + rcu_read_unlock(); if (pdeo) { + spin_lock(&pde->pde_openers_lock); list_del(&pdeo->lh); - kfree(pdeo); + spin_unlock(&pde->pde_openers_lock); } - spin_unlock(&pde->pde_unload_lock); + kfree(pdeo); if (release) rv = release(inode, file); @@ -447,6 +465,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = iget_locked(sb, de->low_ino); + const struct file_operations *fops; if (inode && (inode->i_state & I_NEW)) { inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -463,19 +482,22 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) set_nlink(inode, de->nlink); if (de->proc_iops) inode->i_op = de->proc_iops; - if (de->proc_fops) { + rcu_read_lock(); + fops = rcu_dereference(de->proc_fops); + if (fops) { if (S_ISREG(inode->i_mode)) { #ifdef CONFIG_COMPAT - if (!de->proc_fops->compat_ioctl) + if (!fops->compat_ioctl) inode->i_fop = &proc_reg_file_ops_no_compat; else #endif inode->i_fop = &proc_reg_file_ops; } else { - inode->i_fop = de->proc_fops; + inode->i_fop = fops; } } + rcu_read_unlock(); unlock_new_inode(inode); } else pde_put(de); diff --git a/fs/read_write.c b/fs/read_write.c index f738e4dccfab..b81aebe2a5ce 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,6 +9,7 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -325,16 +326,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; @@ -346,13 +337,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -402,13 +387,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -559,13 +538,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,6 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> +#include <linux/aio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/splice.c b/fs/splice.c index 23ade0e5c559..186ec03700d1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1000,7 +1000,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - sb_start_write(inode->i_sb); + if (!sb_start_file_write(out)) + return -EAGAIN; pipe_lock(pipe); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,6 +50,7 @@ */ #include "ubifs.h" +#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/slab.h> diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> +#include <linux/aio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f707e537171..c24ce0e9c67c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f03bf1a456fb..a81aa74a7263 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" +#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> @@ -775,7 +776,8 @@ xfs_file_aio_write( if (ocount == 0) return 0; - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ret = -EIO; diff --git a/include/linux/aio.h b/include/linux/aio.h index 31ff6dba4872..a7e4c595825e 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -6,94 +6,38 @@ #include <linux/aio_abi.h> #include <linux/uio.h> #include <linux/rcupdate.h> - #include <linux/atomic.h> - -#define AIO_MAXSEGS 4 -#define AIO_KIOGRP_NR_ATOMIC 8 +#include <linux/batch_complete.h> struct kioctx; +struct kiocb; +struct batch_complete; -/* Notes on cancelling a kiocb: - * If a kiocb is cancelled, aio_complete may return 0 to indicate - * that cancel has not yet disposed of the kiocb. All cancel - * operations *must* call aio_put_req to dispose of the kiocb - * to guard against races with the completion code. - */ -#define KIOCB_C_CANCELLED 0x01 -#define KIOCB_C_COMPLETE 0x02 - -#define KIOCB_SYNC_KEY (~0U) +#define KIOCB_KEY 0 -/* ki_flags bits */ /* - * This may be used for cancel/retry serialization in the future, but - * for now it's unused and we probably don't want modules to even - * think they can use it. + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either + * cancelled or completed (this makes a certain amount of sense because + * successful cancellation - io_cancel() - does deliver the completion to + * userspace). + * + * And since most things don't implement kiocb cancellation and we'd really like + * kiocb completion to be lockless when possible, we use ki_cancel to + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). */ -/* #define KIF_LOCKED 0 */ -#define KIF_KICKED 1 -#define KIF_CANCELLED 2 - -#define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) +#define KIOCB_CANCELLED ((void *) (~0ULL)) -#define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) -#define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) +typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); -#define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) -#define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) - -#define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) -#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) - -/* is there a better place to document function pointer methods? */ -/** - * ki_retry - iocb forward progress callback - * @kiocb: The kiocb struct to advance by performing an operation. - * - * This callback is called when the AIO core wants a given AIO operation - * to make forward progress. The kiocb argument describes the operation - * that is to be performed. As the operation proceeds, perhaps partially, - * ki_retry is expected to update the kiocb with progress made. Typically - * ki_retry is set in the AIO core and it itself calls file_operations - * helpers. - * - * ki_retry's return value determines when the AIO operation is completed - * and an event is generated in the AIO event ring. Except the special - * return values described below, the value that is returned from ki_retry - * is transferred directly into the completion ring as the operation's - * resulting status. Once this has happened ki_retry *MUST NOT* reference - * the kiocb pointer again. - * - * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete() - * will be called on the kiocb pointer in the future. The AIO core will - * not ask the method again -- ki_retry must ensure forward progress. - * aio_complete() must be called once and only once in the future, multiple - * calls may result in undefined behaviour. - * - * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb() - * will be called on the kiocb pointer in the future. This may happen - * through generic helpers that associate kiocb->ki_wait with a wait - * queue head that ki_retry uses via current->io_wait. It can also happen - * with custom tracking and manual calls to kick_iocb(), though that is - * discouraged. In either case, kick_iocb() must be called once and only - * once. ki_retry must ensure forward progress, the AIO core will wait - * indefinitely for kick_iocb() to be called. - */ struct kiocb { - struct list_head ki_run_list; - unsigned long ki_flags; - int ki_users; - unsigned ki_key; /* id of this request */ + struct rb_node ki_node; + + atomic_t ki_users; struct file *ki_filp; - struct kioctx *ki_ctx; /* may be NULL for sync ops */ - int (*ki_cancel)(struct kiocb *, struct io_event *); - ssize_t (*ki_retry)(struct kiocb *); + struct kioctx *ki_ctx; /* NULL for sync ops */ + kiocb_cancel_fn *ki_cancel; void (*ki_dtor)(struct kiocb *); union { @@ -102,6 +46,9 @@ struct kiocb { } ki_obj; __u64 ki_user_data; /* user's data for completion */ + long ki_res; + long ki_res2; + loff_t ki_pos; void *private; @@ -117,7 +64,6 @@ struct kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ - struct list_head ki_batch; /* batch allocation */ /* * If the aio_resfd field of the userspace iocb is not zero, @@ -128,108 +74,55 @@ struct kiocb { static inline bool is_sync_kiocb(struct kiocb *kiocb) { - return kiocb->ki_key == KIOCB_SYNC_KEY; + return kiocb->ki_ctx == NULL; } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { - .ki_users = 1, - .ki_key = KIOCB_SYNC_KEY, + .ki_users = ATOMIC_INIT(1), + .ki_ctx = NULL, .ki_filp = filp, .ki_obj.tsk = current, }; } -#define AIO_RING_MAGIC 0xa10a10a1 -#define AIO_RING_COMPAT_FEATURES 1 -#define AIO_RING_INCOMPAT_FEATURES 0 -struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ - unsigned head; - unsigned tail; - - unsigned magic; - unsigned compat_features; - unsigned incompat_features; - unsigned header_length; /* size of aio_ring */ - - - struct io_event io_events[0]; -}; /* 128 bytes + ring size */ - -#define AIO_RING_PAGES 8 -struct aio_ring_info { - unsigned long mmap_base; - unsigned long mmap_size; - - struct page **ring_pages; - spinlock_t ring_lock; - long nr_pages; - - unsigned nr, tail; - - struct page *internal_pages[AIO_RING_PAGES]; -}; - -static inline unsigned aio_ring_avail(struct aio_ring_info *info, - struct aio_ring *ring) -{ - return (ring->head + info->nr - 1 - ring->tail) % info->nr; -} - -struct kioctx { - atomic_t users; - int dead; - struct mm_struct *mm; - - /* This needs improving */ - unsigned long user_id; - struct hlist_node list; - - wait_queue_head_t wait; - - spinlock_t ctx_lock; - - int reqs_active; - struct list_head active_reqs; /* used for cancellation */ - struct list_head run_list; /* used for kicked reqs */ - - /* sys_io_setup currently limits this to an unsigned int */ - unsigned max_reqs; - - struct aio_ring_info ring_info; - - struct delayed_work wq; - - struct rcu_head rcu_head; -}; - /* prototypes */ -extern unsigned aio_max_size; - #ifdef CONFIG_AIO extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); -extern int aio_put_req(struct kiocb *iocb); -extern void kick_iocb(struct kiocb *iocb); -extern int aio_complete(struct kiocb *iocb, long res, long res2); +extern void aio_put_req(struct kiocb *iocb); +extern void batch_complete_aio(struct batch_complete *batch); +extern void aio_complete_batch(struct kiocb *iocb, long res, long res2, + struct batch_complete *batch); struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } -static inline int aio_put_req(struct kiocb *iocb) { return 0; } -static inline void kick_iocb(struct kiocb *iocb) { } -static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; } +static inline void aio_put_req(struct kiocb *iocb) { } + +static inline void batch_complete_aio(struct batch_complete *batch) { } +static inline void aio_complete_batch(struct kiocb *iocb, long res, long res2, + struct batch_complete *batch) +{ + return; +} struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user * __user *iocbpp, bool compat) { return 0; } +static inline void kiocb_set_cancel_fn(struct kiocb *req, + kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ +static inline void aio_complete(struct kiocb *iocb, long res, long res2) +{ + aio_complete_batch(iocb, res, res2, NULL); +} + static inline struct kiocb *list_kiocb(struct list_head *h) { return list_entry(h, struct kiocb, ki_list); diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f7f1d7169b11..6fd5cc80f62f 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -213,8 +213,15 @@ static inline bool balloon_compaction_check(void) return true; } +static inline void balloon_event_count(enum vm_event_item item) +{ + count_vm_event(item); +} #else /* !CONFIG_BALLOON_COMPACTION */ +/* A macro, to avoid generating references to the undefined COMPACTBALLOON* */ +#define balloon_event_count(item) do { } while (0) + static inline void *balloon_mapping_alloc(void *balloon_device, const struct address_space_operations *a_ops) { diff --git a/include/linux/batch_complete.h b/include/linux/batch_complete.h new file mode 100644 index 000000000000..8167a9d306fb --- /dev/null +++ b/include/linux/batch_complete.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_BATCH_COMPLETE_H +#define _LINUX_BATCH_COMPLETE_H + +#include <linux/rbtree.h> + +/* + * Common stuff to the aio and block code for batch completion. Everything + * important is elsewhere: + */ + +struct bio; + +struct bio_list { + struct bio *head; + struct bio *tail; +}; + +struct batch_complete { + struct bio_list bio; + struct rb_root kiocb; +}; + +#endif diff --git a/include/linux/bio.h b/include/linux/bio.h index 820e7aaad4fd..5f549176745c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -24,6 +24,7 @@ #include <linux/mempool.h> #include <linux/ioprio.h> #include <linux/bug.h> +#include <linux/batch_complete.h> #ifdef CONFIG_BLOCK @@ -68,6 +69,8 @@ #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) +void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch); + static inline unsigned int bio_cur_bytes(struct bio *bio) { if (bio->bi_vcnt) @@ -241,7 +244,25 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } -extern void bio_endio(struct bio *, int); +/** + * bio_endio - end I/O on a bio + * @bio: bio + * @error: error, if any + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the + * preferred way to end I/O on a bio, it takes care of clearing + * BIO_UPTODATE on error. @error is 0 on success, and and one of the + * established -Exxxx (-EIO, for instance) error values in case + * something went wrong. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io + * function. + **/ +static inline void bio_endio(struct bio *bio, int error) +{ + bio_endio_batch(bio, error, NULL); +} + struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -420,10 +441,6 @@ static inline bool bio_mergeable(struct bio *bio) * member of the bio. The bio_list also caches the last list member to allow * fast access to the tail. */ -struct bio_list { - struct bio *head; - struct bio *tail; -}; static inline int bio_list_empty(const struct bio_list *bl) { @@ -527,6 +544,15 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +static inline void batch_complete_init(struct batch_complete *batch) +{ + bio_list_init(&batch->bio); + batch->kiocb = RB_ROOT; +} + +void batch_complete(struct batch_complete *batch); + + #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cdf11191e645..d4e7bab9a17e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -16,7 +16,9 @@ struct page; struct block_device; struct io_context; struct cgroup_subsys_state; +struct batch_complete; typedef void (bio_end_io_t) (struct bio *, int); +typedef void (bio_batch_end_io_t) (struct bio *, int, struct batch_complete *); typedef void (bio_destructor_t) (struct bio *); /* @@ -42,6 +44,7 @@ struct bio { * top bits priority */ + short bi_error; unsigned short bi_vcnt; /* how many bio_vec's */ unsigned short bi_idx; /* current index into bvl_vec */ @@ -59,7 +62,10 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - bio_end_io_t *bi_end_io; + union { + bio_end_io_t *bi_end_io; + bio_batch_end_io_t *bi_batch_end_io; + }; void *bi_private; #ifdef CONFIG_BLK_CGROUP @@ -111,12 +117,13 @@ struct bio { #define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */ #define BIO_QUIET 10 /* Make BIO Quiet */ #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */ +#define BIO_BATCH_ENDIO 12 /* * Flags starting here get preserved by bio_reset() - this includes * BIO_POOL_IDX() */ -#define BIO_RESET_BITS 12 +#define BIO_RESET_BITS 13 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 78feda9bbae2..2f91edb6e2d3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -877,7 +877,8 @@ extern struct request *blk_fetch_request(struct request_queue *q); * This prevents code duplication in drivers. */ extern bool blk_update_request(struct request *rq, int error, - unsigned int nr_bytes); + unsigned int nr_bytes, + struct batch_complete *batch); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); @@ -885,10 +886,17 @@ extern bool blk_end_request_cur(struct request *rq, int error); extern bool blk_end_request_err(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, int error); extern bool __blk_end_request_cur(struct request *rq, int error); extern bool __blk_end_request_err(struct request *rq, int error); +extern void blk_end_request_all_batch(struct request *rq, int error, + struct batch_complete *batch); + +static inline void __blk_end_request_all(struct request *rq, int error) +{ + blk_end_request_all_batch(rq, error, NULL); +} + extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7e818a3ef60a..caa790af83b7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ struct cgroup_subsys; struct inode; struct cgroup; struct css_id; +struct eventfd_ctx; extern int cgroup_init_early(void); extern int cgroup_init(void); diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 42e55deee757..4ce9056b31a8 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -33,7 +33,7 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern struct cleancache_ops +extern struct cleancache_ops * cleancache_register_ops(struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(char *, struct super_block *); @@ -42,9 +42,9 @@ extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); extern void __cleancache_invalidate_inode(struct address_space *); extern void __cleancache_invalidate_fs(struct super_block *); -extern int cleancache_enabled; #ifdef CONFIG_CLEANCACHE +#define cleancache_enabled (1) static inline bool cleancache_fs_enabled(struct page *page) { return page->mapping->host->i_sb->cleancache_poolid >= 0; diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index a975de1ff59f..822c1354f3a6 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -27,7 +27,7 @@ extern int debug_locks_off(void); \ if (!oops_in_progress && unlikely(c)) { \ if (debug_locks_off() && !debug_locks_silent) \ - WARN_ON(1); \ + WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c); \ __ret = 1; \ } \ __ret; \ diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h new file mode 100644 index 000000000000..d5b68bf3ec92 --- /dev/null +++ b/include/linux/decompress/unlz4.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZ4_H +#define DECOMPRESS_UNLZ4_H + +int unlz4(unsigned char *inbuf, int len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *pos, + void(*error)(char *x)); +#endif diff --git a/include/linux/errno.h b/include/linux/errno.h index f6bf082d4d4f..89627b9187f9 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h @@ -28,6 +28,5 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ -#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ #endif diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 30442547b9e6..8293262401de 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -14,7 +14,7 @@ struct frontswap_ops { }; extern bool frontswap_enabled; -extern struct frontswap_ops +extern struct frontswap_ops * frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); @@ -22,33 +22,19 @@ extern void frontswap_writethrough(bool); #define FRONTSWAP_HAS_EXCLUSIVE_GETS extern void frontswap_tmem_exclusive_gets(bool); -extern void __frontswap_init(unsigned type); +extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); +extern void __frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); extern void __frontswap_invalidate_page(unsigned, pgoff_t); extern void __frontswap_invalidate_area(unsigned); #ifdef CONFIG_FRONTSWAP +#define frontswap_enabled (1) static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { - bool ret = false; - - if (frontswap_enabled && sis->frontswap_map) - ret = test_bit(offset, sis->frontswap_map); - return ret; -} - -static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) -{ - if (frontswap_enabled && sis->frontswap_map) - set_bit(offset, sis->frontswap_map); -} - -static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) -{ - if (frontswap_enabled && sis->frontswap_map) - clear_bit(offset, sis->frontswap_map); + return __frontswap_test(sis, offset); } static inline void frontswap_map_set(struct swap_info_struct *p, @@ -71,14 +57,6 @@ static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) return false; } -static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) -{ -} - -static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) -{ -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { @@ -120,10 +98,10 @@ static inline void frontswap_invalidate_area(unsigned type) __frontswap_invalidate_area(type); } -static inline void frontswap_init(unsigned type) +static inline void frontswap_init(unsigned type, unsigned long *map) { if (frontswap_enabled) - __frontswap_init(type); + __frontswap_init(type, map); } #endif /* _LINUX_FRONTSWAP_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c28271ab9d4..4c4f0e488313 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -675,9 +675,11 @@ static inline loff_t i_size_read(const struct inode *inode) static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) + preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); + preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) preempt_disable(); inode->i_size = i_size; @@ -1400,6 +1402,16 @@ static inline int sb_start_write_trylock(struct super_block *sb) return __sb_start_write(sb, SB_FREEZE_WRITE, false); } +/* + * sb_start_write() for writing into a file. When file has O_NONBLOCK set, + * we use trylock semantics, otherwise we block on frozen filesystem. + */ +static inline int sb_start_file_write(struct file *file) +{ + return __sb_start_write(file->f_mapping->host->i_sb, SB_FREEZE_WRITE, + !(file->f_flags & O_NONBLOCK)); +} + /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index c1d6555d2567..f3cec6856a4b 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -94,6 +94,11 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) +/* + * Are we in nmi,irq context, or softirq context? + */ +#define in_serving_irq() (in_nmi() || in_irq() || in_serving_softirq()) + #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f1e877b79ed8..cfc2f119779a 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -365,7 +365,7 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#else /* !LOCKDEP */ +#else /* !CONFIG_LOCKDEP */ static inline void lockdep_off(void) { @@ -479,82 +479,36 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# endif -# define spin_release(l, n, i) lock_release(l, n, i) +#ifdef CONFIG_PROVE_LOCKING + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 2, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 2, n, i) #else -# define spin_acquire(l, s, t, i) do { } while (0) -# define spin_release(l, n, i) do { } while (0) + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, NULL, i) -# else -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, NULL, i) -# endif -# define rwlock_release(l, n, i) lock_release(l, n, i) -#else -# define rwlock_acquire(l, s, t, i) do { } while (0) -# define rwlock_acquire_read(l, s, t, i) do { } while (0) -# define rwlock_release(l, n, i) do { } while (0) -#endif +#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define spin_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# endif -# define mutex_release(l, n, i) lock_release(l, n, i) -#else -# define mutex_acquire(l, s, t, i) do { } while (0) -# define mutex_acquire_nest(l, s, t, n, i) do { } while (0) -# define mutex_release(l, n, i) do { } while (0) -#endif +#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) -# else -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) -# endif +#define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define mutex_release(l, n, i) lock_release(l, n, i) + +#define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) # define rwsem_release(l, n, i) lock_release(l, n, i) -#else -# define rwsem_acquire(l, s, t, i) do { } while (0) -# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) -# define rwsem_acquire_read(l, s, t, i) do { } while (0) -# define rwsem_release(l, n, i) do { } while (0) -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_) -# else -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_) -# endif +#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) # define lock_map_release(l) lock_release(l, 1, _THIS_IP_) -#else -# define lock_map_acquire(l) do { } while (0) -# define lock_map_acquire_read(l) do { } while (0) -# define lock_map_release(l) do { } while (0) -#endif #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000000..7f6c75a093f8 --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,51 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len); +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index f4c8aa990442..5b7fd4ec2223 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -900,7 +900,8 @@ extern void pagefault_out_of_memory(void); * Flags passed to show_mem() and show_free_areas() to suppress output in * various contexts. */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ +#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ace9a5f01c64..fb425aa16c01 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -330,12 +330,9 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); - void (*unmap_area) (struct mm_struct *mm, unsigned long addr); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ - unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ - unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ede274957e05..ab20a60d2671 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -815,7 +815,10 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +static inline enum zone_type zone_idx(struct zone *zone) +{ + return zone - zone->zone_pgdat->node_zones; +} static inline int populated_zone(struct zone *zone) { @@ -856,25 +859,18 @@ static inline int is_normal_idx(enum zone_type idx) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM - int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; - return zone_off == ZONE_HIGHMEM * sizeof(*zone) || - (zone_off == ZONE_MOVABLE * sizeof(*zone) && - zone_movable_is_highmem()); -#else - return 0; -#endif + return is_highmem_idx(zone_idx(zone)); } static inline int is_normal(struct zone *zone) { - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; + return zone_idx(zone) == ZONE_NORMAL; } static inline int is_dma32(struct zone *zone) { #ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; + return zone_idx(zone) == ZONE_DMA32; #else return 0; #endif @@ -883,7 +879,7 @@ static inline int is_dma32(struct zone *zone) static inline int is_dma(struct zone *zone) { #ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; + return zone_idx(zone) == ZONE_DMA; #else return 0; #endif diff --git a/include/linux/net.h b/include/linux/net.h index aa1673160a45..99c9f0c103c2 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -240,8 +240,8 @@ do { \ #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) -#define net_random() random32() -#define net_srandom(seed) srandom32((__force u32)seed) +#define net_random() prandom_u32() +#define net_srandom(seed) prandom_seed((__force u32)(seed)) extern int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0e38e13eb249..e3dea75a078b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -149,7 +149,7 @@ static inline int page_cache_get_speculative(struct page *page) { VM_BUG_ON(in_interrupt()); -#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h new file mode 100644 index 000000000000..d0cf8872dc43 --- /dev/null +++ b/include/linux/percpu-refcount.h @@ -0,0 +1,114 @@ +/* + * Dynamic percpu refcounts: + * (C) 2012 Google, Inc. + * Author: Kent Overstreet <koverstreet@google.com> + * + * This implements a refcount with similar semantics to atomic_t - atomic_inc(), + * atomic_dec_and_test() - but potentially percpu. + * + * There's one important difference between percpu refs and normal atomic_t + * refcounts; you have to keep track of your initial refcount, and then when you + * start shutting down you call percpu_ref_kill() _before_ dropping the initial + * refcount. + * + * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the + * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() + * puts the ref back in single atomic_t mode, collecting the per cpu refs and + * issuing the appropriate barriers, and then marks the ref as shutting down so + * that percpu_ref_put() will check for the ref hitting 0. After it returns, + * it's safe to drop the initial ref. + * + * BACKGROUND: + * + * Percpu refcounts are quite useful for performance, but if we blindly + * converted all refcounts to percpu counters we'd waste quite a bit of memory. + * + * Think about all the refcounts embedded in kobjects, files, etc. most of which + * aren't used much. These start out as simple atomic counters - a little bigger + * than a bare atomic_t, 16 bytes instead of 4 - but if we exceed some arbitrary + * number of gets in one second, we then switch to percpu counters. + * + * This heuristic isn't perfect because it'll fire if the refcount was only + * being used on one cpu; ideally we'd be able to count the number of cache + * misses on percpu_ref_get() or something similar, but that'd make the non + * percpu path significantly heavier/more complex. We can count the number of + * gets() without any extra atomic instructions on arches that support + * atomic64_t - simply by changing the atomic_inc() to atomic_add_return(). + * + * USAGE: + * + * See fs/aio.c for some example usage; it's used there for struct kioctx, which + * is created when userspaces calls io_setup(), and destroyed when userspace + * calls io_destroy() or the process exits. + * + * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it + * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove + * the kioctx from the proccess's list of kioctxs - after that, there can't be + * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop + * the initial ref with percpu_ref_put(). + * + * Code that does a two stage shutdown like this often needs some kind of + * explicit synchronization to ensure the initial refcount can only be dropped + * once - percpu_ref_kill() does this for you, it returns true once and false if + * someone else already called it. The aio code uses it this way, but it's not + * necessary if the code has some other mechanism to synchronize teardown. + * + * As mentioned previously, we decide when to convert a ref to percpu counters + * in percpu_ref_get(). However, since percpu_ref_get() will often be called + * with rcu_read_lock() held, it's not done there - percpu_ref_get() returns + * true if the ref should be converted to percpu counters. + * + * The caller should then call percpu_ref_alloc() after dropping + * rcu_read_lock(); if there is an uncommonly used codepath where it's + * inconvenient to call percpu_ref_alloc() after get(), it may be safely skipped + * and percpu_ref_get() will return true again the next time the counter wraps + * around. + */ + +#ifndef _LINUX_PERCPU_REFCOUNT_H +#define _LINUX_PERCPU_REFCOUNT_H + +#include <linux/atomic.h> +#include <linux/percpu.h> + +struct percpu_ref { + atomic64_t count; + unsigned long pcpu_count; +}; + +void percpu_ref_init(struct percpu_ref *ref); +void __percpu_ref_get(struct percpu_ref *ref, bool alloc); +int percpu_ref_put(struct percpu_ref *ref); + +int percpu_ref_kill(struct percpu_ref *ref); +int percpu_ref_dead(struct percpu_ref *ref); + +/** + * percpu_ref_get - increment a dynamic percpu refcount + * + * Increments @ref and possibly converts it to percpu counters. Must be called + * with rcu_read_lock() held, and may potentially drop/reacquire rcu_read_lock() + * to allocate percpu counters - if sleeping/allocation isn't safe for some + * other reason (e.g. a spinlock), see percpu_ref_get_noalloc(). + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get(struct percpu_ref *ref) +{ + __percpu_ref_get(ref, true); +} + +/** + * percpu_ref_get_noalloc - increment a dynamic percpu refcount + * + * Increments @ref, to be used when it's not safe to allocate percpu counters. + * Must be called with rcu_read_lock() held. + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get_noalloc(struct percpu_ref *ref) +{ + __percpu_ref_get(ref, false); +} + +#endif diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 215e5e3dda10..d56406aaffe1 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> +#include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/kref.h> @@ -13,7 +14,9 @@ struct pidmap { void *page; }; -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) +#define BITS_PER_PAGE (PAGE_SIZE * 8) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE) struct bsd_acct_struct; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 8307f2f94d86..bd828e095359 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -68,16 +68,16 @@ struct proc_dir_entry { * If you're allocating ->proc_fops dynamically, save a pointer * somewhere. */ - const struct file_operations *proc_fops; + const struct file_operations __rcu *proc_fops; struct proc_dir_entry *next, *parent, *subdir; void *data; read_proc_t *read_proc; write_proc_t *write_proc; atomic_t count; /* use count */ - int pde_users; /* number of callers into module in progress */ + atomic_t pde_users; /* number of callers into module in progress */ struct completion *pde_unload_completion; + spinlock_t pde_openers_lock; struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ u8 namelen; char name[]; }; diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 5bf5500db83d..69e37c2d1ea5 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -6,7 +6,13 @@ struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, extern struct dentry *ramfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); -#ifndef CONFIG_MMU +#ifdef CONFIG_MMU +static inline int +ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +{ + return 0; +} +#else extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize); extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, diff --git a/include/linux/random.h b/include/linux/random.h index 347ce553a306..3b9377d6b7a5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -29,13 +29,6 @@ u32 prandom_u32(void); void prandom_bytes(void *buf, int nbytes); void prandom_seed(u32 seed); -/* - * These macros are preserved for backward compatibility and should be - * removed as soon as a transition is finished. - */ -#define random32() prandom_u32() -#define srandom32(seed) prandom_seed(seed) - u32 prandom_u32_state(struct rnd_state *); void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); diff --git a/include/linux/rtc-pxa.h b/include/linux/rtc-pxa.h new file mode 100644 index 000000000000..71bc45f060fc --- /dev/null +++ b/include/linux/rtc-pxa.h @@ -0,0 +1,18 @@ +/* + * include/linux/rtc-pxa.h + * + * RTC PXA Header file + * + * Copyright (C) 2010 Marvell International Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_RTC_PXA_H +#define __LINUX_RTC_PXA_H + +extern int pxa_rtc_sync_time(unsigned int ticks); + +#endif /* __LINUX_RTC_PXA_H */ diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 580b24c8b8ca..c2c28975293c 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -133,7 +133,13 @@ extern struct rtc_device *rtc_device_register(const char *name, struct device *dev, const struct rtc_class_ops *ops, struct module *owner); +extern struct rtc_device *devm_rtc_device_register(struct device *dev, + const char *name, + const struct rtc_class_ops *ops, + struct module *owner); extern void rtc_device_unregister(struct rtc_device *rtc); +extern void devm_rtc_device_unregister(struct device *dev, + struct rtc_device *rtc); extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9004f6e19eac..e20580dfc542 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -313,8 +313,6 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; -#include <linux/aio.h> - #ifdef CONFIG_MMU extern void arch_pick_mmap_layout(struct mm_struct *mm); extern unsigned long @@ -324,8 +322,6 @@ extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern void arch_unmap_area(struct mm_struct *, unsigned long); -extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif @@ -625,6 +621,7 @@ struct signal_struct { #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ +#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ /* * Pending notifications to parent. */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..d4b7a184f08c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,7 +50,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, -#endif +#ifdef CONFIG_BALLOON_COMPACTION + COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */ + COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */ + COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */ +#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif diff --git a/include/linux/wait.h b/include/linux/wait.h index 7cb64d4b499d..ac38be2692d8 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -330,6 +330,92 @@ do { \ __ret; \ }) +#define __wait_event_hrtimeout(wq, condition, timeout, state) \ +({ \ + int __ret = 0; \ + DEFINE_WAIT(__wait); \ + struct hrtimer_sleeper __t; \ + \ + hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ + HRTIMER_MODE_REL); \ + hrtimer_init_sleeper(&__t, current); \ + if ((timeout).tv64 != KTIME_MAX) \ + hrtimer_start_range_ns(&__t.timer, timeout, \ + current->timer_slack_ns, \ + HRTIMER_MODE_REL); \ + \ + for (;;) { \ + prepare_to_wait(&wq, &__wait, state); \ + if (condition) \ + break; \ + if (state == TASK_INTERRUPTIBLE && \ + signal_pending(current)) { \ + __ret = -ERESTARTSYS; \ + break; \ + } \ + if (!__t.task) { \ + __ret = -ETIME; \ + break; \ + } \ + schedule(); \ + } \ + \ + hrtimer_cancel(&__t.timer); \ + destroy_hrtimer_on_stack(&__t.timer); \ + finish_wait(&wq, &__wait); \ + __ret; \ +}) + +/** + * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, as a ktime_t + * + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the + * @condition evaluates to true or a signal is received. + * The @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * The function returns 0 if @condition became true, or -ETIME if the timeout + * elapsed. + */ +#define wait_event_hrtimeout(wq, condition, timeout) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = __wait_event_hrtimeout(wq, condition, timeout, \ + TASK_UNINTERRUPTIBLE); \ + __ret; \ +}) + +/** + * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, as a ktime_t + * + * The process is put to sleep (TASK_INTERRUPTIBLE) until the + * @condition evaluates to true or a signal is received. + * The @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * The function returns 0 if @condition became true, -ERESTARTSYS if it was + * interrupted by a signal, or -ETIME if the timeout elapsed. + */ +#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ +({ \ + long __ret = 0; \ + if (!(condition)) \ + __ret = __wait_event_hrtimeout(wq, condition, timeout, \ + TASK_INTERRUPTIBLE); \ + __ret; \ +}) + #define __wait_event_interruptible_exclusive(wq, condition, ret) \ do { \ DEFINE_WAIT(__wait); \ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9a9367c0c076..579a5007c696 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -5,6 +5,7 @@ #define WRITEBACK_H #include <linux/sched.h> +#include <linux/workqueue.h> #include <linux/fs.h> DECLARE_PER_CPU(int, dirty_throttle_leaks); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 68c69d54d392..108ebe8312f2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -874,7 +874,7 @@ struct ip_vs_app { struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; - int sync_queue_len; + unsigned long sync_queue_len; unsigned int sync_queue_delay; struct task_struct *master_thread; struct delayed_work master_wakeup_work; @@ -966,7 +966,7 @@ struct netns_ipvs { int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; - int sysctl_sync_qlen_max; + unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; @@ -1052,7 +1052,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return ACCESS_ONCE(ipvs->sysctl_sync_ports); } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } @@ -1099,7 +1099,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return 1; } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h new file mode 100644 index 000000000000..0421f49a20f7 --- /dev/null +++ b/include/trace/events/filemap.h @@ -0,0 +1,58 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM filemap + +#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FILEMAP_H + +#include <linux/types.h> +#include <linux/tracepoint.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> +#include <linux/device.h> +#include <linux/kdev_t.h> + +DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, + + TP_PROTO(struct page *page), + + TP_ARGS(page), + + TP_STRUCT__entry( + __field(struct page *, page) + __field(unsigned long, i_ino) + __field(unsigned long, index) + __field(dev_t, s_dev) + ), + + TP_fast_assign( + __entry->page = page; + __entry->i_ino = page->mapping->host->i_ino; + __entry->index = page->index; + if (page->mapping->host->i_sb) + __entry->s_dev = page->mapping->host->i_sb->s_dev; + else + __entry->s_dev = page->mapping->host->i_rdev; + ), + + TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), + __entry->i_ino, + __entry->page, + page_to_pfn(__entry->page), + __entry->index << PAGE_SHIFT) +); + +DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache, + TP_PROTO(struct page *page), + TP_ARGS(page) + ); + +DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, + TP_PROTO(struct page *page), + TP_ARGS(page) + ); + +#endif /* _TRACE_FILEMAP_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/printk.h b/include/trace/events/printk.h index 94ec79cc011a..c008bc99f9fa 100644 --- a/include/trace/events/printk.h +++ b/include/trace/events/printk.h @@ -6,31 +6,18 @@ #include <linux/tracepoint.h> -TRACE_EVENT_CONDITION(console, - TP_PROTO(const char *log_buf, unsigned start, unsigned end, - unsigned log_buf_len), +TRACE_EVENT(console, + TP_PROTO(const char *text, size_t len), - TP_ARGS(log_buf, start, end, log_buf_len), - - TP_CONDITION(start != end), + TP_ARGS(text, len), TP_STRUCT__entry( - __dynamic_array(char, msg, end - start + 1) + __dynamic_array(char, msg, len + 1) ), TP_fast_assign( - if ((start & (log_buf_len - 1)) > (end & (log_buf_len - 1))) { - memcpy(__get_dynamic_array(msg), - log_buf + (start & (log_buf_len - 1)), - log_buf_len - (start & (log_buf_len - 1))); - memcpy((char *)__get_dynamic_array(msg) + - log_buf_len - (start & (log_buf_len - 1)), - log_buf, end & (log_buf_len - 1)); - } else - memcpy(__get_dynamic_array(msg), - log_buf + (start & (log_buf_len - 1)), - end - start); - ((char *)__get_dynamic_array(msg))[end - start] = 0; + memcpy(__get_dynamic_array(msg), text, len); + ((char *)__get_dynamic_array(msg))[len] = 0; ), TP_printk("%s", __get_str(msg)) diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 2c267bcbb85c..8c99ce7202c5 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -25,6 +25,7 @@ #define EPOLL_CTL_ADD 1 #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +#define EPOLL_CTL_DISABLE 4 /* * Request the handling of system wakeup events so as to prevent system suspends diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index 022ab186a812..52ebcc89f306 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -5,6 +5,7 @@ /* has the defines to get at the registers. */ +#include <linux/types.h> #define PTRACE_TRACEME 0 #define PTRACE_PEEKTEXT 1 @@ -52,6 +53,17 @@ #define PTRACE_INTERRUPT 0x4207 #define PTRACE_LISTEN 0x4208 +#define PTRACE_PEEKSIGINFO 0x4209 + +struct ptrace_peeksiginfo_args { + __u64 off; /* from which siginfo to start */ + __u32 flags; + __s32 nr; /* how may siginfos to take */ +}; + +/* Read signals from a shared (process wide) queue */ +#define PTRACE_PEEKSIGINFO_SHARED (1 << 0) + /* Wait extended result codes for the above trace options. */ #define PTRACE_EVENT_FORK 1 #define PTRACE_EVENT_VFORK 2 diff --git a/include/xen/tmem.h b/include/xen/tmem.h index 591550a22ac7..3930a90045ff 100644 --- a/include/xen/tmem.h +++ b/include/xen/tmem.h @@ -3,7 +3,15 @@ #include <linux/types.h> +#ifdef CONFIG_XEN_TMEM_MODULE +#define tmem_enabled true +#else /* defined in drivers/xen/tmem.c */ extern bool tmem_enabled; +#endif + +#ifdef CONFIG_XEN_SELFBALLOONING +extern int xen_selfballoon_init(bool, bool); +#endif #endif /* _XEN_TMEM_H */ diff --git a/init/Kconfig b/init/Kconfig index 601992c579a8..47963f5a82f4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -98,10 +98,13 @@ config HAVE_KERNEL_XZ config HAVE_KERNEL_LZO bool +config HAVE_KERNEL_LZ4 + bool + choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -168,6 +171,18 @@ config KERNEL_LZO size is about 10% bigger than gzip; however its speed (both compression and decompression) is the fastest. +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + <https://code.google.com/p/lz4/>. + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + endchoice config DEFAULT_HOSTNAME @@ -927,6 +942,23 @@ config MEMCG_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. +config MEMCG_DEBUG_ASYNC_DESTROY + bool "Memory Resource Controller Debug asynchronous object destruction" + depends on MEMCG_KMEM || MEMCG_SWAP + default n + help + When a memcg is destroyed, the memory consumed by it may not be + immediately freed. This is because when some extensions are used, such + as swap or kernel memory, objects can outlive the group and hold a + reference to it. + + If this is the case, the dangling_memcgs file will show information + about what are the memcgs still alive, and which references are still + preventing it to be freed. There is nothing wrong with that, but it is + very useful when debugging, to know where this memory is being held. + This is a developer-oriented debugging facility only, and no + guarantees of interface stability will be given. + config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" depends on RESOURCE_COUNTERS && HUGETLB_PAGE diff --git a/ipc/msg.c b/ipc/msg.c index 31cd1bf6af27..daeca1304885 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -66,6 +66,7 @@ struct msg_sender { #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 +#define SEARCH_NUMBER 5 #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) @@ -583,6 +584,7 @@ static int testmsg(struct msg_msg *msg, long type, int mode) switch(mode) { case SEARCH_ANY: + case SEARCH_NUMBER: return 1; case SEARCH_LESSEQUAL: if (msg->m_type <=type) @@ -738,6 +740,8 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, static inline int convert_mode(long *msgtyp, int msgflg) { + if (msgflg & MSG_COPY) + return SEARCH_NUMBER; /* * find message of correct type. * msgtyp = 0 => get first. @@ -774,14 +778,10 @@ static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) * This function creates new kernel message structure, large enough to store * bufsz message bytes. */ -static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz, - int msgflg, long *msgtyp, - unsigned long *copy_number) +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { struct msg_msg *copy; - *copy_number = *msgtyp; - *msgtyp = 0; /* * Create dummy message to copy real message to. */ @@ -797,9 +797,7 @@ static inline void free_copy(struct msg_msg *copy) free_msg(copy); } #else -static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz, - int msgflg, long *msgtyp, - unsigned long *copy_number) +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { return ERR_PTR(-ENOSYS); } @@ -809,6 +807,30 @@ static inline void free_copy(struct msg_msg *copy) } #endif +struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) +{ + struct msg_msg *msg; + long count = 0; + + list_for_each_entry(msg, &msq->q_messages, m_list) { + if (testmsg(msg, *msgtyp, mode) && + !security_msg_queue_msgrcv(msq, msg, current, + *msgtyp, mode)) { + if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { + *msgtyp = msg->m_type - 1; + } else if (mode == SEARCH_NUMBER) { + if (*msgtyp == count) + return msg; + } else + return msg; + count++; + } + } + + return ERR_PTR(-EAGAIN); +} + + long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) @@ -818,18 +840,17 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int mode; struct ipc_namespace *ns; struct msg_msg *copy = NULL; - unsigned long copy_number = 0; ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; if (msgflg & MSG_COPY) { - copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax), - msgflg, &msgtyp, ©_number); + copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) return PTR_ERR(copy); } + mode = convert_mode(&msgtyp, msgflg); msq = msg_lock_check(ns, msqid); @@ -840,44 +861,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, for (;;) { struct msg_receiver msr_d; - struct list_head *tmp; - long msg_counter = 0; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; - msg = ERR_PTR(-EAGAIN); - tmp = msq->q_messages.next; - while (tmp != &msq->q_messages) { - struct msg_msg *walk_msg; - - walk_msg = list_entry(tmp, struct msg_msg, m_list); - if (testmsg(walk_msg, msgtyp, mode) && - !security_msg_queue_msgrcv(msq, walk_msg, current, - msgtyp, mode)) { - - msg = walk_msg; - if (mode == SEARCH_LESSEQUAL && - walk_msg->m_type != 1) { - msgtyp = walk_msg->m_type - 1; - } else if (msgflg & MSG_COPY) { - if (copy_number == msg_counter) { - /* - * Found requested message. - * Copy it. - */ - msg = copy_msg(msg, copy); - if (IS_ERR(msg)) - goto out_unlock; - break; - } - } else - break; - msg_counter++; - } - tmp = tmp->next; - } + msg = find_msg(msq, &msgtyp, mode); + if (!IS_ERR(msg)) { /* * Found a suitable message. @@ -891,8 +881,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, * If we are copying, then do not unlink message and do * not update queue parameters. */ - if (msgflg & MSG_COPY) + if (msgflg & MSG_COPY) { + msg = copy_msg(msg, copy); goto out_unlock; + } list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 5df8e4bf1db0..d43439e6eb47 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -17,7 +17,7 @@ #include <linux/ipc_namespace.h> #include <linux/utsname.h> #include <linux/proc_fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "util.h" @@ -37,59 +37,70 @@ struct ipc_namespace init_ipc_ns = { atomic_t nr_ipc_ns = ATOMIC_INIT(1); struct msg_msgseg { - struct msg_msgseg* next; + struct msg_msgseg *next; /* the next part of the message follows immediately */ }; -#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) -#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) +#define DATALEN_MSG (int)(PAGE_SIZE-sizeof(struct msg_msg)) +#define DATALEN_SEG (int)(PAGE_SIZE-sizeof(struct msg_msgseg)) -struct msg_msg *load_msg(const void __user *src, int len) + +static struct msg_msg *alloc_msg(int len) { struct msg_msg *msg; struct msg_msgseg **pseg; - int err; int alen; - alen = len; - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; - + alen = min(len, DATALEN_MSG); msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) - return ERR_PTR(-ENOMEM); + return NULL; msg->next = NULL; msg->security = NULL; - if (copy_from_user(msg + 1, src, alen)) { - err = -EFAULT; - goto out_err; - } - len -= alen; - src = ((char __user *)src) + alen; pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; - seg = kmalloc(sizeof(*seg) + alen, - GFP_KERNEL); - if (seg == NULL) { - err = -ENOMEM; + alen = min(len, DATALEN_SEG); + seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL); + if (seg == NULL) goto out_err; - } *pseg = seg; seg->next = NULL; - if (copy_from_user(seg + 1, src, alen)) { - err = -EFAULT; - goto out_err; - } pseg = &seg->next; len -= alen; - src = ((char __user *)src) + alen; + } + + return msg; + +out_err: + free_msg(msg); + return NULL; +} + +struct msg_msg *load_msg(const void __user *src, int len) +{ + struct msg_msg *msg; + struct msg_msgseg *seg; + int err = -EFAULT; + int alen; + + msg = alloc_msg(len); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + + alen = min(len, DATALEN_MSG); + if (copy_from_user(msg + 1, src, alen)) + goto out_err; + + for (seg = msg->next; seg != NULL; seg = seg->next) { + len -= alen; + src = (char __user *)src + alen; + alen = min(len, DATALEN_SEG); + if (copy_from_user(seg + 1, src, alen)) + goto out_err; } err = security_msg_msg_alloc(msg); @@ -113,23 +124,16 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) if (src->m_ts > dst->m_ts) return ERR_PTR(-EINVAL); - alen = len; - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; - + alen = min(len, DATALEN_MSG); memcpy(dst + 1, src + 1, alen); - len -= alen; - dst_pseg = dst->next; - src_pseg = src->next; - while (len > 0) { - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; - memcpy(dst_pseg + 1, src_pseg + 1, alen); - dst_pseg = dst_pseg->next; + for (dst_pseg = dst->next, src_pseg = src->next; + src_pseg != NULL; + dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) { + len -= alen; - src_pseg = src_pseg->next; + alen = min(len, DATALEN_SEG); + memcpy(dst_pseg + 1, src_pseg + 1, alen); } dst->m_type = src->m_type; @@ -148,24 +152,16 @@ int store_msg(void __user *dest, struct msg_msg *msg, int len) int alen; struct msg_msgseg *seg; - alen = len; - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; + alen = min(len, DATALEN_MSG); if (copy_to_user(dest, msg + 1, alen)) return -1; - len -= alen; - dest = ((char __user *)dest) + alen; - seg = msg->next; - while (len > 0) { - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; + for (seg = msg->next; seg != NULL; seg = seg->next) { + len -= alen; + dest = (char __user *)dest + alen; + alen = min(len, DATALEN_SEG); if (copy_to_user(dest, seg + 1, alen)) return -1; - len -= alen; - dest = ((char __user *)dest) + alen; - seg = seg->next; } return 0; } diff --git a/ipc/sem.c b/ipc/sem.c index 5b167d00efa6..a9234c7b4cac 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -61,8 +61,8 @@ * - A woken up task may not even touch the semaphore array anymore, it may * have been destroyed already by a semctl(RMID). * - The synchronizations between wake-ups due to a timeout/signal and a - * wake-up due to a completed semaphore operation is achieved by using an - * intermediate state (IN_WAKEUP). + * wake-up due to a completed semaphore operation is achieved by using a + * special wakeup scheme (queuewakeup_wait and support functions) * - UNDO values are stored in an array (one per process and per * semaphore array, lazily allocated). For backwards compatibility, multiple * modes for the UNDO variables are supported (per process, per thread) @@ -90,6 +90,135 @@ #include <asm/uaccess.h> #include "util.h" + +#ifdef CONFIG_PREEMPT_RT_BASE + #define SYSVSEM_COMPLETION 1 +#else + #define SYSVSEM_CUSTOM 1 +#endif + +#ifdef SYSVSEM_COMPLETION + /* Using a completion causes some overhead, but avoids a busy loop + * that increases the worst case latency. + */ + struct queue_done { + struct completion done; + }; + + static void queuewakeup_prepare(void) + { + /* no preparation necessary */ + } + + static void queuewakeup_completed(void) + { + /* empty */ + } + + static void queuewakeup_block(struct queue_done *qd) + { + /* empty */ + } + + static void queuewakeup_handsoff(struct queue_done *qd) + { + complete_all(&qd->done); + } + + static void queuewakeup_init(struct queue_done *qd) + { + init_completion(&qd->done); + } + + static void queuewakeup_wait(struct queue_done *qd) + { + wait_for_completion(&qd->done); + } + +#elif defined(SYSVSEM_SPINLOCK) + /* Note: Spinlocks do not work because: + * - lockdep complains [could be fixed] + * - only 255 concurrent spin_lock() calls are permitted, then the + * preempt-counter overflows + */ +#error SYSVSEM_SPINLOCK is a prove of concept, does not work. + struct queue_done { + spinlock_t done; + }; + + static void queuewakeup_prepare(void) + { + /* empty */ + } + + static void queuewakeup_completed(void) + { + /* empty */ + } + + static void queuewakeup_block(struct queue_done *qd) + { + BUG_ON(spin_is_locked(&qd->done)); + spin_lock(&qd->done); + } + + static void queuewakeup_handsoff(struct queue_done *qd) + { + spin_unlock(&qd->done); + } + + static void queuewakeup_init(struct queue_done *qd) + { + spin_lock_init(&qd->done); + } + + static void queuewakeup_wait(struct queue_done *qd) + { + spin_unlock_wait(&qd->done); + } +#else + struct queue_done { + atomic_t done; + }; + + static void queuewakeup_prepare(void) + { + preempt_disable(); + } + + static void queuewakeup_completed(void) + { + preempt_enable(); + } + + static void queuewakeup_block(struct queue_done *qd) + { + BUG_ON(atomic_read(&qd->done) != 1); + atomic_set(&qd->done, 2); + } + + static void queuewakeup_handsoff(struct queue_done *qd) + { + BUG_ON(atomic_read(&qd->done) != 2); + smp_mb(); + atomic_set(&qd->done, 1); + } + + static void queuewakeup_init(struct queue_done *qd) + { + atomic_set(&qd->done, 1); + } + + static void queuewakeup_wait(struct queue_done *qd) + { + while (atomic_read(&qd->done) != 1) + cpu_relax(); + + smp_mb(); + } +#endif + + /* One semaphore structure for each semaphore in the system. */ struct sem { int semval; /* current value */ @@ -108,6 +237,7 @@ struct sem_queue { struct sembuf *sops; /* array of pending operations */ int nsops; /* number of operations */ int alter; /* does *sops alter the array? */ + struct queue_done done; /* completion synchronization */ }; /* Each task has a list of undo requests. They are executed automatically @@ -245,23 +375,27 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * - queue.status is initialized to -EINTR before blocking. * - wakeup is performed by * * unlinking the queue entry from sma->sem_pending - * * setting queue.status to IN_WAKEUP - * This is the notification for the blocked thread that a - * result value is imminent. + * * setting queue.status to the actual result code + * This is the notification for the blocked thread that someone + * (usually: update_queue()) completed the semtimedop() operation. * * call wake_up_process - * * set queue.status to the final value. + * * queuewakeup_handsoff(&q->done); * - the previously blocked thread checks queue.status: - * * if it's IN_WAKEUP, then it must wait until the value changes - * * if it's not -EINTR, then the operation was completed by - * update_queue. semtimedop can return queue.status without - * performing any operation on the sem array. - * * otherwise it must acquire the spinlock and check what's up. + * * if it's not -EINTR, then someone completed the operation. + * First, queuewakeup_wait() must be called. Afterwards, + * semtimedop must return queue.status without performing any + * operation on the sem array. + * - otherwise it must acquire the spinlock and repeat the test + * - If it is still -EINTR, then no update_queue() completed the + * operation, thus semtimedop() can proceed normally. * - * The two-stage algorithm is necessary to protect against the following + * queuewakeup_wait() is necessary to protect against the following * races: * - if queue.status is set after wake_up_process, then the woken up idle * thread could race forward and try (and fail) to acquire sma->lock - * before update_queue had a chance to set queue.status + * before update_queue had a chance to set queue.status. + * More importantly, it would mean that wake_up_process must be done + * while holding sma->lock, i.e. this would reduce the scalability. * - if queue.status is written before wake_up_process and if the * blocked process is woken up by a signal between writing * queue.status and the wake_up_process, then the woken up @@ -271,7 +405,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * (yes, this happened on s390 with sysv msg). * */ -#define IN_WAKEUP 1 /** * newary - Create a new semaphore set @@ -461,15 +594,11 @@ undo: static void wake_up_sem_queue_prepare(struct list_head *pt, struct sem_queue *q, int error) { - if (list_empty(pt)) { - /* - * Hold preempt off so that we don't get preempted and have the - * wakee busy-wait until we're scheduled back on. - */ - preempt_disable(); - } - q->status = IN_WAKEUP; - q->pid = error; + if (list_empty(pt)) + queuewakeup_prepare(); + + queuewakeup_block(&q->done); + q->status = error; list_add_tail(&q->simple_list, pt); } @@ -480,8 +609,8 @@ static void wake_up_sem_queue_prepare(struct list_head *pt, * * Do the actual wake-up. * The function is called without any locks held, thus the semaphore array - * could be destroyed already and the tasks can disappear as soon as the - * status is set to the actual return code. + * could be destroyed already and the tasks can disappear as soon as + * queuewakeup_handsoff() is called. */ static void wake_up_sem_queue_do(struct list_head *pt) { @@ -491,12 +620,11 @@ static void wake_up_sem_queue_do(struct list_head *pt) did_something = !list_empty(pt); list_for_each_entry_safe(q, t, pt, simple_list) { wake_up_process(q->sleeper); - /* q can disappear immediately after writing q->status. */ - smp_wmb(); - q->status = q->pid; + /* q can disappear immediately after completing q->done */ + queuewakeup_handsoff(&q->done); } if (did_something) - preempt_enable(); + queuewakeup_completed(); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -1331,33 +1459,6 @@ out: return un; } - -/** - * get_queue_result - Retrieve the result code from sem_queue - * @q: Pointer to queue structure - * - * Retrieve the return code from the pending queue. If IN_WAKEUP is found in - * q->status, then we must loop until the value is replaced with the final - * value: This may happen if a task is woken up by an unrelated event (e.g. - * signal) and in parallel the task is woken up by another task because it got - * the requested semaphores. - * - * The function can be called with or without holding the semaphore spinlock. - */ -static int get_queue_result(struct sem_queue *q) -{ - int error; - - error = q->status; - while (unlikely(error == IN_WAKEUP)) { - cpu_relax(); - error = q->status; - } - - return error; -} - - SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1503,6 +1604,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, queue.status = -EINTR; queue.sleeper = current; + queuewakeup_init(&queue.done); sleep_again: current->state = TASK_INTERRUPTIBLE; @@ -1513,17 +1615,14 @@ sleep_again: else schedule(); - error = get_queue_result(&queue); + error = queue.status; if (error != -EINTR) { /* fast path: update_queue already obtained all requested - * resources. - * Perform a smp_mb(): User space could assume that semop() - * is a memory barrier: Without the mb(), the cpu could - * speculatively read in user space stale data that was - * overwritten by the previous owner of the semaphore. + * resources. Just ensure that update_queue completed + * it's access to &queue. */ - smp_mb(); + queuewakeup_wait(&queue.done); goto out_free; } @@ -1533,23 +1632,16 @@ sleep_again: /* * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. */ - error = get_queue_result(&queue); - - /* - * Array removed? If yes, leave without sem_unlock(). - */ - if (IS_ERR(sma)) { - goto out_free; - } - - - /* - * If queue.status != -EINTR we are woken up by another process. - * Leave without unlink_queue(), but with sem_unlock(). - */ - + error = queue.status; if (error != -EINTR) { - goto out_unlock_free; + /* If there is a return code, then we can leave immediately. */ + if (!IS_ERR(sma)) { + /* sem_lock() succeeded - then unlock */ + sem_unlock(sma); + } + /* Except that we must wait for the hands-off */ + queuewakeup_wait(&queue.done); + goto out_free; } /* diff --git a/kernel/acct.c b/kernel/acct.c index b9bd7f098ee5..a370ccb43840 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -201,7 +201,8 @@ static int acct_on(struct filename *pathname) struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ - file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + file = file_open_name(pathname, + O_WRONLY|O_APPEND|O_LARGEFILE|O_NONBLOCK, 0); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/kernel/fork.c b/kernel/fork.c index 7d40687b1434..48c89a053959 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -70,6 +70,7 @@ #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> +#include <linux/aio.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -364,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -539,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efba..595afabcb9a9 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -85,14 +85,21 @@ bool __refrigerator(bool check_kthr_stop) } EXPORT_SYMBOL(__refrigerator); -static void fake_signal_wake_up(struct task_struct *p) +static bool fake_signal_wake_up(struct task_struct *p) { unsigned long flags; + bool ret = false; + + if (p->flags & (PF_KTHREAD | PF_DUMPCORE)) + return ret; if (lock_task_sighand(p, &flags)) { - signal_wake_up(p, 0); + ret = !(p->flags & PF_DUMPCORE); + if (ret) + signal_wake_up(p, 0); unlock_task_sighand(p, &flags); } + return ret; } /** @@ -100,8 +107,8 @@ static void fake_signal_wake_up(struct task_struct *p) * @p: task to send the request to * * If @p is freezing, the freeze request is sent either by sending a fake - * signal (if it's not a kernel thread) or waking it up (if it's a kernel - * thread). + * signal (if it's not a kernel thread or a coredumping thread) or waking + * it up otherwise. * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise @@ -116,9 +123,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) - fake_signal_wake_up(p); - else + if (!fake_signal_wake_up(p)) wake_up_state(p, TASK_INTERRUPTIBLE); spin_unlock_irqrestore(&freezer_lock, flags); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b6..9a1064586483 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image, /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); result = copy_from_user(ptr, buf, uchunk); kunmap(page); @@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image, * We do things a page at a time for the sake of kmap. */ unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ptr = kmap(page); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) { - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } @@ -1452,14 +1445,13 @@ void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; - int r; + size_t r; va_start(args, fmt); r = vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - if (r + vmcoreinfo_size > vmcoreinfo_max_size) - r = vmcoreinfo_max_size - vmcoreinfo_size; + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_lock(lock); } @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_unlock(lock); preempt_enable(); @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_lock(lock); } @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_unlock(lock); preempt_enable(); @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) int i; preempt_disable(); - rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) { int i; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); diff --git a/kernel/pid.c b/kernel/pid.c index 047dc6264638..27b518613d96 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -51,9 +51,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - static inline int mk_pid(struct pid_namespace *pid_ns, struct pidmap *map, int off) { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c6023..f158e271fe44 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -19,8 +19,6 @@ #include <linux/reboot.h> #include <linux/export.h> -#define BITS_PER_PAGE (PAGE_SIZE*8) - struct pid_cache { int nr_ids; char name[16]; diff --git a/kernel/printk.c b/kernel/printk.c index 4d36fe337066..e32a96df1204 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> +#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -621,6 +622,9 @@ static int devkmsg_open(struct inode *inode, struct file *file) struct devkmsg_user *user; int err; + if (dmesg_restrict && !capable(CAP_SYSLOG)) + return -EACCES; + /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; @@ -1266,7 +1270,7 @@ static void call_console_drivers(int level, const char *text, size_t len) { struct console *con; - trace_console(text, 0, len, len); + trace_console(text, len); if (!console_drivers) return; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index acbd28424d81..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -17,6 +17,7 @@ #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/uio.h> #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> @@ -24,6 +25,7 @@ #include <linux/regset.h> #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> +#include <linux/compat.h> static int ptrace_trapping_sleep_fn(void *flags) @@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) return error; } +static int ptrace_peek_siginfo(struct task_struct *child, + unsigned long addr, + unsigned long data) +{ + struct ptrace_peeksiginfo_args arg; + struct sigpending *pending; + struct sigqueue *q; + int ret, i; + + ret = copy_from_user(&arg, (void __user *) addr, + sizeof(struct ptrace_peeksiginfo_args)); + if (ret) + return -EFAULT; + + if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED) + return -EINVAL; /* unknown flags */ + + if (arg.nr < 0) + return -EINVAL; + + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) + pending = &child->signal->shared_pending; + else + pending = &child->pending; + + for (i = 0; i < arg.nr; ) { + siginfo_t info; + s32 off = arg.off + i; + + spin_lock_irq(&child->sighand->siglock); + list_for_each_entry(q, &pending->list, list) { + if (!off--) { + copy_siginfo(&info, &q->info); + break; + } + } + spin_unlock_irq(&child->sighand->siglock); + + if (off >= 0) /* beyond the end of the list */ + break; + +#ifdef CONFIG_COMPAT + if (unlikely(is_compat_task())) { + compat_siginfo_t __user *uinfo = compat_ptr(data); + + ret = copy_siginfo_to_user32(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } else +#endif + { + siginfo_t __user *uinfo = (siginfo_t __user *) data; + + ret = copy_siginfo_to_user(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } + + if (ret) { + ret = -EFAULT; + break; + } + + data += sizeof(siginfo_t); + i++; + + if (signal_pending(current)) + break; + + cond_resched(); + } + + if (i > 0) + return i; + + return ret; +} #ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) @@ -748,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request, ret = put_user(child->ptrace_message, datalp); break; + case PTRACE_PEEKSIGINFO: + ret = ptrace_peek_siginfo(child, addr, data); + break; + case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2d5f94c1c7fb..d8534308fd05 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1441,7 +1441,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((random32() % (rcu_num_nodes * 8)) == 0 && + if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && system_state == SYSTEM_RUNNING) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 4567fc020fe3..6815171a4fff 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -193,7 +193,7 @@ EXPORT_SYMBOL(up); struct semaphore_waiter { struct list_head list; struct task_struct *task; - int up; + bool up; }; /* @@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state, list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; - waiter.up = 0; + waiter.up = false; for (;;) { if (signal_pending_state(state, task)) goto interrupted; - if (timeout <= 0) + if (unlikely(timeout <= 0)) goto timed_out; __set_task_state(task, state); raw_spin_unlock_irq(&sem->lock); @@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem) struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); - waiter->up = 1; + waiter->up = true; wake_up_process(waiter->task); } diff --git a/kernel/signal.c b/kernel/signal.c index dd72567767d9..115fa3db2122 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -854,12 +854,14 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, bool force) +static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; - if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { + if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { + if (signal->flags & SIGNAL_GROUP_COREDUMP) + return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. */ diff --git a/kernel/smp.c b/kernel/smp.c index 8e451f3ff51b..31670c8d8f89 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -12,6 +12,7 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/hardirq.h> #include "smpboot.h" @@ -100,16 +101,16 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *data) +static void csd_lock_wait(struct call_single_data *csd) { - while (data->flags & CSD_FLAG_LOCK) + while (csd->flags & CSD_FLAG_LOCK) cpu_relax(); } -static void csd_lock(struct call_single_data *data) +static void csd_lock(struct call_single_data *csd) { - csd_lock_wait(data); - data->flags = CSD_FLAG_LOCK; + csd_lock_wait(csd); + csd->flags = CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment @@ -119,16 +120,16 @@ static void csd_lock(struct call_single_data *data) smp_mb(); } -static void csd_unlock(struct call_single_data *data) +static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_mb(); - data->flags &= ~CSD_FLAG_LOCK; + csd->flags &= ~CSD_FLAG_LOCK; } /* @@ -137,7 +138,7 @@ static void csd_unlock(struct call_single_data *data) * ->func, ->info, and ->flags set. */ static -void generic_exec_single(int cpu, struct call_single_data *data, int wait) +void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -145,7 +146,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); - list_add_tail(&data->list, &dst->list); + list_add_tail(&csd->list, &dst->list); raw_spin_unlock_irqrestore(&dst->lock, flags); /* @@ -163,7 +164,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) arch_send_call_function_single_ipi(cpu); if (wait) - csd_lock_wait(data); + csd_lock_wait(csd); } /* @@ -173,7 +174,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) void generic_smp_call_function_single_interrupt(void) { struct call_single_queue *q = &__get_cpu_var(call_single_queue); - unsigned int data_flags; LIST_HEAD(list); /* @@ -186,25 +186,26 @@ void generic_smp_call_function_single_interrupt(void) raw_spin_unlock(&q->lock); while (!list_empty(&list)) { - struct call_single_data *data; + struct call_single_data *csd; + unsigned int csd_flags; - data = list_entry(list.next, struct call_single_data, list); - list_del(&data->list); + csd = list_entry(list.next, struct call_single_data, list); + list_del(&csd->list); /* - * 'data' can be invalid after this call if flags == 0 + * 'csd' can be invalid after this call if flags == 0 * (when called through generic_exec_single()), * so save them away before making the call: */ - data_flags = data->flags; + csd_flags = csd->flags; - data->func(data->info); + csd->func(csd->info); /* * Unlocked CSDs are valid through generic_exec_single(): */ - if (data_flags & CSD_FLAG_LOCK) - csd_unlock(data); + if (csd_flags & CSD_FLAG_LOCK) + csd_unlock(csd); } } @@ -240,8 +241,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -249,16 +251,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, local_irq_restore(flags); } else { if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = &d; + struct call_single_data *csd = &d; if (!wait) - data = &__get_cpu_var(csd_data); + csd = &__get_cpu_var(csd_data); - csd_lock(data); + csd_lock(csd); - data->func = func; - data->info = info; - generic_exec_single(cpu, data, wait); + csd->func = func; + csd->info = info; + generic_exec_single(cpu, csd, wait); } else { err = -ENXIO; /* CPU not online */ } @@ -325,7 +327,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); * pre-allocated data structure. Useful for embedding @data inside * other structures, for instance. */ -void __smp_call_function_single(int cpu, struct call_single_data *data, +void __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) { unsigned int this_cpu; @@ -343,11 +345,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, if (cpu == this_cpu) { local_irq_save(flags); - data->func(data->info); + csd->func(csd->info); local_irq_restore(flags); } else { - csd_lock(data); - generic_exec_single(cpu, data, wait); + csd_lock(csd); + generic_exec_single(cpu, csd, wait); } put_cpu(); } @@ -369,7 +371,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { - struct call_function_data *data; + struct call_function_data *cfd; int cpu, next_cpu, this_cpu = smp_processor_id(); /* @@ -378,8 +380,9 @@ void smp_call_function_many(const struct cpumask *mask, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress && !early_boot_irqs_disabled); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress && !early_boot_irqs_disabled); /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -401,24 +404,24 @@ void smp_call_function_many(const struct cpumask *mask, return; } - data = &__get_cpu_var(cfd_data); + cfd = &__get_cpu_var(cfd_data); - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); + cpumask_and(cfd->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!cpumask_weight(data->cpumask))) + if (unlikely(!cpumask_weight(cfd->cpumask))) return; /* - * After we put an entry into the list, data->cpumask - * may be cleared again when another CPU sends another IPI for - * a SMP function call, so data->cpumask will be zero. + * After we put an entry into the list, cfd->cpumask may be cleared + * again when another CPU sends another IPI for a SMP function call, so + * cfd->cpumask will be zero. */ - cpumask_copy(data->cpumask_ipi, data->cpumask); + cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); - for_each_cpu(cpu, data->cpumask) { - struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); + for_each_cpu(cpu, cfd->cpumask) { + struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -433,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask, } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(data->cpumask_ipi); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { - for_each_cpu(cpu, data->cpumask) { - struct call_single_data *csd = - per_cpu_ptr(data->csd, cpu); + for_each_cpu(cpu, cfd->cpumask) { + struct call_single_data *csd; + + csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } diff --git a/kernel/sys.c b/kernel/sys.c index fd2b5259ad7a..c0bfd5ec922c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1804,7 +1804,6 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; @@ -1998,17 +1997,12 @@ out: return error; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { return put_user(me->clear_child_tid, tid_addr); } - -#else /* CONFIG_CHECKPOINT_RESTORE */ -static int prctl_set_mm(int opt, unsigned long addr, - unsigned long arg4, unsigned long arg5) -{ - return -EINVAL; -} +#else static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { return -EINVAL; diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index f8b11a283171..12d6ebbfdd83 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -365,7 +365,7 @@ int init_test_probes(void) target2 = kprobe_target2; do { - rand1 = random32(); + rand1 = prandom_u32(); } while (rand1 <= div_factor); printk(KERN_INFO "Kprobe smoke test started\n"); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9f164b..7ccf16f0bcbc 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -133,7 +133,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - SEQ_printf(m, "\n"); SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); @@ -187,6 +186,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #undef P #undef P_ns + SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +195,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; - SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +229,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m) { - int cpu; - #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,12 +244,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) #endif SEQ_printf(m, "\n"); #endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } #endif static int timer_list_show(struct seq_file *m, void *v) @@ -259,34 +252,113 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.7\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); - - for_each_online_cpu(cpu) + if (v == (void *)1) { + SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", + HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); + } else if (v < (void *)(unsigned long)(nr_cpu_ids + 2)) { + cpu = (unsigned long)(v - 2); print_cpu(m, cpu, now); + } +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (v == (void *)(unsigned long)nr_cpu_ids + 2) { + timer_list_show_tickdevices_header(m); + } else { + cpu = (unsigned long)(v - 3 - nr_cpu_ids); + print_tickdevice(m, tick_get_device(cpu), cpu); + } +#endif + return 0; +} - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); +/* + * This iterator really needs some explanation since it is offset and has + * two passes, one of which is controlled by a config option. + * In hotpluggable systems some cpus, including cpu 0 and the last cpu, may + * be missing so we have to use cpumask_* to iterate over the cpus. + * For the first pass: + * It returns 1 for the header position. + * For cpu 0 it returns 2 and the final possible cpu would be nr_cpu_ids + 1. + * On the second pass: + * It returns nr_cpu_ids + 1 for the second header position. + * For cpu 0 it returns nr_cpu_ids + 2 + * The final possible cpu would be nr_cpu_ids + nr_cpu_ids + 2. + * It is also important to remember that cpumask_next returns >= nr_cpu_ids if + * no further cpus set. + */ +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; - return 0; + if (n == 0) + return (void *) 1; + + if (n < nr_cpu_ids + 1) { + n = cpumask_next(n - 2, cpu_online_mask); + if (n >= nr_cpu_ids) + n = nr_cpu_ids; + *offset = n + 1; + return (void *)(unsigned long)(n + 2); + } + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (n == nr_cpu_ids + 1) + return (void *)(unsigned long)(nr_cpu_ids + 2); + + if (n < nr_cpu_ids * 2 + 2) { + n -= (nr_cpu_ids + 2); + n = cpumask_next(n - 1, cpu_online_mask); + if (n >= nr_cpu_ids) + return NULL; + *offset = n + 2 + nr_cpu_ids; + return (void *)(unsigned long)(n + 3 + nr_cpu_ids); + } +#endif + + return NULL; +} + +static void *timer_list_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return timer_list_start(file, offset); } +static void timer_list_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + void sysrq_timer_list_show(void) { timer_list_show(NULL, NULL); } +static int timer_list_release(struct inode *inode, struct file *filep) +{ + seq_release(inode, filep); + + return 0; +} + static int timer_list_open(struct inode *inode, struct file *filp) { - return single_open(filp, timer_list_show, NULL); + return seq_open(filp, &timer_list_sops); } static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = timer_list_release, }; static int __init init_timer_list_procfs(void) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4a944676358e..7672bef6c88b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) + if (hardlockup_panic) { + trigger_all_cpu_backtrace(); panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else + } else { WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + } __this_cpu_write(hard_watchdog_warn, true); return; @@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); - if (softlockup_panic) + if (softlockup_panic) { + trigger_all_cpu_backtrace(); panic("softlockup: hung tasks"); + } __this_cpu_write(soft_watchdog_warn, true); } else __this_cpu_write(soft_watchdog_warn, false); diff --git a/lib/Kconfig b/lib/Kconfig index 3958dc4389f9..d0864159a9b8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -189,6 +189,9 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # @@ -213,6 +216,10 @@ config DECOMPRESS_LZO select LZO_DECOMPRESS tristate +config DECOMPRESS_LZ4 + select LZ4_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 28be08c09bab..ae805189e8d6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1292,6 +1292,24 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. +config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + bool + +config DEBUG_STRICT_USER_COPY_CHECKS + bool "Strict user copy size checks" + depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING + help + Enabling this option turns a certain set of sanity checks for user + copy operations into compile time failures. + + The copy_from_user() etc checks are there to help test if there + are sufficient security checks on the length argument of + the copy operation, by having gcc prove that the argument is + within bounds. + + If unsure, say N. + source mm/Kconfig.debug source kernel/trace/Kconfig diff --git a/lib/Makefile b/lib/Makefile index d7946ff75b2e..0feeb191ccae 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,8 +13,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o + earlycpio.o percpu-refcount.o +lib-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o @@ -72,6 +73,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ @@ -80,6 +82,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o +lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/decompress.c b/lib/decompress.c index 31a804277282..c70810ea8590 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -11,6 +11,7 @@ #include <linux/decompress/unxz.h> #include <linux/decompress/inflate.h> #include <linux/decompress/unlzo.h> +#include <linux/decompress/unlz4.h> #include <linux/types.h> #include <linux/string.h> @@ -31,6 +32,9 @@ #ifndef CONFIG_DECOMPRESS_LZO # define unlzo NULL #endif +#ifndef CONFIG_DECOMPRESS_LZ4 +# define unlz4 NULL +#endif struct compress_format { unsigned char magic[2]; @@ -45,6 +49,7 @@ static const struct compress_format compressed_formats[] __initdata = { { {0x5d, 0x00}, "lzma", unlzma }, { {0xfd, 0x37}, "xz", unxz }, { {0x89, 0x4c}, "lzo", unlzo }, + { {0x02, 0x21}, "lz4", unlz4 }, { {0, 0}, NULL, NULL } }; diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c new file mode 100644 index 000000000000..3e67cfad16ad --- /dev/null +++ b/lib/decompress_unlz4.c @@ -0,0 +1,187 @@ +/* + * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifdef STATIC +#define PREBOOT +#include "lz4/lz4_decompress.c" +#else +#include <linux/decompress/unlz4.h> +#endif +#include <linux/types.h> +#include <linux/lz4.h> +#include <linux/decompress/mm.h> +#include <linux/compiler.h> + +#include <asm/unaligned.h> + +/* + * Note: Uncompressed chunk size is used in the compressor side + * (userspace side for compression). + * It is hardcoded because there is not proper way to extract it + * from the binary stream which is generated by the preliminary + * version of LZ4 tool so far. + */ +#define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) +#define ARCHIVE_MAGICNUMBER 0x184C2102 + +STATIC inline int INIT unlz4(u8 *input, int in_len, + int (*fill) (void *, unsigned int), + int (*flush) (void *, unsigned int), + u8 *output, int *posp, + void (*error) (char *x)) +{ + int ret = -1; + size_t chunksize = 0; + size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE; + u8 *inp; + u8 *inp_start; + u8 *outp; + int size = in_len; +#ifdef PREBOOT + size_t out_len = get_unaligned_le32(input + in_len); +#endif + size_t dest_len; + + + if (output) { + outp = output; + } else if (!flush) { + error("NULL output pointer and no flush function provided"); + goto exit_0; + } else { + outp = large_malloc(uncomp_chunksize); + if (!outp) { + error("Could not allocate output buffer"); + goto exit_0; + } + } + + if (input && fill) { + error("Both input pointer and fill function provided,"); + goto exit_1; + } else if (input) { + inp = input; + } else if (!fill) { + error("NULL input pointer and missing fill function"); + goto exit_1; + } else { + inp = large_malloc(lz4_compressbound(uncomp_chunksize)); + if (!inp) { + error("Could not allocate input buffer"); + goto exit_1; + } + } + inp_start = inp; + + if (posp) + *posp = 0; + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + } else { + error("invalid header"); + goto exit_2; + } + + if (posp) + *posp += 4; + + for (;;) { + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + if (posp) + *posp += 4; + continue; + } + inp += 4; + size -= 4; + + if (posp) + *posp += 4; + + if (fill) { + if (chunksize > lz4_compressbound(uncomp_chunksize)) { + error("chunk length is longer than allocated"); + goto exit_2; + } + fill(inp, chunksize); + } +#ifdef PREBOOT + if (out_len >= uncomp_chunksize) { + dest_len = uncomp_chunksize; + out_len -= dest_len; + } else + dest_len = out_len; + ret = lz4_decompress(inp, &chunksize, outp, dest_len); +#else + dest_len = uncomp_chunksize; + ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, + &dest_len); +#endif + if (ret < 0) { + error("Decoding failed"); + goto exit_2; + } + + if (flush && flush(outp, dest_len) != dest_len) + goto exit_2; + if (output) + outp += dest_len; + if (posp) + *posp += chunksize; + + size -= chunksize; + + if (size == 0) + break; + else if (size < 0) { + error("data corrupted"); + goto exit_2; + } + + inp += chunksize; + if (fill) + inp = inp_start; + } + + ret = 0; +exit_2: + if (!input) + large_free(inp_start); +exit_1: + if (!output) + large_free(outp); +exit_0: + return ret; +} + +#ifdef PREBOOT +STATIC int INIT decompress(unsigned char *buf, int in_len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *posp, + void(*error)(char *x) + ) +{ + return unlz4(buf, in_len - 4, fill, flush, output, posp, error); +} +#endif diff --git a/lib/fault-inject.c b/lib/fault-inject.c index f7210ad6cffd..c5c7a762b850 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -122,7 +122,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) return false; } - if (attr->probability <= random32() % 100) + if (attr->probability <= prandom_u32() % 100) return false; if (!fail_stacktrace(attr)) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index fc2eeb7cb2ea..1ef4cc344977 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -1,3 +1,9 @@ +/* + * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com> + * + * Based on the shift-and-subtract algorithm for computing integer + * square root from Guy L. Steele. + */ #include <linux/kernel.h> #include <linux/export.h> @@ -10,23 +16,23 @@ */ unsigned long int_sqrt(unsigned long x) { - unsigned long op, res, one; + unsigned long b, m, y = 0; - op = x; - res = 0; + if (x <= 1) + return x; - one = 1UL << (BITS_PER_LONG - 2); - while (one > op) - one >>= 2; + m = 1UL << (BITS_PER_LONG - 2); + while (m != 0) { + b = y + m; + y >>= 1; - while (one != 0) { - if (op >= res + one) { - op = op - (res + one); - res = res + 2 * one; + if (x >= b) { + x -= b; + y += m; } - res /= 2; - one /= 4; + m >>= 2; } - return res; + + return y; } EXPORT_SYMBOL(int_sqrt); diff --git a/lib/list_sort.c b/lib/list_sort.c index d7325c6b103f..1183fa70a44d 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -229,7 +229,7 @@ static int __init list_sort_test(void) goto exit; } /* force some equivalencies */ - el->value = random32() % (TEST_LIST_LEN/3); + el->value = prandom_u32() % (TEST_LIST_LEN / 3); el->serial = i; el->poison1 = TEST_POISON1; el->poison2 = TEST_POISON2; diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000000..7f548c6d1c5c --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000000..d3414eae73a1 --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,326 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include <linux/module.h> +#include <linux/kernel.h> +#endif +#include <linux/lz4.h> + +#include <asm/unaligned.h> + +#include "lz4defs.h" + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *)ip) - source)); +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ + + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *) ip) - source)); +} + +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000000..43ac31d63f36 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,94 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define BYTE u8 +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; + +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..79c61580a211 --- /dev/null +++ b/lib/percpu-refcount.c @@ -0,0 +1,243 @@ +#define pr_fmt(fmt) "%s: " fmt "\n", __func__ + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/percpu-refcount.h> +#include <linux/rcupdate.h> + +/* + * A percpu refcount can be in 4 different modes. The state is tracked in the + * low two bits of percpu_ref->pcpu_count: + * + * PCPU_REF_NONE - the initial state, no percpu counters allocated. + * + * PCPU_REF_PTR - using percpu counters for the refcount. + * + * PCPU_REF_DYING - we're shutting down so get()/put() should use the embedded + * atomic counter, but we're not finished updating the atomic counter from the + * percpu counters - this means that percpu_ref_put() can't check for the ref + * hitting 0 yet. + * + * PCPU_REF_DEAD - we've finished the teardown sequence, percpu_ref_put() should + * now check for the ref hitting 0. + * + * In PCPU_REF_NONE mode, we need to count the number of times percpu_ref_get() + * is called; this is done with the high bits of the raw atomic counter. We also + * track the time, in jiffies, when the get count last wrapped - this is done + * with the remaining bits of percpu_ref->percpu_count. + * + * So, when percpu_ref_get() is called it increments the get count and checks if + * it wrapped; if it did, it checks if the last time it wrapped was less than + * one second ago; if so, we want to allocate percpu counters. + * + * PCPU_COUNT_BITS determines the threshold where we convert to percpu: of the + * raw 64 bit counter, we use PCPU_COUNT_BITS for the refcount, and the + * remaining (high) bits to count the number of times percpu_ref_get() has been + * called. It's currently (completely arbitrarily) 16384 times in one second. + * + * Percpu mode (PCPU_REF_PTR): + * + * In percpu mode all we do on get and put is increment or decrement the cpu + * local counter, which is a 32 bit unsigned int. + * + * Note that all the gets() could be happening on one cpu, and all the puts() on + * another - the individual cpu counters can wrap (potentially many times). + * + * But this is fine because we don't need to check for the ref hitting 0 in + * percpu mode; before we set the state to PCPU_REF_DEAD we simply sum up all + * the percpu counters and add them to the atomic counter. Since addition and + * subtraction in modular arithmatic is still associative, the result will be + * correct. + */ + +#define PCPU_COUNT_BITS 50 +#define PCPU_COUNT_MASK ((1LL << PCPU_COUNT_BITS) - 1) + +#define PCPU_STATUS_BITS 2 +#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) + +#define PCPU_REF_PTR 0 +#define PCPU_REF_NONE 1 +#define PCPU_REF_DYING 2 +#define PCPU_REF_DEAD 3 + +#define REF_STATUS(count) (count & PCPU_STATUS_MASK) + +/** + * percpu_ref_init - initialize a dynamic percpu refcount + * + * Initializes the refcount in single atomic counter mode with a refcount of 1; + * analagous to atomic_set(ref, 1). + */ +void percpu_ref_init(struct percpu_ref *ref) +{ + unsigned long now = jiffies; + + atomic64_set(&ref->count, 1); + + now <<= PCPU_STATUS_BITS; + now |= PCPU_REF_NONE; + + ref->pcpu_count = now; +} + +static void percpu_ref_alloc(struct percpu_ref *ref, unsigned long pcpu_count) +{ + unsigned long new, now = jiffies; + + now <<= PCPU_STATUS_BITS; + now |= PCPU_REF_NONE; + + if (now - pcpu_count <= HZ << PCPU_STATUS_BITS) { + rcu_read_unlock(); + new = (unsigned long) alloc_percpu(unsigned); + rcu_read_lock(); + + if (!new) + goto update_time; + + BUG_ON(new & PCPU_STATUS_MASK); + + if (cmpxchg(&ref->pcpu_count, pcpu_count, new) != pcpu_count) + free_percpu((void __percpu *) new); + else + pr_debug("created"); + } else { +update_time: + new = now; + cmpxchg(&ref->pcpu_count, pcpu_count, new); + } +} + +void __percpu_ref_get(struct percpu_ref *ref, bool alloc) +{ + unsigned long pcpu_count; + uint64_t v; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + if (REF_STATUS(pcpu_count) == PCPU_REF_PTR) { + /* for rcu - we're not using rcu_dereference() */ + smp_read_barrier_depends(); + __this_cpu_inc(*((unsigned __percpu *) pcpu_count)); + } else { + v = atomic64_add_return(1 + (1ULL << PCPU_COUNT_BITS), + &ref->count); + + if (!(v >> PCPU_COUNT_BITS) && + REF_STATUS(pcpu_count) == PCPU_REF_NONE && alloc) + percpu_ref_alloc(ref, pcpu_count); + } +} + +/** + * percpu_ref_put - decrement a dynamic percpu refcount + * + * Returns true if the result is 0, otherwise false; only checks for the ref + * hitting 0 after percpu_ref_kill() has been called. Analagous to + * atomic_dec_and_test(). + */ +int percpu_ref_put(struct percpu_ref *ref) +{ + unsigned long pcpu_count; + uint64_t v; + int ret = 0; + + rcu_read_lock(); + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + switch (REF_STATUS(pcpu_count)) { + case PCPU_REF_PTR: + /* for rcu - we're not using rcu_dereference() */ + smp_read_barrier_depends(); + __this_cpu_dec(*((unsigned __percpu *) pcpu_count)); + break; + case PCPU_REF_NONE: + case PCPU_REF_DYING: + atomic64_dec(&ref->count); + break; + case PCPU_REF_DEAD: + v = atomic64_dec_return(&ref->count); + v &= PCPU_COUNT_MASK; + + ret = v == 0; + break; + } + + rcu_read_unlock(); + + return ret; +} + +/** + * percpu_ref_kill - prepare a dynamic percpu refcount for teardown + * + * Must be called before dropping the initial ref, so that percpu_ref_put() + * knows to check for the refcount hitting 0. If the refcount was in percpu + * mode, converts it back to single atomic counter mode. + * + * Returns true the first time called on @ref and false if @ref is already + * shutting down, so it may be used by the caller for synchronizing other parts + * of a two stage shutdown. + */ +int percpu_ref_kill(struct percpu_ref *ref) +{ + unsigned long old, new, status, pcpu_count; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + do { + status = REF_STATUS(pcpu_count); + + switch (status) { + case PCPU_REF_PTR: + new = PCPU_REF_DYING; + break; + case PCPU_REF_NONE: + new = PCPU_REF_DEAD; + break; + case PCPU_REF_DYING: + case PCPU_REF_DEAD: + return 0; + } + + old = pcpu_count; + pcpu_count = cmpxchg(&ref->pcpu_count, old, new); + } while (pcpu_count != old); + + if (status == PCPU_REF_PTR) { + unsigned count = 0, cpu; + + synchronize_rcu(); + + for_each_possible_cpu(cpu) + count += *per_cpu_ptr((unsigned __percpu *) pcpu_count, cpu); + + pr_debug("global %lli pcpu %i", + atomic64_read(&ref->count) & PCPU_COUNT_MASK, + (int) count); + + atomic64_add((int) count, &ref->count); + smp_wmb(); + /* Between setting global count and setting PCPU_REF_DEAD */ + ref->pcpu_count = PCPU_REF_DEAD; + + free_percpu((unsigned __percpu *) pcpu_count); + } + + return 1; +} + +/** + * percpu_ref_dead - check if a dynamic percpu refcount is shutting down + * + * Returns true if percpu_ref_kill() has been called on @ref, false otherwise. + */ +int percpu_ref_dead(struct percpu_ref *ref) +{ + unsigned status = REF_STATUS(ref->pcpu_count); + + return status == PCPU_REF_DYING || + status == PCPU_REF_DEAD; +} diff --git a/lib/show_mem.c b/lib/show_mem.c index 4407f8c9b1f7..b7c72311ad0c 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -18,6 +18,9 @@ void show_mem(unsigned int filter) printk("Mem-Info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_online_pgdat(pgdat) { unsigned long i, flags; diff --git a/arch/s390/lib/usercopy.c b/lib/usercopy.c index 14b363fec8a2..4f5b1ddbcd25 100644 --- a/arch/s390/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,5 +1,6 @@ -#include <linux/module.h> +#include <linux/export.h> #include <linux/bug.h> +#include <linux/uaccess.h> void copy_from_user_overflow(void) { diff --git a/lib/uuid.c b/lib/uuid.c index 52a6fe6387de..398821e4dce1 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -25,13 +25,7 @@ static void __uuid_gen_common(__u8 b[16]) { - int i; - u32 r; - - for (i = 0; i < 4; i++) { - r = random32(); - memcpy(b + i * 4, &r, 4); - } + prandom_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..2c8ce496804f 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -242,6 +242,7 @@ bool balloon_page_isolate(struct page *page) if (__is_movable_balloon_page(page) && page_count(page) == 2) { __isolate_balloon_page(page); + balloon_event_count(COMPACTBALLOONISOLATED); unlock_page(page); return true; } @@ -265,6 +266,7 @@ void balloon_page_putback(struct page *page) __putback_balloon_page(page); /* drop the extra ref count taken for page isolation */ put_page(page); + balloon_event_count(COMPACTBALLOONRETURNED); } else { WARN_ON(1); dump_page(page); diff --git a/mm/cleancache.c b/mm/cleancache.c index d76ba74be2d0..5875f48ce279 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,20 +19,10 @@ #include <linux/cleancache.h> /* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/invalidate even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled __read_mostly; -EXPORT_SYMBOL(cleancache_enabled); - -/* * cleancache_ops is set by cleancache_ops_register to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops cleancache_ops __read_mostly; +static struct cleancache_ops *cleancache_ops __read_mostly; /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -45,15 +35,101 @@ static u64 cleancache_puts; static u64 cleancache_invalidates; /* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting + * When no backend is registered all calls to init_fs and init_shared_fs + * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or + * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array + * [shared_|]fs_poolid_map) are given to the respective super block + * (sb->cleancache_poolid) and no tmem_pools are created. When a backend + * registers with cleancache the previous calls to init_fs and init_shared_fs + * are executed to create tmem_pools and set the respective poolids. While no + * backend is registered all "puts", "gets" and "flushes" are ignored or failed. + */ +#define MAX_INITIALIZABLE_FS 32 +#define FAKE_FS_POOLID_OFFSET 1000 +#define FAKE_SHARED_FS_POOLID_OFFSET 2000 + +#define FS_NO_BACKEND (-1) +#define FS_UNKNOWN (-2) +static int fs_poolid_map[MAX_INITIALIZABLE_FS]; +static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; +static char *uuids[MAX_INITIALIZABLE_FS]; +/* + * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads + * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple + * threads calling mount (and ending up in __cleancache_init_[shared|]fs). + */ +static DEFINE_MUTEX(poolid_mutex); +/* + * When set to false (default) all calls to the cleancache functions, except + * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded + * by the if (!cleancache_ops) return. This means multiple threads (from + * different filesystems) will be checking cleancache_ops. The usage of a + * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are + * OK if the time between the backend's have been initialized (and + * cleancache_ops has been set to not NULL) and when the filesystems start + * actually calling the backends. The inverse (when unloading) is obviously + * not good - but this shim does not do that (yet). + */ + +/* + * The backends and filesystems work all asynchronously. This is b/c the + * backends can be built as modules. + * The usual sequence of events is: + * a) mount / -> __cleancache_init_fs is called. We set the + * [shared_|]fs_poolid_map and uuids for. + * + * b). user does I/Os -> we call the rest of __cleancache_* functions + * which return immediately as cleancache_ops is false. + * + * c). modprobe zcache -> cleancache_register_ops. We init the backend + * and set cleancache_ops to true, and for any fs_poolid_map + * (which is set by __cleancache_init_fs) we initialize the poolid. + * + * d). user does I/Os -> now that cleancache_ops is true all the + * __cleancache_* functions can call the backend. They all check + * that fs_poolid_map is valid and if so invoke the backend. + * + * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is + * reset (which is the second check in the __cleancache_* ops + * to call the backend). + * + * The sequence of event could also be c), followed by a), and d). and e). The + * c) would not happen anymore. There is also the chance of c), and one thread + * doing a) + d), and another doing e). For that case we depend on the + * filesystem calling __cleancache_invalidate_fs in the proper sequence (so + * that it handles all I/Os before it invalidates the fs (which is last part + * of unmounting process). + * + * Note: The acute reader will notice that there is no "rmmod zcache" case. + * This is b/c the functionality for that is not yet implemented and when + * done, will require some extra locking not yet devised. + */ + +/* + * Register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting. */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops old = cleancache_ops; + struct cleancache_ops *old = cleancache_ops; + int i; - cleancache_ops = *ops; - cleancache_enabled = 1; + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (fs_poolid_map[i] == FS_NO_BACKEND) + fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); + if (shared_fs_poolid_map[i] == FS_NO_BACKEND) + shared_fs_poolid_map[i] = ops->init_shared_fs + (uuids[i], PAGE_SIZE); + } + /* + * We MUST set cleancache_ops _after_ we have called the backends + * init_fs or init_shared_fs functions. Otherwise the compiler might + * re-order where cleancache_ops is set in this function. + */ + barrier(); + cleancache_ops = ops; + mutex_unlock(&poolid_mutex); return old; } EXPORT_SYMBOL(cleancache_register_ops); @@ -61,15 +137,42 @@ EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); + int i; + + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (fs_poolid_map[i] == FS_UNKNOWN) { + sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; + if (cleancache_ops) + fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); + else + fs_poolid_map[i] = FS_NO_BACKEND; + break; + } + } + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) { - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); + int i; + + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (shared_fs_poolid_map[i] == FS_UNKNOWN) { + sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; + uuids[i] = uuid; + if (cleancache_ops) + shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs + (uuid, PAGE_SIZE); + else + shared_fs_poolid_map[i] = FS_NO_BACKEND; + break; + } + } + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -99,27 +202,53 @@ static int cleancache_get_key(struct inode *inode, } /* + * Returns a pool_id that is associated with a given fake poolid. + */ +static int get_poolid_from_fake(int fake_pool_id) +{ + if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) + return shared_fs_poolid_map[fake_pool_id - + FAKE_SHARED_FS_POOLID_OFFSET]; + else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) + return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; + return FS_NO_BACKEND; +} + +/* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if * successful, use it to fill the specified page with data and return 0. * The pageframe is unchanged and returns -1 if the get fails. * Page must be locked by caller. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; + int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_failed_gets++; + goto out; + } + VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) + fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (fake_pool_id < 0) goto out; + pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (pool_id >= 0) + ret = cleancache_ops->get_page(pool_id, + key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -134,17 +263,32 @@ EXPORT_SYMBOL(__cleancache_get_page); * (previously-obtained per-filesystem) poolid and the page's, * inode and page index. Page must be locked. Note that a put_page * always "succeeds", though a subsequent get_page may succeed or fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_put_page(struct page *page) { int pool_id; + int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_puts++; + return; + } + VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; + fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (fake_pool_id < 0) + return; + + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_get_key(page->mapping->host, &key) >= 0) { + cleancache_ops->put_page(pool_id, key, page->index, page); cleancache_puts++; } } @@ -153,19 +297,31 @@ EXPORT_SYMBOL(__cleancache_put_page); /* * Invalidate any data from cleancache associated with the poolid and the * page's inode and page index so that a subsequent "get" will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id; + int fake_pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; - if (pool_id >= 0) { + if (!cleancache_ops) + return; + + if (fake_pool_id >= 0) { + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id < 0) + return; + VM_BUG_ON(!PageLocked(page)); if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.invalidate_page)(pool_id, - key, page->index); + cleancache_ops->invalidate_page(pool_id, + key, page->index); cleancache_invalidates++; } } @@ -176,34 +332,63 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); * Invalidate all data from cleancache associated with the poolid and the * mappings's inode so that all subsequent gets to this poolid/inode * will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id; + int fake_pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + + if (fake_pool_id < 0) + return; + + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.invalidate_inode)(pool_id, key); + cleancache_ops->invalidate_inode(pool_id, key); } EXPORT_SYMBOL(__cleancache_invalidate_inode); /* * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs. */ void __cleancache_invalidate_fs(struct super_block *sb) { - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.invalidate_fs)(old_poolid); + int index; + int fake_pool_id = sb->cleancache_poolid; + int old_poolid = fake_pool_id; + + mutex_lock(&poolid_mutex); + if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { + index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; + old_poolid = shared_fs_poolid_map[index]; + shared_fs_poolid_map[index] = FS_UNKNOWN; + uuids[index] = NULL; + } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { + index = fake_pool_id - FAKE_FS_POOLID_OFFSET; + old_poolid = fs_poolid_map[index]; + fs_poolid_map[index] = FS_UNKNOWN; } + sb->cleancache_poolid = -1; + if (cleancache_ops) + cleancache_ops->invalidate_fs(old_poolid); + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { + int i; + #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -215,6 +400,10 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + fs_poolid_map[i] = FS_UNKNOWN; + shared_fs_poolid_map[i] = FS_UNKNOWN; + } return 0; } module_init(init_cleancache) diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf9..668f26316e2e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -132,6 +132,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; + int node; if (align == 0) { align = 1; @@ -156,7 +157,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, return NULL; } - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + node = WARN_ON(!dev) ? -1 : dev_to_node(dev); + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; diff --git a/mm/filemap.c b/mm/filemap.c index e1979fdca805..4ebaf95eb583 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,9 @@ #include <linux/cleancache.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/filemap.h> + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ @@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -464,6 +468,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); } else { page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ @@ -2528,7 +2533,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); + if (!sb_start_file_write(file)) + return -EAGAIN; mutex_lock(&inode->i_mutex); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); diff --git a/mm/frontswap.c b/mm/frontswap.c index 2890e67d6026..538367ef1372 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -24,15 +24,7 @@ * frontswap_ops is set by frontswap_register_ops to contain the pointers * to the frontswap "backend" implementation functions. */ -static struct frontswap_ops frontswap_ops __read_mostly; - -/* - * This global enablement flag reduces overhead on systems where frontswap_ops - * has not been registered, so is preferred to the slower alternative: a - * function call that checks a non-global. - */ -bool frontswap_enabled __read_mostly; -EXPORT_SYMBOL(frontswap_enabled); +static struct frontswap_ops *frontswap_ops __read_mostly; /* * If enabled, frontswap_store will return failure even on success. As @@ -80,16 +72,70 @@ static inline void inc_frontswap_succ_stores(void) { } static inline void inc_frontswap_failed_stores(void) { } static inline void inc_frontswap_invalidates(void) { } #endif + +/* + * Due to the asynchronous nature of the backends loading potentially + * _after_ the swap system has been activated, we have chokepoints + * on all frontswap functions to not call the backend until the backend + * has registered. + * + * Specifically when no backend is registered (nobody called + * frontswap_register_ops) all calls to frontswap_init (which is done via + * swapon -> enable_swap_info -> frontswap_init) are registered and remembered + * (via the setting of need_init bitmap) but fail to create tmem_pools. When a + * backend registers with frontswap at some later point the previous + * calls to frontswap_init are executed (by iterating over the need_init + * bitmap) to create tmem_pools and set the respective poolids. All of that is + * guarded by us using atomic bit operations on the 'need_init' bitmap. + * + * This would not guards us against the user deciding to call swapoff right as + * we are calling the backend to initialize (so swapon is in action). + * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * OK. The other scenario where calls to frontswap_store (called via + * swap_writepage) is racing with frontswap_invalidate_area (called via + * swapoff) is again guarded by the swap subsystem. + * + * While no backend is registered all calls to frontswap_[store|load| + * invalidate_area|invalidate_page] are ignored or fail. + * + * The time between the backend being registered and the swap file system + * calling the backend (via the frontswap_* functions) is indeterminate as + * frontswap_ops is not atomic_t (or a value guarded by a spinlock). + * That is OK as we are comfortable missing some of these calls to the newly + * registered backend. + * + * Obviously the opposite (unloading the backend) must be done after all + * the frontswap_[store|load|invalidate_area|invalidate_page] start + * ignorning or failing the requests - at which point frontswap_ops + * would have to be made in some fashion atomic. + */ +static DECLARE_BITMAP(need_init, MAX_SWAPFILES); + /* * Register operations for frontswap, returning previous thus allowing * detection of multiple backends and possible nesting. */ -struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) { - struct frontswap_ops old = frontswap_ops; - - frontswap_ops = *ops; - frontswap_enabled = true; + struct frontswap_ops *old = frontswap_ops; + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (test_and_clear_bit(i, need_init)) { + struct swap_info_struct *sis = swap_info[i]; + /* __frontswap_init _should_ have set it! */ + if (!sis->frontswap_map) + return ERR_PTR(-EINVAL); + ops->init(i); + } + } + /* + * We MUST have frontswap_ops set _after_ the frontswap_init's + * have been called. Otherwise __frontswap_store might fail. Hence + * the barrier to make sure compiler does not re-order us. + */ + barrier(); + frontswap_ops = ops; return old; } EXPORT_SYMBOL(frontswap_register_ops); @@ -115,20 +161,48 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); /* * Called when a swap device is swapon'd. */ -void __frontswap_init(unsigned type) +void __frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) + + /* + * p->frontswap is a bitmap that we MUST have to figure out which page + * has gone in frontswap. Without it there is no point of continuing. + */ + if (WARN_ON(!map)) return; - frontswap_ops.init(type); + /* + * Irregardless of whether the frontswap backend has been loaded + * before this function or it will be later, we _MUST_ have the + * p->frontswap set to something valid to work properly. + */ + frontswap_map_set(sis, map); + if (frontswap_ops) + frontswap_ops->init(type); + else { + BUG_ON(type > MAX_SWAPFILES); + set_bit(type, need_init); + } } EXPORT_SYMBOL(__frontswap_init); -static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +bool __frontswap_test(struct swap_info_struct *sis, + pgoff_t offset) +{ + bool ret = false; + + if (frontswap_ops && sis->frontswap_map) + ret = test_bit(offset, sis->frontswap_map); + return ret; +} +EXPORT_SYMBOL(__frontswap_test); + +static inline void __frontswap_clear(struct swap_info_struct *sis, + pgoff_t offset) { - frontswap_clear(sis, offset); + clear_bit(offset, sis->frontswap_map); atomic_dec(&sis->frontswap_pages); } @@ -147,13 +221,20 @@ int __frontswap_store(struct page *page) struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + /* + * Return if no backend registed. + * Don't need to inc frontswap_failed_stores here. + */ + if (!frontswap_ops) + return ret; + BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (frontswap_test(sis, offset)) + if (__frontswap_test(sis, offset)) dup = 1; - ret = frontswap_ops.store(type, offset, page); + ret = frontswap_ops->store(type, offset, page); if (ret == 0) { - frontswap_set(sis, offset); + set_bit(offset, sis->frontswap_map); inc_frontswap_succ_stores(); if (!dup) atomic_inc(&sis->frontswap_pages); @@ -188,13 +269,16 @@ int __frontswap_load(struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (frontswap_test(sis, offset)) - ret = frontswap_ops.load(type, offset, page); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) + ret = frontswap_ops->load(type, offset, page); if (ret == 0) { inc_frontswap_loads(); if (frontswap_tmem_exclusive_gets_enabled) { SetPageDirty(page); - frontswap_clear(sis, offset); + __frontswap_clear(sis, offset); } } return ret; @@ -210,8 +294,11 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) struct swap_info_struct *sis = swap_info[type]; BUG_ON(sis == NULL); - if (frontswap_test(sis, offset)) { - frontswap_ops.invalidate_page(type, offset); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) { + frontswap_ops->invalidate_page(type, offset); __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } @@ -226,12 +313,15 @@ void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; - BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) - return; - frontswap_ops.invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - memset(sis->frontswap_map, 0, sis->max / sizeof(long)); + if (frontswap_ops) { + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + frontswap_ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); + } + clear_bit(type, need_init); } EXPORT_SYMBOL(__frontswap_invalidate_area); diff --git a/mm/memblock.c b/mm/memblock.c index b8d9147e5c08..2cce8b3e76ed 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -771,6 +771,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; + if (WARN_ON(!align)) + align = __alignof__(long long); + /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b552224f5cf..690fa8c57832 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -310,14 +310,31 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; - /* For oom notifier event fd */ - struct list_head oom_notify; + union { + /* For oom notifier event fd */ + struct list_head oom_notify; + /* + * we can only trigger an oom event if the memcg is alive. + * so we will reuse this field to hook the memcg in the list + * of dead memcgs. + */ + struct list_head dead; + }; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; + union { + /* + * Should we move charges of a task when a task is moved into + * this mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + + /* + * We are no longer concerned about moving charges after memcg + * is dead. So we will fill this up with its name, to aid + * debugging. + */ + char *memcg_name; + }; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -369,6 +386,55 @@ static size_t memcg_size(void) nr_node_ids * sizeof(struct mem_cgroup_per_node); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static LIST_HEAD(dangling_memcgs); +static DEFINE_MUTEX(dangling_memcgs_mutex); + +static inline void memcg_dangling_free(struct mem_cgroup *memcg) +{ + mutex_lock(&dangling_memcgs_mutex); + list_del(&memcg->dead); + mutex_unlock(&dangling_memcgs_mutex); + free_pages((unsigned long)memcg->memcg_name, 0); +} + +static inline void memcg_dangling_add(struct mem_cgroup *memcg) +{ + /* + * cgroup.c will do page-sized allocations most of the time, + * so we'll just follow the pattern. Also, __get_free_pages + * is a better interface than kmalloc for us here, because + * we'd like this memory to be always billed to the root cgroup, + * not to the process removing the memcg. While kmalloc would + * require us to wrap it into memcg_stop/resume_kmem_account, + * with __get_free_pages we just don't pass the memcg flag. + */ + memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0); + + /* + * we will, in general, just ignore failures. No need to go crazy, + * being this just a debugging interface. It is nice to copy a memcg + * name over, but if we (unlikely) can't, just the address will do + */ + if (!memcg->memcg_name) + goto add_list; + + if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) { + free_pages((unsigned long)memcg->memcg_name, 0); + memcg->memcg_name = NULL; + } + +add_list: + INIT_LIST_HEAD(&memcg->dead); + mutex_lock(&dangling_memcgs_mutex); + list_add(&memcg->dead, &dangling_memcgs); + mutex_unlock(&dangling_memcgs_mutex); +} +#else +static inline void memcg_dangling_free(struct mem_cgroup *memcg) {} +static inline void memcg_dangling_add(struct mem_cgroup *memcg) {} +#endif + /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -4974,6 +5040,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static void +mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_SWAP + u64 kmem; + u64 memsw; + + /* + * kmem will also propagate here, so we are only interested in the + * difference. See comment in mem_cgroup_reparent_charges for details. + * + * We could save this value for later consumption by kmem reports, but + * there is not a lot of problem if the figures differ slightly. + */ + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem; + seq_printf(m, "\t%llu swap bytes\n", memsw); +#endif +} + + +static void +mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m) +{ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + struct tcp_memcontrol *tcp = &memcg->tcp_mem; + s64 tcp_socks; + u64 tcp_bytes; + + tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated); + tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + seq_printf(m, "\t%llu tcp bytes", tcp_bytes); + /* + * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print + * it! + */ + if (tcp_bytes || tcp_socks) + seq_printf(m, ", in %lld sockets", tcp_socks); + seq_printf(m, "\n"); + +#endif +} + +static void +mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_KMEM + u64 kmem; + struct memcg_cache_params *params; + + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + seq_printf(m, "\t%llu kmem bytes", kmem); + + /* list below may not be initialized, so not even try */ + if (!kmem) + return; + + seq_printf(m, " in caches"); + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + struct kmem_cache *s = memcg_params_to_cache(params); + + seq_printf(m, " %s", s->name); + } + mutex_unlock(&memcg->slab_caches_mutex); + seq_printf(m, "\n"); +#endif +} + +/* + * After a memcg is destroyed, it may still be kept around in memory. + * Currently, the two main reasons for it are swap entries, and kernel memory. + * Because they will be freed assynchronously, they will pin the memcg structure + * and its resources until the last reference goes away. + * + * This root-only file will show information about which users + */ +static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg; + + mutex_lock(&dangling_memcgs_mutex); + + list_for_each_entry(memcg, &dangling_memcgs, dead) { + if (memcg->memcg_name) + seq_printf(m, "%s:\n", memcg->memcg_name); + else + seq_printf(m, "%p (name lost):\n", memcg); + + mem_cgroup_dangling_swap(memcg, m); + mem_cgroup_dangling_tcp(memcg, m); + mem_cgroup_dangling_kmem(memcg, m); + } + + mutex_unlock(&dangling_memcgs_mutex); + return 0; +} +#endif + static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; @@ -5875,6 +6042,14 @@ static struct cftype mem_cgroup_files[] = { }, #endif #endif + +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY + { + .name = "dangling_memcgs", + .read_seq_string = mem_cgroup_dangling_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, +#endif { }, /* terminate */ }; @@ -6024,6 +6199,8 @@ static void free_work(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, work_freeing); + + memcg_dangling_free(memcg); __mem_cgroup_free(memcg); } @@ -6198,6 +6375,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) kmem_cgroup_destroy(memcg); + memcg_dangling_add(memcg); mem_cgroup_put(memcg); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index df0694c6adef..ceb0c7f1932f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -785,10 +785,10 @@ static struct page_state { { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, - { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, + { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, diff --git a/mm/migrate.c b/mm/migrate.c index 3bbaf5d230b0..4e3dab51f8fa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -876,6 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); balloon_page_free(page); + balloon_event_count(COMPACTBALLOONMIGRATED); return MIGRATEPAGE_SUCCESS; } out: diff --git a/mm/mmap.c b/mm/mmap.c index 2664a47cec93..49dc7d577b46 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1816,15 +1816,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1881,19 +1872,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -2317,7 +2295,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2334,11 +2311,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..8a8cd0265e52 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -14,9 +14,6 @@ * use_mm * Makes the calling kernel thread take on the specified * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. * (Note: this routine is intended to be called only * from a kernel thread context) */ diff --git a/mm/nommu.c b/mm/nommu.c index 66737e0584ae..c9c18c145bd4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1858,10 +1858,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fcced7823fa..ca7b01ec52a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2002,6 +2002,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) return; /* + * Walking all memory to count page types is very expensive and should + * be inhibited in non-blockable contexts. + */ + if (!(gfp_mask & __GFP_WAIT)) + filter |= SHOW_MEM_FILTER_PAGE_COUNT; + + /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set * of allowed nodes. @@ -3900,8 +3907,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { + pfn = ALIGN(pfn + MAX_ORDER_NR_PAGES, + MAX_ORDER_NR_PAGES) - 1; continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; } diff --git a/mm/page_io.c b/mm/page_io.c index 78eee32ee486..c535d395a440 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/frontswap.h> +#include <linux/aio.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, diff --git a/mm/rmap.c b/mm/rmap.c index 807c96bf0dc6..6280da86b5d6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1513,6 +1513,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; + if (PageHuge(page)) + pgoff = page->index << compound_order(page); + mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); diff --git a/mm/shmem.c b/mm/shmem.c index 1c44af71fcf5..5e6a8422658b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -25,11 +25,13 @@ #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> +#include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> +#include <linux/aio.h> static struct vfsmount *shm_mnt; @@ -2830,8 +2832,6 @@ out4: * effectively equivalent, but much lighter weight. */ -#include <linux/ramfs.h> - static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, @@ -2931,11 +2931,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ -#ifndef CONFIG_MMU res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) goto put_dentry; -#endif res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); diff --git a/mm/swap.c b/mm/swap.c index 8a529a01e8fc..92a9be551846 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,6 +30,7 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> +#include <linux/uio.h> #include "internal.h" diff --git a/mm/swapfile.c b/mm/swapfile.c index a1f7772a01fc..6c340d908b27 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1509,8 +1509,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) } static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - unsigned long *frontswap_map) + unsigned char *swap_map) { int i, prev; @@ -1519,7 +1518,6 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; - frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -1542,10 +1540,10 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, unsigned long *frontswap_map) { + frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, frontswap_map); - frontswap_init(p->type); + _enable_swap_info(p, prio, swap_map); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1554,7 +1552,7 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); + _enable_swap_info(p, p->prio, p->swap_map); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1563,6 +1561,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; @@ -1662,12 +1661,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; - frontswap_invalidate_area(type); + frontswap_map = frontswap_map_get(p); + frontswap_map_set(p, NULL); spin_unlock(&p->lock); spin_unlock(&swap_lock); + frontswap_invalidate_area(type); mutex_unlock(&swapon_mutex); vfree(swap_map); - vfree(frontswap_map_get(p)); + vfree(frontswap_map); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -2120,7 +2121,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); } if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) p->flags |= SWP_DISCARDABLE; diff --git a/mm/util.c b/mm/util.c index ab1424dbe2e6..7441c41d00f6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index e1d8ed172c42..292b1cf785e0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -792,7 +792,14 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", -#endif + +#ifdef CONFIG_BALLOON_COMPACTION + "compact_balloon_isolated", + "compact_balloon_migrated", + "compact_balloon_returned", +#endif /* CONFIG_BALLOON_COMPACTION */ + +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6048fc1da1c2..5c217427a669 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2198,7 +2198,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = random32() % pkt_dev->cflows; + flow = prandom_u32() % pkt_dev->cflows; pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2246,7 +2246,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = random32() % + t = prandom_u32() % (pkt_dev->queue_map_max - pkt_dev->queue_map_min + 1) + pkt_dev->queue_map_min; @@ -2278,7 +2278,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = random32() % pkt_dev->src_mac_count; + mc = prandom_u32() % pkt_dev->src_mac_count; else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2304,7 +2304,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = random32() % pkt_dev->dst_mac_count; + mc = prandom_u32() % pkt_dev->dst_mac_count; else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2331,21 +2331,21 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)random32() & + ((__force __be32)prandom_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = random32() & (4096-1); + pkt_dev->vlan_id = prandom_u32() & (4096 - 1); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = random32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32() & (4096 - 1); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = random32() % + pkt_dev->cur_udp_src = prandom_u32() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min) + pkt_dev->udp_src_min; @@ -2358,7 +2358,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = random32() % + pkt_dev->cur_udp_dst = prandom_u32() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + pkt_dev->udp_dst_min; } else { @@ -2375,7 +2375,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = random32() % (imx - imn) + imn; + t = prandom_u32() % (imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2396,17 +2396,15 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __be32 s; if (pkt_dev->flags & F_IPDST_RND) { - t = random32() % (imx - imn) + imn; - s = htonl(t); - - while (ipv4_is_loopback(s) || - ipv4_is_multicast(s) || - ipv4_is_lbcast(s) || - ipv4_is_zeronet(s) || - ipv4_is_local_multicast(s)) { - t = random32() % (imx - imn) + imn; + do { + t = prandom_u32() % + (imx - imn) + imn; s = htonl(t); - } + } while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)); pkt_dev->cur_daddr = s; } else { t = ntohl(pkt_dev->cur_daddr); @@ -2437,7 +2435,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)random32() | + (((__force __be32)prandom_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2447,7 +2445,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = random32() % + t = prandom_u32() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) + pkt_dev->min_pkt_size; } else { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c68198bf9128..03d01b6a1bd1 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1747,9 +1747,9 @@ static struct ctl_table vs_vars[] = { }, { .procname = "sync_qlen_max", - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c8e001a9c45b..f84965af4a4e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -264,7 +264,7 @@ static void death_by_event(unsigned long ul_conntrack) if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); + (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); return; } @@ -283,7 +283,7 @@ void nf_ct_dying_timeout(struct nf_conn *ct) /* set a new timer to retry event delivery */ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); + (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); } EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index cc37dd52ecf9..ef53ab8d0aae 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -80,7 +80,7 @@ struct choke_sched_data { /* deliver a random number between 0 and N - 1 */ static u32 random_N(unsigned int N) { - return reciprocal_divide(random32(), N); + return reciprocal_divide(prandom_u32(), N); } /* number of elements in queue including holes */ diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 88edec929d73..1da52d1406fc 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -130,8 +130,8 @@ gss_krb5_make_confounder(char *p, u32 conflen) /* initialize to random value */ if (i == 0) { - i = random32(); - i = (i << 32) | random32(); + i = prandom_u32(); + i = (i << 32) | prandom_u32(); } switch (conflen) { diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst index 477d137c0557..182084d728c8 100644 --- a/scripts/Makefile.headersinst +++ b/scripts/Makefile.headersinst @@ -72,7 +72,7 @@ printdir = $(patsubst $(INSTALL_HDR_PATH)/%/,%,$(dir $@)) quiet_cmd_install = INSTALL $(printdir) ($(words $(all-files))\ file$(if $(word 2, $(all-files)),s)) cmd_install = \ - $(PERL) $< $(installdir) $(SRCARCH) $(input-files); \ + $(CONFIG_SHELL) $< $(installdir) $(input-files); \ for F in $(wrapper-files); do \ echo "\#include <asm-generic/$$F>" > $(installdir)/$$F; \ done; \ @@ -98,7 +98,7 @@ __headersinst: $(subdirs) $(install-file) @: targets += $(install-file) -$(install-file): scripts/headers_install.pl $(input-files) FORCE +$(install-file): scripts/headers_install.sh $(input-files) FORCE $(if $(unwanted),$(call cmd,remove),) $(if $(wildcard $(dir $@)),,$(shell mkdir -p $(dir $@))) $(call if_changed,install) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index a373a1f66023..09e9b4b8a015 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -314,6 +314,11 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) +quiet_cmd_lz4 = LZ4 $@ +cmd_lz4 = (cat $(filter-out FORCE,$^) | \ + lz4demo -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + # U-Boot mkimage # --------------------------------------------------------------------------- diff --git a/scripts/decodecode b/scripts/decodecode index 4f8248d5a11f..d8824f37acce 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -89,10 +89,16 @@ echo $code >> $T.s disas $T cat $T.dis >> $T.aa +# (lines of whole $T.oo) - (lines of $T.aa, i.e. "Code starting") + 3, +# i.e. the title + the "===..=" line (sed is counting from 1, 0 address is +# special) +faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \ + $(wc -l $T.aa | cut -d" " -f1) + 3)) + faultline=`cat $T.dis | head -1 | cut -d":" -f2-` faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'` -cat $T.oo | sed -e "s/\($faultline\)/\*\1 <-- trapping instruction/g" +cat $T.oo | sed -e "${faultlinenum}s/^\(.*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/" echo cat $T.aa cleanup diff --git a/scripts/headers_install.pl b/scripts/headers_install.pl deleted file mode 100644 index 581ca99c96f2..000000000000 --- a/scripts/headers_install.pl +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/perl -w -# -# headers_install prepare the listed header files for use in -# user space and copy the files to their destination. -# -# Usage: headers_install.pl readdir installdir arch [files...] -# installdir: dir to install the files to -# arch: current architecture -# arch is used to force a reinstallation when the arch -# changes because kbuild then detect a command line change. -# files: list of files to check -# -# Step in preparation for users space: -# 1) Drop all use of compiler.h definitions -# 2) Drop include of compiler.h -# 3) Drop all sections defined out by __KERNEL__ (using unifdef) - -use strict; - -my ($installdir, $arch, @files) = @ARGV; - -my $unifdef = "scripts/unifdef -U__KERNEL__ -D__EXPORTED_HEADERS__"; - -foreach my $filename (@files) { - my $file = $filename; - $file =~ s!^.*/!!; - - my $tmpfile = "$installdir/$file.tmp"; - - open(my $in, '<', $filename) - or die "$filename: $!\n"; - open(my $out, '>', $tmpfile) - or die "$tmpfile: $!\n"; - while (my $line = <$in>) { - $line =~ s/([\s(])__user\s/$1/g; - $line =~ s/([\s(])__force\s/$1/g; - $line =~ s/([\s(])__iomem\s/$1/g; - $line =~ s/\s__attribute_const__\s/ /g; - $line =~ s/\s__attribute_const__$//g; - $line =~ s/\b__packed\b/__attribute__((packed))/g; - $line =~ s/^#include <linux\/compiler.h>//; - $line =~ s/(^|\s)(inline)\b/$1__$2__/g; - $line =~ s/(^|\s)(asm)\b(\s|[(]|$)/$1__$2__$3/g; - $line =~ s/(^|\s|[(])(volatile)\b(\s|[(]|$)/$1__$2__$3/g; - $line =~ s/#ifndef\s+_UAPI/#ifndef /; - $line =~ s/#define\s+_UAPI/#define /; - $line =~ s!#endif\s+/[*]\s*_UAPI!#endif /* !; - printf {$out} "%s", $line; - } - close $out; - close $in; - - system $unifdef . " $tmpfile > $installdir/$file"; - # unifdef will exit 0 on success, and will exit 1 when the - # file was processed successfully but no changes were made, - # so abort only when it's higher than that. - my $e = $? >> 8; - if ($e > 1) { - die "$tmpfile: $!\n"; - } - unlink $tmpfile; -} -exit 0; diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh new file mode 100644 index 000000000000..643764f53ea7 --- /dev/null +++ b/scripts/headers_install.sh @@ -0,0 +1,43 @@ +#!/bin/sh + +if [ $# -lt 1 ] +then + echo "Usage: headers_install.sh OUTDIR [FILES...] + echo + echo "Prepares kernel header files for use by user space, by removing" + echo "all compiler.h definitions and #includes, removing any" + echo "#ifdef __KERNEL__ sections, and putting __underscores__ around" + echo "asm/inline/volatile keywords." + echo + echo "OUTDIR: directory to write each userspace header FILE to." + echo "FILES: list of header files to operate on." + + exit 1 +fi + +# Grab arguments + +OUTDIR="$1" +shift + +# Iterate through files listed on command line + +FILE= +trap 'rm -f "$OUTDIR/$FILE" "$OUTDIR/$FILE.sed"' EXIT +for i in "$@" +do + FILE="$(basename "$i")" + sed -r \ + -e 's/([ \t(])(__user|__force|__iomem)[ \t]/\1/g' \ + -e 's/__attribute_const__([ \t]|$)/\1/g' \ + -e 's@^#include <linux/compiler.h>@@' \ + -e 's/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g' \ + -e 's/(^|[ \t(])(inline|asm|volatile)([ \t(]|$)/\1__\2__\3/g' \ + -e 's@#(ifndef|define|endif[ \t]*/[*])[ \t]*_UAPI@#\1 @' \ + "$i" > "$OUTDIR/$FILE.sed" || exit 1 + scripts/unifdef -U__KERNEL__ -D__EXPORTED_HEADERS__ "$OUTDIR/$FILE.sed" \ + > "$OUTDIR/$FILE" + [ $? -gt 1 ] && exit 1 + rm -f "$OUTDIR/$FILE.sed" +done +trap - EXIT diff --git a/security/keys/internal.h b/security/keys/internal.h index 8bbefc3b55d4..d4f1468b9b50 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -16,6 +16,8 @@ #include <linux/key-type.h> #include <linux/task_work.h> +struct iovec; + #ifdef __KDEBUG #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 4b5c948eb414..33cfd27b4de2 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -22,6 +22,7 @@ #include <linux/err.h> #include <linux/vmalloc.h> #include <linux/security.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 5bce9152b64e..479e0a581797 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -25,7 +25,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/pm_qos.h> -#include <linux/uio.h> +#include <linux/aio.h> #include <linux/dma-mapping.h> #include <sound/core.h> #include <sound/control.h> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 3cc0ad7ae863..6a88805b6d6e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,10 +1,12 @@ TARGETS = breakpoints +TARGETS += epoll TARGETS += kcmp TARGETS += mqueue TARGETS += vm TARGETS += cpu-hotplug TARGETS += memory-hotplug TARGETS += efivarfs +TARGETS += ptrace all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile new file mode 100644 index 000000000000..19806ed62f50 --- /dev/null +++ b/tools/testing/selftests/epoll/Makefile @@ -0,0 +1,11 @@ +# Makefile for epoll selftests + +all: test_epoll +%: %.c + gcc -pthread -g -o $@ $^ + +run_tests: all + ./test_epoll + +clean: + $(RM) test_epoll diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c new file mode 100644 index 000000000000..1034ed4cc5b4 --- /dev/null +++ b/tools/testing/selftests/epoll/test_epoll.c @@ -0,0 +1,364 @@ +/* + * tools/testing/selftests/epoll/test_epoll.c + * + * Copyright 2012 Adobe Systems Incorporated + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Paton J. Lewis <palewis@adobe.com> + * + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/epoll.h> +#include <sys/socket.h> + +/* + * A pointer to an epoll_item_private structure will be stored in the epoll + * item's event structure so that we can get access to the epoll_item_private + * data after calling epoll_wait: + */ +struct epoll_item_private { + int index; /* Position of this struct within the epoll_items array. */ + int fd; + uint32_t events; + pthread_mutex_t mutex; /* Guards the following variables... */ + int stop; + int status; /* Stores any error encountered while handling item. */ + /* The following variable allows us to test whether we have encountered + a problem while attempting to cancel and delete the associated + event. When the test program exits, 'deleted' should be exactly + one. If it is greater than one, then the failed test reflects a real + world situation where we would have tried to access the epoll item's + private data after deleting it: */ + int deleted; +}; + +struct epoll_item_private *epoll_items; + +/* + * Delete the specified item from the epoll set. In a real-world secneario this + * is where we would free the associated data structure, but in this testing + * environment we retain the structure so that we can test for double-deletion: + */ +void delete_item(int index) +{ + __sync_fetch_and_add(&epoll_items[index].deleted, 1); +} + +/* + * A pointer to a read_thread_data structure will be passed as the argument to + * each read thread: + */ +struct read_thread_data { + int stop; + int status; /* Indicates any error encountered by the read thread. */ + int epoll_set; +}; + +/* + * The function executed by the read threads: + */ +void *read_thread_function(void *function_data) +{ + struct read_thread_data *thread_data = + (struct read_thread_data *)function_data; + struct epoll_event event_data; + struct epoll_item_private *item_data; + char socket_data; + + /* Handle events until we encounter an error or this thread's 'stop' + condition is set: */ + while (1) { + int result = epoll_wait(thread_data->epoll_set, + &event_data, + 1, /* Number of desired events */ + 1000); /* Timeout in ms */ + if (result < 0) { + /* Breakpoints signal all threads. Ignore that while + debugging: */ + if (errno == EINTR) + continue; + thread_data->status = errno; + return 0; + } else if (thread_data->stop) + return 0; + else if (result == 0) /* Timeout */ + continue; + + /* We need the mutex here because checking for the stop + condition and re-enabling the epoll item need to be done + together as one atomic operation when EPOLL_CTL_DISABLE is + available: */ + item_data = (struct epoll_item_private *)event_data.data.ptr; + pthread_mutex_lock(&item_data->mutex); + + /* Remove the item from the epoll set if we want to stop + handling that event: */ + if (item_data->stop) + delete_item(item_data->index); + else { + /* Clear the data that was written to the other end of + our non-blocking socket: */ + do { + if (read(item_data->fd, &socket_data, 1) < 1) { + if ((errno == EAGAIN) || + (errno == EWOULDBLOCK)) + break; + else + goto error_unlock; + } + } while (item_data->events & EPOLLET); + + /* The item was one-shot, so re-enable it: */ + event_data.events = item_data->events; + if (epoll_ctl(thread_data->epoll_set, + EPOLL_CTL_MOD, + item_data->fd, + &event_data) < 0) + goto error_unlock; + } + + pthread_mutex_unlock(&item_data->mutex); + } + +error_unlock: + thread_data->status = item_data->status = errno; + pthread_mutex_unlock(&item_data->mutex); + return 0; +} + +/* + * A pointer to a write_thread_data structure will be passed as the argument to + * the write thread: + */ +struct write_thread_data { + int stop; + int status; /* Indicates any error encountered by the write thread. */ + int n_fds; + int *fds; +}; + +/* + * The function executed by the write thread. It writes a single byte to each + * socket in turn until the stop condition for this thread is set. If writing to + * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for + * the moment and just move on to the next socket in the list. We don't care + * about the order in which we deliver events to the epoll set. In fact we don't + * care about the data we're writing to the pipes at all; we just want to + * trigger epoll events: + */ +void *write_thread_function(void *function_data) +{ + const char data = 'X'; + int index; + struct write_thread_data *thread_data = + (struct write_thread_data *)function_data; + while (!thread_data->stop) + for (index = 0; + !thread_data->stop && (index < thread_data->n_fds); + ++index) + if ((write(thread_data->fds[index], &data, 1) < 1) && + (errno != EAGAIN) && + (errno != EWOULDBLOCK)) { + thread_data->status = errno; + return; + } +} + +/* + * Arguments are currently ignored: + */ +int main(int argc, char **argv) +{ + const int n_read_threads = 100; + const int n_epoll_items = 500; + int index; + int epoll_set = epoll_create1(0); + struct write_thread_data write_thread_data = { + 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int)) + }; + struct read_thread_data *read_thread_data = + malloc(n_read_threads * sizeof(struct read_thread_data)); + pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t)); + pthread_t write_thread; + int socket_pair[2]; + struct epoll_event event_data; + + printf("-----------------\n"); + printf("Runing test_epoll\n"); + printf("-----------------\n"); + + epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private)); + + if (epoll_set < 0 || !epoll_items || write_thread_data.fds == NULL || + !read_thread_data || !read_threads) + goto error; + + if (sysconf(_SC_NPROCESSORS_ONLN) < 2) { + printf("Error: please run this test on a multi-core system.\n"); + goto error; + } + + /* Create the socket pairs and epoll items: */ + for (index = 0; index < n_epoll_items; ++index) { + if (socketpair(AF_UNIX, + SOCK_STREAM | SOCK_NONBLOCK, + 0, + socket_pair) < 0) + goto error; + write_thread_data.fds[index] = socket_pair[0]; + epoll_items[index].index = index; + epoll_items[index].fd = socket_pair[1]; + if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0) + goto error; + /* We always use EPOLLONESHOT because this test is currently + structured to demonstrate the need for EPOLL_CTL_DISABLE, + which only produces useful information in the EPOLLONESHOT + case (without EPOLLONESHOT, calling epoll_ctl with + EPOLL_CTL_DISABLE will never return EBUSY). If support for + testing events without EPOLLONESHOT is desired, it should + probably be implemented in a separate unit test. */ + epoll_items[index].events = EPOLLIN | EPOLLONESHOT; + if (index < n_epoll_items / 2) + epoll_items[index].events |= EPOLLET; + epoll_items[index].stop = 0; + epoll_items[index].status = 0; + epoll_items[index].deleted = 0; + event_data.events = epoll_items[index].events; + event_data.data.ptr = &epoll_items[index]; + if (epoll_ctl(epoll_set, + EPOLL_CTL_ADD, + epoll_items[index].fd, + &event_data) < 0) + goto error; + } + +#ifdef EPOLL_CTL_DISABLE + /* Test to make sure that using EPOLL_CTL_DISABLE without EPOLLONESHOT + returns a clear error: */ + if (socketpair(AF_UNIX, + SOCK_STREAM | SOCK_NONBLOCK, + 0, + socket_pair) < 0) + goto error; + event_data.events = EPOLLIN; + event_data.data.ptr = NULL; + if (epoll_ctl(epoll_set, EPOLL_CTL_ADD, + socket_pair[1], &event_data) < 0) + goto error; + if ((epoll_ctl(epoll_set, EPOLL_CTL_DISABLE, + socket_pair[1], NULL) == 0) || (errno != EINVAL)) + goto error; + if (epoll_ctl(epoll_set, EPOLL_CTL_DEL, socket_pair[1], NULL) != 0) + goto error; +#endif + + /* Create and start the read threads: */ + for (index = 0; index < n_read_threads; ++index) { + read_thread_data[index].stop = 0; + read_thread_data[index].status = 0; + read_thread_data[index].epoll_set = epoll_set; + if (pthread_create(&read_threads[index], + NULL, + read_thread_function, + &read_thread_data[index]) != 0) + goto error; + } + + if (pthread_create(&write_thread, + NULL, + write_thread_function, + &write_thread_data) != 0) + goto error; + + /* Cancel all event pollers: */ +#ifdef EPOLL_CTL_DISABLE + for (index = 0; index < n_epoll_items; ++index) { + pthread_mutex_lock(&epoll_items[index].mutex); + ++epoll_items[index].stop; + if (epoll_ctl(epoll_set, + EPOLL_CTL_DISABLE, + epoll_items[index].fd, + NULL) == 0) + delete_item(index); + else if (errno != EBUSY) { + pthread_mutex_unlock(&epoll_items[index].mutex); + goto error; + } + /* EBUSY means events were being handled; allow the other thread + to delete the item. */ + pthread_mutex_unlock(&epoll_items[index].mutex); + } +#else + for (index = 0; index < n_epoll_items; ++index) { + pthread_mutex_lock(&epoll_items[index].mutex); + ++epoll_items[index].stop; + pthread_mutex_unlock(&epoll_items[index].mutex); + /* Wait in case a thread running read_thread_function is + currently executing code between epoll_wait and + pthread_mutex_lock with this item. Note that a longer delay + would make double-deletion less likely (at the expense of + performance), but there is no guarantee that any delay would + ever be sufficient. Note also that we delete all event + pollers at once for testing purposes, but in a real-world + environment we are likely to want to be able to cancel event + pollers at arbitrary times. Therefore we can't improve this + situation by just splitting this loop into two loops + (i.e. signal 'stop' for all items, sleep, and then delete all + items). We also can't fix the problem via EPOLL_CTL_DEL + because that command can't prevent the case where some other + thread is executing read_thread_function within the region + mentioned above: */ + usleep(1); + pthread_mutex_lock(&epoll_items[index].mutex); + if (!epoll_items[index].deleted) + delete_item(index); + pthread_mutex_unlock(&epoll_items[index].mutex); + } +#endif + + /* Shut down the read threads: */ + for (index = 0; index < n_read_threads; ++index) + __sync_fetch_and_add(&read_thread_data[index].stop, 1); + for (index = 0; index < n_read_threads; ++index) { + if (pthread_join(read_threads[index], NULL) != 0) + goto error; + if (read_thread_data[index].status) + goto error; + } + + /* Shut down the write thread: */ + __sync_fetch_and_add(&write_thread_data.stop, 1); + if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status) + goto error; + + /* Check for final error conditions: */ + for (index = 0; index < n_epoll_items; ++index) { + if (epoll_items[index].status != 0) + goto error; + if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0) + goto error; + } + for (index = 0; index < n_epoll_items; ++index) + if (epoll_items[index].deleted != 1) { + printf("Error: item data deleted %1d times.\n", + epoll_items[index].deleted); + goto error; + } + + printf("[PASS]\n"); + return 0; + + error: + printf("[FAIL]\n"); + return errno; +} diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile new file mode 100644 index 000000000000..47ae2d385ce8 --- /dev/null +++ b/tools/testing/selftests/ptrace/Makefile @@ -0,0 +1,10 @@ +CFLAGS += -iquote../../../../include/uapi -Wall +peeksiginfo: peeksiginfo.c + +all: peeksiginfo + +clean: + rm -f peeksiginfo + +run_tests: all + @./peeksiginfo || echo "peeksiginfo selftests: [FAIL]" diff --git a/tools/testing/selftests/ptrace/peeksiginfo.c b/tools/testing/selftests/ptrace/peeksiginfo.c new file mode 100644 index 000000000000..d46558b1f58d --- /dev/null +++ b/tools/testing/selftests/ptrace/peeksiginfo.c @@ -0,0 +1,214 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> +#include <linux/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <sys/user.h> +#include <sys/mman.h> + +#include "linux/ptrace.h" + +static int sys_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +{ + return syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo); +} + +static int sys_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, + int sig, siginfo_t *uinfo) +{ + return syscall(SYS_rt_tgsigqueueinfo, tgid, tid, sig, uinfo); +} + +static int sys_ptrace(int request, pid_t pid, void *addr, void *data) +{ + return syscall(SYS_ptrace, request, pid, addr, data); +} + +#define SIGNR 10 +#define TEST_SICODE_PRIV -1 +#define TEST_SICODE_SHARE -2 + +#define err(fmt, ...) \ + fprintf(stderr, \ + "Error (%s:%d): " fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +static int check_error_paths(pid_t child) +{ + struct ptrace_peeksiginfo_args arg; + int ret, exit_code = -1; + void *addr_rw, *addr_ro; + + /* + * Allocate two contiguous pages. The first one is for read-write, + * another is for read-only. + */ + addr_rw = mmap(NULL, 2 * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr_rw == MAP_FAILED) { + err("mmap() failed: %m\n"); + return 1; + } + + addr_ro = mmap(addr_rw + PAGE_SIZE, PAGE_SIZE, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (addr_ro == MAP_FAILED) { + err("mmap() failed: %m\n"); + goto out; + } + + arg.nr = SIGNR; + arg.off = 0; + + /* Unsupported flags */ + arg.flags = ~0; + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, addr_rw); + if (ret != -1 || errno != EINVAL) { + err("sys_ptrace() returns %d (expected -1)," + " errno %d (expected %d): %m\n", + ret, errno, EINVAL); + goto out; + } + arg.flags = 0; + + /* A part of the buffer is read-only */ + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, + addr_ro - sizeof(siginfo_t) * 2); + if (ret != 2) { + err("sys_ptrace() returns %d (expected 2): %m\n", ret); + goto out; + } + + /* Read-only buffer */ + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, addr_ro); + if (ret != -1 && errno != EFAULT) { + err("sys_ptrace() returns %d (expected -1)," + " errno %d (expected %d): %m\n", + ret, errno, EFAULT); + goto out; + } + + exit_code = 0; +out: + munmap(addr_rw, 2 * PAGE_SIZE); + return exit_code; +} + +int check_direct_path(pid_t child, int shared, int nr) +{ + struct ptrace_peeksiginfo_args arg = {.flags = 0, .nr = nr, .off = 0}; + int i, j, ret, exit_code = -1; + siginfo_t siginfo[SIGNR]; + int si_code; + + if (shared == 1) { + arg.flags = PTRACE_PEEKSIGINFO_SHARED; + si_code = TEST_SICODE_SHARE; + } else { + arg.flags = 0; + si_code = TEST_SICODE_PRIV; + } + + for (i = 0; i < SIGNR; ) { + arg.off = i; + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, siginfo); + if (ret == -1) { + err("ptrace() failed: %m\n"); + goto out; + } + + if (ret == 0) + break; + + for (j = 0; j < ret; j++, i++) { + if (siginfo[j].si_code == si_code && + siginfo[j].si_int == i) + continue; + + err("%d: Wrong siginfo i=%d si_code=%d si_int=%d\n", + shared, i, siginfo[j].si_code, siginfo[j].si_int); + goto out; + } + } + + if (i != SIGNR) { + err("Only %d signals were read\n", i); + goto out; + } + + exit_code = 0; +out: + return exit_code; +} + +int main(int argc, char *argv[]) +{ + siginfo_t siginfo[SIGNR]; + int i, exit_code = 1; + sigset_t blockmask; + pid_t child; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGRTMIN); + sigprocmask(SIG_BLOCK, &blockmask, NULL); + + child = fork(); + if (child == -1) { + err("fork() failed: %m"); + return 1; + } else if (child == 0) { + pid_t ppid = getppid(); + while (1) { + if (ppid != getppid()) + break; + sleep(1); + } + return 1; + } + + /* Send signals in process-wide and per-thread queues */ + for (i = 0; i < SIGNR; i++) { + siginfo->si_code = TEST_SICODE_SHARE; + siginfo->si_int = i; + sys_rt_sigqueueinfo(child, SIGRTMIN, siginfo); + + siginfo->si_code = TEST_SICODE_PRIV; + siginfo->si_int = i; + sys_rt_tgsigqueueinfo(child, child, SIGRTMIN, siginfo); + } + + if (sys_ptrace(PTRACE_ATTACH, child, NULL, NULL) == -1) + return 1; + + waitpid(child, NULL, 0); + + /* Dump signals one by one*/ + if (check_direct_path(child, 0, 1)) + goto out; + /* Dump all signals for one call */ + if (check_direct_path(child, 0, SIGNR)) + goto out; + + /* + * Dump signal from the process-wide queue. + * The number of signals is not multible to the buffer size + */ + if (check_direct_path(child, 1, 3)) + goto out; + + if (check_error_paths(child)) + goto out; + + printf("PASS\n"); + exit_code = 0; +out: + if (sys_ptrace(PTRACE_KILL, child, NULL, NULL) == -1) + return 1; + + waitpid(child, NULL, 0); + + return exit_code; +} diff --git a/usr/Kconfig b/usr/Kconfig index 085872bb2bb5..642f503d3e9f 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -90,6 +90,15 @@ config RD_LZO Support loading of a LZO encoded initial ramdisk or cpio buffer If unsure, say N. +config RD_LZ4 + bool "Support initial ramdisks compressed using LZ4" if EXPERT + default !EXPERT + depends on BLK_DEV_INITRD + select DECOMPRESS_LZ4 + help + Support loading of a LZ4 encoded initial ramdisk or cpio buffer + If unsure, say N. + choice prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!="" help |