diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-05-29 14:50:41 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-05-29 14:50:43 +1000 |
commit | bf1ef9c39e723865c9cbe53250e195ca2730f3e1 (patch) | |
tree | 185675fd52490b1f298b10c5ff9add939795f3e5 | |
parent | ae6beaeeb6006459cb779637ea41310ff76d7dcb (diff) | |
parent | db337d977352895b8f234c5cbd52cc81a31894eb (diff) |
Merge branch 'akpm/master'
319 files changed, 6300 insertions, 2156 deletions
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index e00b8f0dde52..7fe0546c504a 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle @@ -389,7 +389,8 @@ Albeit deprecated by some people, the equivalent of the goto statement is used frequently by compilers in form of the unconditional jump instruction. The goto statement comes in handy when a function exits from multiple -locations and some common work such as cleanup has to be done. +locations and some common work such as cleanup has to be done. If there is no +cleanup needed then just return directly. The rationale is: diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 3aaf7870a93e..5403b8cb1d09 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -72,6 +72,7 @@ Brief summary of control files. memory.move_charge_at_immigrate # set/show controls of moving charges memory.oom_control # set/show oom controls. memory.numa_stat # show the number of memory usage per numa node + memory.dangling_memcgs # show debugging information about dangling groups memory.kmem.limit_in_bytes # set/show hard limit for kernel memory memory.kmem.usage_in_bytes # show current kernel memory allocation @@ -581,6 +582,21 @@ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... And we have total = file + anon + unevictable. +5.7 dangling_memcgs + +This file will only be ever present in the root cgroup, if the option +CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory +consumed by it may not be immediately freed. This is because when some +extensions are used, such as swap or kernel memory, objects can outlive the +group and hold a reference to it. + +If this is the case, the dangling_memcgs file will show information about what +are the memcgs still alive, and which references are still preventing it to be +freed. There is nothing wrong with that, but it is very useful when debugging, +to know where this memory is being held. This is a developer-oriented debugging +facility only, and no guarantees of interface stability will be given. The file +is read-only, and has the sole purpose of displaying information. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fd8d0d594fc7..fcc22c982a25 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is enabled. The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG -bits on both physical and virtual pages associated with a process. +bits on both physical and virtual pages associated with a process, and the +soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details). To clear the bits for all the pages associated with the process > echo 1 > /proc/PID/clear_refs @@ -482,6 +483,10 @@ To clear the bits for the anonymous pages associated with the process To clear the bits for the file mapped pages associated with the process > echo 3 > /proc/PID/clear_refs + +To clear the soft-dirty bit + > echo 4 > /proc/PID/clear_refs + Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt index 32aa4002de4a..596b60c08b74 100644 --- a/Documentation/rtc.txt +++ b/Documentation/rtc.txt @@ -153,9 +153,10 @@ since_epoch: The number of seconds since the epoch according to the RTC time: RTC-provided time wakealarm: The time at which the clock will generate a system wakeup event. This is a one shot wakeup event, so must be reset - after wake if a daily wakeup is required. Format is either - seconds since the epoch or, if there's a leading +, seconds - in the future. + after wake if a daily wakeup is required. Format is seconds since + the epoch by default, or if there's a leading +, seconds in the + future, or if there is a leading +=, seconds ahead of the current + alarm. IOCTL INTERFACE --------------- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9ed919..a5717c38834a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -169,18 +169,39 @@ Setting this to zero disables periodic writeback altogether. drop_caches -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. +Writing to this will cause the kernel to drop clean caches, as well as +reclaimable slab objects like dentries and inodes. Once dropped, their +memory becomes free. To free pagecache: echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: +To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: +To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +This is a non-destructive operation and will not free any dirty objects. +To increase the number of objects freed by this operation, the user may run +`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the +number of dirty objects on the system and create more candidates to be +dropped. + +This file is not a means to control the growth of the various kernel caches +(inodes, dentries, pagecache, etc...) These objects are automatically +reclaimed by the kernel when memory is needed elsewhere on the system. + +Use of this file can cause performance problems. Since it discards cached +objects, it may cost a significant amount of I/O and CPU to recreate the +dropped objects, especially if they were under heavy use. Because of this, +use outside of a testing or debugging environment is not recommended. + +You may see informational messages in your kernel log when this file is +used: + + cat (1234): dropped kernel caches: 3 + +These are informational only. They do not mean that anything is wrong +with your system. ============================================================== diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 5ef3dd38bb64..5948e455c4d2 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -15,7 +15,8 @@ There are three components to pagemap: * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bits 55-60 page shift (page size = 1<<page shift) + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bits 56-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt new file mode 100644 index 000000000000..9a12a5956bc0 --- /dev/null +++ b/Documentation/vm/soft-dirty.txt @@ -0,0 +1,36 @@ + SOFT-DIRTY PTEs + + The soft-dirty is a bit on a PTE which helps to track which pages a task +writes to. In order to do this tracking one should + + 1. Clear soft-dirty bits from the task's PTEs. + + This is done by writing "4" into the /proc/PID/clear_refs file of the + task in question. + + 2. Wait some time. + + 3. Read soft-dirty bits from the PTEs. + + This is done by reading from the /proc/PID/pagemap. The bit 55 of the + 64-bit qword is the soft-dirty one. If set, the respective PTE was + written to since step 1. + + + Internally, to do this tracking, the writable bit is cleared from PTEs +when the soft-dirty bit is cleared. So, after this, when the task tries to +modify a page at some virtual address the #PF occurs and the kernel sets +the soft-dirty bit on the respective PTE. + + Note, that although all the task's address space is marked as r/o after the +soft-dirty bits clear, the #PF-s that occur after that are processed fast. +This is so, since the pages are still mapped to physical memory, and thus all +the kernel does is finds this fact out and puts both writable and soft-dirty +bits on the PTE. + + + This feature is actively used by the checkpoint-restore project. You +can find more details about it on http://criu.org + + +-- Pavel Emelyanov, Apr 9, 2013 diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 3840b6f28afb..fc66d42422ee 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -657,9 +657,10 @@ Protocol: 2.08+ uncompressed data should be determined using the standard magic numbers. The currently supported compression formats are gzip (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A), LZMA - (magic number 5D 00), and XZ (magic number FD 37). The uncompressed - payload is currently always ELF (magic number 7F 45 4C 46). - + (magic number 5D 00), XZ (magic number FD 37), and LZ4 (magic number + 02 21). The uncompressed payload is currently always ELF (magic + number 7F 45 4C 46). + Field name: payload_length Type: read Offset/size: 0x24c/4 diff --git a/arch/Kconfig b/arch/Kconfig index a4429bcd609e..8d2ae24b9f4a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -365,6 +365,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config HAVE_ARCH_SOFT_DIRTY + bool + config HAVE_MOD_ARCH_SPECIFIC bool help diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5645bd65f4a8..82770cb64916 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -40,6 +40,7 @@ config ARM select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore index f79a08efe000..47279aa96a6a 100644 --- a/arch/arm/boot/compressed/.gitignore +++ b/arch/arm/boot/compressed/.gitignore @@ -6,6 +6,7 @@ piggy.gzip piggy.lzo piggy.lzma piggy.xzkern +piggy.lz4 vmlinux vmlinux.lds diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 3580d57ea218..198a4ad89578 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -91,6 +91,7 @@ suffix_$(CONFIG_KERNEL_GZIP) = gzip suffix_$(CONFIG_KERNEL_LZO) = lzo suffix_$(CONFIG_KERNEL_LZMA) = lzma suffix_$(CONFIG_KERNEL_XZ) = xzkern +suffix_$(CONFIG_KERNEL_LZ4) = lz4 # Borrowed libfdt files for the ATAG compatibility mode @@ -115,7 +116,7 @@ targets := vmlinux vmlinux.lds \ font.o font.c head.o misc.o $(OBJS) # Make sure files are removed during clean -extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \ +extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \ lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs) ifeq ($(CONFIG_FUNCTION_TRACER),y) diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index 24b0475cb8bf..bd245d34952d 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -51,6 +51,10 @@ extern char * strstr(const char * s1, const char *s2); #include "../../../../lib/decompress_unxz.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) { return decompress(input, len, NULL, NULL, output, NULL, error); diff --git a/arch/arm/boot/compressed/piggy.lz4.S b/arch/arm/boot/compressed/piggy.lz4.S new file mode 100644 index 000000000000..3d9a575618a3 --- /dev/null +++ b/arch/arm/boot/compressed/piggy.lz4.S @@ -0,0 +1,6 @@ + .section .piggydata,#alloc + .globl input_data +input_data: + .incbin "arch/arm/boot/compressed/piggy.lz4" + .globl input_data_end +input_data_end: diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 03deeffd9f6d..41668e56685f 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -886,20 +886,12 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_HAVE_HW_BREAKPOINT case PTRACE_GETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_gethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; case PTRACE_SETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_sethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; #endif diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 10062ceadd1c..0c6356255fe3 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7c7be7855638..8ed6cb1a900f 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 7e5fe2790d8a..f1baadd56e82 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 98c2fc198712..6645e573c44c 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -975,16 +975,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; hw_brk.len = 8; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(task) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) { if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } - ptrace_put_breakpoints(task); return 0; } if (bp) { @@ -997,11 +993,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ret = modify_user_hw_breakpoint(bp, &attr); if (ret) { - ptrace_put_breakpoints(task); return ret; } thread->ptrace_bps[0] = bp; - ptrace_put_breakpoints(task); thread->hw_brk = hw_brk; return 0; } @@ -1016,12 +1010,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(task); return PTR_ERR(bp); } - ptrace_put_breakpoints(task); - #endif /* CONFIG_HAVE_HW_BREAKPOINT */ task->thread.hw_brk = hw_brk; #else /* CONFIG_PPC_ADV_DEBUG_REGS */ @@ -1440,9 +1431,6 @@ static long ppc_set_hwdebug(struct task_struct *child, if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) brk.type |= HW_BRK_TYPE_WRITE; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - /* * Check if the request is for 'range' breakpoints. We can * support it if range < 8 bytes. @@ -1450,12 +1438,10 @@ static long ppc_set_hwdebug(struct task_struct *child, if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { len = bp_info->addr2 - bp_info->addr; } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { - ptrace_put_breakpoints(child); return -EINVAL; } bp = thread->ptrace_bps[0]; if (bp) { - ptrace_put_breakpoints(child); return -ENOSPC; } @@ -1469,11 +1455,9 @@ static long ppc_set_hwdebug(struct task_struct *child, ptrace_triggered, NULL, child); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(child); return PTR_ERR(bp); } - ptrace_put_breakpoints(child); return 1; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ @@ -1517,16 +1501,12 @@ static long ppc_del_hwdebug(struct task_struct *child, long data) return -EINVAL; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } else ret = -ENOENT; - ptrace_put_breakpoints(child); return ret; #else /* CONFIG_HAVE_HW_BREAKPOINT */ if (child->thread.hw_brk.address == 0) diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c index 67a42ed0d2fc..cb8bdbe4972f 100644 --- a/arch/powerpc/mm/mmap_64.c +++ b/arch/powerpc/mm/mmap_64.c @@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c192e74250bf..e53ce161df74 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1355,10 +1355,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, , pmd_t *pmdp, + pgtable_t pgtable); #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); static inline int pmd_trans_splitting(pmd_t pmd) { diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 06bafec00278..40023290ee5b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } @@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = s390_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = s390_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 44b145055f35..f97bfbd8b5a2 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1076,7 +1076,7 @@ int s390_enable_sie(void) task_lock(tsk); if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || #ifdef CONFIG_AIO - !hlist_empty(&tsk->mm->ioctx_list) || + tsk->mm->ioctx_rtree.rnode || #endif tsk->mm != tsk->active_mm) { task_unlock(tsk); @@ -1103,7 +1103,7 @@ int s390_enable_sie(void) task_lock(tsk); if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || #ifdef CONFIG_AIO - !hlist_empty(&tsk->mm->ioctx_list) || + tsk->mm->ioctx_rtree.rnode || #endif tsk->mm != tsk->active_mm) { mmput(mm); @@ -1165,7 +1165,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { struct list_head *lh = (struct list_head *) pgtable; @@ -1179,7 +1180,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) mm->pmd_huge_pte = pgtable; } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { struct list_head *lh; pgtable_t pgtable; diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 81f999a672f6..668c81631c08 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -117,11 +117,7 @@ void user_enable_single_step(struct task_struct *child) set_tsk_thread_flag(child, TIF_SINGLESTEP); - if (ptrace_get_breakpoints(child) < 0) - return; - set_single_step(child, pc); - ptrace_put_breakpoints(child); } void user_disable_single_step(struct task_struct *child) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 7619f2f792af..d22b92d67844 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -853,10 +853,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif /* Encode and de-code a swap entry */ diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 2daaaa6eda23..51561b8b15ba 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 83d89bcb44af..f828dd33551c 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -188,7 +188,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { struct list_head *lh = (struct list_head *) pgtable; @@ -202,7 +203,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) mm->pmd_huge_pte = pgtable; } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { struct list_head *lh; pgtable_t pgtable; diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c index f96f4cec602a..d67d91ebf63e 100644 --- a/arch/tile/mm/mmap.c +++ b/arch/tile/mm/mmap.c @@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(mm); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 685692c94f05..723e42eac544 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -65,6 +65,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZ4 select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS @@ -102,6 +103,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE + select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA if X86_64 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5ef205c5f37b..dcd90df10ab4 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,8 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo +targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lz4) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_LZ4) := lz4 quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7cb56c6ca351..0319c88290a5 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -145,6 +145,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzo.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + static void scroll(void) { int i; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 805078e08013..fbeba82c703b 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3a..b8e9224f0b45 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -17,6 +17,8 @@ extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void e820_add_region(u64 start, u64 size, int type); +extern void e820_add_limit_region(u64 start, u64 size, int type); +extern void e820_adjust_region(u64 *start, u64 *size); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e672234c4ff..ebf937362479 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY); + return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY); + return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) @@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +static inline int pte_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..c98ac63aae48 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -55,6 +55,18 @@ #define _PAGE_HIDDEN (_AT(pteval_t, 0)) #endif +/* + * The same hidden bit is used by kmemcheck, but since kmemcheck + * works on kernel pages while soft-dirty engine on user space, + * they do not conflict with each other. + */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1df6e84691f..27811190cbd7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,7 +89,6 @@ struct thread_info { #define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ @@ -113,7 +112,6 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) -#define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -154,7 +152,7 @@ struct thread_info { (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define PREEMPT_ACTIVE 0x10000000 diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda5..0d5bb689649a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +static u64 mem_limit = ~0ULL; /* * This function checks if any part of the range <start,end> is mapped @@ -108,7 +109,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) * Add a memory region to the kernel e820 map. */ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) + int type, bool limited) { int x = e820x->nr_map; @@ -119,6 +120,22 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, return; } + if (limited) { + if (start >= mem_limit) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)start, + (unsigned long long)(start + size - 1)); + return; + } + + if (mem_limit - start < size) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)mem_limit, + (unsigned long long)(start + size - 1)); + size = mem_limit - start; + } + } + e820x->map[x].addr = start; e820x->map[x].size = size; e820x->map[x].type = type; @@ -127,7 +144,37 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(&e820, start, size, type, false); +} + +/* + * do_add_efi_memmap() calls this function(). + * + * Note: BOOT_SERVICES_{CODE,DATA} regions on some efi machines are marked + * as E820_RAM, and they are needed to be mapped. Please use e820_add_region() + * to add BOOT_SERVICES_{CODE,DATA} regions. + */ +void __init e820_add_limit_region(u64 start, u64 size, int type) +{ + /* + * efi_init() is called after finish_e820_parsing(), so we should + * check whether [start, start + size) contains address above + * mem_limit if the type is E820_RAM. + */ + __e820_add_region(&e820, start, size, type, type == E820_RAM); +} + +void __init e820_adjust_region(u64 *start, u64 *size) +{ + if (*start >= mem_limit) { + *size = 0; + return; + } + + if (mem_limit - *start < *size) + *size = mem_limit - *start; + + return; } static void __init e820_print_type(u32 type) @@ -455,8 +502,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, /* new range is totally covered? */ if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); + __e820_add_region(e820x, start, size, new_type, false); + __e820_add_region(e820x, end, ei_end - end, ei->type, + false); ei->size = start - ei->addr; real_updated_size += size; continue; @@ -469,7 +517,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, continue; __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + new_type, false); real_updated_size += final_end - final_start; @@ -809,7 +857,7 @@ static int userdef __initdata; /* "mem=nopentium" disables the 4MB page tables. */ static int __init parse_memopt(char *p) { - u64 mem_size; + char *oldp; if (!p) return -EINVAL; @@ -825,11 +873,11 @@ static int __init parse_memopt(char *p) } userdef = 1; - mem_size = memparse(p, &p); + oldp = p; + mem_limit = memparse(p, &p); /* don't remove all of memory when handling "mem={invalid}" param */ - if (mem_size == 0) + if (mem_limit == 0 || p == oldp) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; } @@ -895,6 +943,12 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { + if (mem_limit != ~0ULL) { + userdef = 1; + e820_remove_range(mem_limit, ULLONG_MAX - mem_limit, + E820_RAM, 1); + } + if (userdef) { u32 nr = e820.nr_map; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 02f07634d265..f66ff162dce8 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 29a8120e6fe8..7461f50d5bb1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -601,30 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) { - int err; - int gen_len, gen_type; struct perf_event_attr attr; + int err; - /* - * We should have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) - return -EINVAL; + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) - return err; + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} - attr = bp->attr; - attr.bp_len = gen_len; - attr.bp_type = gen_type; - attr.disabled = disabled; +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; return modify_user_hw_breakpoint(bp, &attr); } @@ -634,67 +652,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; - int i, orig_ret = 0, rc = 0; - int enabled, second_pass = 0; - unsigned len, type; - struct perf_event *bp; - - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; + bool second_pass = false; + int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + restore: - /* - * Loop through all the hardware breakpoints, making the - * appropriate changes to each. - */ + rc = 0; for (i = 0; i < HBP_NUM; i++) { - enabled = decode_dr7(data, i, &len, &type); - bp = thread->ptrace_bps[i]; - - if (!enabled) { - if (bp) { - /* - * Don't unregister the breakpoints right-away, - * unless all register_user_hw_breakpoint() - * requests have succeeded. This prevents - * any window of opportunity for debug - * register grabbing by other users. - */ - if (!second_pass) - continue; - - rc = ptrace_modify_breakpoint(bp, len, type, - tsk, 1); - if (rc) - break; + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (!bp) { + if (disabled) + continue; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; } + + thread->ptrace_bps[i] = bp; continue; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } - /* - * Make a second pass to free the remaining unused breakpoints - * or to restore the original breakpoints if an error occurred. - */ - if (!second_pass) { - second_pass = 1; - if (rc < 0) { - orig_ret = rc; - data = old_dr7; - } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; goto restore; } - ptrace_put_breakpoints(tsk); - - return ((orig_ret < 0) ? orig_ret : rc); + return ret; } /* @@ -702,25 +703,17 @@ restore: */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp; + struct perf_event *bp = thread->ptrace_bps[n]; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - bp = thread->ptrace_bps[n]; - if (!bp) - val = 0; - else + if (bp) val = bp->hw.info.address; - - ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; - } else if (n == 7) { + } else if (n == 7) { val = thread->ptrace_dr7; } return val; @@ -729,29 +722,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { - struct perf_event *bp; struct thread_struct *t = &tsk->thread; - struct perf_event_attr attr; + struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - if (!t->ptrace_bps[nr]) { - ptrace_breakpoint_init(&attr); - /* - * Put stub len and type to register (reserve) an inactive but - * correct bp - */ - attr.bp_addr = addr; - attr.bp_len = HW_BREAKPOINT_LEN_1; - attr.bp_type = HW_BREAKPOINT_W; - attr.disabled = 1; - - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, - NULL, tsk); - + if (!bp) { /* + * Put stub len and type to create an inactive but correct bp. + * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. @@ -760,22 +738,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) { + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) err = PTR_ERR(bp); - goto put; - } - - t->ptrace_bps[nr] = bp; + else + t->ptrace_bps[nr] = bp; } else { - bp = t->ptrace_bps[nr]; + struct perf_event_attr attr = bp->attr; - attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } -put: - ptrace_put_breakpoints(tsk); return err; } @@ -785,30 +761,20 @@ put: static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { - struct thread_struct *thread = &(tsk->thread); - int rc = 0; - + struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ - if (n == 4 || n == 5) - return -EIO; + int rc = -EIO; - if (n == 6) { - thread->debugreg6 = val; - goto ret_path; - } if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); - if (rc) - return rc; - } - /* All that's left is DR7 */ - if (n == 7) { + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } - -ret_path: return rc; } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df6835f9f..62c29a5bfe26 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 55856b2310d3..83e5cc472752 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -419,10 +419,17 @@ static void __init do_add_efi_memmap(void) int e820_type; switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: + /* EFI_BOOT_SERVICES_{CODE,DATA} needs to be mapped */ + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + e820_add_region(start, size, e820_type); + continue; + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: case EFI_CONVENTIONAL_MEMORY: if (md->attribute & EFI_MEMORY_WB) e820_type = E820_RAM; @@ -447,7 +454,7 @@ static void __init do_add_efi_memmap(void) e820_type = E820_RESERVED; break; } - e820_add_region(start, size, e820_type); + e820_add_limit_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -555,6 +562,8 @@ void __init efi_free_boot_services(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + e820_adjust_region(&start, &size); + /* Could not reserve boot area */ if (!size) continue; diff --git a/block/blk-core.c b/block/blk-core.c index 0852e5d43436..25063acb5bdd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -153,7 +153,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) + unsigned int nbytes, int error, + struct batch_complete *batch) { if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); @@ -167,7 +168,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio_batch(bio, error, batch); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -2281,7 +2282,8 @@ EXPORT_SYMBOL(blk_fetch_request); * %false - this request doesn't have any more data * %true - this request has more data **/ -bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) +bool blk_update_request(struct request *req, int error, unsigned int nr_bytes, + struct batch_complete *batch) { int total_bytes; @@ -2337,7 +2339,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (bio_bytes == bio->bi_size) req->bio = bio->bi_next; - req_bio_endio(req, bio, bio_bytes, error); + req_bio_endio(req, bio, bio_bytes, error, batch); total_bytes += bio_bytes; nr_bytes -= bio_bytes; @@ -2390,14 +2392,15 @@ EXPORT_SYMBOL_GPL(blk_update_request); static bool blk_update_bidi_request(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes) + unsigned int bidi_bytes, + struct batch_complete *batch) { - if (blk_update_request(rq, error, nr_bytes)) + if (blk_update_request(rq, error, nr_bytes, batch)) return true; /* Bidi request must be completed as a whole */ if (unlikely(blk_bidi_rq(rq)) && - blk_update_request(rq->next_rq, error, bidi_bytes)) + blk_update_request(rq->next_rq, error, bidi_bytes, batch)) return true; if (blk_queue_add_random(rq->q)) @@ -2480,7 +2483,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, struct request_queue *q = rq->q; unsigned long flags; - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, NULL)) return true; spin_lock_irqsave(q->queue_lock, flags); @@ -2506,9 +2509,10 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %true - still buffers pending for this request **/ bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) + unsigned int nr_bytes, unsigned int bidi_bytes, + struct batch_complete *batch) { - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, batch)) return true; blk_finish_request(rq, error); @@ -2609,7 +2613,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_err); **/ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - return __blk_end_bidi_request(rq, error, nr_bytes, 0); + return __blk_end_bidi_request(rq, error, nr_bytes, 0, NULL); } EXPORT_SYMBOL(__blk_end_request); @@ -2621,7 +2625,7 @@ EXPORT_SYMBOL(__blk_end_request); * Description: * Completely finish @rq. Must be called with queue lock held. */ -void __blk_end_request_all(struct request *rq, int error) +void blk_end_request_all_batch(struct request *rq, int error, struct batch_complete *batch) { bool pending; unsigned int bidi_bytes = 0; @@ -2629,10 +2633,10 @@ void __blk_end_request_all(struct request *rq, int error) if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); - pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); + pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes, batch); BUG_ON(pending); } -EXPORT_SYMBOL(__blk_end_request_all); +EXPORT_SYMBOL(blk_end_request_all_batch); /** * __blk_end_request_cur - Helper function to finish the current request chunk. diff --git a/block/blk-flush.c b/block/blk-flush.c index cc2b827a853c..ab0ed2358947 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -316,7 +316,7 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - __blk_end_bidi_request(rq, 0, 0, 0); + __blk_end_bidi_request(rq, 0, 0, 0, NULL); return; } @@ -384,7 +384,8 @@ void blk_abort_flushes(struct request_queue *q) } } -static void bio_end_flush(struct bio *bio, int err) +static void bio_end_flush(struct bio *bio, int err, + struct batch_complete *batch) { if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); diff --git a/block/blk-lib.c b/block/blk-lib.c index d6f50d572565..279f9de415be 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -15,7 +15,8 @@ struct bio_batch { struct completion *wait; }; -static void bio_batch_end_io(struct bio *bio, int err) +static void bio_batch_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct bio_batch *bb = bio->bi_private; diff --git a/block/blk.h b/block/blk.h index e837b8f619b7..dc8fee6d41d6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -31,7 +31,8 @@ void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes); + unsigned int nr_bytes, unsigned int bidi_bytes, + struct batch_complete *batch); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7c668c8a6f95..7e5d474dc6ba 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -59,6 +59,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, if (!disk->fops->getgeo) return -ENOTTY; + memset(&geo, 0, sizeof(geo)); /* * We need to set the startsect first, the driver may * want to override it. diff --git a/block/genhd.c b/block/genhd.c index e9094b375c05..2e1cfe31f080 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 75a54e1adbb5..4cebb2f0d2f4 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX of machines called RISCiX. If you say 'Y' here, Linux will be able to read disks partitioned under RISCiX. +config AIX_PARTITION + bool "AIX basic partition table support" if PARTITION_ADVANCED + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM or Motorola PowerPC machines + running AIX. AIX actually uses a Logical Volume Manager, where + "logical volumes" can be spread across one or multiple disks, + but this driver works only for the simple case of partitions which + are contiguous. + Otherwise, say N. + config OSF_PARTITION bool "Alpha OSF partition support" if PARTITION_ADVANCED default y if ALPHA diff --git a/block/partitions/Makefile b/block/partitions/Makefile index bf26d4a61e4c..d79da0be0b2d 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLOCK) := check.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_AIX_PARTITION) += aix.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/block/partitions/aix.c b/block/partitions/aix.c new file mode 100644 index 000000000000..43be471d9b1d --- /dev/null +++ b/block/partitions/aix.c @@ -0,0 +1,293 @@ +/* + * fs/partitions/aix.c + * + * Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be> + */ + +#include "check.h" +#include "aix.h" + +struct lvm_rec { + char lvm_id[4]; /* "_LVM" */ + char reserved4[16]; + __be32 lvmarea_len; + __be32 vgda_len; + __be32 vgda_psn[2]; + char reserved36[10]; + __be16 pp_size; /* log2(pp_size) */ + char reserved46[12]; + __be16 version; + }; + +struct vgda { + __be32 secs; + __be32 usec; + char reserved8[16]; + __be16 numlvs; + __be16 maxlvs; + __be16 pp_size; + __be16 numpvs; + __be16 total_vgdas; + __be16 vgda_size; + }; + +struct lvd { + __be16 lv_ix; + __be16 res2; + __be16 res4; + __be16 maxsize; + __be16 lv_state; + __be16 mirror; + __be16 mirror_policy; + __be16 num_lps; + __be16 res10[8]; + }; + +struct lvname { + char name[64]; + }; + +struct ppe { + __be16 lv_ix; + unsigned short res2; + unsigned short res4; + __be16 lp_ix; + unsigned short res8[12]; + }; + +struct pvd { + char reserved0[16]; + __be16 pp_count; + char reserved18[2]; + __be32 psn_part1; + char reserved24[8]; + struct ppe ppe[1016]; + }; + +#define LVM_MAXLVS 256 + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return (bdev->bd_inode->i_size >> 9) - 1ULL; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @count + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, + size_t count) +{ + size_t totalreadcount = 0; + + if (!buffer || lba + count / 512 > last_lba(state->bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, lba++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount += copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_pvd(): reads physical volume descriptor + * @state + * @lba + * + * Description: Returns pvd on success, NULL on error. + * Allocates space for pvd and fill it with disk blocks at @lba + * Notes: remember to free pvd when you're done! + */ +static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct pvd); + struct pvd *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +/** + * alloc_lvn(): reads logical volume names + * @state + * @lba + * + * Description: Returns lvn on success, NULL on error. + * Allocates space for lvn and fill it with disk blocks at @lba + * Notes: remember to free lvn when you're done! + */ +static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct lvname) * LVM_MAXLVS; + struct lvname *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +int aix_partition(struct parsed_partitions *state) +{ + int ret = 0; + Sector sect; + unsigned char *d; + u32 pp_bytes_size; + u32 pp_blocks_size = 0; + u32 vgda_sector = 0; + u32 vgda_len = 0; + int numlvs = 0; + struct pvd *pvd; + struct lv_info { + unsigned short pps_per_lv; + unsigned short pps_found; + unsigned char lv_is_contiguous; + } *lvip; + struct lvname *n = NULL; + + d = read_part_sector(state, 7, §); + if (d) { + struct lvm_rec *p = (struct lvm_rec *)d; + u16 lvm_version = be16_to_cpu(p->version); + char tmp[64]; + + if (lvm_version == 1) { + int pp_size_log2 = be16_to_cpu(p->pp_size); + + pp_bytes_size = 1 << pp_size_log2; + pp_blocks_size = pp_bytes_size / 512; + snprintf(tmp, sizeof(tmp), + " AIX LVM header version %u found\n", + lvm_version); + vgda_len = be32_to_cpu(p->vgda_len); + vgda_sector = be32_to_cpu(p->vgda_psn[0]); + } else { + snprintf(tmp, sizeof(tmp), + " unsupported AIX LVM version %d found\n", + lvm_version); + } + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + } + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { + struct vgda *p = (struct vgda *)d; + + numlvs = be16_to_cpu(p->numlvs); + put_dev_sector(sect); + } + lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); + if (!lvip) + return 0; + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { + struct lvd *p = (struct lvd *)d; + int i; + + n = alloc_lvn(state, vgda_sector + vgda_len - 33); + if (n) { + int foundlvs = 0; + + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { + lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps); + if (lvip[i].pps_per_lv) + foundlvs += 1; + } + } + put_dev_sector(sect); + } + pvd = alloc_pvd(state, vgda_sector + 17); + if (pvd) { + int numpps = be16_to_cpu(pvd->pp_count); + int psn_part1 = be32_to_cpu(pvd->psn_part1); + int i; + int cur_lv_ix = -1; + int next_lp_ix = 1; + int lp_ix; + + for (i = 0; i < numpps; i += 1) { + struct ppe *p = pvd->ppe + i; + unsigned int lv_ix; + + lp_ix = be16_to_cpu(p->lp_ix); + if (!lp_ix) { + next_lp_ix = 1; + continue; + } + lv_ix = be16_to_cpu(p->lv_ix) - 1; + if (lv_ix > state->limit) { + cur_lv_ix = -1; + continue; + } + lvip[lv_ix].pps_found += 1; + if (lp_ix == 1) { + cur_lv_ix = lv_ix; + next_lp_ix = 1; + } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) { + next_lp_ix = 1; + continue; + } + if (lp_ix == lvip[lv_ix].pps_per_lv) { + char tmp[70]; + + put_partition(state, lv_ix + 1, + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", + n[lv_ix].name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + lvip[lv_ix].lv_is_contiguous = 1; + ret = 1; + next_lp_ix = 1; + } else + next_lp_ix += 1; + } + for (i = 0; i < state->limit; i += 1) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + pr_warn("partition %s (%u pp's found) is " + "not contiguous\n", + n[i].name, lvip[i].pps_found); + kfree(pvd); + } + kfree(n); + kfree(lvip); + return ret; +} diff --git a/block/partitions/aix.h b/block/partitions/aix.h new file mode 100644 index 000000000000..e0c66a987523 --- /dev/null +++ b/block/partitions/aix.h @@ -0,0 +1 @@ +extern int aix_partition(struct parsed_partitions *state); diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 7681cd295ab8..9123f250b425 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -23,6 +23,7 @@ #include "check.h" #include "msdos.h" #include "efi.h" +#include "aix.h" /* * Many architectures don't like unaligned accesses, while @@ -90,7 +91,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); - }; + } return ret; } @@ -142,7 +143,7 @@ static void parse_extended(struct parsed_partitions *state, return; if (!msdos_magic_present(data + 510)) - goto done; + goto done; p = (struct partition *) (data + 0x1be); @@ -155,7 +156,7 @@ static void parse_extended(struct parsed_partitions *state, * and OS/2 seems to use all four entries. */ - /* + /* * First process the data partition(s) */ for (i=0; i<4; i++, p++) { @@ -263,7 +264,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, } #if defined(CONFIG_BSD_DISKLABEL) -/* +/* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ @@ -294,7 +295,7 @@ static void parse_bsd(struct parsed_partitions *state, if (state->next == state->limit) break; - if (p->p_fstype == BSD_FS_UNUSED) + if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); @@ -441,7 +442,7 @@ static struct { {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; - + int msdos_partition(struct parsed_partitions *state) { sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; @@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); +#ifdef CONFIG_AIX_PARTITION + return aix_partition(state); +#else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; +#endif } if (!msdos_magic_present(data + 510)) { diff --git a/crypto/Kconfig b/crypto/Kconfig index d1ca6312d798..25b52098230a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1378,6 +1378,22 @@ config CRYPTO_842 help This is the 842 algorithm. +config CRYPTO_LZ4 + tristate "LZ4 compression algorithm" + select CRYPTO_ALGAPI + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 algorithm. + +config CRYPTO_LZ4HC + tristate "LZ4HC compression algorithm" + select CRYPTO_ALGAPI + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 high compression mode algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index 62af87df8729..2d5ed08a239f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -86,6 +86,8 @@ obj-$(CONFIG_CRYPTO_CRC32) += crc32.o obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o +obj-$(CONFIG_CRYPTO_LZ4) += lz4.o +obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o obj-$(CONFIG_CRYPTO_842) += 842.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o diff --git a/crypto/lz4.c b/crypto/lz4.c new file mode 100644 index 000000000000..4586dd15b0d8 --- /dev/null +++ b/crypto/lz4.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min <chanho.min@lge.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/vmalloc.h> +#include <linux/lz4.h> + +struct lz4_ctx { + void *lz4_comp_mem; +}; + +static int lz4_init(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS); + if (!ctx->lz4_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4_exit(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + vfree(ctx->lz4_comp_mem); +} + +static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress(src, &__slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4 = { + .cra_name = "lz4", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list), + .cra_init = lz4_init, + .cra_exit = lz4_exit, + .cra_u = { .compress = { + .coa_compress = lz4_compress_crypto, + .coa_decompress = lz4_decompress_crypto } } +}; + +static int __init lz4_mod_init(void) +{ + return crypto_register_alg(&alg_lz4); +} + +static void __exit lz4_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4); +} + +module_init(lz4_mod_init); +module_exit(lz4_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Compression Algorithm"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c new file mode 100644 index 000000000000..151ba31d34e3 --- /dev/null +++ b/crypto/lz4hc.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min <chanho.min@lge.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/vmalloc.h> +#include <linux/lz4.h> + +struct lz4hc_ctx { + void *lz4hc_comp_mem; +}; + +static int lz4hc_init(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!ctx->lz4hc_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4hc_exit(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + vfree(ctx->lz4hc_comp_mem); +} + +static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress(src, &__slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4hc = { + .cra_name = "lz4hc", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4hc_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list), + .cra_init = lz4hc_init, + .cra_exit = lz4hc_exit, + .cra_u = { .compress = { + .coa_compress = lz4hc_compress_crypto, + .coa_decompress = lz4hc_decompress_crypto } } +}; + +static int __init lz4hc_mod_init(void) +{ + return crypto_register_alg(&alg_lz4hc); +} + +static void __exit lz4hc_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4hc); +} + +module_init(lz4hc_mod_init); +module_exit(lz4hc_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); diff --git a/drivers/block/blockconsole.c b/drivers/block/blockconsole.c index 65f8ace6dc02..656c5a6c1f4e 100644 --- a/drivers/block/blockconsole.c +++ b/drivers/block/blockconsole.c @@ -164,7 +164,8 @@ static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes) } while (cmpxchg64(&bc->console_bytes, old, new) != old); } -static void request_complete(struct bio *bio, int err) +static void request_complete(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -289,7 +290,7 @@ static void bcon_unregister(struct work_struct *work) } #define BCON_MAX_ERRORS 10 -static void bcon_end_io(struct bio *bio, int err) +static void bcon_end_io(struct bio *bio, int err, struct batch_complete *batch) { struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio); struct blockconsole *bc = bio->bi_private; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 64fbb8385cdc..046aa1793514 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -948,7 +948,8 @@ static void bm_aio_ctx_destroy(struct kref *kref) } /* bv_page may be a copy, or may be the original */ -static void bm_async_io_complete(struct bio *bio, int error) +static void bm_async_io_complete(struct bio *bio, int error, + struct batch_complete *batch) { struct bm_aio_ctx *ctx = bio->bi_private; struct drbd_conf *mdev = ctx->mdev; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 891c0ecaa292..04a80af8fddb 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -64,7 +64,8 @@ rwlock_t global_state_lock; /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ -void drbd_md_io_complete(struct bio *bio, int error) +void drbd_md_io_complete(struct bio *bio, int error, + struct batch_complete *batch) { struct drbd_md_io *md_io; struct drbd_conf *mdev; @@ -167,7 +168,8 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver. */ -void drbd_peer_request_endio(struct bio *bio, int error) +void drbd_peer_request_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct drbd_peer_request *peer_req = bio->bi_private; struct drbd_conf *mdev = peer_req->w.mdev; @@ -203,7 +205,8 @@ void drbd_peer_request_endio(struct bio *bio, int error) /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ -void drbd_request_endio(struct bio *bio, int error) +void drbd_request_endio(struct bio *bio, int error, + struct batch_complete *batch) { unsigned long flags; struct drbd_request *req = bio->bi_private; diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index 328f18e4b4ee..d443dc06b854 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h @@ -20,9 +20,12 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) /* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); +extern void drbd_md_io_complete(struct bio *bio, int error, + struct batch_complete *batch); +extern void drbd_peer_request_endio(struct bio *bio, int error, + struct batch_complete *batch); +extern void drbd_request_endio(struct bio *bio, int error, + struct batch_complete *batch); /* * used to submit our private bio diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 04ceb7e2fadd..d5287538b588 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3746,7 +3746,8 @@ static unsigned int floppy_check_events(struct gendisk *disk, * a disk in the drive, and whether that disk is writable. */ -static void floppy_rb0_complete(struct bio *bio, int err) +static void floppy_rb0_complete(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 20dd52a2f92f..0d4ddce5b777 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -151,6 +151,9 @@ static void mtip_command_cleanup(struct driver_data *dd) struct mtip_cmd *command; struct mtip_port *port = dd->port; static int in_progress; + struct batch_complete batch; + + batch_complete_init(&batch); if (in_progress) return; @@ -166,11 +169,9 @@ static void mtip_command_cleanup(struct driver_data *dd) command = &port->commands[commandindex]; if (atomic_read(&command->active) - && (command->async_callback)) { - command->async_callback(command->async_data, - -ENODEV); - command->async_callback = NULL; - command->async_data = NULL; + && (command->bio)) { + bio_endio_batch(command->bio, -ENODEV, &batch); + command->bio = NULL; } dma_unmap_sg(&port->dd->pdev->dev, @@ -178,9 +179,10 @@ static void mtip_command_cleanup(struct driver_data *dd) command->scatter_ents, command->direction); } + up(&port->cmd_slot); } - up(&port->cmd_slot); + batch_complete(&batch); set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); in_progress = 0; @@ -580,6 +582,9 @@ static void mtip_timeout_function(unsigned long int data) unsigned int bit, group; unsigned int num_command_slots; unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; + struct batch_complete batch; + + batch_complete_init(&batch); if (unlikely(!port)) return; @@ -622,11 +627,9 @@ static void mtip_timeout_function(unsigned long int data) writel(1 << bit, port->completed[group]); /* Call the async completion callback. */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, - -EIO); - command->async_callback = NULL; - command->comp_func = NULL; + if (likely(command->bio)) + bio_endio_batch(command->bio, -EIO, &batch); + command->bio = NULL; /* Unmap the DMA scatter list entries */ dma_unmap_sg(&port->dd->pdev->dev, @@ -645,6 +648,8 @@ static void mtip_timeout_function(unsigned long int data) } } + batch_complete(&batch); + if (cmdto_cnt) { print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { @@ -695,7 +700,8 @@ static void mtip_timeout_function(unsigned long int data) static void mtip_async_complete(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { struct mtip_cmd *command; struct driver_data *dd = data; @@ -712,11 +718,10 @@ static void mtip_async_complete(struct mtip_port *port, } /* Upper layer callback */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, cb_status); + if (likely(command->bio)) + bio_endio_batch(command->bio, cb_status, batch); - command->async_callback = NULL; - command->comp_func = NULL; + command->bio = NULL; /* Unmap the DMA scatter list entries */ dma_unmap_sg(&dd->pdev->dev, @@ -752,24 +757,22 @@ static void mtip_async_complete(struct mtip_port *port, static void mtip_completion(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { - struct mtip_cmd *command = &port->commands[tag]; struct completion *waiting = data; if (unlikely(status == PORT_IRQ_TF_ERR)) dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); - command->async_callback = NULL; - command->comp_func = NULL; - complete(waiting); } static void mtip_null_completion(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { return; } @@ -798,6 +801,7 @@ static void mtip_handle_tfe(struct driver_data *dd) unsigned char *buf; char *fail_reason = NULL; int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0; + struct batch_complete batch; dev_warn(&dd->pdev->dev, "Taskfile error\n"); @@ -815,13 +819,14 @@ static void mtip_handle_tfe(struct driver_data *dd) atomic_inc(&cmd->active); /* active > 1 indicates error */ if (cmd->comp_data && cmd->comp_func) { cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, PORT_IRQ_TF_ERR); + cmd->comp_data, PORT_IRQ_TF_ERR, NULL); } goto handle_tfe_exit; } /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + batch_complete_init(&batch); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -848,7 +853,7 @@ static void mtip_handle_tfe(struct driver_data *dd) cmd->comp_func(port, tag, cmd->comp_data, - 0); + 0, &batch); } else { dev_err(&port->dd->pdev->dev, "Missing completion func for tag %d", @@ -861,6 +866,7 @@ static void mtip_handle_tfe(struct driver_data *dd) } } } + batch_complete(&batch); print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt); @@ -902,6 +908,7 @@ static void mtip_handle_tfe(struct driver_data *dd) /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + batch_complete_init(&batch); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -935,7 +942,7 @@ static void mtip_handle_tfe(struct driver_data *dd) if (cmd->comp_func) { cmd->comp_func(port, tag, cmd->comp_data, - -ENODATA); + -ENODATA, &batch); } continue; } @@ -965,13 +972,15 @@ static void mtip_handle_tfe(struct driver_data *dd) port, tag, cmd->comp_data, - PORT_IRQ_TF_ERR); + PORT_IRQ_TF_ERR, &batch); else dev_warn(&port->dd->pdev->dev, "Bad completion for tag %d\n", tag); } } + + batch_complete(&batch); print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); handle_tfe_exit: @@ -992,6 +1001,9 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, struct driver_data *dd = port->dd; int tag, bit; struct mtip_cmd *command; + struct batch_complete batch; + + batch_complete_init(&batch); if (!completed) { WARN_ON_ONCE(!completed); @@ -1016,7 +1028,8 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, port, tag, command->comp_data, - 0); + 0, + &batch); } else { dev_warn(&dd->pdev->dev, "Null completion " @@ -1026,13 +1039,16 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, if (mtip_check_surprise_removal( dd->pdev)) { mtip_command_cleanup(dd); - return; + goto out; } } } completed >>= 1; } +out: + batch_complete(&batch); + /* If last, re-enable interrupts */ if (atomic_dec_return(&dd->irq_workers_active) == 0) writel(0xffffffff, dd->mmio + HOST_IRQ_STAT); @@ -1053,7 +1069,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd->comp_data, - 0); + 0, NULL); return; } } @@ -2561,8 +2577,8 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * None */ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, - int nsect, int nents, int tag, void *callback, - void *data, int dir, int unaligned) + int nsect, int nents, int tag, + struct bio *bio, int dir, int unaligned) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; @@ -2621,12 +2637,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->comp_func = mtip_async_complete; command->direction = dma_dir; - /* - * Set the completion function and data for the command passed - * from the upper layer. - */ - command->async_data = data; - command->async_callback = callback; + command->bio = bio; /* * To prevent this command from being issued @@ -3936,7 +3947,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) bio_sectors(bio), nents, tag, - bio_endio, bio, bio_data_dir(bio), unaligned); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3bb8a295fbe4..7a2ddfdde9f4 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -328,11 +328,9 @@ struct mtip_cmd { void (*comp_func)(struct mtip_port *port, int tag, void *data, - int status); - /* Additional callback function that may be called by comp_func() */ - void (*async_callback)(void *data, int status); - - void *async_data; /* Addl. data passed to async_callback() */ + int status, + struct batch_complete *batch); + struct bio *bio; int scatter_ents; /* Number of scatter list entries used */ diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8efdfaa44a59..5fc5bd9daa13 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -413,7 +413,8 @@ struct nvme_bio_pair { atomic_t cnt; }; -static void nvme_bio_pair_endio(struct bio *bio, int err) +static void nvme_bio_pair_endio(struct bio *bio, int err, + struct batch_complete *batch) { struct nvme_bio_pair *bp = bio->bi_private; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3c08983e600a..898fa745bb32 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -980,7 +980,8 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec) } } -static void pkt_end_io_read(struct bio *bio, int err) +static void pkt_end_io_read(struct bio *bio, int err, + struct batch_complete *batch) { struct packet_data *pkt = bio->bi_private; struct pktcdvd_device *pd = pkt->pd; @@ -998,7 +999,8 @@ static void pkt_end_io_read(struct bio *bio, int err) pkt_bio_finished(pd); } -static void pkt_end_io_packet_write(struct bio *bio, int err) +static void pkt_end_io_packet_write(struct bio *bio, int err, + struct batch_complete *batch) { struct packet_data *pkt = bio->bi_private; struct pktcdvd_device *pd = pkt->pd; @@ -2337,7 +2339,8 @@ static void pkt_close(struct gendisk *disk, fmode_t mode) } -static void pkt_end_io_read_cloned(struct bio *bio, int err) +static void pkt_end_io_read_cloned(struct bio *bio, int err, + struct batch_complete *batch) { struct packet_stacked_data *psd = bio->bi_private; struct pktcdvd_device *pd = psd->pd; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 20e061c3e023..9282e66b7547 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -775,7 +775,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) if (intr & ERROR_INTR) { n = fs->scount - 1 - resid / 512; if (n > 0) { - blk_update_request(req, 0, n << 9); + blk_update_request(req, 0, n << 9, NULL); fs->req_sector += n; } if (fs->retries < 5) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5cdf88b7ad9e..bc154c23bc52 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -217,7 +217,8 @@ static void virtblk_bio_send_flush_work(struct work_struct *work) virtblk_bio_send_flush(vbr); } -static inline void virtblk_request_done(struct virtblk_req *vbr) +static inline void virtblk_request_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; struct request *req = vbr->req; @@ -231,11 +232,12 @@ static inline void virtblk_request_done(struct virtblk_req *vbr) req->errors = (error != 0); } - __blk_end_request_all(req, error); + blk_end_request_all_batch(req, error, batch); mempool_free(vbr, vblk->pool); } -static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) +static inline void virtblk_bio_flush_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; @@ -244,12 +246,13 @@ static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) INIT_WORK(&vbr->work, virtblk_bio_send_data_work); queue_work(virtblk_wq, &vbr->work); } else { - bio_endio(vbr->bio, virtblk_result(vbr)); + bio_endio_batch(vbr->bio, virtblk_result(vbr), batch); mempool_free(vbr, vblk->pool); } } -static inline void virtblk_bio_data_done(struct virtblk_req *vbr) +static inline void virtblk_bio_data_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; @@ -259,17 +262,18 @@ static inline void virtblk_bio_data_done(struct virtblk_req *vbr) INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); queue_work(virtblk_wq, &vbr->work); } else { - bio_endio(vbr->bio, virtblk_result(vbr)); + bio_endio_batch(vbr->bio, virtblk_result(vbr), batch); mempool_free(vbr, vblk->pool); } } -static inline void virtblk_bio_done(struct virtblk_req *vbr) +static inline void virtblk_bio_done(struct virtblk_req *vbr, + struct batch_complete *batch) { if (unlikely(vbr->flags & VBLK_IS_FLUSH)) - virtblk_bio_flush_done(vbr); + virtblk_bio_flush_done(vbr, batch); else - virtblk_bio_data_done(vbr); + virtblk_bio_data_done(vbr, batch); } static void virtblk_done(struct virtqueue *vq) @@ -279,16 +283,19 @@ static void virtblk_done(struct virtqueue *vq) struct virtblk_req *vbr; unsigned long flags; unsigned int len; + struct batch_complete batch; + + batch_complete_init(&batch); spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { if (vbr->bio) { - virtblk_bio_done(vbr); + virtblk_bio_done(vbr, &batch); bio_done = true; } else { - virtblk_request_done(vbr); + virtblk_request_done(vbr, &batch); req_done = true; } } @@ -298,6 +305,8 @@ static void virtblk_done(struct virtqueue *vq) blk_start_queue(vblk->disk->queue); spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); + batch_complete(&batch); + if (bio_done) wake_up(&vblk->queue_wait); } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index dd5b2fed97e9..990c1d81849f 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -741,7 +741,8 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) /* * bio callback. */ -static void end_block_io_op(struct bio *bio, int error) +static void end_block_io_op(struct bio *bio, int error, + struct batch_complete *batch) { __end_block_io_op(bio->bi_private, error); bio_put(bio); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 4afcb65cc623..5980cb9af857 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -830,9 +830,9 @@ probe_fail_cdrom_register: del_gendisk(gd.disk); probe_fail_no_disk: kfree(gd.cd_info); +probe_fail_no_mem: unregister_blkdev(gdrom_major, GDROM_DEV_NAME); gdrom_major = 0; -probe_fail_no_mem: pr_warning("Probe failed - error is 0x%X\n", err); return err; } diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c index c68969708068..04e6d6a27994 100644 --- a/drivers/char/mwave/tp3780i.c +++ b/drivers/char/mwave/tp3780i.c @@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities PRINTK_2(TRACE_TP3780I, "tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData); + memset(pAbilities, 0, sizeof(*pAbilities)); /* fill out standard constant fields */ pAbilities->instr_per_sec = pBDData->rDspSettings.uIps; pAbilities->data_size = pBDData->rDspSettings.uDStoreSize; diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index b78cbe74dadf..442a15443fa5 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -399,6 +399,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode) return; /* + * fbdev->blank can be called from irq context in case of a panic. + * Since we already have our own special panic handler which will + * restore the fbdev console mode completely, just bail out early. + */ + if (oops_in_progress) + return; + + /* * For each CRTC in this fb, turn the connectors on/off. */ drm_modeset_lock_all(dev); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 784b97cb05b0..f2ef7ef0f36f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -383,14 +383,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; int id; - static int next_id; idr_preload(GFP_KERNEL); spin_lock_irqsave(&cm.lock, flags); - id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT); - if (id >= 0) - next_id = max(id + 1, 0); + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&cm.lock, flags); idr_preload_end(); diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08b..1f75edd7f329 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -156,7 +156,8 @@ static void discard_finish(struct work_struct *w) closure_put(&ca->set->cl); } -static void discard_endio(struct bio *bio, int error) +static void discard_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct discard *d = container_of(bio, struct discard, bio); schedule_work(&d->work); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..36688d6bb061 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -134,7 +134,8 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +static void btree_bio_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io.cl); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd5204924e..3a32b063040b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -177,7 +177,8 @@ void bch_btree_verify(struct btree *b, struct bset *new) mutex_unlock(&b->c->verify_lock); } -static void data_verify_endio(struct bio *bio, int error) +static void data_verify_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; closure_put(cl); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..29f344b9e408 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -9,7 +9,8 @@ #include "bset.h" #include "debug.h" -static void bch_bi_idx_hack_endio(struct bio *bio, int error) +static void bch_bi_idx_hack_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct bio *p = bio->bi_private; @@ -206,7 +207,8 @@ static void bch_bio_submit_split_done(struct closure *cl) mempool_free(s, s->p->bio_split_hook); } -static void bch_bio_submit_split_endio(struct bio *bio, int error) +static void bch_bio_submit_split_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..bff194bb3e08 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -22,7 +22,8 @@ * bit. */ -static void journal_read_endio(struct bio *bio, int error) +static void journal_read_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; closure_put(cl); @@ -390,7 +391,8 @@ found: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio, int error) +static void journal_discard_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct journal_device *ja = container_of(bio, struct journal_device, discard_bio); @@ -535,7 +537,8 @@ void bch_journal_next(struct journal *j) pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); } -static void journal_write_endio(struct bio *bio, int error) +static void journal_write_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct journal_write *w = bio->bi_private; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512c972e..8bf7ae12d4b0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -61,7 +61,8 @@ static void write_moving_finish(struct closure *cl) closure_return_with_destructor(cl, moving_io_destructor); } -static void read_moving_endio(struct bio *bio, int error) +static void read_moving_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct moving_io *io = container_of(bio->bi_private, struct moving_io, s.cl); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..bc837edd52ea 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -456,7 +456,8 @@ static void bch_insert_data_error(struct closure *cl) bch_journal(cl); } -static void bch_insert_data_endio(struct bio *bio, int error) +static void bch_insert_data_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct btree_op *op = container_of(cl, struct btree_op, cl); @@ -621,7 +622,8 @@ void bch_btree_insert_async(struct closure *cl) /* Common code for the make_request functions */ -static void request_endio(struct bio *bio, int error) +static void request_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; @@ -636,7 +638,8 @@ static void request_endio(struct bio *bio, int error) closure_put(cl); } -void bch_cache_read_endio(struct bio *bio, int error) +void bch_cache_read_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct bbio *b = container_of(bio, struct bbio, bio); struct closure *cl = bio->bi_private; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab5707c..3b794625c4c1 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -29,11 +29,10 @@ struct search { struct btree_op op; }; -void bch_cache_read_endio(struct bio *, int); +void bch_cache_read_endio(struct bio *, int, struct batch_complete *batch); int bch_get_congested(struct cache_set *); void bch_insert_data(struct closure *cl); void bch_btree_insert_async(struct closure *); -void bch_cache_read_endio(struct bio *, int); void bch_open_buckets_free(struct cache_set *); int bch_open_buckets_alloc(struct cache_set *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..9c57c596cff7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -224,7 +224,8 @@ err: return err; } -static void write_bdev_super_endio(struct bio *bio, int error) +static void write_bdev_super_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct cached_dev *dc = bio->bi_private; /* XXX: error checking */ @@ -285,7 +286,8 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_return(cl); } -static void write_super_endio(struct bio *bio, int error) +static void write_super_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct cache *ca = bio->bi_private; @@ -326,7 +328,7 @@ void bcache_write_super(struct cache_set *c) /* UUID io */ -static void uuid_endio(struct bio *bio, int error) +static void uuid_endio(struct bio *bio, int error, struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); @@ -490,7 +492,8 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c) * disk. */ -static void prio_endio(struct bio *bio, int error) +static void prio_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct cache *ca = bio->bi_private; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..657a8687a7b8 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -253,7 +253,8 @@ static void write_dirty_finish(struct closure *cl) closure_return_with_destructor(cl, dirty_io_destructor); } -static void dirty_endio(struct bio *bio, int error) +static void dirty_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; @@ -281,7 +282,8 @@ static void write_dirty(struct closure *cl) continue_at(cl, write_dirty_finish, dirty_wq); } -static void read_dirty_endio(struct bio *bio, int error) +static void read_dirty_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; @@ -289,7 +291,7 @@ static void read_dirty_endio(struct bio *bio, int error) bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), error, "reading dirty data from cache"); - dirty_endio(bio, error); + dirty_endio(bio, error, NULL); } static void read_dirty_submit(struct closure *cl) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0387e05cdb98..d489dfd306e6 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -494,7 +494,7 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_end_io(&b->bio, error ? -EIO : 0); + b->bio.bi_end_io(&b->bio, error ? -EIO : 0, NULL); } static void use_dmio(struct dm_buffer *b, int rw, sector_t block, @@ -525,7 +525,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, r = dm_io(&io_req, 1, ®ion, NULL); if (r) - end_io(&b->bio, r); + end_io(&b->bio, r, NULL); } static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, @@ -592,7 +592,8 @@ static void submit_io(struct dm_buffer *b, int rw, sector_t block, * Set the error, clear B_WRITING bit and wake anyone who was waiting on * it. */ -static void write_endio(struct bio *bio, int error) +static void write_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); @@ -965,7 +966,7 @@ found_buffer: * The endio routine for reading: set the error, clear the bit and wake up * anyone waiting on the buffer. */ -static void read_endio(struct bio *bio, int error) +static void read_endio(struct bio *bio, int error, struct batch_complete *batch) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index df44b60e66f2..53fb7b25d7eb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -653,7 +653,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) wake_worker(cache); } -static void writethrough_endio(struct bio *bio, int err) +static void writethrough_endio(struct bio *bio, int err, + struct batch_complete *batch) { struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); bio->bi_end_io = pb->saved_bi_end_io; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6d2d41ae9e32..ec0e3c0883b5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -929,7 +929,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * The work is done per CPU global for all dm-crypt instances. * They should not depend on each other and do not block. */ -static void crypt_endio(struct bio *clone, int error) +static void crypt_endio(struct bio *clone, int error, + struct batch_complete *batch) { struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea49834377c8..a727b267f86d 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -136,7 +136,7 @@ static void dec_count(struct io *io, unsigned int region, int error) } } -static void endio(struct bio *bio, int error) +static void endio(struct bio *bio, int error, struct batch_complete *batch) { struct io *io; unsigned region; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c434e5aab2df..fb3ea3c63fd1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1486,7 +1486,8 @@ static void start_copy(struct dm_snap_pending_exception *pe) dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } -static void full_bio_end_io(struct bio *bio, int error) +static void full_bio_end_io(struct bio *bio, int error, + struct batch_complete *batch) { void *callback_data = bio->bi_private; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 88f2f802d528..3045f9c724b5 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -553,7 +553,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) spin_unlock_irqrestore(&pool->lock, flags); } -static void overwrite_endio(struct bio *bio, int err) +static void overwrite_endio(struct bio *bio, int err, + struct batch_complete *batch) { unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index b948fd864d45..b373bb7d1c2d 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -413,7 +413,8 @@ static void verity_work(struct work_struct *w) verity_finish_io(io, verity_verify_io(io)); } -static void verity_end_io(struct bio *bio, int error) +static void verity_end_io(struct bio *bio, int error, + struct batch_complete *batch) { struct dm_verity_io *io = bio->bi_private; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d5370a94b2c1..290106082244 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -615,7 +615,8 @@ static void dec_pending(struct dm_io *io, int error) } } -static void clone_endio(struct bio *bio, int error) +static void clone_endio(struct bio *bio, int error, + struct batch_complete *batch) { int r = 0; struct dm_target_io *tio = bio->bi_private; @@ -650,7 +651,8 @@ static void clone_endio(struct bio *bio, int error) /* * Partial completion handling for request-based dm */ -static void end_clone_bio(struct bio *clone, int error) +static void end_clone_bio(struct bio *clone, int error, + struct batch_complete *batch) { struct dm_rq_clone_bio_info *info = clone->bi_private; struct dm_rq_target_io *tio = info->tio; @@ -694,7 +696,7 @@ static void end_clone_bio(struct bio *clone, int error) * Do not use blk_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - blk_update_request(tio->orig, 0, nr_bytes); + blk_update_request(tio->orig, 0, nr_bytes, NULL); } /* diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 3193aefe982b..ac8af5253fb6 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -70,7 +70,8 @@ #include <linux/seq_file.h> -static void faulty_fail(struct bio *bio, int error) +static void faulty_fail(struct bio *bio, int error, + struct batch_complete *batch) { struct bio *b = bio->bi_private; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9b82377a833b..0bbe71078fa5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -379,7 +379,8 @@ EXPORT_SYMBOL(mddev_congested); * Generic flush handling for md */ -static void md_end_flush(struct bio *bio, int err) +static void md_end_flush(struct bio *bio, int err, + struct batch_complete *batch) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; @@ -756,7 +757,8 @@ void md_rdev_clear(struct md_rdev *rdev) } EXPORT_SYMBOL_GPL(md_rdev_clear); -static void super_written(struct bio *bio, int error) +static void super_written(struct bio *bio, int error, + struct batch_complete *batch) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; @@ -807,7 +809,8 @@ void md_super_wait(struct mddev *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) +static void bi_complete(struct bio *bio, int error, + struct batch_complete *batch) { complete((struct completion*)bio->bi_private); } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 1642eae75a33..fecad70f53f6 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -83,7 +83,8 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) mempool_free(mp_bh, conf->pool); } -static void multipath_end_request(struct bio *bio, int error) +static void multipath_end_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh *mp_bh = bio->bi_private; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0fbe6549f493..57adb153cb46 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -294,7 +294,8 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) return mirror; } -static void raid1_end_read_request(struct bio *bio, int error) +static void raid1_end_read_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; @@ -379,7 +380,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio) } } -static void raid1_end_write_request(struct bio *bio, int error) +static void raid1_end_write_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; @@ -1613,7 +1615,8 @@ abort: } -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio, int error, + struct batch_complete *batch) { struct r1bio *r1_bio = bio->bi_private; @@ -1631,7 +1634,8 @@ static void end_sync_read(struct bio *bio, int error) reschedule_retry(r1_bio); } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 481ba3744bf1..4a8818d363ab 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -101,7 +101,8 @@ static int enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); -static void end_reshape_write(struct bio *bio, int error); +static void end_reshape_write(struct bio *bio, int error, + struct batch_complete *batch); static void end_reshape(struct r10conf *conf); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -358,7 +359,8 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, return r10_bio->devs[slot].devnum; } -static void raid10_end_read_request(struct bio *bio, int error) +static void raid10_end_read_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; @@ -441,7 +443,8 @@ static void one_write_done(struct r10bio *r10_bio) } } -static void raid10_end_write_request(struct bio *bio, int error) +static void raid10_end_write_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; @@ -1914,7 +1917,8 @@ abort: } -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio, int error, + struct batch_complete *batch) { struct r10bio *r10_bio = bio->bi_private; struct r10conf *conf = r10_bio->mddev->private; @@ -1975,7 +1979,8 @@ static void end_sync_request(struct r10bio *r10_bio) } } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; @@ -4600,7 +4605,8 @@ static int handle_reshape_read_error(struct mddev *mddev, return 0; } -static void end_reshape_write(struct bio *bio, int error) +static void end_reshape_write(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9359828ffe26..d014b66957e1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -532,9 +532,11 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) } static void -raid5_end_read_request(struct bio *bi, int error); +raid5_end_read_request(struct bio *bi, int error, + struct batch_complete *batch); static void -raid5_end_write_request(struct bio *bi, int error); +raid5_end_write_request(struct bio *bi, int error, + struct batch_complete *batch); static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { @@ -1713,7 +1715,8 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } -static void raid5_end_read_request(struct bio * bi, int error) +static void raid5_end_read_request(struct bio *bi, int error, + struct batch_complete *batch) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; @@ -1833,7 +1836,8 @@ static void raid5_end_read_request(struct bio * bi, int error) release_stripe(sh); } -static void raid5_end_write_request(struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi, int error, + struct batch_complete *batch) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; @@ -3904,7 +3908,8 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * first). * If the read failed.. */ -static void raid5_align_endio(struct bio *bi, int error) +static void raid5_align_endio(struct bio *bi, int error, + struct batch_complete *batch) { struct bio* raid_bi = bi->bi_private; struct mddev *mddev; diff --git a/drivers/parport/share.c b/drivers/parport/share.c index a848e02e6be3..6a83ee1e9178 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -282,14 +282,13 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma, int device; char *name; - tmp = kmalloc(sizeof(struct parport), GFP_KERNEL); + tmp = kzalloc(sizeof(struct parport), GFP_KERNEL); if (!tmp) { printk(KERN_WARNING "parport: memory squeeze\n"); return NULL; } /* Init our structure */ - memset(tmp, 0, sizeof(struct parport)); tmp->base = base; tmp->irq = irq; tmp->dma = dma; diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index d3db26e46489..5bd3bf093b54 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -74,7 +74,7 @@ static int pps_gpio_setup(struct platform_device *pdev) int ret; const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; - ret = gpio_request(pdata->gpio_pin, pdata->gpio_label); + ret = devm_gpio_request(&pdev->dev, pdata->gpio_pin, pdata->gpio_label); if (ret) { pr_warning("failed to request GPIO %u\n", pdata->gpio_pin); return -EINVAL; @@ -83,7 +83,6 @@ static int pps_gpio_setup(struct platform_device *pdev) ret = gpio_direction_input(pdata->gpio_pin); if (ret) { pr_warning("failed to set pin direction\n"); - gpio_free(pdata->gpio_pin); return -EINVAL; } @@ -109,7 +108,6 @@ static int pps_gpio_probe(struct platform_device *pdev) struct pps_gpio_device_data *data; int irq; int ret; - int err; int pps_default_params; const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; @@ -123,17 +121,14 @@ static int pps_gpio_probe(struct platform_device *pdev) irq = gpio_to_irq(pdata->gpio_pin); if (irq < 0) { pr_err("failed to map GPIO to IRQ: %d\n", irq); - err = -EINVAL; - goto return_error; + return -EINVAL; } /* allocate space for device info */ data = devm_kzalloc(&pdev->dev, sizeof(struct pps_gpio_device_data), GFP_KERNEL); - if (data == NULL) { - err = -ENOMEM; - goto return_error; - } + if (data == NULL) + return -ENOMEM; /* initialize PPS specific parts of the bookkeeping data structure. */ data->info.mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | @@ -152,41 +147,32 @@ static int pps_gpio_probe(struct platform_device *pdev) data->pps = pps_register_source(&data->info, pps_default_params); if (data->pps == NULL) { pr_err("failed to register IRQ %d as PPS source\n", irq); - err = -EINVAL; - goto return_error; + return -EINVAL; } data->irq = irq; data->pdata = pdata; /* register IRQ interrupt handler */ - ret = request_irq(irq, pps_gpio_irq_handler, + ret = devm_request_irq(&pdev->dev, irq, pps_gpio_irq_handler, get_irqf_trigger_flags(pdata), data->info.name, data); if (ret) { pps_unregister_source(data->pps); pr_err("failed to acquire IRQ %d\n", irq); - err = -EINVAL; - goto return_error; + return -EINVAL; } platform_set_drvdata(pdev, data); dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n", irq); return 0; - -return_error: - gpio_free(pdata->gpio_pin); - return err; } static int pps_gpio_remove(struct platform_device *pdev) { struct pps_gpio_device_data *data = platform_get_drvdata(pdev); - const struct pps_gpio_platform_data *pdata = data->pdata; platform_set_drvdata(pdev, NULL); - free_irq(data->irq, data); - gpio_free(pdata->gpio_pin); pps_unregister_source(data->pps); pr_info("removed IRQ %d as PPS source\n", data->irq); return 0; @@ -201,23 +187,7 @@ static struct platform_driver pps_gpio_driver = { }, }; -static int __init pps_gpio_init(void) -{ - int ret = platform_driver_register(&pps_gpio_driver); - if (ret < 0) - pr_err("failed to register platform driver\n"); - return ret; -} - -static void __exit pps_gpio_exit(void) -{ - platform_driver_unregister(&pps_gpio_driver); - pr_debug("unregistered platform driver\n"); -} - -module_init(pps_gpio_init); -module_exit(pps_gpio_exit); - +module_platform_driver(pps_gpio_driver); MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>"); MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>"); MODULE_DESCRIPTION("Use GPIO pin as PPS source"); diff --git a/drivers/rapidio/switches/Kconfig b/drivers/rapidio/switches/Kconfig index f47fee5d4563..62d4a065230f 100644 --- a/drivers/rapidio/switches/Kconfig +++ b/drivers/rapidio/switches/Kconfig @@ -26,10 +26,3 @@ config RAPIDIO_CPS_GEN2 default n ---help--- Includes support for ITD CPS Gen.2 serial RapidIO switches. - -config RAPIDIO_TSI500 - bool "Tsi500 Parallel RapidIO switch support" - depends on RAPIDIO - default n - ---help--- - Includes support for IDT Tsi500 parallel RapidIO switch. diff --git a/drivers/rapidio/switches/Makefile b/drivers/rapidio/switches/Makefile index c4d3acc3c715..051cc6b38188 100644 --- a/drivers/rapidio/switches/Makefile +++ b/drivers/rapidio/switches/Makefile @@ -5,5 +5,4 @@ obj-$(CONFIG_RAPIDIO_TSI57X) += tsi57x.o obj-$(CONFIG_RAPIDIO_CPS_XX) += idtcps.o obj-$(CONFIG_RAPIDIO_TSI568) += tsi568.o -obj-$(CONFIG_RAPIDIO_TSI500) += tsi500.o obj-$(CONFIG_RAPIDIO_CPS_GEN2) += idt_gen2.o diff --git a/drivers/rapidio/switches/tsi500.c b/drivers/rapidio/switches/tsi500.c deleted file mode 100644 index 914eddd5aa42..000000000000 --- a/drivers/rapidio/switches/tsi500.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * RapidIO Tsi500 switch support - * - * Copyright 2009-2010 Integrated Device Technology, Inc. - * Alexandre Bounine <alexandre.bounine@idt.com> - * - Modified switch operations initialization. - * - * Copyright 2005 MontaVista Software, Inc. - * Matt Porter <mporter@kernel.crashing.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -#include <linux/rio.h> -#include <linux/rio_drv.h> -#include <linux/rio_ids.h> -#include "../rio.h" - -static int -tsi500_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table, u16 route_destid, u8 route_port) -{ - int i; - u32 offset = 0x10000 + 0xa00 + ((route_destid / 2)&~0x3); - u32 result; - - if (table == 0xff) { - rio_mport_read_config_32(mport, destid, hopcount, offset, &result); - result &= ~(0xf << (4*(route_destid & 0x7))); - for (i=0;i<4;i++) - rio_mport_write_config_32(mport, destid, hopcount, offset + (0x20000*i), result | (route_port << (4*(route_destid & 0x7)))); - } - else { - rio_mport_read_config_32(mport, destid, hopcount, offset + (0x20000*table), &result); - result &= ~(0xf << (4*(route_destid & 0x7))); - rio_mport_write_config_32(mport, destid, hopcount, offset + (0x20000*table), result | (route_port << (4*(route_destid & 0x7)))); - } - - return 0; -} - -static int -tsi500_route_get_entry(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table, u16 route_destid, u8 *route_port) -{ - int ret = 0; - u32 offset = 0x10000 + 0xa00 + ((route_destid / 2)&~0x3); - u32 result; - - if (table == 0xff) - rio_mport_read_config_32(mport, destid, hopcount, offset, &result); - else - rio_mport_read_config_32(mport, destid, hopcount, offset + (0x20000*table), &result); - - result &= 0xf << (4*(route_destid & 0x7)); - *route_port = result >> (4*(route_destid & 0x7)); - if (*route_port > 3) - ret = -1; - - return ret; -} - -static int tsi500_switch_init(struct rio_dev *rdev, int do_enum) -{ - pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev)); - rdev->rswitch->add_entry = tsi500_route_add_entry; - rdev->rswitch->get_entry = tsi500_route_get_entry; - rdev->rswitch->clr_table = NULL; - rdev->rswitch->set_domain = NULL; - rdev->rswitch->get_domain = NULL; - rdev->rswitch->em_init = NULL; - rdev->rswitch->em_handle = NULL; - - return 0; -} - -DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI500, tsi500_switch_init); diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 42bd57da239d..14c1efdd0757 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -109,9 +109,9 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs) err = rtc->ops->set_time(rtc->dev.parent, &new); } - } - else + } else { err = -EINVAL; + } mutex_unlock(&rtc->ops_lock); /* A timer might have just expired */ @@ -367,14 +367,14 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; - if (rtc->aie_timer.enabled) { + if (rtc->aie_timer.enabled) rtc_timer_remove(rtc, &rtc->aie_timer); - } + rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); rtc->aie_timer.period = ktime_set(0, 0); - if (alarm->enabled) { + if (alarm->enabled) err = rtc_timer_enqueue(rtc, &rtc->aie_timer); - } + mutex_unlock(&rtc->ops_lock); return err; } @@ -891,7 +891,7 @@ again: * * Kernel interface to initializing an rtc_timer. */ -void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data) +void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data) { timerqueue_init(&timer->node); timer->enabled = 0; @@ -907,7 +907,7 @@ void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data) * * Kernel interface to set an rtc_timer */ -int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, +int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period) { int ret = 0; @@ -930,7 +930,7 @@ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, * * Kernel interface to cancel an rtc_timer */ -int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer) +int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer) { int ret = 0; mutex_lock(&rtc->ops_lock); diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c index f3742f364eb8..354c937a5866 100644 --- a/drivers/rtc/rtc-88pm80x.c +++ b/drivers/rtc/rtc-88pm80x.c @@ -345,7 +345,6 @@ out: static int pm80x_rtc_remove(struct platform_device *pdev) { struct pm80x_rtc_info *info = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); pm80x_free_irq(info->chip, info->irq, info); return 0; } diff --git a/drivers/rtc/rtc-88pm860x.c b/drivers/rtc/rtc-88pm860x.c index 0f2b91bfee37..4e30c85728e5 100644 --- a/drivers/rtc/rtc-88pm860x.c +++ b/drivers/rtc/rtc-88pm860x.c @@ -418,7 +418,6 @@ static int pm860x_rtc_remove(struct platform_device *pdev) pm860x_set_bits(info->i2c, PM8607_MEAS_EN2, MEAS2_VRTC, 0); #endif /* VRTC_CALIBRATION */ - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/rtc/rtc-ab3100.c b/drivers/rtc/rtc-ab3100.c index 47a4f2c4d30e..ff435343ba9f 100644 --- a/drivers/rtc/rtc-ab3100.c +++ b/drivers/rtc/rtc-ab3100.c @@ -240,18 +240,11 @@ static int __init ab3100_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit ab3100_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - return 0; -} - static struct platform_driver ab3100_rtc_driver = { .driver = { .name = "ab3100-rtc", .owner = THIS_MODULE, }, - .remove = __exit_p(ab3100_rtc_remove), }; module_platform_driver_probe(ab3100_rtc_driver, ab3100_rtc_probe); diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 63cfa314a39f..c5b62d4389ee 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -451,8 +451,6 @@ static int ab8500_rtc_remove(struct platform_device *pdev) { ab8500_sysfs_rtc_unregister(&pdev->dev); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index f47fbb5eee8b..3161ab5263ed 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -141,7 +141,7 @@ static int at32_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) spin_lock_irq(&rtc->lock); - if(enabled) { + if (enabled) { if (rtc_readl(rtc, VAL) > rtc->alarm_time) { ret = -EINVAL; goto out; @@ -212,23 +212,20 @@ static int __init at32_rtc_probe(struct platform_device *pdev) regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!regs) { dev_dbg(&pdev->dev, "no mmio resource defined\n"); - ret = -ENXIO; - goto out; + return -ENXIO; } irq = platform_get_irq(pdev, 0); if (irq <= 0) { dev_dbg(&pdev->dev, "could not get irq\n"); - ret = -ENXIO; - goto out; + return -ENXIO; } rtc->irq = irq; rtc->regs = devm_ioremap(&pdev->dev, regs->start, resource_size(regs)); if (!rtc->regs) { - ret = -ENOMEM; dev_dbg(&pdev->dev, "could not map I/O memory\n"); - goto out; + return -ENOMEM; } spin_lock_init(&rtc->lock); @@ -249,7 +246,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) "rtc", rtc); if (ret) { dev_dbg(&pdev->dev, "could not request irq %d\n", irq); - goto out; + return ret; } platform_set_drvdata(pdev, rtc); @@ -258,8 +255,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) &at32_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { dev_dbg(&pdev->dev, "could not register rtc device\n"); - ret = PTR_ERR(rtc->rtc); - goto out; + return PTR_ERR(rtc->rtc); } device_init_wakeup(&pdev->dev, 1); @@ -268,18 +264,12 @@ static int __init at32_rtc_probe(struct platform_device *pdev) (unsigned long)rtc->regs, rtc->irq); return 0; - -out: - platform_set_drvdata(pdev, NULL); - return ret; } static int __exit at32_rtc_remove(struct platform_device *pdev) { device_init_wakeup(&pdev->dev, 0); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 0eab77b22340..38ef46a9094e 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -30,8 +30,7 @@ #include <linux/io.h> #include <linux/of.h> #include <linux/of_device.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "rtc-at91rm9200.h" @@ -342,7 +341,6 @@ static int __exit at91_rtc_remove(struct platform_device *pdev) rtc_device_unregister(rtc); iounmap(at91_rtc_regs); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c index b60a34cb145a..309b8b342d9c 100644 --- a/drivers/rtc/rtc-at91sam9.c +++ b/drivers/rtc/rtc-at91sam9.c @@ -324,16 +324,14 @@ static int at91_rtc_probe(struct platform_device *pdev) rtc->rtt = devm_ioremap(&pdev->dev, r->start, resource_size(r)); if (!rtc->rtt) { dev_err(&pdev->dev, "failed to map registers, aborting.\n"); - ret = -ENOMEM; - goto fail; + return -ENOMEM; } rtc->gpbr = devm_ioremap(&pdev->dev, r_gpbr->start, resource_size(r_gpbr)); if (!rtc->gpbr) { dev_err(&pdev->dev, "failed to map gpbr registers, aborting.\n"); - ret = -ENOMEM; - goto fail; + return -ENOMEM; } mr = rtt_readl(rtc, MR); @@ -350,17 +348,15 @@ static int at91_rtc_probe(struct platform_device *pdev) rtc->rtcdev = devm_rtc_device_register(&pdev->dev, pdev->name, &at91_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc->rtcdev)) { - ret = PTR_ERR(rtc->rtcdev); - goto fail; - } + if (IS_ERR(rtc->rtcdev)) + return PTR_ERR(rtc->rtcdev); /* register irq handler after we know what name we'll use */ ret = devm_request_irq(&pdev->dev, rtc->irq, at91_rtc_interrupt, IRQF_SHARED, dev_name(&rtc->rtcdev->dev), rtc); if (ret) { dev_dbg(&pdev->dev, "can't share IRQ %d?\n", rtc->irq); - goto fail; + return ret; } /* NOTE: sam9260 rev A silicon has a ROM bug which resets the @@ -374,10 +370,6 @@ static int at91_rtc_probe(struct platform_device *pdev) dev_name(&rtc->rtcdev->dev)); return 0; - -fail: - platform_set_drvdata(pdev, NULL); - return ret; } /* @@ -391,7 +383,6 @@ static int at91_rtc_remove(struct platform_device *pdev) /* disable all interrupts */ rtt_writel(rtc, MR, mr & ~(AT91_RTT_ALMIEN | AT91_RTT_RTTINCIEN)); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c index 7995abc391fc..ed526a192ce0 100644 --- a/drivers/rtc/rtc-au1xxx.c +++ b/drivers/rtc/rtc-au1xxx.c @@ -116,19 +116,11 @@ out_err: return ret; } -static int au1xtoy_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - static struct platform_driver au1xrtc_driver = { .driver = { .name = "rtc-au1xxx", .owner = THIS_MODULE, }, - .remove = au1xtoy_rtc_remove, }; module_platform_driver_probe(au1xrtc_driver, au1xtoy_rtc_probe); diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index ad44ec5dc29a..0c53f452849d 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -391,7 +391,6 @@ static int bfin_rtc_remove(struct platform_device *pdev) struct device *dev = &pdev->dev; bfin_rtc_reset(dev, 0); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index af2886784a7b..fc0ff87aa5df 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -186,13 +186,6 @@ out: } -static int bq4802_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - /* work with hotplug and coldplug */ MODULE_ALIAS("platform:rtc-bq4802"); @@ -202,7 +195,6 @@ static struct platform_driver bq4802_driver = { .owner = THIS_MODULE, }, .probe = bq4802_probe, - .remove = bq4802_remove, }; module_platform_driver(bq4802_driver); diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index cc5bea9c4b1c..60b6fd4bee5f 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -326,7 +326,7 @@ static void cmos_irq_disable(struct cmos_rtc *cmos, unsigned char mask) static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned char mon, mday, hrs, min, sec, rtc_control; + unsigned char mon, mday, hrs, min, sec, rtc_control; if (!is_valid_irq(cmos->irq)) return -EIO; @@ -691,7 +691,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) /* FIXME: * <asm-generic/rtc.h> doesn't know 12-hour mode either. */ - if (is_valid_irq(rtc_irq) && !(rtc_control & RTC_24H)) { + if (is_valid_irq(rtc_irq) && !(rtc_control & RTC_24H)) { dev_warn(dev, "only 24-hr supported\n"); retval = -ENXIO; goto cleanup1; @@ -989,7 +989,7 @@ static int cmos_pnp_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) { cmos_wake_setup(&pnp->dev); - if (pnp_port_start(pnp,0) == 0x70 && !pnp_irq_valid(pnp,0)) + if (pnp_port_start(pnp, 0) == 0x70 && !pnp_irq_valid(pnp, 0)) /* Some machines contain a PNP entry for the RTC, but * don't define the IRQ. It should always be safe to * hardcode it in these cases diff --git a/drivers/rtc/rtc-coh901331.c b/drivers/rtc/rtc-coh901331.c index 93c06588ddca..9ad58914601d 100644 --- a/drivers/rtc/rtc-coh901331.c +++ b/drivers/rtc/rtc-coh901331.c @@ -154,10 +154,8 @@ static int __exit coh901331_remove(struct platform_device *pdev) { struct coh901331_port *rtap = dev_get_drvdata(&pdev->dev); - if (rtap) { + if (rtap) clk_unprepare(rtap->clk); - platform_set_drvdata(pdev, NULL); - } return 0; } @@ -220,7 +218,6 @@ static int __init coh901331_probe(struct platform_device *pdev) return 0; out_no_rtc: - platform_set_drvdata(pdev, NULL); clk_unprepare(rtap->clk); return ret; } diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index 7286b279cf2d..2f7fdd77a6f0 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -255,16 +255,8 @@ static int da9052_rtc_probe(struct platform_device *pdev) return 0; } -static int da9052_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - static struct platform_driver da9052_rtc_driver = { .probe = da9052_rtc_probe, - .remove = da9052_rtc_remove, .driver = { .name = "da9052-rtc", .owner = THIS_MODULE, diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c index 73858ca9709a..e00642b61076 100644 --- a/drivers/rtc/rtc-da9055.c +++ b/drivers/rtc/rtc-da9055.c @@ -315,13 +315,6 @@ err_rtc: } -static int da9055_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - #ifdef CONFIG_PM /* Turn off the alarm if it should not be a wake source. */ static int da9055_rtc_suspend(struct device *dev) @@ -394,7 +387,6 @@ static const struct dev_pm_ops da9055_rtc_pm_ops = { static struct platform_driver da9055_rtc_driver = { .probe = da9055_rtc_probe, - .remove = da9055_rtc_remove, .driver = { .name = "da9055-rtc", .owner = THIS_MODULE, diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index a55048c3e26f..24677ef8c39a 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -117,7 +117,7 @@ static DEFINE_SPINLOCK(davinci_rtc_lock); struct davinci_rtc { - struct rtc_device *rtc; + struct rtc_device *rtc; void __iomem *base; resource_size_t pbase; size_t base_size; @@ -526,10 +526,9 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) davinci_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &davinci_rtc_ops, THIS_MODULE); if (IS_ERR(davinci_rtc->rtc)) { - ret = PTR_ERR(davinci_rtc->rtc); dev_err(dev, "unable to register RTC device, err %d\n", ret); - goto fail1; + return PTR_ERR(davinci_rtc->rtc); } rtcif_write(davinci_rtc, PRTCIF_INTFLG_RTCSS, PRTCIF_INTFLG); @@ -543,7 +542,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) 0, "davinci_rtc", davinci_rtc); if (ret < 0) { dev_err(dev, "unable to register davinci RTC interrupt\n"); - goto fail1; + return ret; } /* Enable interrupts */ @@ -556,10 +555,6 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 0); return 0; - -fail1: - platform_set_drvdata(pdev, NULL); - return ret; } static int __exit davinci_rtc_remove(struct platform_device *pdev) @@ -570,8 +565,6 @@ static int __exit davinci_rtc_remove(struct platform_device *pdev) rtcif_write(davinci_rtc, 0, PRTCIF_INTEN); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c index 1e1ca63d58a9..1aca08394c47 100644 --- a/drivers/rtc/rtc-dm355evm.c +++ b/drivers/rtc/rtc-dm355evm.c @@ -139,19 +139,12 @@ static int dm355evm_rtc_probe(struct platform_device *pdev) return 0; } -static int dm355evm_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - return 0; -} - /* * I2C is used to talk to the MSP430, but this platform device is * exposed by an MFD driver that manages I2C communications. */ static struct platform_driver rtc_dm355evm_driver = { .probe = dm355evm_rtc_probe, - .remove = dm355evm_rtc_remove, .driver = { .owner = THIS_MODULE, .name = "rtc-dm355evm", diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c index d13954346286..7533b723082d 100644 --- a/drivers/rtc/rtc-ds1302.c +++ b/drivers/rtc/rtc-ds1302.c @@ -234,19 +234,11 @@ static int __init ds1302_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit ds1302_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - static struct platform_driver ds1302_platform_driver = { .driver = { .name = DRV_NAME, .owner = THIS_MODULE, }, - .remove = __exit_p(ds1302_rtc_remove), }; module_platform_driver_probe(ds1302_platform_driver, ds1302_rtc_probe); diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c index bb5f13f63630..dd6170acde95 100644 --- a/drivers/rtc/rtc-ds1305.c +++ b/drivers/rtc/rtc-ds1305.c @@ -158,7 +158,7 @@ static int ds1305_alarm_irq_enable(struct device *dev, unsigned int enabled) goto done; buf[1] &= ~DS1305_AEI0; } - err = spi_write_then_read(ds1305->spi, buf, sizeof buf, NULL, 0); + err = spi_write_then_read(ds1305->spi, buf, sizeof(buf), NULL, 0); if (err >= 0) ds1305->ctrl[0] = buf[1]; done: @@ -181,8 +181,8 @@ static int ds1305_get_time(struct device *dev, struct rtc_time *time) /* Use write-then-read to get all the date/time registers * since dma from stack is nonportable */ - status = spi_write_then_read(ds1305->spi, &addr, sizeof addr, - buf, sizeof buf); + status = spi_write_then_read(ds1305->spi, &addr, sizeof(addr), + buf, sizeof(buf)); if (status < 0) return status; @@ -237,7 +237,7 @@ static int ds1305_set_time(struct device *dev, struct rtc_time *time) buf[4], buf[5], buf[6], buf[7]); /* use write-then-read since dma from stack is nonportable */ - return spi_write_then_read(ds1305->spi, buf, sizeof buf, + return spi_write_then_read(ds1305->spi, buf, sizeof(buf), NULL, 0); } @@ -286,8 +286,8 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm) * of EFI status is at best fragile anyway (given IRQ handlers). */ addr = DS1305_CONTROL; - status = spi_write_then_read(spi, &addr, sizeof addr, - ds1305->ctrl, sizeof ds1305->ctrl); + status = spi_write_then_read(spi, &addr, sizeof(addr), + ds1305->ctrl, sizeof(ds1305->ctrl)); if (status < 0) return status; @@ -296,8 +296,8 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm) /* get and check ALM0 registers */ addr = DS1305_ALM0(DS1305_SEC); - status = spi_write_then_read(spi, &addr, sizeof addr, - buf, sizeof buf); + status = spi_write_then_read(spi, &addr, sizeof(addr), + buf, sizeof(buf)); if (status < 0) return status; @@ -381,7 +381,7 @@ static int ds1305_set_alarm(struct device *dev, struct rtc_wkalrm *alm) "alm0 write", buf[1 + DS1305_SEC], buf[1 + DS1305_MIN], buf[1 + DS1305_HOUR], buf[1 + DS1305_WDAY]); - status = spi_write_then_read(spi, buf, sizeof buf, NULL, 0); + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); if (status < 0) return status; @@ -474,7 +474,7 @@ static void ds1305_work(struct work_struct *work) buf[1] = ds1305->ctrl[0]; buf[2] = 0; - status = spi_write_then_read(spi, buf, sizeof buf, + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); if (status < 0) dev_dbg(&spi->dev, "clear irq --> %d\n", status); @@ -627,8 +627,8 @@ static int ds1305_probe(struct spi_device *spi) /* read and cache control registers */ addr = DS1305_CONTROL; - status = spi_write_then_read(spi, &addr, sizeof addr, - ds1305->ctrl, sizeof ds1305->ctrl); + status = spi_write_then_read(spi, &addr, sizeof(addr), + ds1305->ctrl, sizeof(ds1305->ctrl)); if (status < 0) { dev_dbg(&spi->dev, "can't %s, %d\n", "read", status); @@ -659,7 +659,7 @@ static int ds1305_probe(struct spi_device *spi) buf[0] = DS1305_WRITE | DS1305_CONTROL; buf[1] = ds1305->ctrl[0]; - status = spi_write_then_read(spi, buf, sizeof buf, NULL, 0); + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); dev_dbg(&spi->dev, "clear WP --> %d\n", status); if (status < 0) @@ -713,7 +713,7 @@ static int ds1305_probe(struct spi_device *spi) buf[1] = ds1305->ctrl[0]; buf[2] = ds1305->ctrl[1]; buf[3] = ds1305->ctrl[2]; - status = spi_write_then_read(spi, buf, sizeof buf, NULL, 0); + status = spi_write_then_read(spi, buf, sizeof(buf), NULL, 0); if (status < 0) { dev_dbg(&spi->dev, "can't %s, %d\n", "write", status); @@ -725,8 +725,8 @@ static int ds1305_probe(struct spi_device *spi) /* see if non-Linux software set up AM/PM mode */ addr = DS1305_HOUR; - status = spi_write_then_read(spi, &addr, sizeof addr, - &value, sizeof value); + status = spi_write_then_read(spi, &addr, sizeof(addr), + &value, sizeof(value)); if (status < 0) { dev_dbg(&spi->dev, "read HOUR --> %d\n", status); return status; diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c index 94366e12f40f..9e6e14fb53d7 100644 --- a/drivers/rtc/rtc-ds1374.c +++ b/drivers/rtc/rtc-ds1374.c @@ -65,7 +65,7 @@ struct ds1374 { static struct i2c_driver ds1374_driver; static int ds1374_read_rtc(struct i2c_client *client, u32 *time, - int reg, int nbytes) + int reg, int nbytes) { u8 buf[4]; int ret; @@ -90,7 +90,7 @@ static int ds1374_read_rtc(struct i2c_client *client, u32 *time, } static int ds1374_write_rtc(struct i2c_client *client, u32 time, - int reg, int nbytes) + int reg, int nbytes) { u8 buf[4]; int i; @@ -119,8 +119,7 @@ static int ds1374_check_rtc_status(struct i2c_client *client) if (stat & DS1374_REG_SR_OSF) dev_warn(&client->dev, - "oscillator discontinuity flagged, " - "time unreliable\n"); + "oscillator discontinuity flagged, time unreliable\n"); stat &= ~(DS1374_REG_SR_OSF | DS1374_REG_SR_AF); @@ -363,7 +362,7 @@ static int ds1374_probe(struct i2c_client *client, if (client->irq > 0) { ret = devm_request_irq(&client->dev, client->irq, ds1374_irq, 0, - "ds1374", client); + "ds1374", client); if (ret) { dev_err(&client->dev, "unable to request IRQ\n"); return ret; @@ -373,7 +372,7 @@ static int ds1374_probe(struct i2c_client *client, } ds1374->rtc = devm_rtc_device_register(&client->dev, client->name, - &ds1374_rtc_ops, THIS_MODULE); + &ds1374_rtc_ops, THIS_MODULE); if (IS_ERR(ds1374->rtc)) { dev_err(&client->dev, "unable to register the class device\n"); return PTR_ERR(ds1374->rtc); diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 6ce8a997cf51..308a8fefe76f 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -104,31 +104,31 @@ static DEFINE_SPINLOCK(ds1511_lock); static __iomem char *ds1511_base; static u32 reg_spacing = 1; - static noinline void +static noinline void rtc_write(uint8_t val, uint32_t reg) { writeb(val, ds1511_base + (reg * reg_spacing)); } - static inline void +static inline void rtc_write_alarm(uint8_t val, enum ds1511reg reg) { rtc_write((val | 0x80), reg); } - static noinline uint8_t +static noinline uint8_t rtc_read(enum ds1511reg reg) { return readb(ds1511_base + (reg * reg_spacing)); } - static inline void +static inline void rtc_disable_update(void) { rtc_write((rtc_read(RTC_CMD) & ~RTC_TE), RTC_CMD); } - static void +static void rtc_enable_update(void) { rtc_write((rtc_read(RTC_CMD) | RTC_TE), RTC_CMD); @@ -145,7 +145,7 @@ rtc_enable_update(void) * just enough code to set the watchdog timer so that it * will reboot the system */ - void +void ds1511_wdog_set(unsigned long deciseconds) { /* @@ -163,7 +163,7 @@ ds1511_wdog_set(unsigned long deciseconds) rtc_write(DS1511_WDE | DS1511_WDS, RTC_CMD); } - void +void ds1511_wdog_disable(void) { /* @@ -191,13 +191,12 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) /* * won't have to change this for a while */ - if (rtc_tm->tm_year < 1900) { + if (rtc_tm->tm_year < 1900) rtc_tm->tm_year += 1900; - } - if (rtc_tm->tm_year < 1970) { + if (rtc_tm->tm_year < 1970) return -EINVAL; - } + yrs = rtc_tm->tm_year % 100; cen = rtc_tm->tm_year / 100; mon = rtc_tm->tm_mon + 1; /* tm_mon starts at zero */ @@ -207,17 +206,14 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) min = rtc_tm->tm_min; sec = rtc_tm->tm_sec; - if ((mon > 12) || (day == 0)) { + if ((mon > 12) || (day == 0)) return -EINVAL; - } - if (day > rtc_month_days(rtc_tm->tm_mon, rtc_tm->tm_year)) { + if (day > rtc_month_days(rtc_tm->tm_mon, rtc_tm->tm_year)) return -EINVAL; - } - if ((hrs >= 24) || (min >= 60) || (sec >= 60)) { + if ((hrs >= 24) || (min >= 60) || (sec >= 60)) return -EINVAL; - } /* * each register is a different number of valid bits @@ -299,7 +295,7 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm) * date/hours/mins/secs matches. the ds1511 has many more * permutations, but the kernel doesn't. */ - static void +static void ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) { unsigned long flags; @@ -322,7 +318,7 @@ ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) spin_unlock_irqrestore(&pdata->lock, flags); } - static int +static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct platform_device *pdev = to_platform_device(dev); @@ -335,14 +331,14 @@ ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) pdata->alrm_hour = alrm->time.tm_hour; pdata->alrm_min = alrm->time.tm_min; pdata->alrm_sec = alrm->time.tm_sec; - if (alrm->enabled) { + if (alrm->enabled) pdata->irqen |= RTC_AF; - } + ds1511_rtc_update_alarm(pdata); return 0; } - static int +static int ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct platform_device *pdev = to_platform_device(dev); @@ -359,7 +355,7 @@ ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } - static irqreturn_t +static irqreturn_t ds1511_interrupt(int irq, void *dev_id) { struct platform_device *pdev = dev_id; @@ -406,7 +402,7 @@ static const struct rtc_class_ops ds1511_rtc_ops = { .alarm_irq_enable = ds1511_rtc_alarm_irq_enable, }; - static ssize_t +static ssize_t ds1511_nvram_read(struct file *filp, struct kobject *kobj, struct bin_attribute *ba, char *buf, loff_t pos, size_t size) @@ -417,26 +413,26 @@ ds1511_nvram_read(struct file *filp, struct kobject *kobj, * if count is more than one, turn on "burst" mode * turn it off when you're done */ - if (size > 1) { + if (size > 1) rtc_write((rtc_read(RTC_CMD) | DS1511_BME), RTC_CMD); - } - if (pos > DS1511_RAM_MAX) { + + if (pos > DS1511_RAM_MAX) pos = DS1511_RAM_MAX; - } - if (size + pos > DS1511_RAM_MAX + 1) { + + if (size + pos > DS1511_RAM_MAX + 1) size = DS1511_RAM_MAX - pos + 1; - } + rtc_write(pos, DS1511_RAMADDR_LSB); - for (count = 0; size > 0; count++, size--) { + for (count = 0; size > 0; count++, size--) *buf++ = rtc_read(DS1511_RAMDATA); - } - if (count > 1) { + + if (count > 1) rtc_write((rtc_read(RTC_CMD) & ~DS1511_BME), RTC_CMD); - } + return count; } - static ssize_t +static ssize_t ds1511_nvram_write(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t size) @@ -447,22 +443,22 @@ ds1511_nvram_write(struct file *filp, struct kobject *kobj, * if count is more than one, turn on "burst" mode * turn it off when you're done */ - if (size > 1) { + if (size > 1) rtc_write((rtc_read(RTC_CMD) | DS1511_BME), RTC_CMD); - } - if (pos > DS1511_RAM_MAX) { + + if (pos > DS1511_RAM_MAX) pos = DS1511_RAM_MAX; - } - if (size + pos > DS1511_RAM_MAX + 1) { + + if (size + pos > DS1511_RAM_MAX + 1) size = DS1511_RAM_MAX - pos + 1; - } + rtc_write(pos, DS1511_RAMADDR_LSB); - for (count = 0; size > 0; count++, size--) { + for (count = 0; size > 0; count++, size--) rtc_write(*buf++, DS1511_RAMDATA); - } - if (count > 1) { + + if (count > 1) rtc_write((rtc_read(RTC_CMD) & ~DS1511_BME), RTC_CMD); - } + return count; } @@ -484,9 +480,9 @@ static int ds1511_rtc_probe(struct platform_device *pdev) int ret = 0; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { + if (!res) return -ENODEV; - } + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; @@ -518,9 +514,8 @@ static int ds1511_rtc_probe(struct platform_device *pdev) /* * check for a dying bat-tree */ - if (rtc_read(RTC_CMD1) & DS1511_BLF1) { + if (rtc_read(RTC_CMD1) & DS1511_BLF1) dev_warn(&pdev->dev, "voltage-low detected.\n"); - } spin_lock_init(&pdata->lock); platform_set_drvdata(pdev, pdata); diff --git a/drivers/rtc/rtc-ds3234.c b/drivers/rtc/rtc-ds3234.c index ba98c0e9580d..67f1a80112e2 100644 --- a/drivers/rtc/rtc-ds3234.c +++ b/drivers/rtc/rtc-ds3234.c @@ -73,7 +73,7 @@ static int ds3234_read_time(struct device *dev, struct rtc_time *dt) dt->tm_wday = bcd2bin(buf[3]) - 1; /* 0 = Sun */ dt->tm_mday = bcd2bin(buf[4]); dt->tm_mon = bcd2bin(buf[5] & 0x1f) - 1; /* 0 = Jan */ - dt->tm_year = bcd2bin(buf[6] & 0xff) + 100; /* Assume 20YY */ + dt->tm_year = bcd2bin(buf[6] & 0xff) + 100; /* Assume 20YY */ return rtc_valid_tm(dt); } diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 5807b77c444a..549b3c3792d2 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -167,7 +167,6 @@ static int ep93xx_rtc_probe(struct platform_device *pdev) return 0; exit: - platform_set_drvdata(pdev, NULL); pdev->dev.platform_data = NULL; return err; } @@ -175,7 +174,6 @@ exit: static int ep93xx_rtc_remove(struct platform_device *pdev) { sysfs_remove_group(&pdev->dev.kobj, &ep93xx_rtc_sysfs_files); - platform_set_drvdata(pdev, NULL); pdev->dev.platform_data = NULL; return 0; diff --git a/drivers/rtc/rtc-fm3130.c b/drivers/rtc/rtc-fm3130.c index 2835fb6c1965..8f053daf0f0c 100644 --- a/drivers/rtc/rtc-fm3130.c +++ b/drivers/rtc/rtc-fm3130.c @@ -47,7 +47,7 @@ struct fm3130 { u8 reg_addr_time; - u8 reg_addr_alarm; + u8 reg_addr_alarm; u8 regs[15]; struct i2c_msg msg[4]; struct i2c_client *client; diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 63024505dddc..4aca12d1d564 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -27,6 +27,12 @@ /* Usage ID from spec for Time: 0x2000A0 */ #define DRIVER_NAME "HID-SENSOR-2000a0" /* must be lowercase */ +static bool hid_time_hctosys_enabled = 1; +module_param_named(hctosys, hid_time_hctosys_enabled, bool, 0644); +MODULE_PARM_DESC(hctosys, + "set the system time (once) if it is before 1970-01-02"); +static bool hid_time_time_set_once; + enum hid_time_channel { CHANNEL_SCAN_INDEX_YEAR, CHANNEL_SCAN_INDEX_MONTH, @@ -34,18 +40,27 @@ enum hid_time_channel { CHANNEL_SCAN_INDEX_HOUR, CHANNEL_SCAN_INDEX_MINUTE, CHANNEL_SCAN_INDEX_SECOND, + CHANNEL_SCAN_INDEX_MILLISECOND, /* optional */ TIME_RTC_CHANNEL_MAX, }; +struct hid_time_workts { + struct work_struct work; + struct hid_time_state *time_state; +}; + struct hid_time_state { struct hid_sensor_hub_callbacks callbacks; struct hid_sensor_common common_attributes; struct hid_sensor_hub_attribute_info info[TIME_RTC_CHANNEL_MAX]; struct rtc_time last_time; + unsigned last_ms; /* == UINT_MAX to indicate ms aren't supported */ spinlock_t lock_last_time; struct completion comp_last_time; struct rtc_time time_buf; + unsigned ms_buf; struct rtc_device *rtc; + struct hid_time_workts *workts; }; static const u32 hid_time_addresses[TIME_RTC_CHANNEL_MAX] = { @@ -55,13 +70,65 @@ static const u32 hid_time_addresses[TIME_RTC_CHANNEL_MAX] = { HID_USAGE_SENSOR_TIME_HOUR, HID_USAGE_SENSOR_TIME_MINUTE, HID_USAGE_SENSOR_TIME_SECOND, + HID_USAGE_SENSOR_TIME_MILLISECOND, /* optional */ }; /* Channel names for verbose error messages */ static const char * const hid_time_channel_names[TIME_RTC_CHANNEL_MAX] = { - "year", "month", "day", "hour", "minute", "second", + "year", "month", "day", "hour", "minute", "second", "millisecond", }; +static void hid_time_hctosys(struct hid_time_state *time_state) +{ + struct timespec ts; + char msbuf[5]; + int rc = rtc_valid_tm(&time_state->last_time); + + if (rc) { + dev_err(time_state->rtc->dev.parent, + "hctosys: invalid date/time\n"); + return; + } + + getnstimeofday(&ts); + if (ts.tv_sec >= 86400) /* 1970-01-02 00:00:00 UTC */ + /* + * Security: don't let a hot-pluggable device change + * a valid system time. + * In absence of a kernel-wide flag like 'systime_was_set', + * we check if the system time is earlier than 1970-01-02. + * Just using a flag inside the RTC subsystem (e.g. from + * hctosys) doesn't work, because the time could be set + * by userspace (NTP, date, user, ...) or something else + * in the kernel too. + * Using a whole day to decide if something else has set the + * time should be enough for even the longest boot to complete + * (including possible forced fscks) and load this module. + */ + return; + + if (time_state->last_ms != UINT_MAX) { + ts.tv_nsec = time_state->last_ms * NSEC_PER_MSEC; + snprintf(msbuf, sizeof(msbuf), ":%03d", time_state->last_ms); + } else { + ts.tv_nsec = NSEC_PER_SEC >> 1; + *msbuf = 0; + } + rtc_tm_to_time(&time_state->last_time, &ts.tv_sec); + rc = do_settimeofday(&ts); + dev_info(time_state->rtc->dev.parent, + "hctosys: setting system clock to " + "%d-%02d-%02d %02d:%02d:%02d%s UTC (%u)\n", + time_state->last_time.tm_year + 1900, + time_state->last_time.tm_mon + 1, + time_state->last_time.tm_mday, + time_state->last_time.tm_hour, + time_state->last_time.tm_min, + time_state->last_time.tm_sec, + msbuf, + (unsigned int) ts.tv_sec); +} + /* Callback handler to send event after all samples are received and captured */ static int hid_time_proc_event(struct hid_sensor_hub_device *hsdev, unsigned usage_id, void *priv) @@ -71,11 +138,31 @@ static int hid_time_proc_event(struct hid_sensor_hub_device *hsdev, spin_lock_irqsave(&time_state->lock_last_time, flags); time_state->last_time = time_state->time_buf; + if (time_state->last_ms != UINT_MAX) + time_state->last_ms = time_state->ms_buf; spin_unlock_irqrestore(&time_state->lock_last_time, flags); complete(&time_state->comp_last_time); + if (unlikely(!hid_time_time_set_once && hid_time_hctosys_enabled)) { + hid_time_time_set_once = 1; + hid_time_hctosys(time_state); + } return 0; } +static u32 hid_time_value(size_t raw_len, char *raw_data) +{ + switch (raw_len) { + case 1: + return *(u8 *)raw_data; + case 2: + return *(u16 *)raw_data; + case 4: + return *(u32 *)raw_data; + default: + return (u32)(~0U); /* 0xff... or -1 to denote an error */ + } +} + static int hid_time_capture_sample(struct hid_sensor_hub_device *hsdev, unsigned usage_id, size_t raw_len, char *raw_data, void *priv) @@ -85,26 +172,38 @@ static int hid_time_capture_sample(struct hid_sensor_hub_device *hsdev, switch (usage_id) { case HID_USAGE_SENSOR_TIME_YEAR: - time_buf->tm_year = *(u8 *)raw_data; - if (time_buf->tm_year < 70) - /* assume we are in 1970...2069 */ - time_buf->tm_year += 100; + /* + * The draft for HID-sensors (HUTRR39) currently doesn't define + * the range for the year attribute. Therefor we support + * 8 bit (0-99) and 16 or 32 bits (full) as size for the year. + */ + if (raw_len == 1) { + time_buf->tm_year = *(u8 *)raw_data; + if (time_buf->tm_year < 70) + /* assume we are in 1970...2069 */ + time_buf->tm_year += 100; + } else + time_buf->tm_year = + (int)hid_time_value(raw_len, raw_data)-1900; break; case HID_USAGE_SENSOR_TIME_MONTH: - /* sensor sending the month as 1-12, we need 0-11 */ - time_buf->tm_mon = *(u8 *)raw_data-1; + /* sensors are sending the month as 1-12, we need 0-11 */ + time_buf->tm_mon = (int)hid_time_value(raw_len, raw_data)-1; break; case HID_USAGE_SENSOR_TIME_DAY: - time_buf->tm_mday = *(u8 *)raw_data; + time_buf->tm_mday = (int)hid_time_value(raw_len, raw_data); break; case HID_USAGE_SENSOR_TIME_HOUR: - time_buf->tm_hour = *(u8 *)raw_data; + time_buf->tm_hour = (int)hid_time_value(raw_len, raw_data); break; case HID_USAGE_SENSOR_TIME_MINUTE: - time_buf->tm_min = *(u8 *)raw_data; + time_buf->tm_min = (int)hid_time_value(raw_len, raw_data); break; case HID_USAGE_SENSOR_TIME_SECOND: - time_buf->tm_sec = *(u8 *)raw_data; + time_buf->tm_sec = (int)hid_time_value(raw_len, raw_data); + break; + case HID_USAGE_SENSOR_TIME_MILLISECOND: + time_state->ms_buf = hid_time_value(raw_len, raw_data); break; default: return -EINVAL; @@ -132,12 +231,18 @@ static int hid_time_parse_report(struct platform_device *pdev, { int report_id, i; - for (i = 0; i < TIME_RTC_CHANNEL_MAX; ++i) + /* Check if all required attributes are available */ + for (i = 0; i < CHANNEL_SCAN_INDEX_MILLISECOND; ++i) if (sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT, usage_id, hid_time_addresses[i], &time_state->info[i]) < 0) return -EINVAL; + if (!sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT, + usage_id, hid_time_addresses[i], &time_state->info[i])) { + dev_info(&pdev->dev, "milliseconds supported\n"); + time_state->last_ms = 0; + } /* Check the (needed) attributes for sanity */ report_id = time_state->info[0].report_id; if (report_id < 0) { @@ -145,14 +250,19 @@ static int hid_time_parse_report(struct platform_device *pdev, return -EINVAL; } for (i = 0; i < TIME_RTC_CHANNEL_MAX; ++i) { + if (time_state->info[i].attrib_id == + HID_USAGE_SENSOR_TIME_MILLISECOND && + time_state->last_ms == UINT_MAX) + continue; if (time_state->info[i].report_id != report_id) { dev_err(&pdev->dev, "not all needed attributes inside the same report!\n"); return -EINVAL; } - if (time_state->info[i].size != 1) { + if (time_state->info[i].size == 3 || + time_state->info[i].size > 4) { dev_err(&pdev->dev, - "attribute '%s' not 8 bits wide!\n", + "attribute '%s' not 8, 16 or 32 bits wide!\n", hid_time_attrib_name( time_state->info[i].attrib_id)); return -EINVAL; @@ -213,6 +323,19 @@ static const struct rtc_class_ops hid_time_rtc_ops = { .read_time = hid_rtc_read_time, }; +static void hid_time_request_report_work(struct work_struct *work) +{ + struct hid_time_state *time_state = + container_of(work, struct hid_time_workts, work) + ->time_state; + /* get a report with all values through requesting one value */ + sensor_hub_input_attr_get_raw_value( + time_state->common_attributes.hsdev, HID_USAGE_SENSOR_TIME, + hid_time_addresses[0], time_state->info[0].report_id); + time_state->workts = NULL; + kfree(work); +} + static int hid_time_probe(struct platform_device *pdev) { int ret = 0; @@ -223,6 +346,8 @@ static int hid_time_probe(struct platform_device *pdev) if (time_state == NULL) return -ENOMEM; + time_state->last_ms = UINT_MAX; + platform_set_drvdata(pdev, time_state); spin_lock_init(&time_state->lock_last_time); @@ -264,13 +389,35 @@ static int hid_time_probe(struct platform_device *pdev) return PTR_ERR(time_state->rtc); } + if (!hid_time_time_set_once && hid_time_hctosys_enabled) { + /* + * Request a HID report to set the time. + * Calling sensor_hub_..._get_raw_value() here directly + * doesn't work, therefor we have to use a work. + */ + time_state->workts = kmalloc(sizeof(struct hid_time_workts), + GFP_KERNEL); + if (time_state->workts == NULL) + return -ENOMEM; + time_state->workts->time_state = time_state; + INIT_WORK(&time_state->workts->work, + hid_time_request_report_work); + schedule_work(&time_state->workts->work); + } + return ret; } static int hid_time_remove(struct platform_device *pdev) { struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data; + struct hid_time_state *time_state = platform_get_drvdata(pdev); + if (time_state->workts) { + cancel_work_sync(&time_state->workts->work); + kfree(time_state->workts); + time_state->workts = NULL; + } sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME); return 0; diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 1e48686ca6d2..c6573585d028 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -287,7 +287,6 @@ err_free_irq: err_unregister_rtc: rtc_device_unregister(rtc->rtc); err_iounmap: - platform_set_drvdata(pdev, NULL); iounmap(rtc->base); err_release_mem_region: release_mem_region(rtc->mem->start, resource_size(rtc->mem)); @@ -310,8 +309,6 @@ static int jz4740_rtc_remove(struct platform_device *pdev) kfree(rtc); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-lp8788.c b/drivers/rtc/rtc-lp8788.c index 9853ac15b296..4ff6c73253b3 100644 --- a/drivers/rtc/rtc-lp8788.c +++ b/drivers/rtc/rtc-lp8788.c @@ -312,16 +312,8 @@ static int lp8788_rtc_probe(struct platform_device *pdev) return 0; } -static int lp8788_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - static struct platform_driver lp8788_rtc_driver = { .probe = lp8788_rtc_probe, - .remove = lp8788_rtc_remove, .driver = { .name = LP8788_DEV_RTC, .owner = THIS_MODULE, diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index 787550d756e9..8276ae94a2a9 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -277,7 +277,6 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) &lpc32xx_rtc_ops, THIS_MODULE); if (IS_ERR(rtc->rtc)) { dev_err(&pdev->dev, "Can't get RTC\n"); - platform_set_drvdata(pdev, NULL); return PTR_ERR(rtc->rtc); } @@ -306,8 +305,6 @@ static int lpc32xx_rtc_remove(struct platform_device *pdev) if (rtc->irq >= 0) device_init_wakeup(&pdev->dev, 0); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c index db82f91f4562..682ecb094839 100644 --- a/drivers/rtc/rtc-ls1x.c +++ b/drivers/rtc/rtc-ls1x.c @@ -185,19 +185,11 @@ err: return ret; } -static int ls1x_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - static struct platform_driver ls1x_rtc_driver = { .driver = { .name = "ls1x-rtc", .owner = THIS_MODULE, }, - .remove = ls1x_rtc_remove, .probe = ls1x_rtc_probe, }; diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index 89674b5e6efd..a5248aa1abf1 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -168,7 +168,7 @@ static int m41t80_set_datetime(struct i2c_client *client, struct rtc_time *tm) buf[M41T80_REG_MIN] = bin2bcd(tm->tm_min) | (buf[M41T80_REG_MIN] & ~0x7f); buf[M41T80_REG_HOUR] = - bin2bcd(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f) ; + bin2bcd(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f); buf[M41T80_REG_WDAY] = (tm->tm_wday & 0x07) | (buf[M41T80_REG_WDAY] & ~0x07); buf[M41T80_REG_DAY] = diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 130f29af3869..d4d31fa19244 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -513,7 +513,6 @@ static int m48t59_rtc_remove(struct platform_device *pdev) iounmap(m48t59->ioaddr); if (m48t59->irq != NO_IRQ) free_irq(m48t59->irq, &pdev->dev); - platform_set_drvdata(pdev, NULL); kfree(m48t59); return 0; } diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index 33a91c484533..2d30314fa07f 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -166,20 +166,12 @@ static int m48t86_rtc_probe(struct platform_device *dev) return 0; } -static int m48t86_rtc_remove(struct platform_device *dev) -{ - platform_set_drvdata(dev, NULL); - - return 0; -} - static struct platform_driver m48t86_rtc_platform_driver = { .driver = { .name = "rtc-m48t86", .owner = THIS_MODULE, }, .probe = m48t86_rtc_probe, - .remove = m48t86_rtc_remove, }; module_platform_driver(m48t86_rtc_platform_driver); diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index e3aea00c3145..e9dd56c8ffe7 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -159,7 +159,7 @@ static struct spi_driver max6902_driver = { module_spi_driver(max6902_driver); -MODULE_DESCRIPTION ("max6902 spi RTC driver"); -MODULE_AUTHOR ("Raphael Assenat"); -MODULE_LICENSE ("GPL"); +MODULE_DESCRIPTION("max6902 spi RTC driver"); +MODULE_AUTHOR("Raphael Assenat"); +MODULE_LICENSE("GPL"); MODULE_ALIAS("spi:rtc-max6902"); diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c index 771812d62e6b..441c5c2e0bfc 100644 --- a/drivers/rtc/rtc-max77686.c +++ b/drivers/rtc/rtc-max77686.c @@ -119,7 +119,7 @@ static int max77686_rtc_tm_to_data(struct rtc_time *tm, u8 *data) data[RTC_WEEKDAY] = 1 << tm->tm_wday; data[RTC_DATE] = tm->tm_mday; data[RTC_MONTH] = tm->tm_mon + 1; - data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0 ; + data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0; if (tm->tm_year < 100) { pr_warn("%s: MAX77686 RTC cannot handle the year %d." diff --git a/drivers/rtc/rtc-max8925.c b/drivers/rtc/rtc-max8925.c index 7c90f4e45e27..951d1a78e190 100644 --- a/drivers/rtc/rtc-max8925.c +++ b/drivers/rtc/rtc-max8925.c @@ -268,7 +268,7 @@ static int max8925_rtc_probe(struct platform_device *pdev) if (ret < 0) { dev_err(chip->dev, "Failed to request IRQ: #%d: %d\n", info->irq, ret); - goto err; + return ret; } dev_set_drvdata(&pdev->dev, info); @@ -282,18 +282,10 @@ static int max8925_rtc_probe(struct platform_device *pdev) ret = PTR_ERR(info->rtc_dev); if (IS_ERR(info->rtc_dev)) { dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); - goto err; + return ret; } return 0; -err: - platform_set_drvdata(pdev, NULL); - return ret; -} - -static int max8925_rtc_remove(struct platform_device *pdev) -{ - return 0; } #ifdef CONFIG_PM_SLEEP @@ -326,7 +318,6 @@ static struct platform_driver max8925_rtc_driver = { .pm = &max8925_rtc_pm_ops, }, .probe = max8925_rtc_probe, - .remove = max8925_rtc_remove, }; module_platform_driver(max8925_rtc_driver); diff --git a/drivers/rtc/rtc-max8997.c b/drivers/rtc/rtc-max8997.c index dacf48db7925..168390428f86 100644 --- a/drivers/rtc/rtc-max8997.c +++ b/drivers/rtc/rtc-max8997.c @@ -104,7 +104,7 @@ static int max8997_rtc_tm_to_data(struct rtc_time *tm, u8 *data) data[RTC_WEEKDAY] = 1 << tm->tm_wday; data[RTC_DATE] = tm->tm_mday; data[RTC_MONTH] = tm->tm_mon + 1; - data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0 ; + data[RTC_YEAR] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0; if (tm->tm_year < 100) { pr_warn("%s: MAX8997 RTC cannot handle the year %d." diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c index d5af7baa48b5..5388336a2c4c 100644 --- a/drivers/rtc/rtc-max8998.c +++ b/drivers/rtc/rtc-max8998.c @@ -274,7 +274,7 @@ static int max8998_rtc_probe(struct platform_device *pdev) if (IS_ERR(info->rtc_dev)) { ret = PTR_ERR(info->rtc_dev); dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); - goto out_rtc; + return ret; } ret = devm_request_threaded_irq(&pdev->dev, info->irq, NULL, @@ -292,15 +292,6 @@ static int max8998_rtc_probe(struct platform_device *pdev) } return 0; - -out_rtc: - platform_set_drvdata(pdev, NULL); - return ret; -} - -static int max8998_rtc_remove(struct platform_device *pdev) -{ - return 0; } static const struct platform_device_id max8998_rtc_id[] = { @@ -315,7 +306,6 @@ static struct platform_driver max8998_rtc_driver = { .owner = THIS_MODULE, }, .probe = max8998_rtc_probe, - .remove = max8998_rtc_remove, .id_table = max8998_rtc_id, }; diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 7a8ed27a5f2e..77ea9896b5ba 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -370,8 +370,6 @@ err_reset_irq_status: err_reset_irq_request: mc13xxx_unlock(mc13xxx); - - platform_set_drvdata(pdev, NULL); } return ret; @@ -389,8 +387,6 @@ static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) mc13xxx_unlock(priv->mc13xxx); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c index bdcc60830aec..213006bfc69a 100644 --- a/drivers/rtc/rtc-mpc5121.c +++ b/drivers/rtc/rtc-mpc5121.c @@ -68,7 +68,7 @@ struct mpc5121_rtc_regs { u32 target_time; /* RTC + 0x20 */ /* * actual_time: - * readonly time since VBAT_RTC was last connected + * readonly time since VBAT_RTC was last connected */ u32 actual_time; /* RTC + 0x24 */ u32 keep_alive; /* RTC + 0x28 */ diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c index 771f86a05d14..426cb5189daa 100644 --- a/drivers/rtc/rtc-msm6242.c +++ b/drivers/rtc/rtc-msm6242.c @@ -111,8 +111,8 @@ static void msm6242_lock(struct msm6242_priv *priv) } if (!cnt) - pr_warning("msm6242: timed out waiting for RTC (0x%x)\n", - msm6242_read(priv, MSM6242_CD)); + pr_warn("msm6242: timed out waiting for RTC (0x%x)\n", + msm6242_read(priv, MSM6242_CD)); } static void msm6242_unlock(struct msm6242_priv *priv) @@ -199,7 +199,6 @@ static int __init msm6242_rtc_probe(struct platform_device *pdev) struct resource *res; struct msm6242_priv *priv; struct rtc_device *rtc; - int error; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) @@ -216,22 +215,11 @@ static int __init msm6242_rtc_probe(struct platform_device *pdev) rtc = devm_rtc_device_register(&pdev->dev, "rtc-msm6242", &msm6242_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - error = PTR_ERR(rtc); - goto out_unmap; - } + if (IS_ERR(rtc)) + return PTR_ERR(rtc); priv->rtc = rtc; return 0; - -out_unmap: - platform_set_drvdata(pdev, NULL); - return error; -} - -static int __exit msm6242_rtc_remove(struct platform_device *pdev) -{ - return 0; } static struct platform_driver msm6242_rtc_driver = { @@ -239,7 +227,6 @@ static struct platform_driver msm6242_rtc_driver = { .name = "rtc-msm6242", .owner = THIS_MODULE, }, - .remove = __exit_p(msm6242_rtc_remove), }; module_platform_driver_probe(msm6242_rtc_driver, msm6242_rtc_probe); diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 9a3895bc4f4d..ab87bacb8f88 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -436,22 +436,20 @@ static int mxc_rtc_probe(struct platform_device *pdev) pdata->irq = -1; } - if (pdata->irq >=0) + if (pdata->irq >= 0) device_init_wakeup(&pdev->dev, 1); rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &mxc_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { ret = PTR_ERR(rtc); - goto exit_clr_drvdata; + goto exit_put_clk; } pdata->rtc = rtc; return 0; -exit_clr_drvdata: - platform_set_drvdata(pdev, NULL); exit_put_clk: clk_disable_unprepare(pdata->clk); @@ -465,7 +463,6 @@ static int mxc_rtc_remove(struct platform_device *pdev) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); clk_disable_unprepare(pdata->clk); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index d592e2fe43f7..22861c5e0c59 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -260,15 +260,7 @@ static int __init nuc900_rtc_probe(struct platform_device *pdev) return 0; } -static int __exit nuc900_rtc_remove(struct platform_device *pdev) -{ - platform_set_drvdata(pdev, NULL); - - return 0; -} - static struct platform_driver nuc900_rtc_driver = { - .remove = __exit_p(nuc900_rtc_remove), .driver = { .name = "nuc900-rtc", .owner = THIS_MODULE, diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index b0ba3fc991ea..6f6ac033d5d9 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -23,9 +23,7 @@ #include <linux/of.h> #include <linux/of_device.h> #include <linux/pm_runtime.h> - -#include <asm/io.h> - +#include <linux/io.h> /* The OMAP1 RTC is a year/month/day/hours/minutes/seconds BCD clock * with century-range alarm matching, driven by the 32kHz clock. diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c index 539a90b98bc5..40b5c630bc7d 100644 --- a/drivers/rtc/rtc-pcap.c +++ b/drivers/rtc/rtc-pcap.c @@ -156,10 +156,8 @@ static int __init pcap_rtc_probe(struct platform_device *pdev) pcap_rtc->rtc = devm_rtc_device_register(&pdev->dev, "pcap", &pcap_rtc_ops, THIS_MODULE); - if (IS_ERR(pcap_rtc->rtc)) { - err = PTR_ERR(pcap_rtc->rtc); - goto fail; - } + if (IS_ERR(pcap_rtc->rtc)) + return PTR_ERR(pcap_rtc->rtc); timer_irq = pcap_to_irq(pcap_rtc->pcap, PCAP_IRQ_1HZ); alarm_irq = pcap_to_irq(pcap_rtc->pcap, PCAP_IRQ_TODA); @@ -167,17 +165,14 @@ static int __init pcap_rtc_probe(struct platform_device *pdev) err = devm_request_irq(&pdev->dev, timer_irq, pcap_rtc_irq, 0, "RTC Timer", pcap_rtc); if (err) - goto fail; + return err; err = devm_request_irq(&pdev->dev, alarm_irq, pcap_rtc_irq, 0, "RTC Alarm", pcap_rtc); if (err) - goto fail; + return err; return 0; -fail: - platform_set_drvdata(pdev, NULL); - return err; } static int __exit pcap_rtc_remove(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c index 796a6c5067dd..b2a78a02cf16 100644 --- a/drivers/rtc/rtc-pcf2123.c +++ b/drivers/rtc/rtc-pcf2123.c @@ -18,11 +18,11 @@ * should look something like: * * static struct spi_board_info ek_spi_devices[] = { - * ... - * { - * .modalias = "rtc-pcf2123", - * .chip_select = 1, - * .controller_data = (void *)AT91_PIN_PA10, + * ... + * { + * .modalias = "rtc-pcf2123", + * .chip_select = 1, + * .controller_data = (void *)AT91_PIN_PA10, * .max_speed_hz = 1000 * 1000, * .mode = SPI_CS_HIGH, * .bus_num = 0, diff --git a/drivers/rtc/rtc-pcf8583.c b/drivers/rtc/rtc-pcf8583.c index 95886dcf4a39..9971f7f7cac8 100644 --- a/drivers/rtc/rtc-pcf8583.c +++ b/drivers/rtc/rtc-pcf8583.c @@ -188,7 +188,8 @@ static int pcf8583_rtc_read_time(struct device *dev, struct rtc_time *tm) dev_warn(dev, "resetting control %02x -> %02x\n", ctrl, new_ctrl); - if ((err = pcf8583_set_ctrl(client, &new_ctrl)) < 0) + err = pcf8583_set_ctrl(client, &new_ctrl); + if (err < 0) return err; } diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index f1a6557261f3..14ee860d5a3f 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -480,7 +480,6 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev) fail_req_irq: rtc_device_unregister(rtc_dd->rtc); fail_rtc_enable: - platform_set_drvdata(pdev, NULL); kfree(rtc_dd); return rc; } @@ -492,7 +491,6 @@ static int pm8xxx_rtc_remove(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 0); free_irq(rtc_dd->rtc_alarm_irq, rtc_dd); rtc_device_unregister(rtc_dd->rtc); - platform_set_drvdata(pdev, NULL); kfree(rtc_dd); return 0; diff --git a/drivers/rtc/rtc-puv3.c b/drivers/rtc/rtc-puv3.c index 72f437170d2e..402732cfb32a 100644 --- a/drivers/rtc/rtc-puv3.c +++ b/drivers/rtc/rtc-puv3.c @@ -224,7 +224,6 @@ static int puv3_rtc_remove(struct platform_device *dev) { struct rtc_device *rtc = platform_get_drvdata(dev); - platform_set_drvdata(dev, NULL); rtc_device_unregister(rtc); puv3_rtc_setpie(&dev->dev, 0); diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c index 873c689f01c3..89d073679267 100644 --- a/drivers/rtc/rtc-rp5c01.c +++ b/drivers/rtc/rtc-rp5c01.c @@ -251,21 +251,15 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev) rtc = devm_rtc_device_register(&dev->dev, "rtc-rp5c01", &rp5c01_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - error = PTR_ERR(rtc); - goto out; - } + if (IS_ERR(rtc)) + return PTR_ERR(rtc); priv->rtc = rtc; error = sysfs_create_bin_file(&dev->dev.kobj, &priv->nvram_attr); if (error) - goto out; + return error; return 0; - -out: - platform_set_drvdata(dev, NULL); - return error; } static int __exit rp5c01_rtc_remove(struct platform_device *dev) diff --git a/drivers/rtc/rtc-rs5c313.c b/drivers/rtc/rtc-rs5c313.c index 8089fc63e403..6af0f1f95f63 100644 --- a/drivers/rtc/rtc-rs5c313.c +++ b/drivers/rtc/rtc-rs5c313.c @@ -47,10 +47,10 @@ #include <linux/platform_device.h> #include <linux/bcd.h> #include <linux/delay.h> -#include <asm/io.h> +#include <linux/io.h> #define DRV_NAME "rs5c313" -#define DRV_VERSION "1.13" +#define DRV_VERSION "1.13" #ifdef CONFIG_SH_LANDISK /*****************************************************/ @@ -301,7 +301,7 @@ static int rs5c313_rtc_set_time(struct device *dev, struct rtc_time *tm) rs5c313_write_reg(RS5C313_ADDR_SEC10, (data >> 4)); data = bin2bcd(tm->tm_min); - rs5c313_write_reg(RS5C313_ADDR_MIN, data ); + rs5c313_write_reg(RS5C313_ADDR_MIN, data); rs5c313_write_reg(RS5C313_ADDR_MIN10, (data >> 4)); data = bin2bcd(tm->tm_hour); @@ -310,7 +310,7 @@ static int rs5c313_rtc_set_time(struct device *dev, struct rtc_time *tm) data = bin2bcd(tm->tm_mday); rs5c313_write_reg(RS5C313_ADDR_DAY, data); - rs5c313_write_reg(RS5C313_ADDR_DAY10, (data>> 4)); + rs5c313_write_reg(RS5C313_ADDR_DAY10, (data >> 4)); data = bin2bcd(tm->tm_mon + 1); rs5c313_write_reg(RS5C313_ADDR_MON, data); @@ -349,9 +349,9 @@ static void rs5c313_check_xstp_bit(void) } memset(&tm, 0, sizeof(struct rtc_time)); - tm.tm_mday = 1; - tm.tm_mon = 1 - 1; - tm.tm_year = 2000 - 1900; + tm.tm_mday = 1; + tm.tm_mon = 1 - 1; + tm.tm_year = 2000 - 1900; rs5c313_rtc_set_time(NULL, &tm); pr_err("invalid value, resetting to 1 Jan 2000\n"); @@ -388,7 +388,7 @@ static struct platform_driver rs5c313_rtc_platform_driver = { .name = DRV_NAME, .owner = THIS_MODULE, }, - .probe = rs5c313_rtc_probe, + .probe = rs5c313_rtc_probe, .remove = rs5c313_rtc_remove, }; @@ -408,7 +408,7 @@ static int __init rs5c313_rtc_init(void) static void __exit rs5c313_rtc_exit(void) { - platform_driver_unregister( &rs5c313_rtc_platform_driver ); + platform_driver_unregister(&rs5c313_rtc_platform_driver); } module_init(rs5c313_rtc_init); diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c index 5032c24ec159..9100a3401de1 100644 --- a/drivers/rtc/rtc-rv3029c2.c +++ b/drivers/rtc/rtc-rv3029c2.c @@ -310,7 +310,7 @@ static int rv3029c2_rtc_i2c_set_alarm(struct i2c_client *client, dev_dbg(&client->dev, "alarm IRQ armed\n"); } else { /* disable AIE irq */ - ret = rv3029c2_rtc_i2c_alarm_set_irq(client, 1); + ret = rv3029c2_rtc_i2c_alarm_set_irq(client, 0); if (ret) return ret; diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 0b495e8b8e66..7afd373b9595 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -421,8 +421,6 @@ static void s3c_rtc_enable(struct platform_device *pdev, int en) static int s3c_rtc_remove(struct platform_device *dev) { - platform_set_drvdata(dev, NULL); - s3c_rtc_setaie(&dev->dev, 0); clk_unprepare(rtc_clk); @@ -549,23 +547,20 @@ static int s3c_rtc_probe(struct platform_device *pdev) 0, "s3c2410-rtc alarm", rtc); if (ret) { dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_alarmno, ret); - goto err_alarm_irq; + goto err_nortc; } ret = devm_request_irq(&pdev->dev, s3c_rtc_tickno, s3c_rtc_tickirq, 0, "s3c2410-rtc tick", rtc); if (ret) { dev_err(&pdev->dev, "IRQ%d error %d\n", s3c_rtc_tickno, ret); - goto err_alarm_irq; + goto err_nortc; } clk_disable(rtc_clk); return 0; - err_alarm_irq: - platform_set_drvdata(pdev, NULL); - err_nortc: s3c_rtc_enable(pdev, 0); clk_disable_unprepare(rtc_clk); diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index 00605601dbf7..0f7adeb1944a 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -249,7 +249,7 @@ static int sa1100_rtc_probe(struct platform_device *pdev) ret = clk_prepare_enable(info->clk); if (ret) - goto err_enable_clk; + return ret; /* * According to the manual we should be able to let RTTR be zero * and then a default diviser for a 32.768KHz clock is used. @@ -303,8 +303,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev) return 0; err_dev: clk_disable_unprepare(info->clk); -err_enable_clk: - platform_set_drvdata(pdev, NULL); return ret; } @@ -312,10 +310,8 @@ static int sa1100_rtc_remove(struct platform_device *pdev) { struct sa1100_rtc *info = platform_get_drvdata(pdev); - if (info) { + if (info) clk_disable_unprepare(info->clk); - platform_set_drvdata(pdev, NULL); - } return 0; } diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index 8d5bd2e36776..cb2f8399d692 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -770,8 +770,6 @@ static int __exit sh_rtc_remove(struct platform_device *pdev) clk_disable(rtc->clk); clk_put(rtc->clk); - platform_set_drvdata(pdev, NULL); - kfree(rtc); return 0; diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index 574359c48f65..c492cf0ab8cd 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -417,7 +417,6 @@ static int spear_rtc_probe(struct platform_device *pdev) return 0; err_disable_clock: - platform_set_drvdata(pdev, NULL); clk_disable_unprepare(config->clk); return status; diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 483ce086990b..90a3e864b8fe 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -225,7 +225,6 @@ static int stmp3xxx_rtc_remove(struct platform_device *pdev) writel(STMP3XXX_RTC_CTRL_ALARM_IRQ_EN, rtc_data->io + STMP3XXX_RTC_CTRL_CLR); - platform_set_drvdata(pdev, NULL); return 0; } @@ -274,25 +273,19 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev) rtc_data->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &stmp3xxx_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc_data->rtc)) { - err = PTR_ERR(rtc_data->rtc); - goto out; - } + if (IS_ERR(rtc_data->rtc)) + return PTR_ERR(rtc_data->rtc); err = devm_request_irq(&pdev->dev, rtc_data->irq_alarm, stmp3xxx_rtc_interrupt, 0, "RTC alarm", &pdev->dev); if (err) { dev_err(&pdev->dev, "Cannot claim IRQ%d\n", rtc_data->irq_alarm); - goto out; + return err; } stmp3xxx_wdt_register(pdev); return 0; - -out: - platform_set_drvdata(pdev, NULL); - return err; } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/rtc/rtc-sysfs.c b/drivers/rtc/rtc-sysfs.c index b70e2bb63645..4b26f8672b2d 100644 --- a/drivers/rtc/rtc-sysfs.c +++ b/drivers/rtc/rtc-sysfs.c @@ -164,6 +164,7 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr, { ssize_t retval; unsigned long now, alarm; + unsigned long push = 0; struct rtc_wkalrm alm; struct rtc_device *rtc = to_rtc_device(dev); char *buf_ptr; @@ -180,13 +181,17 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr, buf_ptr = (char *)buf; if (*buf_ptr == '+') { buf_ptr++; - adjust = 1; + if (*buf_ptr == '=') { + buf_ptr++; + push = 1; + } else + adjust = 1; } alarm = simple_strtoul(buf_ptr, NULL, 0); if (adjust) { alarm += now; } - if (alarm > now) { + if (alarm > now || push) { /* Avoid accidentally clobbering active alarms; we can't * entirely prevent that here, without even the minimal * locking from the /dev/rtcN api. @@ -194,9 +199,14 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr, retval = rtc_read_alarm(rtc, &alm); if (retval < 0) return retval; - if (alm.enabled) - return -EBUSY; - + if (alm.enabled) { + if (push) { + rtc_tm_to_time(&alm.time, &push); + alarm += push; + } else + return -EBUSY; + } else if (push) + return -EINVAL; alm.enabled = 1; } else { alm.enabled = 0; diff --git a/drivers/rtc/rtc-tile.c b/drivers/rtc/rtc-tile.c index fc3dee95f166..ff9632eb79f2 100644 --- a/drivers/rtc/rtc-tile.c +++ b/drivers/rtc/rtc-tile.c @@ -91,23 +91,12 @@ static int tile_rtc_probe(struct platform_device *dev) return 0; } -/* - * Device cleanup routine. - */ -static int tile_rtc_remove(struct platform_device *dev) -{ - platform_set_drvdata(dev, NULL); - - return 0; -} - static struct platform_driver tile_rtc_platform_driver = { .driver = { .name = "rtc-tile", .owner = THIS_MODULE, }, .probe = tile_rtc_probe, - .remove = tile_rtc_remove, }; /* diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c index 459c2ffc95a6..426901cef14f 100644 --- a/drivers/rtc/rtc-tps6586x.c +++ b/drivers/rtc/rtc-tps6586x.c @@ -273,6 +273,8 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) return ret; } + device_init_wakeup(&pdev->dev, 1); + platform_set_drvdata(pdev, rtc); rtc->rtc = devm_rtc_device_register(&pdev->dev, dev_name(&pdev->dev), &tps6586x_rtc_ops, THIS_MODULE); @@ -292,7 +294,6 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) goto fail_rtc_register; } disable_irq(rtc->irq); - device_set_wakeup_capable(&pdev->dev, 1); return 0; fail_rtc_register: diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index 8751a5240c99..103fea21e73c 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -555,7 +555,6 @@ static int twl_rtc_remove(struct platform_device *pdev) free_irq(irq, rtc); rtc_device_unregister(rtc); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 6e0cba8f47d5..d07d89823020 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -16,7 +16,7 @@ * - Use the generic rtc class * * ??-???-2004: Someone at Compulab - * - Initial driver creation. + * - Initial driver creation. * */ #include <linux/platform_device.h> @@ -278,13 +278,13 @@ static int v3020_set_time(struct device *dev, struct rtc_time *dt) dev_dbg(dev, "tm_year: %i\n", dt->tm_year); /* Write all the values to ram... */ - v3020_set_reg(chip, V3020_SECONDS, bin2bcd(dt->tm_sec)); - v3020_set_reg(chip, V3020_MINUTES, bin2bcd(dt->tm_min)); - v3020_set_reg(chip, V3020_HOURS, bin2bcd(dt->tm_hour)); + v3020_set_reg(chip, V3020_SECONDS, bin2bcd(dt->tm_sec)); + v3020_set_reg(chip, V3020_MINUTES, bin2bcd(dt->tm_min)); + v3020_set_reg(chip, V3020_HOURS, bin2bcd(dt->tm_hour)); v3020_set_reg(chip, V3020_MONTH_DAY, bin2bcd(dt->tm_mday)); - v3020_set_reg(chip, V3020_MONTH, bin2bcd(dt->tm_mon + 1)); - v3020_set_reg(chip, V3020_WEEK_DAY, bin2bcd(dt->tm_wday)); - v3020_set_reg(chip, V3020_YEAR, bin2bcd(dt->tm_year % 100)); + v3020_set_reg(chip, V3020_MONTH, bin2bcd(dt->tm_mon + 1)); + v3020_set_reg(chip, V3020_WEEK_DAY, bin2bcd(dt->tm_wday)); + v3020_set_reg(chip, V3020_YEAR, bin2bcd(dt->tm_year % 100)); /* ...and set the clock. */ v3020_set_reg(chip, V3020_CMD_RAM2CLOCK, 0); @@ -320,7 +320,7 @@ static int rtc_probe(struct platform_device *pdev) retval = chip->ops->map_io(chip, pdev, pdata); if (retval) - goto err_chip; + return retval; /* Make sure the v3020 expects a communication cycle * by reading 8 times */ @@ -364,7 +364,7 @@ static int rtc_probe(struct platform_device *pdev) err_io: chip->ops->unmap_io(chip); -err_chip: + return retval; } diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index f91be04b9050..3b5b4fa9a6e9 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -103,7 +103,7 @@ static inline unsigned long read_elapsed_second(void) second_mid = rtc1_read(ETIMEMREG); second_high = rtc1_read(ETIMEHREG); } while (first_low != second_low || first_mid != second_mid || - first_high != second_high); + first_high != second_high); return (first_high << 17) | (first_mid << 1) | (first_low >> 15); } @@ -154,7 +154,7 @@ static int vr41xx_rtc_set_time(struct device *dev, struct rtc_time *time) epoch_sec = mktime(epoch, 1, 1, 0, 0, 0); current_sec = mktime(time->tm_year + 1900, time->tm_mon + 1, time->tm_mday, - time->tm_hour, time->tm_min, time->tm_sec); + time->tm_hour, time->tm_min, time->tm_sec); write_elapsed_second(current_sec - epoch_sec); @@ -186,7 +186,7 @@ static int vr41xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) struct rtc_time *time = &wkalrm->time; alarm_sec = mktime(time->tm_year + 1900, time->tm_mon + 1, time->tm_mday, - time->tm_hour, time->tm_min, time->tm_sec); + time->tm_hour, time->tm_min, time->tm_sec); spin_lock_irq(&rtc_lock); @@ -334,7 +334,7 @@ static int rtc_probe(struct platform_device *pdev) } retval = request_irq(aie_irq, elapsedtime_interrupt, 0, - "elapsed_time", pdev); + "elapsed_time", pdev); if (retval < 0) goto err_device_unregister; @@ -343,7 +343,7 @@ static int rtc_probe(struct platform_device *pdev) goto err_free_irq; retval = request_irq(pie_irq, rtclong1_interrupt, 0, - "rtclong1", pdev); + "rtclong1", pdev); if (retval < 0) goto err_free_irq; @@ -381,8 +381,6 @@ static int rtc_remove(struct platform_device *pdev) if (rtc) rtc_device_unregister(rtc); - platform_set_drvdata(pdev, NULL); - free_irq(aie_irq, pdev); free_irq(pie_irq, pdev); if (rtc1_base) diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c index d89efee6d29e..c2d6331fc712 100644 --- a/drivers/rtc/rtc-vt8500.c +++ b/drivers/rtc/rtc-vt8500.c @@ -282,8 +282,6 @@ static int vt8500_rtc_remove(struct platform_device *pdev) /* Disable alarm matching */ writel(0, vt8500_rtc->regbase + VT8500_RTC_IS); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c index fa9b0679fb60..365dc6505148 100644 --- a/drivers/rtc/rtc-x1205.c +++ b/drivers/rtc/rtc-x1205.c @@ -4,7 +4,7 @@ * Copyright 2005 Alessandro Zummo * * please send all reports to: - * Karen Spearel <kas111 at gmail dot com> + * Karen Spearel <kas111 at gmail dot com> * Alessandro Zummo <a.zummo@towertech.it> * * based on a lot of other RTC drivers. @@ -215,12 +215,14 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm, buf[i] |= 0x80; /* this sequence is required to unlock the chip */ - if ((xfer = i2c_master_send(client, wel, 3)) != 3) { + xfer = i2c_master_send(client, wel, 3); + if (xfer != 3) { dev_err(&client->dev, "%s: wel - %d\n", __func__, xfer); return -EIO; } - if ((xfer = i2c_master_send(client, rwel, 3)) != 3) { + xfer = i2c_master_send(client, rwel, 3); + if (xfer != 3) { dev_err(&client->dev, "%s: rwel - %d\n", __func__, xfer); return -EIO; } @@ -269,7 +271,8 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm, } /* disable further writes */ - if ((xfer = i2c_master_send(client, diswe, 3)) != 3) { + xfer = i2c_master_send(client, diswe, 3); + if (xfer != 3) { dev_err(&client->dev, "%s: diswe - %d\n", __func__, xfer); return -EIO; } @@ -375,8 +378,7 @@ static int x1205_get_atrim(struct i2c_client *client, int *trim) return 0; } -struct x1205_limit -{ +struct x1205_limit { unsigned char reg, mask, min, max; }; @@ -430,7 +432,8 @@ static int x1205_validate_client(struct i2c_client *client) }, }; - if ((xfer = i2c_transfer(client->adapter, msgs, 2)) != 2) { + xfer = i2c_transfer(client->adapter, msgs, 2); + if (xfer != 2) { dev_err(&client->dev, "%s: could not read register %x\n", __func__, probe_zero_pattern[i]); @@ -467,7 +470,8 @@ static int x1205_validate_client(struct i2c_client *client) }, }; - if ((xfer = i2c_transfer(client->adapter, msgs, 2)) != 2) { + xfer = i2c_transfer(client->adapter, msgs, 2); + if (xfer != 2) { dev_err(&client->dev, "%s: could not read register %x\n", __func__, probe_limits_pattern[i].reg); @@ -548,10 +552,12 @@ static int x1205_rtc_proc(struct device *dev, struct seq_file *seq) { int err, dtrim, atrim; - if ((err = x1205_get_dtrim(to_i2c_client(dev), &dtrim)) == 0) + err = x1205_get_dtrim(to_i2c_client(dev), &dtrim); + if (!err) seq_printf(seq, "digital_trim\t: %d ppm\n", dtrim); - if ((err = x1205_get_atrim(to_i2c_client(dev), &atrim)) == 0) + err = x1205_get_atrim(to_i2c_client(dev), &atrim); + if (!err) seq_printf(seq, "analog_trim\t: %d.%02d pF\n", atrim / 1000, atrim % 1000); return 0; @@ -639,7 +645,8 @@ static int x1205_probe(struct i2c_client *client, i2c_set_clientdata(client, rtc); /* Check for power failures and eventually enable the osc */ - if ((err = x1205_get_status(client, &sr)) == 0) { + err = x1205_get_status(client, &sr); + if (!err) { if (sr & X1205_SR_RTCF) { dev_err(&client->dev, "power failure detected, " @@ -647,9 +654,9 @@ static int x1205_probe(struct i2c_client *client, udelay(50); x1205_fix_osc(client); } - } - else + } else { dev_err(&client->dev, "couldn't read status\n"); + } err = x1205_sysfs_register(&client->dev); if (err) diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index aa1620abec6d..fc6012c213a5 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -271,7 +271,8 @@ static void iblock_complete_cmd(struct se_cmd *cmd) kfree(ibr); } -static void iblock_bio_done(struct bio *bio, int err) +static void iblock_bio_done(struct bio *bio, int err, + struct batch_complete *batch) { struct se_cmd *cmd = bio->bi_private; struct iblock_req *ibr = cmd->priv; @@ -335,7 +336,8 @@ static void iblock_submit_bios(struct bio_list *list, int rw) blk_finish_plug(&plug); } -static void iblock_end_io_flush(struct bio *bio, int err) +static void iblock_end_io_flush(struct bio *bio, int err, + struct batch_complete *batch) { struct se_cmd *cmd = bio->bi_private; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index e992b27aa090..1e9873179860 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -835,7 +835,8 @@ static ssize_t pscsi_show_configfs_dev_params(struct se_device *dev, char *b) return bl; } -static void pscsi_bi_endio(struct bio *bio, int error) +static void pscsi_bi_endio(struct bio *bio, int error, + struct batch_complete *batch) { bio_put(bio); } diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index b645c47501b4..3b96f18593b3 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -677,7 +677,7 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) if (mi < 0) return -EINVAL; - requested_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + requested_pages = vma_pages(vma); actual_pages = ((idev->info->mem[mi].addr & ~PAGE_MASK) + idev->info->mem[mi].size + PAGE_SIZE -1) >> PAGE_SHIFT; if (requested_pages > actual_pages) diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index f52dcfe8f545..24bd363ca351 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -3099,7 +3099,7 @@ static int init_dma_pools(struct udc *dev) } /* DMA setup */ - dev->data_requests = dma_pool_create("data_requests", NULL, + dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev, sizeof(struct udc_data_dma), 0, 0); if (!dev->data_requests) { DBG(dev, "can't get request data pool\n"); @@ -3111,7 +3111,7 @@ static int init_dma_pools(struct udc *dev) dev->ep[UDC_EP0IN_IX].dma = &dev->regs->ctl; /* dma desc for setup data */ - dev->stp_requests = dma_pool_create("setup requests", NULL, + dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev, sizeof(struct udc_stp_dma), 0, 0); if (!dev->stp_requests) { DBG(dev, "can't get stp request pool\n"); diff --git a/drivers/video/backlight/atmel-pwm-bl.c b/drivers/video/backlight/atmel-pwm-bl.c index a60d6afca97c..0393d827dd44 100644 --- a/drivers/video/backlight/atmel-pwm-bl.c +++ b/drivers/video/backlight/atmel-pwm-bl.c @@ -195,7 +195,6 @@ static int __init atmel_pwm_bl_probe(struct platform_device *pdev) return 0; err_free_bl_dev: - platform_set_drvdata(pdev, NULL); backlight_device_unregister(bldev); err_free_pwm: pwm_channel_free(&pwmbl->pwmc); @@ -212,7 +211,6 @@ static int __exit atmel_pwm_bl_remove(struct platform_device *pdev) pwm_channel_disable(&pwmbl->pwmc); pwm_channel_free(&pwmbl->pwmc); backlight_device_unregister(pwmbl->bldev); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index 33455821dd31..018368ba4124 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -111,7 +111,6 @@ static int ep93xxbl_remove(struct platform_device *dev) struct backlight_device *bl = platform_get_drvdata(dev); backlight_device_unregister(bl); - platform_set_drvdata(dev, NULL); return 0; } diff --git a/drivers/video/backlight/lp8788_bl.c b/drivers/video/backlight/lp8788_bl.c index 4bb8b4f140cf..980855ec9bb1 100644 --- a/drivers/video/backlight/lp8788_bl.c +++ b/drivers/video/backlight/lp8788_bl.c @@ -312,7 +312,6 @@ static int lp8788_backlight_remove(struct platform_device *pdev) backlight_update_status(bl_dev); sysfs_remove_group(&pdev->dev.kobj, &lp8788_attr_group); lp8788_backlight_unregister(bl); - platform_set_drvdata(pdev, NULL); return 0; } diff --git a/drivers/video/backlight/pcf50633-backlight.c b/drivers/video/backlight/pcf50633-backlight.c index e87c7a3394f3..6ed76be18f19 100644 --- a/drivers/video/backlight/pcf50633-backlight.c +++ b/drivers/video/backlight/pcf50633-backlight.c @@ -153,8 +153,6 @@ static int pcf50633_bl_remove(struct platform_device *pdev) backlight_device_unregister(pcf_bl->bl); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 57886787ead0..e78d9f2233b8 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -518,6 +518,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw) cyber2000_grphw(0xb9, 0x00, cfb); spin_unlock(&cfb->reg_b0_lock); + /* wait (for the PLL?) to avoid palette corruption at higher clocks */ + msleep(1000); + cfb->ramdac_ctrl = hw->ramdac; cyber2000fb_write_ramdac_ctrl(cfb); diff --git a/drivers/video/imxfb.c b/drivers/video/imxfb.c index 0abf2bf20836..0a16d82d8384 100644 --- a/drivers/video/imxfb.c +++ b/drivers/video/imxfb.c @@ -960,7 +960,7 @@ static int imxfb_remove(struct platform_device *pdev) return 0; } -void imxfb_shutdown(struct platform_device * dev) +static void imxfb_shutdown(struct platform_device *dev) { struct fb_info *info = platform_get_drvdata(dev); struct imxfb_info *fbi = info->par; @@ -999,7 +999,7 @@ static int imxfb_setup(void) return 0; } -int __init imxfb_init(void) +static int __init imxfb_init(void) { int ret = imxfb_setup(); diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c index b2b33fc1ac3f..e188ada2ffd1 100644 --- a/drivers/video/smscufx.c +++ b/drivers/video/smscufx.c @@ -1622,7 +1622,7 @@ static int ufx_usb_probe(struct usb_interface *interface, { struct usb_device *usbdev; struct ufx_data *dev; - struct fb_info *info = 0; + struct fb_info *info = NULL; int retval = -ENOMEM; u32 id_rev, fpga_rev; diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index ec03e726c940..d2e5bc3cf969 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c @@ -434,10 +434,10 @@ static void dlfb_compress_hline( while ((pixel_end > pixel) && (cmd_buffer_end - MIN_RLX_CMD_BYTES > cmd)) { - uint8_t *raw_pixels_count_byte = 0; - uint8_t *cmd_pixels_count_byte = 0; - const uint16_t *raw_pixel_start = 0; - const uint16_t *cmd_pixel_start, *cmd_pixel_end = 0; + uint8_t *raw_pixels_count_byte = NULL; + uint8_t *cmd_pixels_count_byte = NULL; + const uint16_t *raw_pixel_start = NULL; + const uint16_t *cmd_pixel_start, *cmd_pixel_end = NULL; prefetchw((void *) cmd); /* pull in one cache line at least */ @@ -573,7 +573,7 @@ static int dlfb_render_hline(struct dlfb_data *dev, struct urb **urb_ptr, return 0; } -int dlfb_handle_damage(struct dlfb_data *dev, int x, int y, +static int dlfb_handle_damage(struct dlfb_data *dev, int x, int y, int width, int height, char *data) { int i, ret; @@ -1588,7 +1588,7 @@ static int dlfb_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *usbdev; - struct dlfb_data *dev = 0; + struct dlfb_data *dev = NULL; int retval = -ENOMEM; /* usb initialization */ diff --git a/drivers/w1/slaves/w1_ds2408.c b/drivers/w1/slaves/w1_ds2408.c index e45eca1044bd..dbe7e45dc38f 100644 --- a/drivers/w1/slaves/w1_ds2408.c +++ b/drivers/w1/slaves/w1_ds2408.c @@ -301,7 +301,33 @@ error: return -EIO; } +/* + * This is a special sequence we must do to ensure the P0 output is not stuck + * in test mode. This is described in rev 2 of the ds2408's datasheet + * (http://datasheets.maximintegrated.com/en/ds/DS2408.pdf) under + * "APPLICATION INFORMATION/Power-up timing". + */ +static int w1_f29_disable_test_mode(struct w1_slave *sl) +{ + int res; + u8 magic[10] = {0x96, }; + u64 rn = le64_to_cpu(*((u64*)&sl->reg_num)); + + memcpy(&magic[1], &rn, 8); + magic[9] = 0x3C; + + mutex_lock(&sl->master->bus_mutex); + res = w1_reset_bus(sl->master); + if (res) + goto out; + w1_write_block(sl->master, magic, ARRAY_SIZE(magic)); + + res = w1_reset_bus(sl->master); +out: + mutex_unlock(&sl->master->bus_mutex); + return res; +} static struct bin_attribute w1_f29_sysfs_bin_files[] = { { @@ -362,6 +388,10 @@ static int w1_f29_add_slave(struct w1_slave *sl) int err = 0; int i; + err = w1_f29_disable_test_mode(sl); + if (err) + return err; + for (i = 0; i < ARRAY_SIZE(w1_f29_sysfs_bin_files) && !err; ++i) err = sysfs_create_bin_file( &sl->dev.kobj, @@ -25,7 +25,9 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/bio.h> #include <linux/mmu_context.h> +#include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -35,6 +37,8 @@ #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/percpu-refcount.h> +#include <linux/radix-tree.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -59,14 +63,22 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_cpu { + unsigned reqs_available; +}; + struct kioctx { - atomic_t users; - atomic_t dead; + struct percpu_ref users; - /* This needs improving */ unsigned long user_id; - struct hlist_node list; + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. @@ -89,7 +101,15 @@ struct kioctx { struct work_struct rcu_work; struct { - atomic_t reqs_active; + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. + */ + atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { @@ -100,11 +120,23 @@ struct kioctx { struct { struct mutex ring_lock; wait_queue_head_t wait; + + /* + * Copy of the real tail - to reduce cacheline bouncing. Updated + * by aio_complete() whenever it updates the real tail. + */ + unsigned shadow_tail; } ____cacheline_aligned_in_smp; struct { + /* + * This is the canonical copy of the tail pointer, updated by + * aio_complete(). But aio_complete() also uses it as a lock, so + * other code can't use it; aio_complete() keeps shadow_tail in + * sync with the real value of the tail pointer for other code + * to use. + */ unsigned tail; - spinlock_t completion_lock; } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; @@ -275,6 +307,8 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } @@ -288,7 +322,7 @@ static void free_ioctx(struct kioctx *ctx) struct aio_ring *ring; struct io_event res; struct kiocb *req; - unsigned head, avail; + unsigned cpu, head, avail; spin_lock_irq(&ctx->ctx_lock); @@ -302,23 +336,31 @@ static void free_ioctx(struct kioctx *ctx) spin_unlock_irq(&ctx->ctx_lock); + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); + + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } + ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; kunmap_atomic(ring); - while (atomic_read(&ctx->reqs_active) > 0) { + while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) { wait_event(ctx->wait, - head != ctx->tail || - atomic_read(&ctx->reqs_active) <= 0); + (head != ctx->shadow_tail) || + (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)); - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + avail = (head <= ctx->shadow_tail + ? ctx->shadow_tail : ctx->nr_events) - head; - atomic_sub(avail, &ctx->reqs_active); + atomic_add(avail, &ctx->reqs_available); head += avail; head %= ctx->nr_events; } - WARN_ON(atomic_read(&ctx->reqs_active) < 0); + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); aio_free_ring(ctx); @@ -341,7 +383,7 @@ static void free_ioctx(struct kioctx *ctx) static void put_ioctx(struct kioctx *ctx) { - if (unlikely(atomic_dec_and_test(&ctx->users))) + if (percpu_ref_put(&ctx->users)) free_ioctx(ctx); } @@ -354,6 +396,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) struct kioctx *ctx; int err = -ENOMEM; + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || (nr_events > (0x10000000U / sizeof(struct kiocb)))) { @@ -370,18 +424,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - atomic_set(&ctx->users, 2); - atomic_set(&ctx->dead, 0); + percpu_ref_init(&ctx->users); + rcu_read_lock(); + percpu_ref_get(&ctx->users); + rcu_read_unlock(); + spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - if (aio_setup_ring(ctx) < 0) + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) goto out_freectx; + if (aio_setup_ring(ctx) < 0) + goto out_freepcpu; + + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + BUG_ON(!ctx->req_batch); + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > aio_max_nr || @@ -392,10 +456,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - /* now link into global list. */ + /* now insert into the radix tree */ + err = radix_tree_preload(GFP_KERNEL); + if (err) + goto out_cleanup; spin_lock(&mm->ioctx_lock); - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); + err = radix_tree_insert(&mm->ioctx_rtree, ctx->user_id, ctx); spin_unlock(&mm->ioctx_lock); + radix_tree_preload_end(); + if (err) { + WARN_ONCE(1, "aio: insert into ioctx tree failed: %d", err); + goto out_cleanup; + } pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); @@ -404,6 +476,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); out_freectx: kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); @@ -433,9 +507,9 @@ static void kill_ioctx_rcu(struct rcu_head *head) */ static void kill_ioctx(struct kioctx *ctx) { - if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); - /* Between hlist_del_rcu() and dropping the initial ref */ + if (percpu_ref_kill(&ctx->users)) { + radix_tree_delete(¤t->mm->ioctx_rtree, ctx->user_id); + /* Between radix_tree_delete() and dropping the initial ref */ synchronize_rcu(); /* @@ -475,31 +549,84 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { - struct kioctx *ctx; - struct hlist_node *n; - - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), - atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_active)); - /* - * We don't need to bother with munmap() here - - * exit_mmap(mm) is coming and it'll unmap everything. - * Since aio_free_ring() uses non-zero ->mmap_size - * as indicator that it needs to unmap the area, - * just set it to 0; aio_free_ring() is the only - * place that uses ->mmap_size, so it's safe. - */ - ctx->mmap_size = 0; + struct kioctx *ctx[16]; + unsigned long idx = 0; + int count; - if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + do { + int i; + + count = radix_tree_gang_lookup(&mm->ioctx_rtree, (void **)ctx, + idx, sizeof(ctx)/sizeof(void *)); + for (i = 0; i < count; i++) { + void *ret; + + BUG_ON(ctx[i]->user_id < idx); + idx = ctx[i]->user_id; + + /* + * We don't need to bother with munmap() here - + * exit_mmap(mm) is coming and it'll unmap everything. + * Since aio_free_ring() uses non-zero ->mmap_size + * as indicator that it needs to unmap the area, + * just set it to 0; aio_free_ring() is the only + * place that uses ->mmap_size, so it's safe. + */ + ctx[i]->mmap_size = 0; + + if (percpu_ref_kill(&ctx[i]->users)) { + ret = radix_tree_delete(&mm->ioctx_rtree, idx); + BUG_ON(!ret || ret != ctx[i]); + call_rcu(&ctx[i]->rcu_head, kill_ioctx_rcu); + } } + } while (count); +} + +static void put_reqs_available(struct kioctx *ctx, unsigned nr) +{ + struct kioctx_cpu *kcpu; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); } + + preempt_enable(); +} + +static bool get_reqs_available(struct kioctx *ctx) +{ + struct kioctx_cpu *kcpu; + bool ret = false; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); + + do { + if (avail < ctx->req_batch) + goto out; + + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); + + kcpu->reqs_available += ctx->req_batch; + } + + ret = true; + kcpu->reqs_available--; +out: + preempt_enable(); + return ret; } /* aio_get_req @@ -516,22 +643,18 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) + if (!get_reqs_available(ctx)) return NULL; - if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) - goto out_put; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) goto out_put; atomic_set(&req->ki_users, 2); req->ki_ctx = ctx; - return req; out_put: - atomic_dec(&ctx->reqs_active); + put_reqs_available(ctx, 1); return NULL; } @@ -562,78 +685,21 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id) { - atomic_inc(&ctx->users); - ret = ctx; - break; - } + ctx = radix_tree_lookup(&mm->ioctx_rtree, ctx_id); + if (ctx) { + percpu_ref_get(&ctx->users); + ret = ctx; } rcu_read_unlock(); return ret; } -/* aio_complete - * Called when the io request on the given iocb is complete. - */ -void aio_complete(struct kiocb *iocb, long res, long res2) +static inline unsigned kioctx_ring_put(struct kioctx *ctx, struct kiocb *req, + unsigned tail) { - struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring *ring; struct io_event *ev_page, *event; - unsigned long flags; - unsigned tail, pos; - - /* - * Special case handling for sync iocbs: - * - events go directly into the iocb for fast handling - * - the sync task with the iocb in its stack holds the single iocb - * ref, no other paths have a way to get another ref - * - the sync task helpfully left a reference to itself in the iocb - */ - if (is_sync_kiocb(iocb)) { - BUG_ON(atomic_read(&iocb->ki_users) != 1); - iocb->ki_user_data = res; - atomic_set(&iocb->ki_users, 0); - wake_up_process(iocb->ki_obj.tsk); - return; - } - - /* - * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after decrementing reqs_active. - */ - rcu_read_lock(); - - if (iocb->ki_list.next) { - unsigned long flags; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - } - - /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (unlikely(xchg(&iocb->ki_cancel, - KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - atomic_dec(&ctx->reqs_active); - /* Still need the wake_up in case free_ioctx is waiting */ - goto put_rq; - } - - /* - * Add a completion event to the ring buffer. Must be done holding - * ctx->ctx_lock to prevent other code from messing with the tail - * pointer since we might be called from irq context. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); - - tail = ctx->tail; - pos = tail + AIO_EVENTS_OFFSET; + unsigned pos = tail + AIO_EVENTS_OFFSET; if (++tail >= ctx->nr_events) tail = 0; @@ -641,60 +707,195 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_obj.user; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + event->obj = (u64)(unsigned long)req->ki_obj.user; + event->data = req->ki_user_data; + event->res = req->ki_res; + event->res2 = req->ki_res2; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + ctx, tail, req, req->ki_obj.user, req->ki_user_data, + req->ki_res, req->ki_res2); + + return tail; +} + +static inline unsigned kioctx_ring_lock(struct kioctx *ctx) +{ + unsigned tail; - /* after flagging the request as done, we - * must never even look at it again + /* + * ctx->tail is both our lock and the canonical version of the tail + * pointer. */ - smp_wmb(); /* make event visible before updating tail */ + while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX) + cpu_relax(); - ctx->tail = tail; + return tail; +} + +static inline void kioctx_ring_unlock(struct kioctx *ctx, unsigned tail) +{ + struct aio_ring *ring; + + if (!ctx) + return; + + smp_wmb(); + /* make event visible before updating tail */ + + ctx->shadow_tail = tail; ring = kmap_atomic(ctx->ring_pages[0]); ring->tail = tail; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + /* unlock, make new tail visible before checking waitlist */ + smp_mb(); + + ctx->tail = tail; + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +void batch_complete_aio(struct batch_complete *batch) +{ + struct kioctx *ctx = NULL; + struct eventfd_ctx *eventfd = NULL; + struct rb_node *n; + unsigned long flags; + unsigned tail = 0; + + if (RB_EMPTY_ROOT(&batch->kiocb)) + return; + + /* + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after incrementing reqs_available. + */ + rcu_read_lock(); + local_irq_save(flags); + + n = rb_first(&batch->kiocb); + while (n) { + struct kiocb *req = container_of(n, struct kiocb, ki_node); + + if (n->rb_right) { + n->rb_right->__rb_parent_color = n->__rb_parent_color; + n = n->rb_right; + + while (n->rb_left) + n = n->rb_left; + } else { + n = rb_parent(n); + } + + if (unlikely(req->ki_eventfd != eventfd)) { + if (eventfd) { + /* Make event visible */ + kioctx_ring_unlock(ctx, tail); + ctx = NULL; + + eventfd_signal(eventfd, 1); + eventfd_ctx_put(eventfd); + } + + eventfd = req->ki_eventfd; + req->ki_eventfd = NULL; + } + + if (unlikely(req->ki_ctx != ctx)) { + kioctx_ring_unlock(ctx, tail); + + ctx = req->ki_ctx; + tail = kioctx_ring_lock(ctx); + } + + tail = kioctx_ring_put(ctx, req, tail); + aio_put_req(req); + } - pr_debug("added to ring %p at [%u]\n", iocb, tail); + kioctx_ring_unlock(ctx, tail); + local_irq_restore(flags); + rcu_read_unlock(); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd != NULL) - eventfd_signal(iocb->ki_eventfd, 1); + if (eventfd) { + eventfd_signal(eventfd, 1); + eventfd_ctx_put(eventfd); + } +} +EXPORT_SYMBOL(batch_complete_aio); -put_rq: - /* everything turned out well, dispose of the aiocb. */ - aio_put_req(iocb); +/* aio_complete_batch + * Called when the io request on the given iocb is complete; @batch may be + * NULL. + */ +void aio_complete_batch(struct kiocb *req, long res, long res2, + struct batch_complete *batch) +{ + req->ki_res = res; + req->ki_res2 = res2; + + if (req->ki_list.next) { + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&req->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* - * We have to order our ring_info tail store above and test - * of the wait list below outside the wait lock. This is - * like in wake_up_bit() where clearing a bit has to be - * ordered with the unlocked test. + * Special case handling for sync iocbs: + * - events go directly into the iocb for fast handling + * - the sync task with the iocb in its stack holds the single iocb + * ref, no other paths have a way to get another ref + * - the sync task helpfully left a reference to itself in the iocb */ - smp_mb(); + if (is_sync_kiocb(req)) { + BUG_ON(atomic_read(&req->ki_users) != 1); + req->ki_user_data = req->ki_res; + atomic_set(&req->ki_users, 0); + wake_up_process(req->ki_obj.tsk); + } else if (batch) { + int res; + struct kiocb *t; + struct rb_node **n = &batch->kiocb.rb_node, *parent = NULL; + + while (*n) { + parent = *n; + t = container_of(*n, struct kiocb, ki_node); + + res = req->ki_ctx != t->ki_ctx + ? req->ki_ctx < t->ki_ctx + : req->ki_eventfd != t->ki_eventfd + ? req->ki_eventfd < t->ki_eventfd + : req < t; + + n = res ? &(*n)->rb_left : &(*n)->rb_right; + } - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + rb_link_node(&req->ki_node, parent, n); + rb_insert_color(&req->ki_node, &batch->kiocb); + } else { + struct batch_complete batch_stack; - rcu_read_unlock(); + memset(&req->ki_node, 0, sizeof(req->ki_node)); + batch_stack.kiocb.rb_node = &req->ki_node; + + batch_complete_aio(&batch_stack); + } } -EXPORT_SYMBOL(aio_complete); +EXPORT_SYMBOL(aio_complete_batch); /* aio_read_events * Pull an event off of the ioctx's event ring. Returns the number of @@ -714,9 +915,9 @@ static long aio_read_events_ring(struct kioctx *ctx, head = ring->head; kunmap_atomic(ring); - pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); + pr_debug("h%u t%u m%u\n", head, ctx->shadow_tail, ctx->nr_events); - if (head == ctx->tail) + if (head == ctx->shadow_tail) goto out; while (ret < nr) { @@ -724,8 +925,9 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event *ev; struct page *page; - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - if (head == ctx->tail) + avail = (head <= ctx->shadow_tail ? + ctx->shadow_tail : ctx->nr_events) - head; + if (head == ctx->shadow_tail) break; avail = min(avail, nr - ret); @@ -756,9 +958,9 @@ static long aio_read_events_ring(struct kioctx *ctx, kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); - pr_debug("%li h%u t%u\n", ret, head, ctx->tail); + pr_debug("%li h%u t%u\n", ret, head, ctx->shadow_tail); - atomic_sub(ret, &ctx->reqs_active); + put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -773,7 +975,7 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, if (ret > 0) *i += ret; - if (unlikely(atomic_read(&ctx->dead))) + if (unlikely(percpu_ref_dead(&ctx->users))) ret = -EINVAL; if (!*i) @@ -1142,7 +1344,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_dec(&ctx->reqs_active); + put_reqs_available(ctx, 1); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 3f1128b37e46..16d3288c808d 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -104,7 +104,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -139,7 +139,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 743c7c2c949d..91838211b66d 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, { int pipefd; int err = 0; + struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; @@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { - struct file *pipe = fget(pipefd); + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + AUTOFS_WARN("Not allowed to change PID namespace"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; @@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, fput(pipe); goto out; } - sbi->oz_pgrp = task_pgrp_nr(current); + swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; } out: + put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index b104726e2d0a..1b045ecfcea2 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -62,6 +62,8 @@ void autofs4_kill_sb(struct super_block *sb) /* Free wait queues, close pipe */ autofs4_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + sb->s_fs_info = NULL; kfree(sbi); @@ -85,7 +87,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -129,7 +131,8 @@ static const match_table_t tokens = { }; static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -137,7 +140,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -176,6 +178,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, if (match_int(args, &option)) return 1; *pgrp = option; + *pgrp_set = true; break; case Opt_minproto: if (match_int(args, &option)) @@ -211,6 +214,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + int pgrp; + bool pgrp_set = false; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -223,7 +228,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); + sbi->oz_pgrp = NULL; sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; @@ -260,12 +265,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_warn("autofs: could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); @@ -289,9 +305,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); - + if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; @@ -321,6 +337,7 @@ fail_dput: fail_ino: kfree(ino); fail_free: + put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; fail_unlock: diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 3db70dae40d3..309ca6bcbb09 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -353,11 +353,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, struct qstr qstr; char *name; int status, ret, type; + pid_t pid; + pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) return -ENOENT; + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + if (!dentry->d_inode) { /* * A wait for a negative dentry is invalid for certain @@ -423,8 +435,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->ino = autofs4_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); - wq->pid = current->pid; - wq->tgid = current->tgid; + wq->pid = pid; + wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; mutex_unlock(&sbi->wq_mutex); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bce87694f7b0..89dec7f789a4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8a0b0efda44..53b6d624c7a9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss) #define ELF_BASE_PLATFORM NULL #endif +/* + * Use get_random_int() to implement AT_RANDOM while avoiding depletion + * of the entropy pool. + */ +static void get_atrandom_bytes(unsigned char *buf, size_t nbytes) +{ + unsigned char *p = buf; + + while (nbytes) { + unsigned int random_variable; + size_t chunk = min(nbytes, sizeof(random_variable)); + + random_variable = get_random_int(); + memcpy(p, &random_variable, chunk); + p += chunk; + nbytes -= chunk; + } +} + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, /* * Generate 16 random bytes for userspace PRNG seeding. */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) @@ -738,8 +757,6 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8fb42916d8a2..69f6f802b09e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -510,7 +510,8 @@ static void bio_integrity_verify_fn(struct work_struct *work) * in process context. This function postpones completion * accordingly. */ -void bio_integrity_endio(struct bio *bio, int error) +void bio_integrity_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct bio_integrity_payload *bip = bio->bi_integrity; @@ -28,6 +28,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/aio.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -760,7 +761,8 @@ struct submit_bio_ret { int error; }; -static void submit_bio_wait_endio(struct bio *bio, int error) +static void submit_bio_wait_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct submit_bio_ret *ret = bio->bi_private; @@ -1414,7 +1416,8 @@ void bio_unmap_user(struct bio *bio) } EXPORT_SYMBOL(bio_unmap_user); -static void bio_map_kern_endio(struct bio *bio, int err) +static void bio_map_kern_endio(struct bio *bio, int err, + struct batch_complete *batch) { bio_put(bio); } @@ -1486,7 +1489,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, } EXPORT_SYMBOL(bio_map_kern); -static void bio_copy_kern_endio(struct bio *bio, int err) +static void bio_copy_kern_endio(struct bio *bio, int err, + struct batch_complete *batch) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; @@ -1685,31 +1689,40 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif -/** - * bio_endio - end I/O on a bio - * @bio: bio - * @error: error, if any - * - * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. - **/ -void bio_endio(struct bio *bio, int error) +static inline void __bio_endio(struct bio *bio, struct batch_complete *batch) { - if (error) + if (bio->bi_error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = -EIO; if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio->bi_end_io(bio, bio->bi_error, batch); +} + +void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch) +{ + if (error) + bio->bi_error = error; + + if (batch) + bio_list_add(&batch->bio, bio); + else + __bio_endio(bio, batch); + +} +EXPORT_SYMBOL(bio_endio_batch); + +void batch_complete(struct batch_complete *batch) +{ + struct bio *bio; + + while ((bio = bio_list_pop(&batch->bio))) + __bio_endio(bio, batch); + + batch_complete_aio(batch); } -EXPORT_SYMBOL(bio_endio); +EXPORT_SYMBOL(batch_complete); void bio_pair_release(struct bio_pair *bp) { @@ -1722,7 +1735,8 @@ void bio_pair_release(struct bio_pair *bp) } EXPORT_SYMBOL(bio_pair_release); -static void bio_pair_end_1(struct bio *bi, int err) +static void bio_pair_end_1(struct bio *bi, int err, + struct batch_complete *batch) { struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); @@ -1732,7 +1746,8 @@ static void bio_pair_end_1(struct bio *bi, int err) bio_pair_release(bp); } -static void bio_pair_end_2(struct bio *bi, int err) +static void bio_pair_end_2(struct bio *bi, int err, + struct batch_complete *batch) { struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 1431a6965017..29b35350b01d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -323,7 +323,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); -static void btrfsic_complete_bio_end_io(struct bio *bio, int err); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, @@ -336,7 +337,8 @@ static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status, + struct batch_complete *batch); static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, @@ -1751,7 +1753,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, return block_ctx->len; } -static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +static void btrfsic_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -2294,7 +2297,8 @@ continue_loop: goto again; } -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status, + struct batch_complete *batch) { struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; int iodone_w_error; @@ -2342,7 +2346,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) block = next_block; } while (NULL != block); - bp->bi_end_io(bp, bio_error_status); + bp->bi_end_io(bp, bio_error_status, batch); } static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b189bd1e7a3e..2298567b48e2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -156,7 +156,8 @@ fail: * The compressed pages are freed here, and it must be run * in process context */ -static void end_compressed_bio_read(struct bio *bio, int err) +static void end_compressed_bio_read(struct bio *bio, int err, + struct batch_complete *batch) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; @@ -266,7 +267,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, * This also calls the writeback end hooks for the file pages so that * metadata and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio, int err) +static void end_compressed_bio_write(struct bio *bio, int err, + struct batch_complete *batch) { struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 40c7bc300075..364ce4fbc3cf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -685,7 +685,8 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) return -EIO; /* we fixed nothing */ } -static void end_workqueue_bio(struct bio *bio, int err) +static void end_workqueue_bio(struct bio *bio, int err, + struct batch_complete *batch) { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; @@ -3075,7 +3076,8 @@ static int write_dev_supers(struct btrfs_device *device, * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ -static void btrfs_end_empty_barrier(struct bio *bio, int err) +static void btrfs_end_empty_barrier(struct bio *bio, int err, + struct batch_complete *batch) { if (err) { if (err == -EOPNOTSUPP) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6bca9472f313..94258a1407c1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2003,7 +2003,8 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, return err; } -static void repair_io_failure_callback(struct bio *bio, int err) +static void repair_io_failure_callback(struct bio *bio, int err, + struct batch_complete *batch) { complete(bio->bi_private); } @@ -2383,7 +2384,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio, int err) +static void end_bio_extent_writepage(struct bio *bio, int err, + struct batch_complete *batch) { struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; @@ -2431,7 +2433,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio, int err) +static void end_bio_extent_readpage(struct bio *bio, int err, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -3270,7 +3273,8 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +static void end_bio_extent_buffer_writepage(struct bio *bio, int err, + struct batch_complete *batch) { int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db57e6384fbb..06acf922af38 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6939,7 +6939,8 @@ struct btrfs_dio_private { struct bio *dio_bio; }; -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -6993,11 +6994,12 @@ failed: /* If we had a csum failure make sure to clear the uptodate flag */ if (err) clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, err, batch); bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7040,7 +7042,7 @@ out_done: /* If we had an error make sure to clear the uptodate flag */ if (err) clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, err, batch); bio_put(bio); } @@ -7055,7 +7057,8 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_dio_private *dip = bio->bi_private; @@ -7081,7 +7084,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) bio_io_error(dip->orig_bio); } else { set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); - bio_endio(dip->orig_bio, 0); + bio_endio_batch(dip->orig_bio, 0, batch); } out: bio_put(bio); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0525e1389f5b..17cef49e21f6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -850,7 +850,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_end_io(struct bio *bio, int err) +static void raid_write_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1384,7 +1385,8 @@ static void set_bio_pages_uptodate(struct bio *bio) * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid_rmw_end_io(struct bio *bio, int err) +static void raid_rmw_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1905,7 +1907,8 @@ cleanup_io: * This is called only for stripes we've read from disk to * reconstruct the parity. */ -static void raid_recover_end_io(struct bio *bio, int err) +static void raid_recover_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_raid_bio *rbio = bio->bi_private; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 79bd479317cb..86114520ba45 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -200,7 +200,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int is_metadata, int have_csum, const u8 *csum, u64 generation, u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); +static void scrub_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write); @@ -223,7 +224,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, @@ -240,7 +242,8 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); @@ -1384,7 +1387,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, sblock->checksum_error = 1; } -static void scrub_complete_bio_end_io(struct bio *bio, int err) +static void scrub_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -1584,7 +1588,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(WRITE, sbio->bio); } -static void scrub_wr_bio_end_io(struct bio *bio, int err) +static void scrub_wr_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; @@ -2053,7 +2058,8 @@ leave_nomem: return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8bffb9174afb..34cb538ecd2e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5018,7 +5018,8 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void btrfs_end_bio(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; @@ -5073,7 +5074,7 @@ static void btrfs_end_bio(struct bio *bio, int err) } kfree(bbio); - bio_endio(bio, err); + bio_endio_batch(bio, err, batch); } else if (!is_orig_bio) { bio_put(bio); } diff --git a/fs/buffer.c b/fs/buffer.c index f93392e2df12..d3b6c14a7b1c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2897,7 +2897,8 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, } EXPORT_SYMBOL(generic_block_bmap); -static void end_bio_bh_io_sync(struct bio *bio, int err) +static void end_bio_bh_io_sync(struct bio *bio, int err, + struct batch_complete *batch) { struct buffer_head *bh = bio->bi_private; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2b6cb23dd14e..1d1c41f1014d 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, count); + len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); diff --git a/fs/direct-io.c b/fs/direct-io.c index 7ab90f5081ee..b8707bf52b82 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -230,7 +230,8 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async, + struct batch_complete *batch) { ssize_t transferred = 0; @@ -264,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is } else { inode_dio_done(dio->inode); if (is_async) - aio_complete(dio->iocb, ret, 0); + aio_complete_batch(dio->iocb, ret, 0, batch); } return ret; @@ -274,7 +275,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. */ -static void dio_bio_end_aio(struct bio *bio, int error) +static void dio_bio_end_aio(struct bio *bio, int error, struct batch_complete *batch) { struct dio *dio = bio->bi_private; unsigned long remaining; @@ -290,7 +291,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); + dio_complete(dio, dio->iocb->ki_pos, 0, true, batch); kmem_cache_free(dio_cache, dio); } } @@ -324,12 +325,12 @@ static void dio_bio_end_io(struct bio *bio, int error) * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio, int error, struct batch_complete *batch) { struct dio *dio = bio->bi_private; if (dio->is_async) - dio_bio_end_aio(bio, error); + dio_bio_end_aio(bio, error, batch); else dio_bio_end_io(bio, error); } @@ -350,10 +351,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_bdev = bdev; bio->bi_sector = first_sector; - if (dio->is_async) - bio->bi_end_io = dio_bio_end_aio; - else - bio->bi_end_io = dio_bio_end_io; + bio->bi_end_io = dio_end_io; sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; @@ -1268,7 +1266,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, offset, retval, false); + retval = dio_complete(dio, offset, retval, false, NULL); kmem_cache_free(dio_cache, dio); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..f23d2a7ed438 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { + printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n", + current->comm, task_pid_nr(current), sysctl_drop_caches); if (sysctl_drop_caches & 1) iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4acf1f78881b..1678d9ed2354 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -219,7 +219,8 @@ static void buffer_io_error(struct buffer_head *bh) (unsigned long long)bh->b_blocknr); } -static void ext4_end_bio(struct bio *bio, int error) +static void ext4_end_bio(struct bio *bio, int error, + struct batch_complete *batch) { ext4_io_end_t *io_end = bio->bi_private; struct inode *inode; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f0f2e26e0e53..552bcbbbc8c4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -344,7 +344,7 @@ repeat: return page; } -static void read_end_io(struct bio *bio, int err) +static void read_end_io(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index be668ffb001c..e76c448bd6ca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -634,7 +634,8 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -static void f2fs_end_io_write(struct bio *bio, int err) +static void f2fs_end_io_write(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e50ddb..73264395f705 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -17,8 +17,11 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -140,6 +143,22 @@ static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, static int fat_file_release(struct inode *inode, struct file *filp) { + + struct super_block *sb = inode->i_sb; + loff_t mmu_private_ideal; + + /* + * Release unwritten fallocated blocks on file release. + * Do this only when the last open file descriptor is closed. + */ + mutex_lock(&inode->i_mutex); + mmu_private_ideal = round_up(inode->i_size, sb->s_blocksize); + + if (mmu_private_ideal < MSDOS_I(inode)->mmu_private && + filp->f_dentry->d_count == 1) + fat_truncate_blocks(inode, inode->i_size); + mutex_unlock(&inode->i_mutex); + if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); @@ -174,6 +193,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -212,6 +232,88 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. The + * allocated clusters are freed in fat_file_release(). + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int cluster, fclus, dclus; + int nr_cluster; /* Number of clusters to be allocated */ + loff_t nr_bytes; /* Number of bytes to be allocated*/ + loff_t free_bytes; /* Unused bytes in the last cluster of file*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if ((offset + len) <= MSDOS_I(inode)->mmu_private) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): Blocks already allocated"); + err = -EINVAL; + goto error; + } + + if (mode & FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + if (inode->i_size > 0) { + err = fat_get_cluster(inode, FAT_ENT_EOF, + &fclus, &dclus); + if (err < 0) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_get_cluster() error"); + goto error; + } + free_bytes = ((fclus + 1) << sbi->cluster_bits) - + inode->i_size; + nr_bytes = offset + len - inode->i_size - free_bytes; + MSDOS_I(inode)->mmu_private = (fclus + 1) << + sbi->cluster_bits; + } else + nr_bytes = offset + len - inode->i_size; + + nr_cluster = (nr_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_alloc_clusters() error"); + goto error; + } + err = fat_chain_add(inode, cluster, 1); + if (err) { + fat_free_clusters(inode, cluster); + goto error; + } + MSDOS_I(inode)->mmu_private += sbi->cluster_size; + } + } else { + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_cont_expand() error"); + } + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -378,6 +480,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; unsigned int ia_valid; int error; + loff_t mmu_private_ideal; + + mmu_private_ideal = round_up(inode->i_size, dentry->d_sb->s_blocksize); /* Check for setting the inode time. */ ia_valid = attr->ia_valid; @@ -403,7 +508,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { inode_dio_wait(inode); - if (attr->ia_size > inode->i_size) { + if (attr->ia_size > inode->i_size && + MSDOS_I(inode)->mmu_private <= mmu_private_ideal) { error = fat_cont_expand(inode, attr->ia_size); if (error || attr->ia_valid == ATTR_SIZE) goto out; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5d4513cb1b3c..f49de9379bcd 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -152,11 +152,65 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } } +static int fat_zero_falloc_area(struct file *file, + struct address_space *mapping, loff_t pos) +{ + struct page *page; + struct inode *inode = mapping->host; + loff_t curpos = i_size_read(inode); + size_t count = pos - curpos; + int err; + + do { + unsigned offset; + size_t bytes; + void *fsdata; + + offset = (curpos & (PAGE_CACHE_SIZE - 1)); + bytes = PAGE_CACHE_SIZE - offset; + bytes = min(bytes, count); + + err = pagecache_write_begin(NULL, mapping, curpos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + break; + + zero_user(page, offset, bytes); + + err = pagecache_write_end(NULL, mapping, curpos, bytes, bytes, + page, fsdata); + if (err < 0) + break; + curpos += bytes; + count -= bytes; + err = 0; + } while (count); + + return err; +} + static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int err; + loff_t mmu_private_ideal, mmu_private_actual; + loff_t size; + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + + size = i_size_read(inode); + mmu_private_actual = MSDOS_I(inode)->mmu_private; + mmu_private_ideal = round_up(size, sb->s_blocksize); + if ((mmu_private_actual > mmu_private_ideal) && (pos > size)) { + err = fat_zero_falloc_area(file, mapping, pos); + if (err) { + fat_msg(sb, KERN_ERR, + "Error (%d) zeroing fallocated area", err); + return err; + } + } *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, flags, diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 359d307b5507..628e22a5a543 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); + fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } @@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { sb->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT-fs (%s): Filesystem has been " - "set read-only\n", sb->s_id); + fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 33f18b7282b2..d3ac40e24111 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -201,7 +201,8 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, * */ -static void gfs2_end_log_write(struct bio *bio, int error) +static void gfs2_end_log_write(struct bio *bio, int error, + struct batch_complete *batch) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 60ede2a0f43f..86eb657aeaca 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -155,7 +155,8 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } -static void end_bio_io_page(struct bio *bio, int error) +static void end_bio_io_page(struct bio *bio, int error, + struct batch_complete *batch) { struct page *page = bio->bi_private; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b51a6079108d..96375a5124b2 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -24,7 +24,8 @@ struct hfsplus_wd { u16 embed_count; }; -static void hfsplus_end_io_sync(struct bio *bio, int err) +static void hfsplus_end_io_sync(struct bio *bio, int err, + struct batch_complete *batch) { if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 360d27c48887..4c3289c00d0c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2012,7 +2012,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio, 0, NULL); } else { submit_bio(READ_SYNC, bio); } @@ -2159,7 +2159,7 @@ static void lbmStartIO(struct lbuf * bp) /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio, 0, NULL); } else { submit_bio(WRITE_SYNC, bio); INCREMENT(lmStat.submitted); @@ -2197,7 +2197,7 @@ static int lbmIOWait(struct lbuf * bp, int flag) * * executed at INTIODONE level */ -static void lbmIODone(struct bio *bio, int error) +static void lbmIODone(struct bio *bio, int error, struct batch_complete *batch) { struct lbuf *bp = bio->bi_private; struct lbuf *nextbp, *tail; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 9e3aaff11f89..de87794fedaf 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -283,7 +283,8 @@ static void last_read_complete(struct page *page) unlock_page(page); } -static void metapage_read_end_io(struct bio *bio, int err) +static void metapage_read_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct page *page = bio->bi_private; @@ -338,7 +339,8 @@ static void last_write_complete(struct page *page) end_page_writeback(page); } -static void metapage_write_end_io(struct bio *bio, int err) +static void metapage_write_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct page *page = bio->bi_private; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 550475ca6a0e..0ae2254f74bf 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -14,7 +14,8 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static void request_complete(struct bio *bio, int err) +static void request_complete(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -64,7 +65,8 @@ static int bdev_readpage(void *_sb, struct page *page) static DECLARE_WAIT_QUEUE_HEAD(wq); -static void writeseg_end_io(struct bio *bio, int err) +static void writeseg_end_io(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -168,7 +170,7 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) } -static void erase_end_io(struct bio *bio, int err) +static void erase_end_io(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct super_block *sb = bio->bi_private; diff --git a/fs/mpage.c b/fs/mpage.c index 0face1c4d4c6..a4089bbfee0a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -41,7 +41,7 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index ee24df5af1f9..3c5dd55d284c 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; /* we do not support files bigger than 4GB... We eventually supports just 4GB... */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff + if (vma_pages(vma) + vma->vm_pgoff > (1U << (32 - PAGE_SHIFT))) return -EFBIG; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1e5fdd3506e2..dba178e2a1bd 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -143,7 +143,7 @@ bl_submit_bio(int rw, struct bio *bio) static struct bio *bl_alloc_init_bio(int npg, sector_t isect, struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + bio_end_io_t *end_io, struct parallel_io *par) { struct bio *bio; @@ -167,7 +167,7 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + bio_end_io_t *end_io, struct parallel_io *par, unsigned int offset, int len) { @@ -190,7 +190,7 @@ retry: static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + bio_end_io_t *end_io, struct parallel_io *par) { return do_add_page_to_bio(bio, npg, rw, isect, page, be, @@ -198,7 +198,8 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, } /* This is basically copied from mpage_end_io_read */ -static void bl_end_io_read(struct bio *bio, int err) +static void bl_end_io_read(struct bio *bio, int err, + struct batch_complete *batch) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -380,7 +381,8 @@ static void mark_extents_written(struct pnfs_block_layout *bl, } } -static void bl_end_io_write_zero(struct bio *bio, int err) +static void bl_end_io_write_zero(struct bio *bio, int err, + struct batch_complete *batch) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -408,7 +410,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err) put_parallel(par); } -static void bl_end_io_write(struct bio *bio, int err) +static void bl_end_io_write(struct bio *bio, int err, + struct batch_complete *batch) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -487,7 +490,7 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) } static void -bl_read_single_end_io(struct bio *bio, int error) +bl_read_single_end_io(struct bio *bio, int error, struct batch_complete *batch) { struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc9a913784ab..680b65b8a74d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,8 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) /* * BIO operations */ -static void nilfs_end_bio_write(struct bio *bio, int err) +static void nilfs_end_bio_write(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 8c3318bf2252..0cc19d0417bf 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -372,8 +372,8 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, wait_for_completion(&wc->wc_io_complete); } -static void o2hb_bio_end_io(struct bio *bio, - int error) +static void o2hb_bio_end_io(struct bio *bio, int error, + struct batch_complete *batch) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b57c243..33c7b91c777b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -947,7 +947,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; diff --git a/fs/pnode.c b/fs/pnode.c index 3d2a7141b87a..9af0df15256e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } - if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) mnt_release_group_id(mnt); list_del_init(&mnt->mnt_share); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d864d56..dbf61f6174f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + struct vm_area_struct *vma; + enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + .type = type, + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* @@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -794,6 +861,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -807,14 +875,17 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1110,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 76c8faddf137..0048cc16a6a8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - if (insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args) < 0) { + + reiserfs_write_unlock(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock(inode->i_sb); + if (err) { err = -EINVAL; goto out_bad_inode; } + if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 596ec71da00e..8a58be3f2dcf 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -380,7 +380,8 @@ xfs_imap_valid( STATIC void xfs_end_bio( struct bio *bio, - int error) + int error, + struct batch_complete *batch) { xfs_ioend_t *ioend = bio->bi_private; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1b2472a46e46..e8610aa1a0fe 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1225,7 +1225,8 @@ _xfs_buf_ioend( STATIC void xfs_buf_bio_end_io( struct bio *bio, - int error) + int error, + struct batch_complete *batch) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a59ff51b0166..185cf318080a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -173,11 +173,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE @@ -396,6 +397,28 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_start_context_switch(prev) do {} while (0) #endif +#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline int pte_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return 0; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd; +} +#endif + #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of diff --git a/include/linux/aio.h b/include/linux/aio.h index 1bdf965339f9..a7e4c595825e 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -6,11 +6,12 @@ #include <linux/aio_abi.h> #include <linux/uio.h> #include <linux/rcupdate.h> - #include <linux/atomic.h> +#include <linux/batch_complete.h> struct kioctx; struct kiocb; +struct batch_complete; #define KIOCB_KEY 0 @@ -30,6 +31,8 @@ struct kiocb; typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); struct kiocb { + struct rb_node ki_node; + atomic_t ki_users; struct file *ki_filp; @@ -43,6 +46,9 @@ struct kiocb { } ki_obj; __u64 ki_user_data; /* user's data for completion */ + long ki_res; + long ki_res2; + loff_t ki_pos; void *private; @@ -85,7 +91,9 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) #ifdef CONFIG_AIO extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); extern void aio_put_req(struct kiocb *iocb); -extern void aio_complete(struct kiocb *iocb, long res, long res2); +extern void batch_complete_aio(struct batch_complete *batch); +extern void aio_complete_batch(struct kiocb *iocb, long res, long res2, + struct batch_complete *batch); struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, @@ -94,7 +102,13 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } static inline void aio_put_req(struct kiocb *iocb) { } -static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } + +static inline void batch_complete_aio(struct batch_complete *batch) { } +static inline void aio_complete_batch(struct kiocb *iocb, long res, long res2, + struct batch_complete *batch) +{ + return; +} struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, @@ -104,6 +118,11 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ +static inline void aio_complete(struct kiocb *iocb, long res, long res2) +{ + aio_complete_batch(iocb, res, res2, NULL); +} + static inline struct kiocb *list_kiocb(struct list_head *h) { return list_entry(h, struct kiocb, ki_list); diff --git a/include/linux/audit.h b/include/linux/audit.h index b20b03852f21..729a4d165bcc 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -103,8 +103,11 @@ extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void audit_putname(struct filename *name); + +#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ +#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ extern void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent); + unsigned int flags); extern void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type); @@ -148,10 +151,22 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } -static inline void audit_inode(struct filename *name, const struct dentry *dentry, +static inline void audit_inode(struct filename *name, + const struct dentry *dentry, unsigned int parent) { + if (unlikely(!audit_dummy_context())) { + unsigned int flags = 0; + if (parent) + flags |= AUDIT_INODE_PARENT; + __audit_inode(name, dentry, flags); + } +} +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ if (unlikely(!audit_dummy_context())) - __audit_inode(name, dentry, parent); + __audit_inode(name, dentry, + AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -311,7 +326,7 @@ static inline void audit_putname(struct filename *name) { } static inline void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { } static inline void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -321,6 +336,9 @@ static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int parent) { } +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f7f1d7169b11..6fd5cc80f62f 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -213,8 +213,15 @@ static inline bool balloon_compaction_check(void) return true; } +static inline void balloon_event_count(enum vm_event_item item) +{ + count_vm_event(item); +} #else /* !CONFIG_BALLOON_COMPACTION */ +/* A macro, to avoid generating references to the undefined COMPACTBALLOON* */ +#define balloon_event_count(item) do { } while (0) + static inline void *balloon_mapping_alloc(void *balloon_device, const struct address_space_operations *a_ops) { diff --git a/include/linux/batch_complete.h b/include/linux/batch_complete.h new file mode 100644 index 000000000000..8167a9d306fb --- /dev/null +++ b/include/linux/batch_complete.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_BATCH_COMPLETE_H +#define _LINUX_BATCH_COMPLETE_H + +#include <linux/rbtree.h> + +/* + * Common stuff to the aio and block code for batch completion. Everything + * important is elsewhere: + */ + +struct bio; + +struct bio_list { + struct bio *head; + struct bio *tail; +}; + +struct batch_complete { + struct bio_list bio; + struct rb_root kiocb; +}; + +#endif diff --git a/include/linux/bio.h b/include/linux/bio.h index ef24466d8f82..5db8a51eebb1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -24,6 +24,7 @@ #include <linux/mempool.h> #include <linux/ioprio.h> #include <linux/bug.h> +#include <linux/batch_complete.h> #ifdef CONFIG_BLOCK @@ -69,6 +70,8 @@ #define bio_sectors(bio) ((bio)->bi_size >> 9) #define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio))) +void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch); + static inline unsigned int bio_cur_bytes(struct bio *bio) { if (bio->bi_vcnt) @@ -252,7 +255,25 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } -extern void bio_endio(struct bio *, int); +/** + * bio_endio - end I/O on a bio + * @bio: bio + * @error: error, if any + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the + * preferred way to end I/O on a bio, it takes care of clearing + * BIO_UPTODATE on error. @error is 0 on success, and and one of the + * established -Exxxx (-EIO, for instance) error values in case + * something went wrong. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io + * function. + **/ +static inline void bio_endio(struct bio *bio, int error) +{ + bio_endio_batch(bio, error, NULL); +} + struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -404,10 +425,6 @@ static inline bool bio_mergeable(struct bio *bio) * member of the bio. The bio_list also caches the last list member to allow * fast access to the tail. */ -struct bio_list { - struct bio *head; - struct bio *tail; -}; static inline int bio_list_empty(const struct bio_list *bl) { @@ -554,6 +571,15 @@ struct biovec_slab { */ #define BIO_SPLIT_ENTRIES 2 +static inline void batch_complete_init(struct batch_complete *batch) +{ + bio_list_init(&batch->bio); + batch->kiocb = RB_ROOT; +} + +void batch_complete(struct batch_complete *batch); + + #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) @@ -580,7 +606,7 @@ extern int bio_integrity_enabled(struct bio *bio); extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); extern int bio_integrity_prep(struct bio *); -extern void bio_integrity_endio(struct bio *, int); +extern void bio_integrity_endio(struct bio *, int, struct batch_complete *); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); extern void bio_integrity_split(struct bio *, struct bio_pair *, int); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fa1abeb45b76..9d3cafa6bbcd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -16,7 +16,8 @@ struct page; struct block_device; struct io_context; struct cgroup_subsys_state; -typedef void (bio_end_io_t) (struct bio *, int); +struct batch_complete; +typedef void (bio_end_io_t) (struct bio *, int, struct batch_complete *); typedef void (bio_destructor_t) (struct bio *); /* @@ -42,6 +43,7 @@ struct bio { * top bits priority */ + short bi_error; unsigned short bi_vcnt; /* how many bio_vec's */ unsigned short bi_idx; /* current index into bvl_vec */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fdb4a451b49..ddc2f8058c70 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -883,7 +883,8 @@ extern struct request *blk_fetch_request(struct request_queue *q); * This prevents code duplication in drivers. */ extern bool blk_update_request(struct request *rq, int error, - unsigned int nr_bytes); + unsigned int nr_bytes, + struct batch_complete *batch); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); @@ -891,10 +892,17 @@ extern bool blk_end_request_cur(struct request *rq, int error); extern bool blk_end_request_err(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, int error); extern bool __blk_end_request_cur(struct request *rq, int error); extern bool __blk_end_request_err(struct request *rq, int error); +extern void blk_end_request_all_batch(struct request *rq, int error, + struct batch_complete *batch); + +static inline void __blk_end_request_all(struct request *rq, int error) +{ + blk_end_request_all_batch(rq, error, NULL); +} + extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 0d868d37bb8e..944f283f01c4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -174,6 +174,8 @@ extern struct bus_type cpu_subsys; extern void get_online_cpus(void); extern void put_online_cpus(void); +extern void cpu_hotplug_disable(void); +extern void cpu_hotplug_enable(void); #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri) #define register_hotcpu_notifier(nb) register_cpu_notifier(nb) #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) @@ -197,6 +199,8 @@ static inline void cpu_hotplug_driver_unlock(void) #define get_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0) +#define cpu_hotplug_disable() do { } while (0) +#define cpu_hotplug_enable() do { } while (0) #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) /* These aren't inline functions due to a GCC bug. */ #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h new file mode 100644 index 000000000000..d5b68bf3ec92 --- /dev/null +++ b/include/linux/decompress/unlz4.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZ4_H +#define DECOMPRESS_UNLZ4_H + +int unlz4(unsigned char *inbuf, int len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *pos, + void(*error)(char *x)); +#endif diff --git a/include/linux/err.h b/include/linux/err.h index f2edce25a76b..221fcfb676c4 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -24,17 +24,17 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } -static inline long __must_check PTR_ERR(const void *ptr) +static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } -static inline long __must_check IS_ERR(const void *ptr) +static inline long __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } -static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); } @@ -46,13 +46,13 @@ static inline long __must_check IS_ERR_OR_NULL(const void *ptr) * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ -static inline void * __must_check ERR_CAST(const void *ptr) +static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } -static inline int __must_check PTR_RET(const void *ptr) +static inline int __must_check PTR_RET(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f696014988d..f47e43c52562 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2453,7 +2453,7 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio, int error); +void dio_end_io(struct bio *bio, int error, struct batch_complete *batch); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 05bcc0903766..ecc8e01f492a 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -94,6 +94,11 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) +/* + * Are we in nmi,irq context, or softirq context? + */ +#define in_serving_irq() (in_nmi() || in_irq() || in_serving_softirq()) + #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 6f24446e7669..b519c47c635b 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -74,6 +74,7 @@ #define HID_USAGE_SENSOR_TIME_HOUR 0x200525 #define HID_USAGE_SENSOR_TIME_MINUTE 0x200526 #define HID_USAGE_SENSOR_TIME_SECOND 0x200527 +#define HID_USAGE_SENSOR_TIME_MILLISECOND 0x200528 /* Units */ #define HID_USAGE_SENSOR_UNITS_NOT_SPECIFIED 0x00 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 528454c2caa9..cc276d2c3a40 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -58,12 +58,11 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) +#define HPAGE_PMD_SHIFT PMD_SHIFT +#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) +#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define HPAGE_PMD_SHIFT HPAGE_SHIFT -#define HPAGE_PMD_MASK HPAGE_MASK -#define HPAGE_PMD_SIZE HPAGE_SIZE - extern bool is_vma_temporary_stack(struct vm_area_struct *vma); #define transparent_hugepage_enabled(__vma) \ @@ -181,9 +180,6 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm unsigned long addr, pmd_t pmd, pmd_t *pmdp); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) -#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) -#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define hpage_nr_pages(x) 1 diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f1e877b79ed8..cfc2f119779a 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -365,7 +365,7 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#else /* !LOCKDEP */ +#else /* !CONFIG_LOCKDEP */ static inline void lockdep_off(void) { @@ -479,82 +479,36 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# endif -# define spin_release(l, n, i) lock_release(l, n, i) +#ifdef CONFIG_PROVE_LOCKING + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 2, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 2, n, i) #else -# define spin_acquire(l, s, t, i) do { } while (0) -# define spin_release(l, n, i) do { } while (0) + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, NULL, i) -# else -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, NULL, i) -# endif -# define rwlock_release(l, n, i) lock_release(l, n, i) -#else -# define rwlock_acquire(l, s, t, i) do { } while (0) -# define rwlock_acquire_read(l, s, t, i) do { } while (0) -# define rwlock_release(l, n, i) do { } while (0) -#endif +#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define spin_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# endif -# define mutex_release(l, n, i) lock_release(l, n, i) -#else -# define mutex_acquire(l, s, t, i) do { } while (0) -# define mutex_acquire_nest(l, s, t, n, i) do { } while (0) -# define mutex_release(l, n, i) do { } while (0) -#endif +#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) -# else -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) -# endif +#define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define mutex_release(l, n, i) lock_release(l, n, i) + +#define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) # define rwsem_release(l, n, i) lock_release(l, n, i) -#else -# define rwsem_acquire(l, s, t, i) do { } while (0) -# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) -# define rwsem_acquire_read(l, s, t, i) do { } while (0) -# define rwsem_release(l, n, i) do { } while (0) -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_) -# else -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_) -# endif +#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) # define lock_map_release(l) lock_release(l, 1, _THIS_IP_) -#else -# define lock_map_acquire(l) do { } while (0) -# define lock_map_acquire_read(l) do { } while (0) -# define lock_map_release(l) do { } while (0) -#endif #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000000..d21c13f10a64 --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,87 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) +#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + + /* + * lz4hc_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len); +#endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d6183f06d8c1..7b4d9d79570b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -273,10 +274,10 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline int task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +static inline bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - return 1; + return true; } static inline struct cgroup_subsys_state diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ace9a5f01c64..1976fa91125e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/threads.h> #include <linux/list.h> +#include <linux/radix-tree.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/rwsem.h> @@ -330,12 +331,9 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); - void (*unmap_area) (struct mm_struct *mm, unsigned long addr); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ - unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ - unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ @@ -386,7 +384,7 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; - struct hlist_head ioctx_list; + struct radix_tree_root ioctx_rtree; #endif #ifdef CONFIG_MM_OWNER /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b..131989a8f574 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -495,6 +495,13 @@ typedef enum { ZONE_CONGESTED, /* zone has many dirty pages backed by * a congested BDI */ + ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found + * many dirty file pages at the tail + * of the LRU. + */ + ZONE_WRITEBACK, /* reclaim scanning has recently found + * many pages under writeback + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -517,6 +524,16 @@ static inline int zone_is_reclaim_congested(const struct zone *zone) return test_bit(ZONE_CONGESTED, &zone->flags); } +static inline int zone_is_reclaim_dirty(const struct zone *zone) +{ + return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); +} + +static inline int zone_is_reclaim_writeback(const struct zone *zone) +{ + return test_bit(ZONE_WRITEBACK, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); @@ -716,7 +733,10 @@ typedef struct pglist_data { * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * - * Nests above zone->lock and zone->size_seqlock. + * pgdat_resize_lock() and pgdat_resize_unlock() are provided to + * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. + * + * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif @@ -815,7 +835,10 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +static inline enum zone_type zone_idx(struct zone *zone) +{ + return zone - zone->zone_pgdat->node_zones; +} static inline int populated_zone(struct zone *zone) { @@ -856,25 +879,18 @@ static inline int is_normal_idx(enum zone_type idx) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM - int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; - return zone_off == ZONE_HIGHMEM * sizeof(*zone) || - (zone_off == ZONE_MOVABLE * sizeof(*zone) && - zone_movable_is_highmem()); -#else - return 0; -#endif + return is_highmem_idx(zone_idx(zone)); } static inline int is_normal(struct zone *zone) { - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; + return zone_idx(zone) == ZONE_NORMAL; } static inline int is_dma32(struct zone *zone) { #ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; + return zone_idx(zone) == ZONE_DMA32; #else return 0; #endif @@ -883,7 +899,7 @@ static inline int is_dma32(struct zone *zone) static inline int is_dma(struct zone *zone) { #ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; + return zone_idx(zone) == ZONE_DMA; #else return 0; #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6d53675c2b54..f1a5b5937be4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) PAGEFLAG(MappedToDisk, mappedtodisk) -/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ +/* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ +PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) #ifdef CONFIG_HIGHMEM /* diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h new file mode 100644 index 000000000000..d0cf8872dc43 --- /dev/null +++ b/include/linux/percpu-refcount.h @@ -0,0 +1,114 @@ +/* + * Dynamic percpu refcounts: + * (C) 2012 Google, Inc. + * Author: Kent Overstreet <koverstreet@google.com> + * + * This implements a refcount with similar semantics to atomic_t - atomic_inc(), + * atomic_dec_and_test() - but potentially percpu. + * + * There's one important difference between percpu refs and normal atomic_t + * refcounts; you have to keep track of your initial refcount, and then when you + * start shutting down you call percpu_ref_kill() _before_ dropping the initial + * refcount. + * + * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the + * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() + * puts the ref back in single atomic_t mode, collecting the per cpu refs and + * issuing the appropriate barriers, and then marks the ref as shutting down so + * that percpu_ref_put() will check for the ref hitting 0. After it returns, + * it's safe to drop the initial ref. + * + * BACKGROUND: + * + * Percpu refcounts are quite useful for performance, but if we blindly + * converted all refcounts to percpu counters we'd waste quite a bit of memory. + * + * Think about all the refcounts embedded in kobjects, files, etc. most of which + * aren't used much. These start out as simple atomic counters - a little bigger + * than a bare atomic_t, 16 bytes instead of 4 - but if we exceed some arbitrary + * number of gets in one second, we then switch to percpu counters. + * + * This heuristic isn't perfect because it'll fire if the refcount was only + * being used on one cpu; ideally we'd be able to count the number of cache + * misses on percpu_ref_get() or something similar, but that'd make the non + * percpu path significantly heavier/more complex. We can count the number of + * gets() without any extra atomic instructions on arches that support + * atomic64_t - simply by changing the atomic_inc() to atomic_add_return(). + * + * USAGE: + * + * See fs/aio.c for some example usage; it's used there for struct kioctx, which + * is created when userspaces calls io_setup(), and destroyed when userspace + * calls io_destroy() or the process exits. + * + * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it + * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove + * the kioctx from the proccess's list of kioctxs - after that, there can't be + * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop + * the initial ref with percpu_ref_put(). + * + * Code that does a two stage shutdown like this often needs some kind of + * explicit synchronization to ensure the initial refcount can only be dropped + * once - percpu_ref_kill() does this for you, it returns true once and false if + * someone else already called it. The aio code uses it this way, but it's not + * necessary if the code has some other mechanism to synchronize teardown. + * + * As mentioned previously, we decide when to convert a ref to percpu counters + * in percpu_ref_get(). However, since percpu_ref_get() will often be called + * with rcu_read_lock() held, it's not done there - percpu_ref_get() returns + * true if the ref should be converted to percpu counters. + * + * The caller should then call percpu_ref_alloc() after dropping + * rcu_read_lock(); if there is an uncommonly used codepath where it's + * inconvenient to call percpu_ref_alloc() after get(), it may be safely skipped + * and percpu_ref_get() will return true again the next time the counter wraps + * around. + */ + +#ifndef _LINUX_PERCPU_REFCOUNT_H +#define _LINUX_PERCPU_REFCOUNT_H + +#include <linux/atomic.h> +#include <linux/percpu.h> + +struct percpu_ref { + atomic64_t count; + unsigned long pcpu_count; +}; + +void percpu_ref_init(struct percpu_ref *ref); +void __percpu_ref_get(struct percpu_ref *ref, bool alloc); +int percpu_ref_put(struct percpu_ref *ref); + +int percpu_ref_kill(struct percpu_ref *ref); +int percpu_ref_dead(struct percpu_ref *ref); + +/** + * percpu_ref_get - increment a dynamic percpu refcount + * + * Increments @ref and possibly converts it to percpu counters. Must be called + * with rcu_read_lock() held, and may potentially drop/reacquire rcu_read_lock() + * to allocate percpu counters - if sleeping/allocation isn't safe for some + * other reason (e.g. a spinlock), see percpu_ref_get_noalloc(). + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get(struct percpu_ref *ref) +{ + __percpu_ref_get(ref, true); +} + +/** + * percpu_ref_get_noalloc - increment a dynamic percpu refcount + * + * Increments @ref, to be used when it's not safe to allocate percpu counters. + * Must be called with rcu_read_lock() held. + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get_noalloc(struct percpu_ref *ref) +{ + __percpu_ref_get(ref, false); +} + +#endif diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 7794d75ed155..907f3fd191ac 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -7,14 +7,20 @@ #include <linux/timex.h> #include <linux/alarmtimer.h> -union cpu_time_count { - cputime_t cpu; - unsigned long long sched; -}; + +static inline unsigned long long cputime_to_expires(cputime_t expires) +{ + return (__force unsigned long long)expires; +} + +static inline cputime_t expires_to_cputime(unsigned long long expires) +{ + return (__force cputime_t)expires; +} struct cpu_timer_list { struct list_head entry; - union cpu_time_count expires, incr; + unsigned long long expires, incr; struct task_struct *task; int firing; }; diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 89573a33ab3c..07d0df6bf768 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -142,9 +142,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_set(&child->ptrace_bp_refcnt, 1); -#endif child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; @@ -351,11 +348,4 @@ extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); -#ifdef CONFIG_HAVE_HW_BREAKPOINT -extern int ptrace_get_breakpoints(struct task_struct *tsk); -extern void ptrace_put_breakpoints(struct task_struct *tsk); -#else -static inline void ptrace_put_breakpoints(struct task_struct *tsk) { } -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - #endif diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 0022c1bb1e26..d2fdc6d1bdd2 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -29,7 +29,6 @@ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H -#include <linux/kernel.h> #include <linux/stddef.h> struct rb_node { diff --git a/include/linux/sched.h b/include/linux/sched.h index 178a8d909f14..d7715b23e842 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -322,8 +322,6 @@ extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern void arch_unmap_area(struct mm_struct *, unsigned long); -extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif @@ -1406,9 +1404,6 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; #endif -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_t ptrace_bp_refcnt; -#endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 1701ce4be746..ca031f7abee4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -330,11 +330,14 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern void end_swap_bio_write(struct bio *bio, int err); +extern void end_swap_bio_write(struct bio *bio, int err, + struct batch_complete *batch); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)); + void (*end_write_func)(struct bio *bio, int err, + struct batch_complete *batch)); extern int swap_set_page_dirty(struct page *page); -extern void end_swap_bio_read(struct bio *bio, int err); +extern void end_swap_bio_read(struct bio *bio, int err, + struct batch_complete *batch); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 38911391a139..98a3153c0f96 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -44,8 +44,8 @@ /* Return size of the log buffer */ #define SYSLOG_ACTION_SIZE_BUFFER 10 -#define SYSLOG_FROM_CALL 0 -#define SYSLOG_FROM_FILE 1 +#define SYSLOG_FROM_READER 0 +#define SYSLOG_FROM_PROC 1 int do_syslog(int type, char __user *buf, int count, bool from_file); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..d4b7a184f08c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,7 +50,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, -#endif +#ifdef CONFIG_BALLOON_COMPACTION + COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */ + COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */ + COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */ +#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4c062ccff9aa..4405886980c7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -905,7 +905,7 @@ struct ip_vs_app { struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; - int sync_queue_len; + unsigned long sync_queue_len; unsigned int sync_queue_delay; struct task_struct *master_thread; struct delayed_work master_wakeup_work; @@ -998,7 +998,7 @@ struct netns_ipvs { int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; - int sysctl_sync_qlen_max; + unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; @@ -1085,7 +1085,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return ACCESS_ONCE(ipvs->sysctl_sync_ports); } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } @@ -1138,7 +1138,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return 1; } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } diff --git a/init/Kconfig b/init/Kconfig index a29c8cd60c42..7fb26a667979 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -98,10 +98,13 @@ config HAVE_KERNEL_XZ config HAVE_KERNEL_LZO bool +config HAVE_KERNEL_LZ4 + bool + choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -168,6 +171,18 @@ config KERNEL_LZO size is about 10% bigger than gzip; however its speed (both compression and decompression) is the fastest. +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + <https://code.google.com/p/lz4/>. + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + endchoice config DEFAULT_HOSTNAME @@ -939,6 +954,23 @@ config MEMCG_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. +config MEMCG_DEBUG_ASYNC_DESTROY + bool "Memory Resource Controller Debug asynchronous object destruction" + depends on MEMCG_KMEM || MEMCG_SWAP + default n + help + When a memcg is destroyed, the memory consumed by it may not be + immediately freed. This is because when some extensions are used, such + as swap or kernel memory, objects can outlive the group and hold a + reference to it. + + If this is the case, the dangling_memcgs file will show information + about what are the memcgs still alive, and which references are still + preventing it to be freed. There is nothing wrong with that, but it is + very useful when debugging, to know where this memory is being held. + This is a developer-oriented debugging facility only, and no + guarantees of interface stability will be given. + config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" depends on RESOURCE_COUNTERS && HUGETLB_PAGE diff --git a/init/do_mounts.c b/init/do_mounts.c index 33d060e91fcd..c84fedb12c24 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -536,7 +536,7 @@ void __init prepare_namespace(void) int is_floppy; if (root_delay) { - printk(KERN_INFO "Waiting %dsec before mounting root device...\n", + printk(KERN_INFO "Waiting %d sec before mounting root device...\n", root_delay); ssleep(root_delay); } diff --git a/init/main.c b/init/main.c index 9484f4ba88d0..1b9e7f1b8008 100644 --- a/init/main.c +++ b/init/main.c @@ -655,8 +655,6 @@ static void __init do_ctors(void) bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); -static char msgbuf[64]; - static int __init_or_module do_one_initcall_debug(initcall_t fn) { ktime_t calltime, delta, rettime; @@ -679,6 +677,7 @@ int __init_or_module do_one_initcall(initcall_t fn) { int count = preempt_count(); int ret; + char msgbuf[64]; if (initcall_debug) ret = do_one_initcall_debug(fn); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e4e47f647446..ae1996d3c539 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -823,6 +823,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, error = ro; goto out; } + audit_inode_parent_hidden(name, root); filp = do_create(ipc_ns, root->d_inode, &path, oflag, mode, u_attr ? &attr : NULL); @@ -868,6 +869,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) if (IS_ERR(name)) return PTR_ERR(name); + audit_inode_parent_hidden(name, mnt->mnt_root); err = mnt_want_write(mnt); if (err) goto out_name; diff --git a/ipc/shm.c b/ipc/shm.c index 7e199fa1960f..85dc001634b1 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -491,10 +491,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf (name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) - & SHM_HUGE_MASK); + struct hstate *hs; size_t hugesize; + hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) { error = -EINVAL; goto no_file; diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..ea8550fb7585 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void) core_initcall(alloc_frozen_cpus); /* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - -/* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen * or not, as reported by the notification, remains unchanged *throughout the @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: diff --git a/kernel/exit.c b/kernel/exit.c index e59756275000..a2c19f24735d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -819,7 +819,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index 987b28a1f01b..2bac9012484b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -524,7 +522,7 @@ static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); + INIT_RADIX_TREE(&mm->ioctx_rtree, GFP_KERNEL); #endif } @@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); @@ -1199,8 +1195,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != INIT_USER) + if (p->real_cred->user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_lock(lock); } @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_unlock(lock); preempt_enable(); @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_lock(lock); } @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_unlock(lock); preempt_enable(); @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) int i; preempt_disable(); - rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) { int i; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..97712319f128 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -399,8 +399,9 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) + list_del_init(&timer->entry); +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= ptime; - } - } - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= utime; - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; - } else { - timer->expires.sched -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* @@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. @@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } @@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. diff --git a/kernel/printk.c b/kernel/printk.c index 7b254c33f672..0a4095e2fcda 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..0ea7f225ba95 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* @@ -1179,19 +1180,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..e03440ba0f20 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -339,6 +339,10 @@ static void wakeup_readers(unsigned long data) { struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); + /* + * Stupid polling for now: + */ + mod_timer(&buf->timer, jiffies + 1); } /** @@ -356,6 +360,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + mod_timer(&buf->timer, jiffies + 1); } else del_timer_sync(&buf->timer); @@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) else buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..97084cfc61ca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -12,6 +12,7 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/hardirq.h> #include "smpboot.h" @@ -240,8 +241,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -378,8 +380,9 @@ void smp_call_function_many(const struct cpumask *mask, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress && !early_boot_irqs_disabled); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress && !early_boot_irqs_disabled); /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..a5f88362589b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_DISABLE_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); @@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt) WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); + trace_softirqs_on(_RET_IP_); sub_preempt_count(cnt); } @@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) void local_bh_enable(void) { - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + _local_bh_enable_ip(_RET_IP_); } EXPORT_SYMBOL(local_bh_enable); @@ -223,8 +222,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f07..ea741c32d596 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) + if (hardlockup_panic) { + trigger_all_cpu_backtrace(); panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else + } else { WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + } __this_cpu_write(hard_watchdog_warn, true); return; @@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); - if (softlockup_panic) + if (softlockup_panic) { + trigger_all_cpu_backtrace(); panic("softlockup: hung tasks"); + } __this_cpu_write(soft_watchdog_warn, true); } else __this_cpu_write(soft_watchdog_warn, false); diff --git a/lib/Kconfig b/lib/Kconfig index 339d5a4292f8..c2ce4fd8b12b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -191,6 +191,15 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_COMPRESS + tristate + +config LZ4HC_COMPRESS + tristate + +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # @@ -215,6 +224,10 @@ config DECOMPRESS_LZO select LZO_DECOMPRESS tristate +config DECOMPRESS_LZ4 + select LZ4_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Makefile b/lib/Makefile index c55a037a354e..af911db40b88 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o + earlycpio.o percpu-refcount.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o @@ -23,7 +23,7 @@ lib-y += kobject.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o \ + gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o\ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o @@ -75,6 +75,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ @@ -83,6 +86,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o +lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/bitmap.c b/lib/bitmap.c index 06f7e4fe8d2d..ea190b7047c3 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1114,14 +1114,23 @@ done: */ int bitmap_find_free_region(unsigned long *bitmap, int bits, int order) { - int pos, end; /* scans bitmap by regions of size order */ + int nlongs = BITS_TO_LONGS(bits); + int i; - for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) { - if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) - continue; - __reg_op(bitmap, pos, order, REG_OP_ALLOC); - return pos; - } + for (i = 0; i < nlongs; i++) + if (bitmap[i] != ~0UL) { + int pos = i * BITS_PER_LONG; + int nbit = min(bits, pos + BITS_PER_LONG); + int end; + + for (; (end = pos + (1 << order)) <= nbit; pos = end) { + if (!__reg_op(bitmap, pos, order, + REG_OP_ISFREE)) + continue; + __reg_op(bitmap, pos, order, REG_OP_ALLOC); + return pos; + } + } return -ENOMEM; } EXPORT_SYMBOL(bitmap_find_free_region); diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c new file mode 100644 index 000000000000..a8f8379eb49f --- /dev/null +++ b/lib/clz_ctz.c @@ -0,0 +1,58 @@ +/* + * lib/clz_ctz.c + * + * Copyright (C) 2013 Chanho Min <chanho.min@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * __c[lt]z[sd]i2 can be overridden by linking arch-specific versions. + */ + +#include <linux/export.h> +#include <linux/kernel.h> + +int __weak __ctzsi2(int val) +{ + return __ffs(val); +} +EXPORT_SYMBOL(__ctzsi2); + +int __weak __clzsi2(int val) +{ + return 32 - fls(val); +} +EXPORT_SYMBOL(__clzsi2); + +#if BITS_PER_LONG == 32 + +int __weak __clzdi2(long val) +{ + return 32 - fls((int)val); +} +EXPORT_SYMBOL(__clzdi2); + +int __weak __ctzdi2(long val) +{ + return __ffs((u32)val); +} +EXPORT_SYMBOL(__ctzdi2); + +#elif BITS_PER_LONG == 64 + +int __weak __clzdi2(long val) +{ + return 64 - fls64((u64)val); +} +EXPORT_SYMBOL(__clzdi2); + +int __weak __ctzdi2(long val) +{ + return __ffs64((u64)val); +} +EXPORT_SYMBOL(__ctzdi2); + +#else +#error BITS_PER_LONG not 32 or 64 +#endif diff --git a/lib/decompress.c b/lib/decompress.c index f8fdedaf7b3d..4d1cd0397aab 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -11,6 +11,7 @@ #include <linux/decompress/unxz.h> #include <linux/decompress/inflate.h> #include <linux/decompress/unlzo.h> +#include <linux/decompress/unlz4.h> #include <linux/types.h> #include <linux/string.h> @@ -31,6 +32,9 @@ #ifndef CONFIG_DECOMPRESS_LZO # define unlzo NULL #endif +#ifndef CONFIG_DECOMPRESS_LZ4 +# define unlz4 NULL +#endif struct compress_format { unsigned char magic[2]; @@ -45,6 +49,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0x5d, 0x00}, "lzma", unlzma }, { {0xfd, 0x37}, "xz", unxz }, { {0x89, 0x4c}, "lzo", unlzo }, + { {0x02, 0x21}, "lz4", unlz4 }, { {0, 0}, NULL, NULL } }; diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c new file mode 100644 index 000000000000..3e67cfad16ad --- /dev/null +++ b/lib/decompress_unlz4.c @@ -0,0 +1,187 @@ +/* + * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifdef STATIC +#define PREBOOT +#include "lz4/lz4_decompress.c" +#else +#include <linux/decompress/unlz4.h> +#endif +#include <linux/types.h> +#include <linux/lz4.h> +#include <linux/decompress/mm.h> +#include <linux/compiler.h> + +#include <asm/unaligned.h> + +/* + * Note: Uncompressed chunk size is used in the compressor side + * (userspace side for compression). + * It is hardcoded because there is not proper way to extract it + * from the binary stream which is generated by the preliminary + * version of LZ4 tool so far. + */ +#define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) +#define ARCHIVE_MAGICNUMBER 0x184C2102 + +STATIC inline int INIT unlz4(u8 *input, int in_len, + int (*fill) (void *, unsigned int), + int (*flush) (void *, unsigned int), + u8 *output, int *posp, + void (*error) (char *x)) +{ + int ret = -1; + size_t chunksize = 0; + size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE; + u8 *inp; + u8 *inp_start; + u8 *outp; + int size = in_len; +#ifdef PREBOOT + size_t out_len = get_unaligned_le32(input + in_len); +#endif + size_t dest_len; + + + if (output) { + outp = output; + } else if (!flush) { + error("NULL output pointer and no flush function provided"); + goto exit_0; + } else { + outp = large_malloc(uncomp_chunksize); + if (!outp) { + error("Could not allocate output buffer"); + goto exit_0; + } + } + + if (input && fill) { + error("Both input pointer and fill function provided,"); + goto exit_1; + } else if (input) { + inp = input; + } else if (!fill) { + error("NULL input pointer and missing fill function"); + goto exit_1; + } else { + inp = large_malloc(lz4_compressbound(uncomp_chunksize)); + if (!inp) { + error("Could not allocate input buffer"); + goto exit_1; + } + } + inp_start = inp; + + if (posp) + *posp = 0; + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + } else { + error("invalid header"); + goto exit_2; + } + + if (posp) + *posp += 4; + + for (;;) { + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + if (posp) + *posp += 4; + continue; + } + inp += 4; + size -= 4; + + if (posp) + *posp += 4; + + if (fill) { + if (chunksize > lz4_compressbound(uncomp_chunksize)) { + error("chunk length is longer than allocated"); + goto exit_2; + } + fill(inp, chunksize); + } +#ifdef PREBOOT + if (out_len >= uncomp_chunksize) { + dest_len = uncomp_chunksize; + out_len -= dest_len; + } else + dest_len = out_len; + ret = lz4_decompress(inp, &chunksize, outp, dest_len); +#else + dest_len = uncomp_chunksize; + ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, + &dest_len); +#endif + if (ret < 0) { + error("Decoding failed"); + goto exit_2; + } + + if (flush && flush(outp, dest_len) != dest_len) + goto exit_2; + if (output) + outp += dest_len; + if (posp) + *posp += chunksize; + + size -= chunksize; + + if (size == 0) + break; + else if (size < 0) { + error("data corrupted"); + goto exit_2; + } + + inp += chunksize; + if (fill) + inp = inp_start; + } + + ret = 0; +exit_2: + if (!input) + large_free(inp_start); +exit_1: + if (!output) + large_free(outp); +exit_0: + return ret; +} + +#ifdef PREBOOT +STATIC int INIT decompress(unsigned char *buf, int in_len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *posp, + void(*error)(char *x) + ) +{ + return unlz4(buf, in_len - 4, fill, flush, output, posp, error); +} +#endif diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 53bad099ebd6..c03154173cc7 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -6,15 +6,58 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/smp.h> +#include <linux/atomic.h> + +static void __dump_stack(void) +{ + dump_stack_print_info(KERN_DEFAULT); + show_stack(NULL, NULL); +} /** * dump_stack - dump the current task information and its stack trace * * Architectures can override this implementation by implementing its own. */ +#ifdef CONFIG_SMP +static atomic_t dump_lock = ATOMIC_INIT(-1); + void dump_stack(void) { - dump_stack_print_info(KERN_DEFAULT); - show_stack(NULL, NULL); + int was_locked; + int old; + int cpu; + + /* + * Permit this cpu to perform nested stack dumps while serialising + * against other CPUs + */ + preempt_disable(); + +retry: + cpu = smp_processor_id(); + old = atomic_cmpxchg(&dump_lock, -1, cpu); + if (old == -1) { + was_locked = 0; + } else if (old == cpu) { + was_locked = 1; + } else { + cpu_relax(); + goto retry; + } + + __dump_stack(); + + if (!was_locked) + atomic_set(&dump_lock, -1); + + preempt_enable(); +} +#else +void dump_stack(void) +{ + __dump_stack(); } +#endif EXPORT_SYMBOL(dump_stack); diff --git a/lib/idr.c b/lib/idr.c index cca4b9302a71..bfe4db4e165f 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -524,9 +524,7 @@ EXPORT_SYMBOL(idr_alloc_cyclic); static void idr_remove_warning(int id) { - printk(KERN_WARNING - "idr_remove called for id=%d which is not allocated.\n", id); - dump_stack(); + WARN(1, "idr_remove called for id=%d which is not allocated.\n", id); } static void sub_remove(struct idr *idp, int shift, int id) @@ -1064,8 +1062,7 @@ void ida_remove(struct ida *ida, int id) return; err: - printk(KERN_WARNING - "ida_remove called for id=%d which is not allocated.\n", id); + WARN(1, "ida_remove called for id=%d which is not allocated.\n", id); } EXPORT_SYMBOL(ida_remove); diff --git a/lib/interval_tree.c b/lib/interval_tree.c index e6eb406f2d65..1efe7ab86dc1 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,3 +1,4 @@ +#include <linux/kernel.h> #include <linux/init.h> #include <linux/interval_tree.h> #include <linux/interval_tree_generic.h> diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000000..8085d04e9309 --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c new file mode 100644 index 000000000000..fd94058bd7f9 --- /dev/null +++ b/lib/lz4/lz4_compress.c @@ -0,0 +1,443 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min <chanho.min@lge.com> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <asm/unaligned.h> +#include "lz4defs.h" + +/* + * LZ4_compressCtx : + * ----------------- + * Compress 'isize' bytes from 'source' into an output buffer 'dest' of + * maximum size 'maxOutputSize'. * If it cannot achieve it, compression + * will stop, and result of the function will be zero. + * return : the number of bytes written in buffer 'dest', or 0 if the + * compression fails + */ +static inline int lz4_compressctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + HTYPE *hashtable = (HTYPE *)ctx; + const u8 *ip = (u8 *)source; +#if LZ4_ARCH64 + const BYTE * const base = ip; +#else + const int base = 0; +#endif + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardh = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (unlikely(forwardip > mflimit)) + goto _last_literals; + + forwardh = LZ4_HASH_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = ip - base; + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + + if (length >= (int)RUN_MASK) { + int len; + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + /* Encode MatchLength */ + length = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) + return 0; + if (length >= (int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length -= 510) { + *op++ = 255; + *op++ = 255; + } + if (length > 254) { + length -= 255; + *op++ = 255; + } + *op++ = (u8)length; + } else + *token += length; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + /* Test next position */ + ref = base + hashtable[LZ4_HASH_VALUE(ip)]; + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (((char *)op - dest) + lastrun + 1 + + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) + return 0; + + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + + /* End */ + return (int)(((char *)op) - dest); +} + +static inline int lz4_compress64kctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + u16 *hashtable = (u16 *)ctx; + const u8 *ip = (u8 *) source; + const u8 *anchor = ip; + const u8 *const base = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int len, length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (forwardip > mflimit) + goto _last_literals; + + forwardh = LZ4_HASH64K_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = (u16)(ip - base); + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) + && (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + + while (ip < MATCHLIMIT - (STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + + /* Encode MatchLength */ + len = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return 0; + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (u8)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + + /* Test next position */ + ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; + hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) + return 0; + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int)(((char *)op) - dest); +} + +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + if (src_len < LZ4_64KLIMIT) + out_len = lz4_compress64kctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + else + out_len = lz4_compressctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + + return 0; +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000000..d3414eae73a1 --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,326 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include <linux/module.h> +#include <linux/kernel.h> +#endif +#include <linux/lz4.h> + +#include <asm/unaligned.h> + +#include "lz4defs.h" + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *)ip) - source)); +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ + + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *) ip) - source)); +} + +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000000..abcecdc2d0f2 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,156 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define BYTE u8 +typedef struct _U16_S { u16 v; } U16_S; +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + A16(p) = v; \ + p += 2; \ + } while (0) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) +#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) +#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) + +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + put_unaligned(v, (u16 *)(p)); \ + p += 2; \ + } while (0) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) +#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - (MEMORY_USAGE-2))) +#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASHLOG64K)) +#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASH_LOG)) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) +#define HTYPE u32 + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) +#endif + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const u8* + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) +#endif + +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) + +#define LZ4_BLINDCOPY(s, d, l) \ + do { \ + u8 *e = (d) + l; \ + LZ4_WILDCOPY(s, d, e); \ + d = e; \ + } while (0) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c new file mode 100644 index 000000000000..eb1a74f5e368 --- /dev/null +++ b/lib/lz4/lz4hc_compress.c @@ -0,0 +1,539 @@ +/* + * LZ4 HC - High Compression Mode of LZ4 + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min <chanho.min@lge.com> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <asm/unaligned.h> +#include "lz4defs.h" + +struct lz4hc_data { + const u8 *base; + HTYPE hashtable[HASHTABLESIZE]; + u16 chaintable[MAXD]; + const u8 *nexttoupdate; +} __attribute__((__packed__)); + +static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +{ + memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); + memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); + +#if LZ4_ARCH64 + hc4->nexttoupdate = base + 1; +#else + hc4->nexttoupdate = base; +#endif + hc4->base = base; + return 1; +} + +/* Update chains up to ip (excluded) */ +static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +{ + u16 *chaintable = hc4->chaintable; + HTYPE *hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + + while (hc4->nexttoupdate < ip) { + const u8 *p = hc4->nexttoupdate; + size_t delta = p - (hashtable[HASH_VALUE(p)] + base); + if (delta > MAX_DISTANCE) + delta = MAX_DISTANCE; + chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; + hashtable[HASH_VALUE(p)] = (p) - base; + hc4->nexttoupdate++; + } +} + +static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, + const u8 *const matchlimit) +{ + const u8 *p1t = p1; + + while (p1t < matchlimit - (STEPSIZE - 1)) { +#if LZ4_ARCH64 + u64 diff = A64(p2) ^ A64(p1t); +#else + u32 diff = A32(p2) ^ A32(p1t); +#endif + if (!diff) { + p1t += STEPSIZE; + p2 += STEPSIZE; + continue; + } + p1t += LZ4_NBCOMMONBYTES(diff); + return p1t - p1; + } +#if LZ4_ARCH64 + if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { + p1t += 4; + p2 += 4; + } +#endif + + if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { + p1t += 2; + p2 += 2; + } + if ((p1t < matchlimit) && (*p2 == *p1t)) + p1t++; + return p1t - p1; +} + +static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; + const u8 *ref; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + int nbattempts = MAX_NB_ATTEMPTS; + size_t repl = 0, ml = 0; + u16 delta; + + /* HC4 match finder */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + /* potential repetition */ + if (ref >= ip-4) { + /* confirmed */ + if (A32(ref) == A32(ip)) { + delta = (u16)(ip-ref); + repl = ml = lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + while ((ref >= ip - MAX_DISTANCE) && nbattempts) { + nbattempts--; + if (*(ref + ml) == *(ip + ml)) { + if (A32(ref) == A32(ip)) { + size_t mlt = + lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { + ml = mlt; + *matchpos = ref; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + /* Complete table */ + if (repl) { + const BYTE *ptr = ip; + const BYTE *end; + end = ip + repl - (MINMATCH-1); + /* Pre-Load */ + while (ptr < end - delta) { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + ptr++; + } + do { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + /* Head of chain */ + hashtable[HASH_VALUE(ptr)] = (ptr) - base; + ptr++; + } while (ptr < end); + hc4->nexttoupdate = end; + } + + return (int)ml; +} + +static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, + const u8 **matchpos, const u8 **startpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + const u8 *ref; + int nbattempts = MAX_NB_ATTEMPTS; + int delta = (int)(ip - startlimit); + + /* First Match */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) + && (nbattempts)) { + nbattempts--; + if (*(startlimit + longest) == *(ref - delta + longest)) { + if (A32(ref) == A32(ip)) { + const u8 *reft = ref + MINMATCH; + const u8 *ipt = ip + MINMATCH; + const u8 *startt = ip; + + while (ipt < matchlimit-(STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(reft) ^ A64(ipt); + #else + u32 diff = A32(reft) ^ A32(ipt); + #endif + + if (!diff) { + ipt += STEPSIZE; + reft += STEPSIZE; + continue; + } + ipt += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ipt < (matchlimit - 3)) + && (A32(reft) == A32(ipt))) { + ipt += 4; + reft += 4; + } + ipt += 2; + #endif + if ((ipt < (matchlimit - 1)) + && (A16(reft) == A16(ipt))) { + reft += 2; + } + if ((ipt < matchlimit) && (*reft == *ipt)) + ipt++; +_endcount: + reft = ref; + + while ((startt > startlimit) + && (reft > hc4->base) + && (startt[-1] == reft[-1])) { + startt--; + reft--; + } + + if ((ipt - startt) > longest) { + longest = (int)(ipt - startt); + *matchpos = reft; + *startpos = startt; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + return longest; +} + +static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, + int ml, const u8 *ref) +{ + int length, len; + u8 *token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *(*op)++ = 255; + *(*op)++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(*anchor, *op, length); + + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref)); + + /* Encode MatchLength */ + len = (int)(ml - MINMATCH); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *(*op)++ = 255; + *(*op)++ = 255; + } + if (len > 254) { + len -= 255; + *(*op)++ = 255; + } + *(*op)++ = (u8)len; + } else + *token += len; + + /* Prepare next loop */ + *ip += ml; + *anchor = *ip; + + return 0; +} + +static int lz4_compresshcctx(struct lz4hc_data *ctx, + const char *source, + char *dest, + int isize) +{ + const u8 *ip = (const u8 *)source; + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + const u8 *const matchlimit = (iend - LASTLITERALS); + + u8 *op = (u8 *)dest; + + int ml, ml2, ml3, ml0; + const u8 *ref = NULL; + const u8 *start2 = NULL; + const u8 *ref2 = NULL; + const u8 *start3 = NULL; + const u8 *ref3 = NULL; + const u8 *start0; + const u8 *ref0; + int lastrun; + + ip++; + + /* Main Loop */ + while (ip < mflimit) { + ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + if (!ml) { + ip++; + continue; + } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; +_search2: + if (ip+ml < mflimit) + ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, + ip + 1, matchlimit, ml, &ref2, &start2); + else + ml2 = ml; + /* No better match */ + if (ml2 == ml) { + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + continue; + } + + if (start0 < ip) { + /* empirical */ + if (start2 < ip + ml0) { + ip = start0; + ref = ref0; + ml = ml0; + } + } + /* + * Here, start0==ip + * First Match too small : removed + */ + if ((start2 - ip) < 3) { + ml = ml2; + ip = start2; + ref = ref2; + goto _search2; + } + +_search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) + new_ml = OPTIMAL_ML; + if (ip + new_ml > start2 + ml2 - MINMATCH) + new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* + * Now, we have start2 = ip+new_ml, + * with new_ml=min(ml, OPTIMAL_ML=18) + */ + if (start2 + ml2 < mflimit) + ml3 = lz4hc_insertandgetwidermatch(ctx, + start2 + ml2 - 3, start2, matchlimit, + ml2, &ref3, &start3); + else + ml3 = ml2; + + /* No better match : 2 sequences to encode */ + if (ml3 == ml2) { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) + ml = (int)(start2 - ip); + + /* Now, encode 2 sequences */ + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start2; + lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + continue; + } + + /* Not enough space for match 2 : remove it */ + if (start3 < ip + ml + 3) { + /* + * can write Seq1 immediately ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ + if (start3 >= (ip + ml)) { + if (start2 < ip + ml) { + int correction = + (int)(ip + ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least + * the first one ip & ref are known; Now for ml + */ + if (start2 < ip + ml) { + if ((start2 - ip) < (int)ML_MASK) { + int correction; + if (ml > OPTIMAL_ML) + ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) + ml = (int)(start2 - ip) + ml2 + - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else + ml = (int)(start2 - ip); + } + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _search3; + } + + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8) lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int) (((char *)op) - dest); +} + +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; + lz4hc_init(hc4, (const u8 *)src); + out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, + (char *)dst, (int)src_len); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + return 0; + +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4hc_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC compressor"); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..b2bfa1057f7b --- /dev/null +++ b/lib/percpu-refcount.c @@ -0,0 +1,251 @@ +#define pr_fmt(fmt) "%s: " fmt "\n", __func__ + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/percpu-refcount.h> +#include <linux/rcupdate.h> + +/* + * A percpu refcount can be in 4 different modes. The state is tracked in the + * low two bits of percpu_ref->pcpu_count: + * + * PCPU_REF_NONE - the initial state, no percpu counters allocated. + * + * PCPU_REF_PTR - using percpu counters for the refcount. + * + * PCPU_REF_DYING - we're shutting down so get()/put() should use the embedded + * atomic counter, but we're not finished updating the atomic counter from the + * percpu counters - this means that percpu_ref_put() can't check for the ref + * hitting 0 yet. + * + * PCPU_REF_DEAD - we've finished the teardown sequence, percpu_ref_put() should + * now check for the ref hitting 0. + * + * In PCPU_REF_NONE mode, we need to count the number of times percpu_ref_get() + * is called; this is done with the high bits of the raw atomic counter. We also + * track the time, in jiffies, when the get count last wrapped - this is done + * with the remaining bits of percpu_ref->percpu_count. + * + * So, when percpu_ref_get() is called it increments the get count and checks if + * it wrapped; if it did, it checks if the last time it wrapped was less than + * one second ago; if so, we want to allocate percpu counters. + * + * PCPU_COUNT_BITS determines the threshold where we convert to percpu: of the + * raw 64 bit counter, we use PCPU_COUNT_BITS for the refcount, and the + * remaining (high) bits to count the number of times percpu_ref_get() has been + * called. It's currently (completely arbitrarily) 16384 times in one second. + * + * Percpu mode (PCPU_REF_PTR): + * + * In percpu mode all we do on get and put is increment or decrement the cpu + * local counter, which is a 32 bit unsigned int. + * + * Note that all the gets() could be happening on one cpu, and all the puts() on + * another - the individual cpu counters can wrap (potentially many times). + * + * But this is fine because we don't need to check for the ref hitting 0 in + * percpu mode; before we set the state to PCPU_REF_DEAD we simply sum up all + * the percpu counters and add them to the atomic counter. Since addition and + * subtraction in modular arithmatic is still associative, the result will be + * correct. + */ + +#define PCPU_COUNT_BITS 50 +#define PCPU_COUNT_MASK ((1LL << PCPU_COUNT_BITS) - 1) + +#define PCPU_STATUS_BITS 2 +#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) + +#define PCPU_REF_PTR 0 +#define PCPU_REF_NONE 1 +#define PCPU_REF_DYING 2 +#define PCPU_REF_DEAD 3 + +#define REF_STATUS(count) (count & PCPU_STATUS_MASK) + +/** + * percpu_ref_init - initialize a dynamic percpu refcount + * + * Initializes the refcount in single atomic counter mode with a refcount of 1; + * analagous to atomic_set(ref, 1). + */ +void percpu_ref_init(struct percpu_ref *ref) +{ + unsigned long now = jiffies; + + atomic64_set(&ref->count, 1); + + now <<= PCPU_STATUS_BITS; + now |= PCPU_REF_NONE; + + ref->pcpu_count = now; +} + +static void percpu_ref_alloc(struct percpu_ref *ref, unsigned long pcpu_count) +{ + unsigned long new, now = jiffies; + + now <<= PCPU_STATUS_BITS; + now |= PCPU_REF_NONE; + + if (now - pcpu_count <= HZ << PCPU_STATUS_BITS) { + rcu_read_unlock(); + new = (unsigned long) alloc_percpu(unsigned); + rcu_read_lock(); + + if (!new) + goto update_time; + + BUG_ON(new & PCPU_STATUS_MASK); + + if (cmpxchg(&ref->pcpu_count, pcpu_count, new) != pcpu_count) + free_percpu((void __percpu *) new); + else + pr_debug("created"); + } else { +update_time: + new = now; + cmpxchg(&ref->pcpu_count, pcpu_count, new); + } +} + +void __percpu_ref_get(struct percpu_ref *ref, bool alloc) +{ + unsigned long pcpu_count; + uint64_t v; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + if (REF_STATUS(pcpu_count) == PCPU_REF_PTR) { + /* for rcu - we're not using rcu_dereference() */ + smp_read_barrier_depends(); + __this_cpu_inc(*((unsigned __percpu *) pcpu_count)); + } else { + v = atomic64_add_return(1 + (1ULL << PCPU_COUNT_BITS), + &ref->count); + + /* + * The high bits of the counter count the number of gets() that + * have occured; we check for overflow to call + * percpu_ref_alloc() every (1 << (64 - PCPU_COUNT_BITS)) + * iterations. + */ + + if (!(v >> PCPU_COUNT_BITS) && + REF_STATUS(pcpu_count) == PCPU_REF_NONE && alloc) + percpu_ref_alloc(ref, pcpu_count); + } +} + +/** + * percpu_ref_put - decrement a dynamic percpu refcount + * + * Returns true if the result is 0, otherwise false; only checks for the ref + * hitting 0 after percpu_ref_kill() has been called. Analagous to + * atomic_dec_and_test(). + */ +int percpu_ref_put(struct percpu_ref *ref) +{ + unsigned long pcpu_count; + uint64_t v; + int ret = 0; + + rcu_read_lock(); + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + switch (REF_STATUS(pcpu_count)) { + case PCPU_REF_PTR: + /* for rcu - we're not using rcu_dereference() */ + smp_read_barrier_depends(); + __this_cpu_dec(*((unsigned __percpu *) pcpu_count)); + break; + case PCPU_REF_NONE: + case PCPU_REF_DYING: + atomic64_dec(&ref->count); + break; + case PCPU_REF_DEAD: + v = atomic64_dec_return(&ref->count); + v &= PCPU_COUNT_MASK; + + ret = v == 0; + break; + } + + rcu_read_unlock(); + + return ret; +} + +/** + * percpu_ref_kill - prepare a dynamic percpu refcount for teardown + * + * Must be called before dropping the initial ref, so that percpu_ref_put() + * knows to check for the refcount hitting 0. If the refcount was in percpu + * mode, converts it back to single atomic counter mode. + * + * Returns true the first time called on @ref and false if @ref is already + * shutting down, so it may be used by the caller for synchronizing other parts + * of a two stage shutdown. + */ +int percpu_ref_kill(struct percpu_ref *ref) +{ + unsigned long old, new, status, pcpu_count; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + do { + status = REF_STATUS(pcpu_count); + + switch (status) { + case PCPU_REF_PTR: + new = PCPU_REF_DYING; + break; + case PCPU_REF_NONE: + new = PCPU_REF_DEAD; + break; + case PCPU_REF_DYING: + case PCPU_REF_DEAD: + return 0; + } + + old = pcpu_count; + pcpu_count = cmpxchg(&ref->pcpu_count, old, new); + } while (pcpu_count != old); + + if (status == PCPU_REF_PTR) { + unsigned count = 0, cpu; + + synchronize_rcu(); + + for_each_possible_cpu(cpu) + count += *per_cpu_ptr((unsigned __percpu *)pcpu_count, + cpu); + + pr_debug("global %lli pcpu %i", + atomic64_read(&ref->count) & PCPU_COUNT_MASK, + (int) count); + + atomic64_add((int) count, &ref->count); + smp_wmb(); + /* Between setting global count and setting PCPU_REF_DEAD */ + ref->pcpu_count = PCPU_REF_DEAD; + + free_percpu((unsigned __percpu *) pcpu_count); + } + + return 1; +} + +/** + * percpu_ref_dead - check if a dynamic percpu refcount is shutting down + * + * Returns true if percpu_ref_kill() has been called on @ref, false otherwise. + */ +int percpu_ref_dead(struct percpu_ref *ref) +{ + unsigned status = REF_STATUS(ref->pcpu_count); + + return status == PCPU_REF_DYING || + status == PCPU_REF_DEAD; +} diff --git a/mm/Kconfig b/mm/Kconfig index e742d06285b7..b0a7dad663f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -477,3 +477,15 @@ config FRONTSWAP and swap data is stored as normal on the matching swap device. If unsure, say Y to enable frontswap. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..2c8ce496804f 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -242,6 +242,7 @@ bool balloon_page_isolate(struct page *page) if (__is_movable_balloon_page(page) && page_count(page) == 2) { __isolate_balloon_page(page); + balloon_event_count(COMPACTBALLOONISOLATED); unlock_page(page); return true; } @@ -265,6 +266,7 @@ void balloon_page_putback(struct page *page) __putback_balloon_page(page); /* drop the extra ref count taken for page isolation */ put_page(page); + balloon_event_count(COMPACTBALLOONRETURNED); } else { WARN_ON(1); dump_page(page); diff --git a/mm/bounce.c b/mm/bounce.c index c9f0a4339a7d..708c1e99f57b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -147,12 +147,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) bio_put(bio); } -static void bounce_end_io_write(struct bio *bio, int err) +static void bounce_end_io_write(struct bio *bio, int err, + struct batch_complete *batch) { bounce_end_io(bio, page_pool, err); } -static void bounce_end_io_write_isa(struct bio *bio, int err) +static void bounce_end_io_write_isa(struct bio *bio, int err, + struct batch_complete *batch) { bounce_end_io(bio, isa_page_pool, err); @@ -168,12 +170,14 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) bounce_end_io(bio, pool, err); } -static void bounce_end_io_read(struct bio *bio, int err) +static void bounce_end_io_read(struct bio *bio, int err, + struct batch_complete *batch) { __bounce_end_io_read(bio, page_pool, err); } -static void bounce_end_io_read_isa(struct bio *bio, int err) +static void bounce_end_io_read_isa(struct bio *bio, int err, + struct batch_complete *batch) { __bounce_end_io_read(bio, isa_page_pool, err); } diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf9..668f26316e2e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -132,6 +132,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; + int node; if (align == 0) { align = 1; @@ -156,7 +157,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, return NULL; } - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + node = WARN_ON(!dev) ? -1 : dev_to_node(dev); + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329b83fe..243e710c6039 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pmd_t entry; entry = mk_huge_pmd(page, vma); page_add_new_anon_rmap(page, vma, haddr); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); mm->nr_ptes++; return true; } @@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - pgtable_trans_huge_deposit(dst_mm, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { @@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm); + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); @@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (ret == 1) { pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(&mm->page_table_lock); } out: @@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); haddr = address; @@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; @@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 010d6c14129a..e34da3c07a85 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -321,14 +321,31 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; - /* For oom notifier event fd */ - struct list_head oom_notify; + union { + /* For oom notifier event fd */ + struct list_head oom_notify; + /* + * we can only trigger an oom event if the memcg is alive. + * so we will reuse this field to hook the memcg in the list + * of dead memcgs. + */ + struct list_head dead; + }; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; + union { + /* + * Should we move charges of a task when a task is moved into + * this mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + + /* + * We are no longer concerned about moving charges after memcg + * is dead. So we will fill this up with its name, to aid + * debugging. + */ + char *memcg_name; + }; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -382,6 +399,55 @@ static size_t memcg_size(void) nr_node_ids * sizeof(struct mem_cgroup_per_node); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static LIST_HEAD(dangling_memcgs); +static DEFINE_MUTEX(dangling_memcgs_mutex); + +static inline void memcg_dangling_free(struct mem_cgroup *memcg) +{ + mutex_lock(&dangling_memcgs_mutex); + list_del(&memcg->dead); + mutex_unlock(&dangling_memcgs_mutex); + free_pages((unsigned long)memcg->memcg_name, 0); +} + +static inline void memcg_dangling_add(struct mem_cgroup *memcg) +{ + /* + * cgroup.c will do page-sized allocations most of the time, + * so we'll just follow the pattern. Also, __get_free_pages + * is a better interface than kmalloc for us here, because + * we'd like this memory to be always billed to the root cgroup, + * not to the process removing the memcg. While kmalloc would + * require us to wrap it into memcg_stop/resume_kmem_account, + * with __get_free_pages we just don't pass the memcg flag. + */ + memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0); + + /* + * we will, in general, just ignore failures. No need to go crazy, + * being this just a debugging interface. It is nice to copy a memcg + * name over, but if we (unlikely) can't, just the address will do + */ + if (!memcg->memcg_name) + goto add_list; + + if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) { + free_pages((unsigned long)memcg->memcg_name, 0); + memcg->memcg_name = NULL; + } + +add_list: + INIT_LIST_HEAD(&memcg->dead); + mutex_lock(&dangling_memcgs_mutex); + list_add(&memcg->dead, &dangling_memcgs); + mutex_unlock(&dangling_memcgs_mutex); +} +#else +static inline void memcg_dangling_free(struct mem_cgroup *memcg) {} +static inline void memcg_dangling_add(struct mem_cgroup *memcg) {} +#endif + /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -1450,11 +1516,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { @@ -1466,14 +1533,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } if (!curr) - return 0; + return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -5102,6 +5169,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static void +mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_SWAP + u64 kmem; + u64 memsw; + + /* + * kmem will also propagate here, so we are only interested in the + * difference. See comment in mem_cgroup_reparent_charges for details. + * + * We could save this value for later consumption by kmem reports, but + * there is not a lot of problem if the figures differ slightly. + */ + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem; + seq_printf(m, "\t%llu swap bytes\n", memsw); +#endif +} + + +static void +mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m) +{ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + struct tcp_memcontrol *tcp = &memcg->tcp_mem; + s64 tcp_socks; + u64 tcp_bytes; + + tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated); + tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + seq_printf(m, "\t%llu tcp bytes", tcp_bytes); + /* + * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print + * it! + */ + if (tcp_bytes || tcp_socks) + seq_printf(m, ", in %lld sockets", tcp_socks); + seq_printf(m, "\n"); + +#endif +} + +static void +mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_KMEM + u64 kmem; + struct memcg_cache_params *params; + + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + seq_printf(m, "\t%llu kmem bytes", kmem); + + /* list below may not be initialized, so not even try */ + if (!kmem) + return; + + seq_printf(m, " in caches"); + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + struct kmem_cache *s = memcg_params_to_cache(params); + + seq_printf(m, " %s", s->name); + } + mutex_unlock(&memcg->slab_caches_mutex); + seq_printf(m, "\n"); +#endif +} + +/* + * After a memcg is destroyed, it may still be kept around in memory. + * Currently, the two main reasons for it are swap entries, and kernel memory. + * Because they will be freed assynchronously, they will pin the memcg structure + * and its resources until the last reference goes away. + * + * This root-only file will show information about which users + */ +static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg; + + mutex_lock(&dangling_memcgs_mutex); + + list_for_each_entry(memcg, &dangling_memcgs, dead) { + if (memcg->memcg_name) + seq_printf(m, "%s:\n", memcg->memcg_name); + else + seq_printf(m, "%p (name lost):\n", memcg); + + mem_cgroup_dangling_swap(memcg, m); + mem_cgroup_dangling_tcp(memcg, m); + mem_cgroup_dangling_kmem(memcg, m); + } + + mutex_unlock(&dangling_memcgs_mutex); + return 0; +} +#endif + static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; @@ -6003,6 +6171,14 @@ static struct cftype mem_cgroup_files[] = { }, #endif #endif + +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY + { + .name = "dangling_memcgs", + .read_seq_string = mem_cgroup_dangling_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, +#endif { }, /* terminate */ }; @@ -6152,6 +6328,8 @@ static void free_work(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, work_freeing); + + memcg_dangling_free(memcg); __mem_cgroup_free(memcg); } @@ -6346,6 +6524,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) kmem_cgroup_destroy(memcg); + memcg_dangling_add(memcg); mem_cgroup_put(memcg); } diff --git a/mm/memory.c b/mm/memory.c index 6dc1882fbd72..44bb67b810c9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2913,7 +2913,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, details->first_index, details->last_index) { vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + vma_pages(vma) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b522e2e1040e..3d2ba9393907 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -918,6 +918,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { + unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; @@ -996,7 +997,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + if (onlined_pages) { node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) @@ -1487,6 +1492,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long pfn, nr_pages, expire; long offlined_pages; int ret, drain, retry_max, node; + unsigned long flags; struct zone *zone; struct memory_notify arg; @@ -1580,7 +1586,11 @@ repeat: /* removal success */ zone->managed_pages -= offlined_pages; zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + totalram_pages -= offlined_pages; init_per_zone_wmark_min(); diff --git a/mm/migrate.c b/mm/migrate.c index b1f57501de9c..0f348946e288 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -876,6 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); balloon_page_free(page); + balloon_event_count(COMPACTBALLOONMIGRATED); return MIGRATEPAGE_SUCCESS; } out: diff --git a/mm/mmap.c b/mm/mmap.c index f681e1842fad..fcedf517e61d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } @@ -1367,9 +1367,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; - struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & - SHM_HUGE_MASK); + struct hstate *hs; + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) return -EINVAL; @@ -1876,15 +1876,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1941,19 +1932,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -2374,7 +2352,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2391,11 +2368,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } diff --git a/mm/mremap.c b/mm/mremap.c index 463a25705ac6..3708655378e9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); } arch_leave_lazy_mmu_mode(); diff --git a/mm/nommu.c b/mm/nommu.c index 298884dcd6e7..886e07ce4d1d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1869,10 +1869,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 378a15bcd649..ed06c1dffc9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,9 @@ #include <asm/div64.h> #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -1179,10 +1182,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1350,8 +1355,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: @@ -1628,6 +1634,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; + long free_cma = 0; free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) @@ -1637,9 +1644,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages <= min + lowmem_reserve) + + if (free_pages - free_cma <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -3703,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } @@ -3926,8 +3934,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { + pfn = ALIGN(pfn + MAX_ORDER_NR_PAGES, + MAX_ORDER_NR_PAGES) - 1; continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; } @@ -4030,7 +4041,40 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + +/* a companion to pageset_set_high() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); +} + +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4039,45 +4083,49 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); +} + +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + if (percpu_pagelist_fraction) + pageset_set_high(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); } static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - } + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* @@ -5538,7 +5586,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5549,14 +5596,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -6045,32 +6094,17 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { - stop_machine(__zone_pcp_update, zone, NULL); + unsigned cpu; + mutex_lock(&pcp_batch_high_lock); + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); + mutex_unlock(&pcp_batch_high_lock); } #endif diff --git a/mm/page_io.c b/mm/page_io.c index a8a3ef45fed7..b222e9a1d9dc 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> #include <linux/frontswap.h> #include <linux/aio.h> +#include <linux/blkdev.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -42,7 +43,8 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, return bio; } -void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -68,7 +70,7 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -void end_swap_bio_read(struct bio *bio, int err) +void end_swap_bio_read(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -81,8 +83,42 @@ void end_swap_bio_read(struct bio *bio, int err) iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_sector); } else { + struct swap_info_struct *sis; + SetPageUptodate(page); + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } } + unlock_page(page); bio_put(bio); } @@ -203,7 +239,8 @@ out: } int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)) + void (*end_write_func)(struct bio *bio, int err, + struct batch_complete *batch)) { struct bio *bio; int ret = 0, rw = WRITE; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c8323fe6c8f..e1a6e4fab016 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { assert_spin_locked(&mm->page_table_lock); @@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* no "address" argument so destroys page coloring of some arch */ -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; diff --git a/mm/swap_state.c b/mm/swap_state.c index b3d40dcf3624..7a6d3b1a7d67 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -373,6 +378,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -396,11 +445,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -414,10 +468,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } diff --git a/mm/util.c b/mm/util.c index ab1424dbe2e6..7441c41d00f6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index fa6a85378ee4..b1b38adbcb5c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -676,13 +676,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, unsigned long *ret_nr_writeback, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; @@ -723,25 +724,53 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * block for HZ/10 or until some IO completes then clear the + * ZONE_WRITEBACK flag to recheck if the condition exists. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * memcg doesn't have any dirty pages throttling so we - * could easily OOM just because too many pages are in - * writeback and there is nothing else to reclaim. - * - * Check __GFP_IO, certainly because a loop driver - * thread might enter reclaim, and deadlock if it waits - * on a page for which it is needed to do the write - * (loop masks off __GFP_IO|__GFP_FS for this reason); - * but more thought would probably show more reasons. - * - * Don't require __GFP_FS, since we're not going into - * the FS, just waiting on its writeback completion. - * Worryingly, ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so - * testing may_enter_fs here is liable to OOM on them. - */ - if (global_reclaim(sc) || + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* Case 2 above */ + } else if (global_reclaim(sc) || !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { /* * This is slightly racy - end_page_writeback() @@ -756,9 +785,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); } - wait_on_page_writeback(page); } if (!force_reclaim) @@ -808,14 +841,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { nr_dirty++; + if (!PageWriteback(page)) + nr_unqueued_dirty++; + /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && (!current_is_kswapd() || - sc->priority >= DEF_PRIORITY - 2)) { + !zone_is_reclaim_dirty(zone))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -960,7 +996,7 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); mem_cgroup_uncharge_end(); - *ret_nr_dirty += nr_dirty; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1370,8 +1406,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * isolated page is PageWriteback */ if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + (nr_taken >> (DEF_PRIORITY - sc->priority))) { wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + zone_set_flag(zone, ZONE_WRITEBACK); + } + + /* + * Similarly, if many dirty pages are encountered that are not + * currently being written then flag that kswapd should start + * writing back pages. + */ + if (global_reclaim(sc) && nr_dirty && + nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority))) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), @@ -1822,17 +1869,25 @@ out: static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; + bool scan_adjusted = false; get_scan_count(lruvec, sc, nr); + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); @@ -1842,17 +1897,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) lruvec, sc); } } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. + * For global direct reclaim, reclaim only the number of pages + * requested. Less care is taken to scan proportionally as it + * is more important to minimise direct reclaim stall latency + * than it is to properly age the LRU lists. */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) + if (global_reclaim(sc) && !current_is_kswapd()) break; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs shrink + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; @@ -2601,6 +2699,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, } /* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long lru_pages, + unsigned long *nr_attempted) +{ + unsigned long nr_slab; + int testorder = sc->order; + unsigned long balance_gap; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc); + + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; + + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (!zone->all_unreclaimable && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * @@ -2624,35 +2807,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, + .may_writepage = !laptop_mode, .order = order, .target_mem_cgroup = NULL, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; -loop_again: - sc.priority = DEF_PRIORITY; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); do { unsigned long lru_pages = 0; + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); + + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2689,23 +2865,46 @@ loop_again: end_zone = i; break; } else { - /* If balanced, clear the congested flag */ + /* + * If balanced, clear the dirty and congested + * flags + */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } } - if (i < 0) { - pgdat_is_balanced = true; + if (i < 0) goto out; - } for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -2716,8 +2915,6 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2738,65 +2935,14 @@ loop_again: sc.nr_reclaimed += nr_soft_reclaimed; /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. - */ - testorder = order; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - shrink_zone(zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (zone_balanced(zone, testorder, 0, end_zone)) - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * speculatively avoid congestion waits - */ - zone_clear_flag(zone, ZONE_CONGESTED); + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* @@ -2808,74 +2954,38 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (pgdat_balanced(pgdat, order, *classzone_idx)) { - pgdat_is_balanced = true; - break; /* kswapd: all done */ - } - /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } while (--sc.priority >= 0); - -out: - if (!pgdat_is_balanced) { - cond_resched(); + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; - try_to_freeze(); + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) - order = sc.order = 0; - - goto loop_again; - } - - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - } - - if (zones_need_compaction) + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) compact_pgdat(pgdat, order); - } + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, diff --git a/mm/vmstat.c b/mm/vmstat.c index f42745e65780..7a35116462a7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -794,7 +794,14 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", -#endif + +#ifdef CONFIG_BALLOON_COMPACTION + "compact_balloon_isolated", + "compact_balloon_migrated", + "compact_balloon_returned", +#endif /* CONFIG_BALLOON_COMPACTION */ + +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7c3ed429789e..df05c1c276f0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1716,9 +1716,9 @@ static struct ctl_table vs_vars[] = { }, { .procname = "sync_qlen_max", - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 8337663aa298..3e07b0d629c4 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -311,6 +311,11 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) +quiet_cmd_lz4 = LZ4 $@ +cmd_lz4 = (cat $(filter-out FORCE,$^) | \ + lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + # U-Boot mkimage # --------------------------------------------------------------------------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b954de58304f..517da26d9a2d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -230,11 +230,15 @@ our $Inline = qr{inline|__always_inline|noinline}; our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; +our $Int_type = qr{(?i)llu|ull|ll|lu|ul|l|u}; +our $Binary = qr{(?i)0b[01]+$Int_type?}; +our $Hex = qr{(?i)0x[0-9a-f]+$Int_type?}; +our $Int = qr{[0-9]+$Int_type?}; our $Float_hex = qr{(?i)0x[0-9a-f]+p-?[0-9]+[fl]?}; our $Float_dec = qr{(?i)(?:[0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(?:e-?[0-9]+)?[fl]?}; our $Float_int = qr{(?i)[0-9]+e-?[0-9]+[fl]?}; our $Float = qr{$Float_hex|$Float_dec|$Float_int}; -our $Constant = qr{$Float|(?i)(?:0x[0-9a-f]+|[0-9]+)[ul]*}; +our $Constant = qr{$Float|$Binary|$Hex|$Int}; our $Assignment = qr{\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ @@ -2934,16 +2938,24 @@ sub process { } } -#CamelCase +#Specific variable tests while ($line =~ m{($Constant|$Lval)}g) { my $var = $1; + +#gcc binary extension + if ($var =~ /^$Binary$/) { + WARN("GCC_BINARY_CONSTANT", + "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr); + } + +#CamelCase if ($var !~ /$Constant/ && - $var =~ /[A-Z]\w*[a-z]|[a-z]\w*[A-Z]/ && + $var =~ /[A-Z][a-z]|[a-z][A-Z]/ && $var !~ /"^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && !defined $camelcase{$var}) { $camelcase{$var} = 1; - WARN("CAMELCASE", - "Avoid CamelCase: <$var>\n" . $herecurr); + CHK("CAMELCASE", + "Avoid CamelCase: <$var>\n" . $herecurr); } } diff --git a/sound/soc/codecs/si476x.c b/sound/soc/codecs/si476x.c index 721587c9cd84..73e205c892a0 100644 --- a/sound/soc/codecs/si476x.c +++ b/sound/soc/codecs/si476x.c @@ -38,9 +38,9 @@ enum si476x_digital_io_output_format { SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT = 8, }; -#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0b111 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \ - (0b111 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT)) -#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0b1111110) +#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0x7 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \ + (0x7 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT)) +#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0x7e) enum si476x_daudio_formats { SI476X_DAUDIO_MODE_I2S = (0x0 << 1), diff --git a/tools/include/tools/be_byteshift.h b/tools/include/tools/be_byteshift.h index f4912e2668ba..d51fe267cce8 100644 --- a/tools/include/tools/be_byteshift.h +++ b/tools/include/tools/be_byteshift.h @@ -1,68 +1,68 @@ #ifndef _TOOLS_BE_BYTESHIFT_H #define _TOOLS_BE_BYTESHIFT_H -#include <linux/types.h> +#include <inttypes.h> -static inline __u16 __get_unaligned_be16(const __u8 *p) +static inline uint16_t __get_unaligned_be16(const uint8_t *p) { return p[0] << 8 | p[1]; } -static inline __u32 __get_unaligned_be32(const __u8 *p) +static inline uint32_t __get_unaligned_be32(const uint8_t *p) { return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3]; } -static inline __u64 __get_unaligned_be64(const __u8 *p) +static inline uint64_t __get_unaligned_be64(const uint8_t *p) { - return (__u64)__get_unaligned_be32(p) << 32 | + return (uint64_t)__get_unaligned_be32(p) << 32 | __get_unaligned_be32(p + 4); } -static inline void __put_unaligned_be16(__u16 val, __u8 *p) +static inline void __put_unaligned_be16(uint16_t val, uint8_t *p) { *p++ = val >> 8; *p++ = val; } -static inline void __put_unaligned_be32(__u32 val, __u8 *p) +static inline void __put_unaligned_be32(uint32_t val, uint8_t *p) { __put_unaligned_be16(val >> 16, p); __put_unaligned_be16(val, p + 2); } -static inline void __put_unaligned_be64(__u64 val, __u8 *p) +static inline void __put_unaligned_be64(uint64_t val, uint8_t *p) { __put_unaligned_be32(val >> 32, p); __put_unaligned_be32(val, p + 4); } -static inline __u16 get_unaligned_be16(const void *p) +static inline uint16_t get_unaligned_be16(const void *p) { - return __get_unaligned_be16((const __u8 *)p); + return __get_unaligned_be16((const uint8_t *)p); } -static inline __u32 get_unaligned_be32(const void *p) +static inline uint32_t get_unaligned_be32(const void *p) { - return __get_unaligned_be32((const __u8 *)p); + return __get_unaligned_be32((const uint8_t *)p); } -static inline __u64 get_unaligned_be64(const void *p) +static inline uint64_t get_unaligned_be64(const void *p) { - return __get_unaligned_be64((const __u8 *)p); + return __get_unaligned_be64((const uint8_t *)p); } -static inline void put_unaligned_be16(__u16 val, void *p) +static inline void put_unaligned_be16(uint16_t val, void *p) { __put_unaligned_be16(val, p); } -static inline void put_unaligned_be32(__u32 val, void *p) +static inline void put_unaligned_be32(uint32_t val, void *p) { __put_unaligned_be32(val, p); } -static inline void put_unaligned_be64(__u64 val, void *p) +static inline void put_unaligned_be64(uint64_t val, void *p) { __put_unaligned_be64(val, p); } diff --git a/tools/include/tools/le_byteshift.h b/tools/include/tools/le_byteshift.h index c99d45a68bda..259a132046ff 100644 --- a/tools/include/tools/le_byteshift.h +++ b/tools/include/tools/le_byteshift.h @@ -1,68 +1,68 @@ #ifndef _TOOLS_LE_BYTESHIFT_H #define _TOOLS_LE_BYTESHIFT_H -#include <linux/types.h> +#include <inttypes.h> -static inline __u16 __get_unaligned_le16(const __u8 *p) +static inline uint16_t __get_unaligned_le16(const uint8_t *p) { return p[0] | p[1] << 8; } -static inline __u32 __get_unaligned_le32(const __u8 *p) +static inline uint32_t __get_unaligned_le32(const uint8_t *p) { return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24; } -static inline __u64 __get_unaligned_le64(const __u8 *p) +static inline uint64_t __get_unaligned_le64(const uint8_t *p) { - return (__u64)__get_unaligned_le32(p + 4) << 32 | + return (uint64_t)__get_unaligned_le32(p + 4) << 32 | __get_unaligned_le32(p); } -static inline void __put_unaligned_le16(__u16 val, __u8 *p) +static inline void __put_unaligned_le16(uint16_t val, uint8_t *p) { *p++ = val; *p++ = val >> 8; } -static inline void __put_unaligned_le32(__u32 val, __u8 *p) +static inline void __put_unaligned_le32(uint32_t val, uint8_t *p) { __put_unaligned_le16(val >> 16, p + 2); __put_unaligned_le16(val, p); } -static inline void __put_unaligned_le64(__u64 val, __u8 *p) +static inline void __put_unaligned_le64(uint64_t val, uint8_t *p) { __put_unaligned_le32(val >> 32, p + 4); __put_unaligned_le32(val, p); } -static inline __u16 get_unaligned_le16(const void *p) +static inline uint16_t get_unaligned_le16(const void *p) { - return __get_unaligned_le16((const __u8 *)p); + return __get_unaligned_le16((const uint8_t *)p); } -static inline __u32 get_unaligned_le32(const void *p) +static inline uint32_t get_unaligned_le32(const void *p) { - return __get_unaligned_le32((const __u8 *)p); + return __get_unaligned_le32((const uint8_t *)p); } -static inline __u64 get_unaligned_le64(const void *p) +static inline uint64_t get_unaligned_le64(const void *p) { - return __get_unaligned_le64((const __u8 *)p); + return __get_unaligned_le64((const uint8_t *)p); } -static inline void put_unaligned_le16(__u16 val, void *p) +static inline void put_unaligned_le16(uint16_t val, void *p) { __put_unaligned_le16(val, p); } -static inline void put_unaligned_le32(__u32 val, void *p) +static inline void put_unaligned_le32(uint32_t val, void *p) { __put_unaligned_le32(val, p); } -static inline void put_unaligned_le64(__u64 val, void *p) +static inline void put_unaligned_le64(uint64_t val, void *p) { __put_unaligned_le64(val, p); } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 0a63658065f0..4cb14cae3791 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -6,6 +6,7 @@ TARGETS += memory-hotplug TARGETS += mqueue TARGETS += net TARGETS += ptrace +TARGETS += timers TARGETS += vm all: diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile new file mode 100644 index 000000000000..eb2859f4ad21 --- /dev/null +++ b/tools/testing/selftests/timers/Makefile @@ -0,0 +1,8 @@ +all: + gcc posix_timers.c -o posix_timers -lrt + +run_tests: all + ./posix_timers + +clean: + rm -f ./posix_timers diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c new file mode 100644 index 000000000000..4fa655d68a81 --- /dev/null +++ b/tools/testing/selftests/timers/posix_timers.c @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2013 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Licensed under the terms of the GNU GPL License version 2 + * + * Selftests for a few posix timers interface. + * + * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com> + */ + +#include <sys/time.h> +#include <stdio.h> +#include <signal.h> +#include <unistd.h> +#include <time.h> +#include <pthread.h> + +#define DELAY 2 +#define USECS_PER_SEC 1000000 + +static volatile int done; + +/* Busy loop in userspace to elapse ITIMER_VIRTUAL */ +static void user_loop(void) +{ + while (!done); +} + +/* + * Try to spend as much time as possible in kernelspace + * to elapse ITIMER_PROF. + */ +static void kernel_loop(void) +{ + void *addr = sbrk(0); + + while (!done) { + brk(addr + 4096); + brk(addr); + } +} + +/* + * Sleep until ITIMER_REAL expiration. + */ +static void idle_loop(void) +{ + pause(); +} + +static void sig_handler(int nr) +{ + done = 1; +} + +/* + * Check the expected timer expiration matches the GTOD elapsed delta since + * we armed the timer. Keep a 0.5 sec error margin due to various jitter. + */ +static int check_diff(struct timeval start, struct timeval end) +{ + long long diff; + + diff = end.tv_usec - start.tv_usec; + diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; + + if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + printf("Diff too high: %lld..", diff); + return -1; + } + + return 0; +} + +static int check_itimer(int which) +{ + int err; + struct timeval start, end; + struct itimerval val = { + .it_value.tv_sec = DELAY, + }; + + printf("Check itimer "); + + if (which == ITIMER_VIRTUAL) + printf("virtual... "); + else if (which == ITIMER_PROF) + printf("prof... "); + else if (which == ITIMER_REAL) + printf("real... "); + + fflush(stdout); + + done = 0; + + if (which == ITIMER_VIRTUAL) + signal(SIGVTALRM, sig_handler); + else if (which == ITIMER_PROF) + signal(SIGPROF, sig_handler); + else if (which == ITIMER_REAL) + signal(SIGALRM, sig_handler); + + err = gettimeofday(&start, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + err = setitimer(which, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + if (which == ITIMER_VIRTUAL) + user_loop(); + else if (which == ITIMER_PROF) + kernel_loop(); + else if (which == ITIMER_REAL) + idle_loop(); + + gettimeofday(&end, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + if (!check_diff(start, end)) + printf("[OK]\n"); + else + printf("[FAIL]\n"); + + return 0; +} + +static int check_timer_create(int which) +{ + int err; + timer_t id; + struct timeval start, end; + struct itimerspec val = { + .it_value.tv_sec = DELAY, + }; + + printf("Check timer_create() "); + if (which == CLOCK_THREAD_CPUTIME_ID) { + printf("per thread... "); + } else if (which == CLOCK_PROCESS_CPUTIME_ID) { + printf("per process... "); + } + fflush(stdout); + + done = 0; + timer_create(which, NULL, &id); + if (err < 0) { + perror("Can't create timer\n"); + return -1; + } + signal(SIGALRM, sig_handler); + + err = gettimeofday(&start, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + err = timer_settime(id, 0, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + user_loop(); + + gettimeofday(&end, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + if (!check_diff(start, end)) + printf("[OK]\n"); + else + printf("[FAIL]\n"); + + return 0; +} + +int main(int argc, char **argv) +{ + int err; + + printf("Testing posix timers. False negative may happen on CPU execution \n"); + printf("based timers if other threads run on the CPU...\n"); + + if (check_itimer(ITIMER_VIRTUAL) < 0) + return -1; + + if (check_itimer(ITIMER_PROF) < 0) + return -1; + + if (check_itimer(ITIMER_REAL) < 0) + return -1; + + if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) + return -1; + + /* + * It's unfortunately hard to reliably test a timer expiration + * on parallel multithread cputime. We could arm it to expire + * on DELAY * nr_threads, with nr_threads busy looping, then wait + * the normal DELAY since the time is elapsing nr_threads faster. + * But for that we need to ensure we have real physical free CPUs + * to ensure true parallelism. So test only one thread until we + * find a better solution. + */ + if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) + return -1; + + return 0; +} diff --git a/usr/Kconfig b/usr/Kconfig index 085872bb2bb5..642f503d3e9f 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -90,6 +90,15 @@ config RD_LZO Support loading of a LZO encoded initial ramdisk or cpio buffer If unsure, say N. +config RD_LZ4 + bool "Support initial ramdisks compressed using LZ4" if EXPERT + default !EXPERT + depends on BLK_DEV_INITRD + select DECOMPRESS_LZ4 + help + Support loading of a LZ4 encoded initial ramdisk or cpio buffer + If unsure, say N. + choice prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!="" help |