diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-05-27 18:24:54 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-05-27 18:24:54 +1000 |
commit | 247264072bc5d7552e3fb517e542f0d28312d5f0 (patch) | |
tree | 4bd71b975c5d769170b20d13ec7f9662bdb31c2d | |
parent | e4b1ebcec3000118eafe25944feddba130c1441a (diff) | |
parent | ca1139488da3dcf267b2ce8490f778d69af0d0e4 (diff) |
Merge branch 'akpm/master'
88 files changed, 1659 insertions, 903 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf5..3187a18af6da 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2d70ceabadb7..5b3fdda3ad6b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -57,6 +57,7 @@ Brief summary of control files. memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap (See 5.5 for details) memory.limit_in_bytes # set/show limit of memory usage + memory.low_limit_in_bytes # set/show low limit for memory reclaim memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage memory.failcnt # show the number of memory usage hits limits memory.memsw.failcnt # show the number of memory+Swap hits limits @@ -236,23 +237,26 @@ it by cgroup. 2.5 Reclaim Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: Reclaim does not work for the root cgroup, since we cannot set any -limits on the root cgroup. - -Note2: When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) +global VM. Cgroups can get reclaimed basically under two conditions + - under global memory pressure when all cgroups are reclaimed + proportionally wrt. their LRU size in a round robin fashion + - when a cgroup or its hierarchical parent (see 6. Hierarchical support) + hits hard limit. If the reclaim is unsuccessful, an OOM routine is invoked + to select and kill the bulkiest task in the hiearchy. (See 10. OOM Control + below.) + +Groups might be also protected from both global and limit reclaim by +low_limit_in_bytes knob. If the limit is non-zero the reclaim logic +doesn't include groups (and their subgroups - see 6. Hierarchy support) +which are bellow the low limit if there is other eligible cgroup in the +reclaimed hierarchy. If all groups which participate reclaim are under +their low limits then all of them are reclaimed and the low limit is +ignored. + +Note: When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered to the root +of the memory pressure which cannot be handled (See oom_control section) 2.6 Locking @@ -473,6 +477,9 @@ About use_hierarchy, see Section 6. write will still return success. In this case, it is expected that memory.kmem.usage_in_bytes == memory.usage_in_bytes. + Please note that this knob is considered deprecated and will be removed + in the future. + About use_hierarchy, see Section 6. 5.2 stat file @@ -536,14 +543,13 @@ Note: 5.3 swappiness -Similar to /proc/sys/vm/swappiness, but only affecting reclaim that is -triggered by this cgroup's hard limit. The tunable in the root cgroup -corresponds to the global swappiness setting. +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. -Please note that unlike the global swappiness, memcg knob set to 0 -really prevents from any swapping even if there is a swap storage -available. This might lead to memcg OOM killer if there are no file -pages to reclaim. +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. 5.4 failcnt diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126aceed8..223c32171dcc 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block <bool>: 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ba1c2768a906..909bcd5647eb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3119,6 +3119,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL] Should the soft-lockup detector generate panics. Format: <integer> + softlockup_all_cpu_backtrace= + [KNL] Should the soft-lockup detector generate + backtraces on all cpus. + Format: <integer> + sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index a7563ec4ea7b..b772418bf064 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -142,6 +142,7 @@ kmemleak_alloc_percpu - notify of a percpu memory block allocation kmemleak_free - notify of a memory block freeing kmemleak_free_part - notify of a partial memory block freeing kmemleak_free_percpu - notify of a percpu memory block freeing +kmemleak_update_trace - update object allocation stack trace kmemleak_not_leak - mark an object as not a leak kmemleak_ignore - do not scan or report an object as leak kmemleak_scan_area - add scan areas inside a memory block diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 708bb7f1b7e0..c14374e71775 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -75,6 +75,7 @@ show up in /proc/sys/kernel: - shmall - shmmax [ sysv ipc ] - shmmni +- softlockup_all_cpu_backtrace - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict @@ -783,6 +784,22 @@ via the /proc/sys interface: ============================================================== +softlockup_all_cpu_backtrace: + +This value controls the soft lockup detector thread's behavior +when a soft lockup condition is detected as to whether or not +to gather further debug information. If enabled, each cpu will +be issued an NMI and instructed to capture stack trace. + +This feature is only applicable for architectures which support +NMI. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt new file mode 100644 index 000000000000..f609142f406a --- /dev/null +++ b/Documentation/vm/remap_file_pages.txt @@ -0,0 +1,27 @@ +The remap_file_pages() system call is used to create a nonlinear mapping, +that is, a mapping in which the pages of the file are mapped into a +nonsequential order in memory. The advantage of using remap_file_pages() +over using repeated calls to mmap(2) is that the former approach does not +require the kernel to create additional VMA (Virtual Memory Area) data +structures. + +Supporting of nonlinear mapping requires significant amount of non-trivial +code in kernel virtual memory subsystem including hot paths. Also to get +nonlinear mapping work kernel need a way to distinguish normal page table +entries from entries with file offset (pte_file). Kernel reserves flag in +PTE for this purpose. PTE flags are scarce resource especially on some CPU +architectures. It would be nice to free up the flag for other usage. + +Fortunately, there are not many users of remap_file_pages() in the wild. +It's only known that one enterprise RDBMS implementation uses the syscall +on 32-bit systems to map files bigger than can linearly fit into 32-bit +virtual address space. This use-case is not critical anymore since 64-bit +systems are widely available. + +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. + +One side effect of emulation (apart from performance) is that user can hit +vm.max_map_count limit more easily due to additional VMAs. See comment for +DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/MAINTAINERS b/MAINTAINERS index b9fbd059bdcd..c0d1e360a9a1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -210,6 +210,13 @@ S: Supported F: Documentation/scsi/aacraid.txt F: drivers/scsi/aacraid/ +ABI/API +L: linux-api@vger.kernel.org +F: Documentation/ABI/ +F: include/linux/syscalls.h +F: include/uapi/ +F: kernel/sys_ni.c + ABIT UGURU 1,2 HARDWARE MONITOR DRIVER M: Hans de Goede <hdegoede@redhat.com> L: lm-sensors@lm-sensors.org @@ -647,7 +654,7 @@ F: sound/soc/codecs/ssm* F: sound/soc/codecs/sigmadsp.* ANALOG DEVICES INC ASOC DRIVERS -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: alsa-devel@alsa-project.org (moderated for non-subscribers) W: http://blackfin.uclinux.org/ S: Supported @@ -1758,54 +1765,54 @@ F: include/uapi/linux/bfs_fs.h BLACKFIN ARCHITECTURE M: Steven Miao <realmz6@gmail.com> -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) T: git git://git.code.sf.net/p/adi-linux/code W: http://blackfin.uclinux.org S: Supported F: arch/blackfin/ BLACKFIN EMAC DRIVER -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org S: Supported F: drivers/net/ethernet/adi/ BLACKFIN RTC DRIVER -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org S: Supported F: drivers/rtc/rtc-bfin.c BLACKFIN SDH DRIVER M: Sonic Zhang <sonic.zhang@analog.com> -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org S: Supported F: drivers/mmc/host/bfin_sdh.c BLACKFIN SERIAL DRIVER M: Sonic Zhang <sonic.zhang@analog.com> -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org S: Supported F: drivers/tty/serial/bfin_uart.c BLACKFIN WATCHDOG DRIVER -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org S: Supported F: drivers/watchdog/bfin_wdt.c BLACKFIN I2C TWI DRIVER M: Sonic Zhang <sonic.zhang@analog.com> -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org/ S: Supported F: drivers/i2c/busses/i2c-bfin-twi.c BLACKFIN MEDIA DRIVER M: Scott Jiang <scott.jiang.linux@gmail.com> -L: adi-buildroot-devel@lists.sourceforge.net +L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) W: http://blackfin.uclinux.org/ S: Supported F: drivers/media/platform/blackfin/ diff --git a/arch/arm/kernel/isa.c b/arch/arm/kernel/isa.c index 346485910732..9d1cf7156895 100644 --- a/arch/arm/kernel/isa.c +++ b/arch/arm/kernel/isa.c @@ -20,7 +20,7 @@ static unsigned int isa_membase, isa_portbase, isa_portshift; -static ctl_table ctl_isa_vars[4] = { +static struct ctl_table ctl_isa_vars[4] = { { .procname = "membase", .data = &isa_membase, @@ -44,7 +44,7 @@ static ctl_table ctl_isa_vars[4] = { static struct ctl_table_header *isa_sysctl_header; -static ctl_table ctl_isa[2] = { +static struct ctl_table ctl_isa[2] = { { .procname = "isa", .mode = 0555, @@ -52,7 +52,7 @@ static ctl_table ctl_isa[2] = { }, {} }; -static ctl_table ctl_bus[2] = { +static struct ctl_table ctl_bus[2] = { { .procname = "bus", .mode = 0555, diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index e1f88e028cfe..8b8fe671b1a6 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -117,6 +117,7 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data) int is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long len) { + bool valid; struct vm_area_struct *vma; struct sram_list_struct *sraml; @@ -124,9 +125,12 @@ is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long if (start + len < start) return -EIO; + down_read(&child->mm->mmap_sem); vma = find_vma(child->mm, start); - if (vma && start >= vma->vm_start && start + len <= vma->vm_end) - return 0; + valid = vma && start >= vma->vm_start && start + len <= vma->vm_end; + up_read(&child->mm->mmap_sem); + if (valid) + return 0; for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next) if (start >= (unsigned long)sraml->addr diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index b942f4032d7a..2955f359e2a7 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -237,7 +237,7 @@ kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) } #ifdef CONFIG_SYSCTL -static ctl_table kdump_ctl_table[] = { +static struct ctl_table kdump_ctl_table[] = { { .procname = "kdump_on_init", .data = &kdump_on_init, @@ -255,7 +255,7 @@ static ctl_table kdump_ctl_table[] = { { } }; -static ctl_table sys_table[] = { +static struct ctl_table sys_table[] = { { .procname = "kernel", .mode = 0555, diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index d841c4bd6864..5845ffea67c3 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -521,7 +521,7 @@ static pmu_config_t *pmu_conf; pfm_sysctl_t pfm_sysctl; EXPORT_SYMBOL(pfm_sysctl); -static ctl_table pfm_ctl_table[]={ +static struct ctl_table pfm_ctl_table[] = { { .procname = "debug", .data = &pfm_sysctl.debug, @@ -552,7 +552,7 @@ static ctl_table pfm_ctl_table[]={ }, {} }; -static ctl_table pfm_sysctl_dir[] = { +static struct ctl_table pfm_sysctl_dir[] = { { .procname = "perfmon", .mode = 0555, @@ -560,7 +560,7 @@ static ctl_table pfm_sysctl_dir[] = { }, {} }; -static ctl_table pfm_sysctl_root[] = { +static struct ctl_table pfm_sysctl_root[] = { { .procname = "kernel", .mode = 0555, diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 375cffcf7dbd..91d219381306 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -89,7 +89,7 @@ static inline unsigned long get_softint(void) return retval; } -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace extern void *hardirq_stack[NR_CPUS]; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index b2988f25e230..027e09986194 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp) } } -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { struct thread_info *tp = current_thread_info(); struct pt_regs *regs = get_irq_regs(); @@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void) spin_lock_irqsave(&global_cpu_snapshot_lock, flags); - memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); - this_cpu = raw_smp_processor_id(); - __global_reg_self(tp, regs, this_cpu); + memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); + + if (include_self) + __global_reg_self(tp, regs, this_cpu); smp_fetch_global_regs(); for_each_online_cpu(cpu) { - struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg; + struct global_reg_snapshot *gp; + + if (!include_self && cpu == this_cpu) + continue; + + gp = &global_cpu_snapshot[cpu].reg; __global_reg_poll(gp); @@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void) static void sysrq_handle_globreg(int key) { - arch_trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(true); } static struct sysrq_key_op sparc_globalreg_op = { diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c index 681100c59fda..6829a9508649 100644 --- a/arch/tile/kernel/proc.c +++ b/arch/tile/kernel/proc.c @@ -113,7 +113,7 @@ arch_initcall(proc_tile_init); * Support /proc/sys/tile directory */ -static ctl_table unaligned_subtable[] = { +static struct ctl_table unaligned_subtable[] = { { .procname = "enabled", .data = &unaligned_fixup, @@ -138,7 +138,7 @@ static ctl_table unaligned_subtable[] = { {} }; -static ctl_table unaligned_table[] = { +static struct ctl_table unaligned_table[] = { { .procname = "unaligned_fixup", .mode = 0555, diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index cb6cfcd034cf..a80cbb88ea91 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c3fcb5de5083..6a1e71bde323 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { int i; + int cpu = get_cpu(); - if (test_and_set_bit(0, &backtrace_flag)) + if (test_and_set_bit(0, &backtrace_flag)) { /* * If there is already a trigger_all_cpu_backtrace() in progress * (backtrace_flag == 1), don't output double cpu dump infos. */ + put_cpu(); return; + } cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); + } /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); + touch_softlockup_watchdog(); } clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); + put_cpu(); } static int diff --git a/block/bio.c b/block/bio.c index 1ba33657160f..0443694ccbb4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -752,29 +752,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page return 0; /* - * we might lose a segment or two here, but rather that than - * make this too complex. + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). */ - while (bio->bi_phys_segments >= queue_max_segments(q)) { + while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) - return 0; + goto failed; retried_segments = 1; blk_recount_segments(q, bio); } /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* * if queue has other restrictions (eg varying max sector size * depending on offset), it can specify a merge_bvec_fn in the * queue to get further control @@ -791,23 +793,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) + goto failed; } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); - bio->bi_vcnt++; - bio->bi_phys_segments++; done: bio->bi_iter.bi_size += len; return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + blk_recount_segments(q, bio); + return 0; } /** diff --git a/block/bounce.c b/block/bounce.c index 523918b8c6dc..ab21ba203d5c 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -3,6 +3,8 @@ * - Split from highmem.c */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> @@ -15,6 +17,7 @@ #include <linux/hash.h> #include <linux/highmem.h> #include <linux/bootmem.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <trace/events/block.h> @@ -34,7 +37,7 @@ static __init int init_emergency_pool(void) page_pool = mempool_create_page_pool(POOL_SIZE, 0); BUG_ON(!page_pool); - printk("bounce pool size: %d pages\n", POOL_SIZE); + pr_info("pool size: %d pages\n", POOL_SIZE); return 0; } @@ -86,7 +89,7 @@ int init_emergency_isa_pool(void) mempool_free_pages, (void *) 0); BUG_ON(!isa_page_pool); - printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); + pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); return 0; } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 49ac5662585b..2a44767891f5 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3470,7 +3470,7 @@ static int cdrom_print_info(const char *header, int val, char *info, return 0; } -static int cdrom_sysctl_info(ctl_table *ctl, int write, +static int cdrom_sysctl_info(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int pos; @@ -3583,7 +3583,7 @@ static void cdrom_update_settings(void) mutex_unlock(&cdrom_mutex); } -static int cdrom_sysctl_handler(ctl_table *ctl, int write, +static int cdrom_sysctl_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3609,7 +3609,7 @@ static int cdrom_sysctl_handler(ctl_table *ctl, int write, } /* Place files in /proc/sys/dev/cdrom */ -static ctl_table cdrom_table[] = { +static struct ctl_table cdrom_table[] = { { .procname = "info", .data = &cdrom_sysctl_settings.info, @@ -3655,7 +3655,7 @@ static ctl_table cdrom_table[] = { { } }; -static ctl_table cdrom_cdrom_table[] = { +static struct ctl_table cdrom_cdrom_table[] = { { .procname = "cdrom", .maxlen = 0, @@ -3666,7 +3666,7 @@ static ctl_table cdrom_cdrom_table[] = { }; /* Make sure that /proc/sys/dev is there */ -static ctl_table cdrom_root_table[] = { +static struct ctl_table cdrom_root_table[] = { { .procname = "dev", .maxlen = 0, diff --git a/drivers/char/random.c b/drivers/char/random.c index 06cea7ff3a7c..4ad71ef2cd59 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1582,10 +1582,10 @@ static int proc_do_uuid(struct ctl_table *table, int write, /* * Return entropy available scaled to integral bits */ -static int proc_do_entropy(ctl_table *table, int write, +static int proc_do_entropy(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - ctl_table fake_table; + struct ctl_table fake_table; int entropy_count; entropy_count = *(int *)table->data >> ENTROPY_SHIFT; diff --git a/drivers/gpio/gpio-zevio.c b/drivers/gpio/gpio-zevio.c index 54e54e4cc6c4..cd71e5769a76 100644 --- a/drivers/gpio/gpio-zevio.c +++ b/drivers/gpio/gpio-zevio.c @@ -18,6 +18,10 @@ #include <linux/slab.h> #include <linux/gpio.h> +#ifndef IOMEM +#define IOMEM(x) ((void __force __iomem *)(x)) +#endif + /* * Memory layout: * This chip has four gpio sections, each controls 8 GPIOs. diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 92ed045a5f93..3b470801a04f 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -31,7 +31,7 @@ #define PARPORT_MIN_SPINTIME_VALUE 1 #define PARPORT_MAX_SPINTIME_VALUE 1000 -static int do_active_device(ctl_table *table, int write, +static int do_active_device(struct ctl_table *table, int write, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, } #ifdef CONFIG_PARPORT_1284 -static int do_autoprobe(ctl_table *table, int write, +static int do_autoprobe(struct ctl_table *table, int write, void __user *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; @@ -110,9 +110,9 @@ static int do_autoprobe(ctl_table *table, int write, } #endif /* IEEE1284.3 support. */ -static int do_hardware_base_addr (ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) +static int do_hardware_base_addr(struct ctl_table *table, int write, + void __user *result, + size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -138,9 +138,9 @@ static int do_hardware_base_addr (ctl_table *table, int write, return copy_to_user(result, buffer, len) ? -EFAULT : 0; } -static int do_hardware_irq (ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) +static int do_hardware_irq(struct ctl_table *table, int write, + void __user *result, + size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -166,9 +166,9 @@ static int do_hardware_irq (ctl_table *table, int write, return copy_to_user(result, buffer, len) ? -EFAULT : 0; } -static int do_hardware_dma (ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) +static int do_hardware_dma(struct ctl_table *table, int write, + void __user *result, + size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -194,9 +194,9 @@ static int do_hardware_dma (ctl_table *table, int write, return copy_to_user(result, buffer, len) ? -EFAULT : 0; } -static int do_hardware_modes (ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) +static int do_hardware_modes(struct ctl_table *table, int write, + void __user *result, + size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; @@ -255,11 +255,11 @@ PARPORT_MAX_SPINTIME_VALUE; struct parport_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table vars[12]; - ctl_table device_dir[2]; - ctl_table port_dir[2]; - ctl_table parport_dir[2]; - ctl_table dev_dir[2]; + struct ctl_table vars[12]; + struct ctl_table device_dir[2]; + struct ctl_table port_dir[2]; + struct ctl_table parport_dir[2]; + struct ctl_table dev_dir[2]; }; static const struct parport_sysctl_table parport_sysctl_template = { @@ -369,12 +369,12 @@ static const struct parport_sysctl_table parport_sysctl_template = { struct parport_device_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table vars[2]; - ctl_table device_dir[2]; - ctl_table devices_root_dir[2]; - ctl_table port_dir[2]; - ctl_table parport_dir[2]; - ctl_table dev_dir[2]; + struct ctl_table vars[2]; + struct ctl_table device_dir[2]; + struct ctl_table devices_root_dir[2]; + struct ctl_table port_dir[2]; + struct ctl_table parport_dir[2]; + struct ctl_table dev_dir[2]; }; static const struct parport_device_sysctl_table @@ -422,10 +422,10 @@ parport_device_sysctl_template = { struct parport_default_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table vars[3]; - ctl_table default_dir[2]; - ctl_table parport_dir[2]; - ctl_table dev_dir[2]; + struct ctl_table vars[3]; + struct ctl_table default_dir[2]; + struct ctl_table parport_dir[2]; + struct ctl_table dev_dir[2]; }; static struct parport_default_sysctl_table diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 71988b69eca6..0754f5c7cb3b 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -530,11 +530,11 @@ config RTC_DRV_RV3029C2 will be called rtc-rv3029c2. config RTC_DRV_S5M - tristate "Samsung S5M series" + tristate "Samsung S2M/S5M series" depends on MFD_SEC_CORE help If you say yes here you will get support for the - RTC of Samsung S5M PMIC series. + RTC of Samsung S2MPS14 and S5M PMIC series. This driver can also be built as a module. If so, the module will be called rtc-s5m. diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c index 8ec2d6a1dbe1..8f06250a0389 100644 --- a/drivers/rtc/rtc-s5m.c +++ b/drivers/rtc/rtc-s5m.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Samsung Electronics Co., Ltd + * Copyright (c) 2013-2014 Samsung Electronics Co., Ltd * http://www.samsung.com * * Copyright (C) 2013 Google, Inc @@ -17,27 +17,76 @@ #include <linux/module.h> #include <linux/i2c.h> -#include <linux/slab.h> #include <linux/bcd.h> -#include <linux/bitops.h> #include <linux/regmap.h> #include <linux/rtc.h> -#include <linux/delay.h> #include <linux/platform_device.h> #include <linux/mfd/samsung/core.h> #include <linux/mfd/samsung/irq.h> #include <linux/mfd/samsung/rtc.h> +#include <linux/mfd/samsung/s2mps14.h> /* * Maximum number of retries for checking changes in UDR field - * of SEC_RTC_UDR_CON register (to limit possible endless loop). + * of S5M_RTC_UDR_CON register (to limit possible endless loop). * * After writing to RTC registers (setting time or alarm) read the UDR field - * in SEC_RTC_UDR_CON register. UDR is auto-cleared when data have + * in S5M_RTC_UDR_CON register. UDR is auto-cleared when data have * been transferred. */ #define UDR_READ_RETRY_CNT 5 +/* Registers used by the driver which are different between chipsets. */ +struct s5m_rtc_reg_config { + /* Number of registers used for setting time/alarm0/alarm1 */ + unsigned int regs_count; + /* First register for time, seconds */ + unsigned int time; + /* RTC control register */ + unsigned int ctrl; + /* First register for alarm 0, seconds */ + unsigned int alarm0; + /* First register for alarm 1, seconds */ + unsigned int alarm1; + /* SMPL/WTSR register */ + unsigned int smpl_wtsr; + /* + * Register for update flag (UDR). Typically setting UDR field to 1 + * will enable update of time or alarm register. Then it will be + * auto-cleared after successful update. + */ + unsigned int rtc_udr_update; + /* Mask for UDR field in 'rtc_udr_update' register */ + unsigned int rtc_udr_mask; +}; + +/* Register map for S5M8763 and S5M8767 */ +static const struct s5m_rtc_reg_config s5m_rtc_regs = { + .regs_count = 8, + .time = S5M_RTC_SEC, + .ctrl = S5M_ALARM1_CONF, + .alarm0 = S5M_ALARM0_SEC, + .alarm1 = S5M_ALARM1_SEC, + .smpl_wtsr = S5M_WTSR_SMPL_CNTL, + .rtc_udr_update = S5M_RTC_UDR_CON, + .rtc_udr_mask = S5M_RTC_UDR_MASK, +}; + +/* + * Register map for S2MPS14. + * It may be also suitable for S2MPS11 but this was not tested. + */ +static const struct s5m_rtc_reg_config s2mps_rtc_regs = { + .regs_count = 7, + .time = S2MPS_RTC_SEC, + .ctrl = S2MPS_RTC_CTRL, + .alarm0 = S2MPS_ALARM0_SEC, + .alarm1 = S2MPS_ALARM1_SEC, + .smpl_wtsr = S2MPS_WTSR_SMPL_CNTL, + .rtc_udr_update = S2MPS_RTC_UDR_CON, + .rtc_udr_mask = S2MPS_RTC_WUDR_MASK, +}; + struct s5m_rtc_info { struct device *dev; struct i2c_client *i2c; @@ -48,13 +97,14 @@ struct s5m_rtc_info { int device_type; int rtc_24hr_mode; bool wtsr_smpl; + const struct s5m_rtc_reg_config *regs; }; static const struct regmap_config s5m_rtc_regmap_config = { .reg_bits = 8, .val_bits = 8, - .max_register = SEC_RTC_REG_MAX, + .max_register = S5M_RTC_REG_MAX, }; static const struct regmap_config s2mps14_rtc_regmap_config = { @@ -119,8 +169,9 @@ static inline int s5m8767_wait_for_udr_update(struct s5m_rtc_info *info) unsigned int data; do { - ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &data); - } while (--retry && (data & RTC_UDR_MASK) && !ret); + ret = regmap_read(info->regmap, info->regs->rtc_udr_update, + &data); + } while (--retry && (data & info->regs->rtc_udr_mask) && !ret); if (!retry) dev_err(info->dev, "waiting for UDR update, reached max number of retries\n"); @@ -128,21 +179,53 @@ static inline int s5m8767_wait_for_udr_update(struct s5m_rtc_info *info) return ret; } +static inline int s5m_check_peding_alarm_interrupt(struct s5m_rtc_info *info, + struct rtc_wkalrm *alarm) +{ + int ret; + unsigned int val; + + switch (info->device_type) { + case S5M8767X: + case S5M8763X: + ret = regmap_read(info->regmap, S5M_RTC_STATUS, &val); + val &= S5M_ALARM0_STATUS; + break; + case S2MPS14X: + ret = regmap_read(info->s5m87xx->regmap_pmic, S2MPS14_REG_ST2, + &val); + val &= S2MPS_ALARM0_STATUS; + break; + default: + return -EINVAL; + } + if (ret < 0) + return ret; + + if (val) + alarm->pending = 1; + else + alarm->pending = 0; + + return 0; +} + static inline int s5m8767_rtc_set_time_reg(struct s5m_rtc_info *info) { int ret; unsigned int data; - ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &data); + ret = regmap_read(info->regmap, info->regs->rtc_udr_update, &data); if (ret < 0) { dev_err(info->dev, "failed to read update reg(%d)\n", ret); return ret; } - data |= RTC_TIME_EN_MASK; - data |= RTC_UDR_MASK; + data |= info->regs->rtc_udr_mask; + if (info->device_type == S5M8763X || info->device_type == S5M8767X) + data |= S5M_RTC_TIME_EN_MASK; - ret = regmap_write(info->regmap, SEC_RTC_UDR_CON, data); + ret = regmap_write(info->regmap, info->regs->rtc_udr_update, data); if (ret < 0) { dev_err(info->dev, "failed to write update reg(%d)\n", ret); return ret; @@ -158,17 +241,27 @@ static inline int s5m8767_rtc_set_alarm_reg(struct s5m_rtc_info *info) int ret; unsigned int data; - ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &data); + ret = regmap_read(info->regmap, info->regs->rtc_udr_update, &data); if (ret < 0) { dev_err(info->dev, "%s: fail to read update reg(%d)\n", __func__, ret); return ret; } - data &= ~RTC_TIME_EN_MASK; - data |= RTC_UDR_MASK; + data |= info->regs->rtc_udr_mask; + switch (info->device_type) { + case S5M8763X: + case S5M8767X: + data &= ~S5M_RTC_TIME_EN_MASK; + break; + case S2MPS14X: + data |= S2MPS_RTC_RUDR_MASK; + break; + default: + return -EINVAL; + } - ret = regmap_write(info->regmap, SEC_RTC_UDR_CON, data); + ret = regmap_write(info->regmap, info->regs->rtc_udr_update, data); if (ret < 0) { dev_err(info->dev, "%s: fail to write update reg(%d)\n", __func__, ret); @@ -215,10 +308,22 @@ static void s5m8763_tm_to_data(struct rtc_time *tm, u8 *data) static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct s5m_rtc_info *info = dev_get_drvdata(dev); - u8 data[8]; + u8 data[info->regs->regs_count]; int ret; - ret = regmap_bulk_read(info->regmap, SEC_RTC_SEC, data, 8); + if (info->device_type == S2MPS14X) { + ret = regmap_update_bits(info->regmap, + info->regs->rtc_udr_update, + S2MPS_RTC_RUDR_MASK, S2MPS_RTC_RUDR_MASK); + if (ret) { + dev_err(dev, + "Failed to prepare registers for time reading: %d\n", + ret); + return ret; + } + } + ret = regmap_bulk_read(info->regmap, info->regs->time, data, + info->regs->regs_count); if (ret < 0) return ret; @@ -228,6 +333,7 @@ static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm) break; case S5M8767X: + case S2MPS14X: s5m8767_data_to_tm(data, tm, info->rtc_24hr_mode); break; @@ -245,7 +351,7 @@ static int s5m_rtc_read_time(struct device *dev, struct rtc_time *tm) static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm) { struct s5m_rtc_info *info = dev_get_drvdata(dev); - u8 data[8]; + u8 data[info->regs->regs_count]; int ret = 0; switch (info->device_type) { @@ -253,6 +359,7 @@ static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm) s5m8763_tm_to_data(tm, data); break; case S5M8767X: + case S2MPS14X: ret = s5m8767_tm_to_data(tm, data); break; default: @@ -266,7 +373,8 @@ static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm) 1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec, tm->tm_wday); - ret = regmap_raw_write(info->regmap, SEC_RTC_SEC, data, 8); + ret = regmap_raw_write(info->regmap, info->regs->time, data, + info->regs->regs_count); if (ret < 0) return ret; @@ -278,70 +386,60 @@ static int s5m_rtc_set_time(struct device *dev, struct rtc_time *tm) static int s5m_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct s5m_rtc_info *info = dev_get_drvdata(dev); - u8 data[8]; + u8 data[info->regs->regs_count]; unsigned int val; int ret, i; - ret = regmap_bulk_read(info->regmap, SEC_ALARM0_SEC, data, 8); + ret = regmap_bulk_read(info->regmap, info->regs->alarm0, data, + info->regs->regs_count); if (ret < 0) return ret; switch (info->device_type) { case S5M8763X: s5m8763_data_to_tm(data, &alrm->time); - ret = regmap_read(info->regmap, SEC_ALARM0_CONF, &val); + ret = regmap_read(info->regmap, S5M_ALARM0_CONF, &val); if (ret < 0) return ret; alrm->enabled = !!val; - - ret = regmap_read(info->regmap, SEC_RTC_STATUS, &val); - if (ret < 0) - return ret; - break; case S5M8767X: + case S2MPS14X: s5m8767_data_to_tm(data, &alrm->time, info->rtc_24hr_mode); - dev_dbg(dev, "%s: %d/%d/%d %d:%d:%d(%d)\n", __func__, - 1900 + alrm->time.tm_year, 1 + alrm->time.tm_mon, - alrm->time.tm_mday, alrm->time.tm_hour, - alrm->time.tm_min, alrm->time.tm_sec, - alrm->time.tm_wday); - alrm->enabled = 0; - for (i = 0; i < 7; i++) { + for (i = 0; i < info->regs->regs_count; i++) { if (data[i] & ALARM_ENABLE_MASK) { alrm->enabled = 1; break; } } - - alrm->pending = 0; - ret = regmap_read(info->regmap, SEC_RTC_STATUS, &val); - if (ret < 0) - return ret; break; default: return -EINVAL; } - if (val & ALARM0_STATUS) - alrm->pending = 1; - else - alrm->pending = 0; + dev_dbg(dev, "%s: %d/%d/%d %d:%d:%d(%d)\n", __func__, + 1900 + alrm->time.tm_year, 1 + alrm->time.tm_mon, + alrm->time.tm_mday, alrm->time.tm_hour, + alrm->time.tm_min, alrm->time.tm_sec, + alrm->time.tm_wday); + + ret = s5m_check_peding_alarm_interrupt(info, alrm); return 0; } static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info) { - u8 data[8]; + u8 data[info->regs->regs_count]; int ret, i; struct rtc_time tm; - ret = regmap_bulk_read(info->regmap, SEC_ALARM0_SEC, data, 8); + ret = regmap_bulk_read(info->regmap, info->regs->alarm0, data, + info->regs->regs_count); if (ret < 0) return ret; @@ -352,14 +450,16 @@ static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info) switch (info->device_type) { case S5M8763X: - ret = regmap_write(info->regmap, SEC_ALARM0_CONF, 0); + ret = regmap_write(info->regmap, S5M_ALARM0_CONF, 0); break; case S5M8767X: - for (i = 0; i < 7; i++) + case S2MPS14X: + for (i = 0; i < info->regs->regs_count; i++) data[i] &= ~ALARM_ENABLE_MASK; - ret = regmap_raw_write(info->regmap, SEC_ALARM0_SEC, data, 8); + ret = regmap_raw_write(info->regmap, info->regs->alarm0, data, + info->regs->regs_count); if (ret < 0) return ret; @@ -377,11 +477,12 @@ static int s5m_rtc_stop_alarm(struct s5m_rtc_info *info) static int s5m_rtc_start_alarm(struct s5m_rtc_info *info) { int ret; - u8 data[8]; + u8 data[info->regs->regs_count]; u8 alarm0_conf; struct rtc_time tm; - ret = regmap_bulk_read(info->regmap, SEC_ALARM0_SEC, data, 8); + ret = regmap_bulk_read(info->regmap, info->regs->alarm0, data, + info->regs->regs_count); if (ret < 0) return ret; @@ -393,10 +494,11 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info) switch (info->device_type) { case S5M8763X: alarm0_conf = 0x77; - ret = regmap_write(info->regmap, SEC_ALARM0_CONF, alarm0_conf); + ret = regmap_write(info->regmap, S5M_ALARM0_CONF, alarm0_conf); break; case S5M8767X: + case S2MPS14X: data[RTC_SEC] |= ALARM_ENABLE_MASK; data[RTC_MIN] |= ALARM_ENABLE_MASK; data[RTC_HOUR] |= ALARM_ENABLE_MASK; @@ -408,7 +510,8 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info) if (data[RTC_YEAR1] & 0x7f) data[RTC_YEAR1] |= ALARM_ENABLE_MASK; - ret = regmap_raw_write(info->regmap, SEC_ALARM0_SEC, data, 8); + ret = regmap_raw_write(info->regmap, info->regs->alarm0, data, + info->regs->regs_count); if (ret < 0) return ret; ret = s5m8767_rtc_set_alarm_reg(info); @@ -425,7 +528,7 @@ static int s5m_rtc_start_alarm(struct s5m_rtc_info *info) static int s5m_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct s5m_rtc_info *info = dev_get_drvdata(dev); - u8 data[8]; + u8 data[info->regs->regs_count]; int ret; switch (info->device_type) { @@ -434,6 +537,7 @@ static int s5m_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) break; case S5M8767X: + case S2MPS14X: s5m8767_tm_to_data(&alrm->time, data); break; @@ -450,7 +554,8 @@ static int s5m_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) if (ret < 0) return ret; - ret = regmap_raw_write(info->regmap, SEC_ALARM0_SEC, data, 8); + ret = regmap_raw_write(info->regmap, info->regs->alarm0, data, + info->regs->regs_count); if (ret < 0) return ret; @@ -495,7 +600,7 @@ static const struct rtc_class_ops s5m_rtc_ops = { static void s5m_rtc_enable_wtsr(struct s5m_rtc_info *info, bool enable) { int ret; - ret = regmap_update_bits(info->regmap, SEC_WTSR_SMPL_CNTL, + ret = regmap_update_bits(info->regmap, info->regs->smpl_wtsr, WTSR_ENABLE_MASK, enable ? WTSR_ENABLE_MASK : 0); if (ret < 0) @@ -506,7 +611,7 @@ static void s5m_rtc_enable_wtsr(struct s5m_rtc_info *info, bool enable) static void s5m_rtc_enable_smpl(struct s5m_rtc_info *info, bool enable) { int ret; - ret = regmap_update_bits(info->regmap, SEC_WTSR_SMPL_CNTL, + ret = regmap_update_bits(info->regmap, info->regs->smpl_wtsr, SMPL_ENABLE_MASK, enable ? SMPL_ENABLE_MASK : 0); if (ret < 0) @@ -517,50 +622,41 @@ static void s5m_rtc_enable_smpl(struct s5m_rtc_info *info, bool enable) static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info) { u8 data[2]; - unsigned int tp_read; int ret; - struct rtc_time tm; - ret = regmap_read(info->regmap, SEC_RTC_UDR_CON, &tp_read); - if (ret < 0) { - dev_err(info->dev, "%s: fail to read control reg(%d)\n", - __func__, ret); - return ret; - } + switch (info->device_type) { + case S5M8763X: + case S5M8767X: + /* UDR update time. Default of 7.32 ms is too long. */ + ret = regmap_update_bits(info->regmap, S5M_RTC_UDR_CON, + S5M_RTC_UDR_T_MASK, S5M_RTC_UDR_T_450_US); + if (ret < 0) + dev_err(info->dev, "%s: fail to change UDR time: %d\n", + __func__, ret); - /* Set RTC control register : Binary mode, 24hour mode */ - data[0] = (1 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT); - data[1] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT); + /* Set RTC control register : Binary mode, 24hour mode */ + data[0] = (1 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT); + data[1] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT); + + ret = regmap_raw_write(info->regmap, S5M_ALARM0_CONF, data, 2); + break; + + case S2MPS14X: + data[0] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT); + ret = regmap_write(info->regmap, info->regs->ctrl, data[0]); + break; + + default: + return -EINVAL; + } info->rtc_24hr_mode = 1; - ret = regmap_raw_write(info->regmap, SEC_ALARM0_CONF, data, 2); if (ret < 0) { dev_err(info->dev, "%s: fail to write controlm reg(%d)\n", __func__, ret); return ret; } - /* In first boot time, Set rtc time to 1/1/2012 00:00:00(SUN) */ - if ((tp_read & RTC_TCON_MASK) == 0) { - dev_dbg(info->dev, "rtc init\n"); - tm.tm_sec = 0; - tm.tm_min = 0; - tm.tm_hour = 0; - tm.tm_wday = 0; - tm.tm_mday = 1; - tm.tm_mon = 0; - tm.tm_year = 112; - tm.tm_yday = 0; - tm.tm_isdst = 0; - ret = s5m_rtc_set_time(info->dev, &tm); - } - - ret = regmap_update_bits(info->regmap, SEC_RTC_UDR_CON, - RTC_TCON_MASK, tp_read | RTC_TCON_MASK); - if (ret < 0) - dev_err(info->dev, "%s: fail to update TCON reg(%d)\n", - __func__, ret); - return ret; } @@ -570,7 +666,7 @@ static int s5m_rtc_probe(struct platform_device *pdev) struct sec_platform_data *pdata = s5m87xx->pdata; struct s5m_rtc_info *info; const struct regmap_config *regmap_cfg; - int ret; + int ret, alarm_irq; if (!pdata) { dev_err(pdev->dev.parent, "Platform data not supplied\n"); @@ -584,12 +680,18 @@ static int s5m_rtc_probe(struct platform_device *pdev) switch (pdata->device_type) { case S2MPS14X: regmap_cfg = &s2mps14_rtc_regmap_config; + info->regs = &s2mps_rtc_regs; + alarm_irq = S2MPS14_IRQ_RTCA0; break; case S5M8763X: regmap_cfg = &s5m_rtc_regmap_config; + info->regs = &s5m_rtc_regs; + alarm_irq = S5M8763_IRQ_ALARM0; break; case S5M8767X: regmap_cfg = &s5m_rtc_regmap_config; + info->regs = &s5m_rtc_regs; + alarm_irq = S5M8767_IRQ_RTCA1; break; default: dev_err(&pdev->dev, "Device type is not supported by RTC driver\n"); @@ -615,20 +717,11 @@ static int s5m_rtc_probe(struct platform_device *pdev) info->device_type = s5m87xx->device_type; info->wtsr_smpl = s5m87xx->wtsr_smpl; - switch (pdata->device_type) { - case S5M8763X: - info->irq = regmap_irq_get_virq(s5m87xx->irq_data, - S5M8763_IRQ_ALARM0); - break; - - case S5M8767X: - info->irq = regmap_irq_get_virq(s5m87xx->irq_data, - S5M8767_IRQ_RTCA1); - break; - - default: + info->irq = regmap_irq_get_virq(s5m87xx->irq_data, alarm_irq); + if (info->irq <= 0) { ret = -EINVAL; - dev_err(&pdev->dev, "Unsupported device type: %d\n", ret); + dev_err(&pdev->dev, "Failed to get virtual IRQ %d\n", + alarm_irq); goto err; } @@ -676,7 +769,7 @@ static void s5m_rtc_shutdown(struct platform_device *pdev) if (info->wtsr_smpl) { for (i = 0; i < 3; i++) { s5m_rtc_enable_wtsr(info, false); - regmap_read(info->regmap, SEC_WTSR_SMPL_CNTL, &val); + regmap_read(info->regmap, info->regs->smpl_wtsr, &val); pr_debug("%s: WTSR_SMPL reg(0x%02x)\n", __func__, val); if (val & WTSR_ENABLE_MASK) pr_emerg("%s: fail to disable WTSR\n", @@ -730,7 +823,8 @@ static int s5m_rtc_suspend(struct device *dev) static SIMPLE_DEV_PM_OPS(s5m_rtc_pm_ops, s5m_rtc_suspend, s5m_rtc_resume); static const struct platform_device_id s5m_rtc_id[] = { - { "s5m-rtc", 0 }, + { "s5m-rtc", S5M8767X }, + { "s2mps14-rtc", S2MPS14X }, }; static struct platform_driver s5m_rtc_driver = { @@ -749,6 +843,6 @@ module_platform_driver(s5m_rtc_driver); /* Module information */ MODULE_AUTHOR("Sangbeom Kim <sbkim73@samsung.com>"); -MODULE_DESCRIPTION("Samsung S5M RTC driver"); +MODULE_DESCRIPTION("Samsung S5M/S2MPS14 RTC driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:s5m-rtc"); diff --git a/drivers/scsi/scsi_sysctl.c b/drivers/scsi/scsi_sysctl.c index 2b6b93f7d8ef..546f16299ef9 100644 --- a/drivers/scsi/scsi_sysctl.c +++ b/drivers/scsi/scsi_sysctl.c @@ -12,7 +12,7 @@ #include "scsi_priv.h" -static ctl_table scsi_table[] = { +static struct ctl_table scsi_table[] = { { .procname = "logging_level", .data = &scsi_logging_level, .maxlen = sizeof(scsi_logging_level), @@ -21,14 +21,14 @@ static ctl_table scsi_table[] = { { } }; -static ctl_table scsi_dir_table[] = { +static struct ctl_table scsi_dir_table[] = { { .procname = "scsi", .mode = 0555, .child = scsi_table }, { } }; -static ctl_table scsi_root_table[] = { +static struct ctl_table scsi_root_table[] = { { .procname = "dev", .mode = 0555, .child = scsi_dir_table }, diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index b767a64e49d9..454b65898e2c 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -46,6 +46,7 @@ #include <linux/jiffies.h> #include <linux/syscalls.h> #include <linux/of.h> +#include <linux/rcupdate.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> @@ -510,9 +511,9 @@ void __handle_sysrq(int key, bool check_mask) struct sysrq_key_op *op_p; int orig_log_level; int i; - unsigned long flags; - spin_lock_irqsave(&sysrq_key_table_lock, flags); + rcu_sysrq_start(); + rcu_read_lock(); /* * Raise the apparent loglevel to maximum so that the sysrq header * is shown to provide the user with positive feedback. We do not @@ -554,7 +555,8 @@ void __handle_sysrq(int key, bool check_mask) printk("\n"); console_loglevel = orig_log_level; } - spin_unlock_irqrestore(&sysrq_key_table_lock, flags); + rcu_read_unlock(); + rcu_sysrq_end(); } void handle_sysrq(int key) @@ -1043,16 +1045,23 @@ static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, struct sysrq_key_op *remove_op_p) { int retval; - unsigned long flags; - spin_lock_irqsave(&sysrq_key_table_lock, flags); + spin_lock(&sysrq_key_table_lock); if (__sysrq_get_key_op(key) == remove_op_p) { __sysrq_put_key_op(key, insert_op_p); retval = 0; } else { retval = -1; } - spin_unlock_irqrestore(&sysrq_key_table_lock, flags); + spin_unlock(&sysrq_key_table_lock); + + /* + * A concurrent __handle_sysrq either got the old op or the new op. + * Wait for it to go away before returning, so the code for an old + * op is not freed (eg. on module unload) while it is in use. + */ + synchronize_rcu(); + return retval; } diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 9b084db739c7..3aaa15f6b7e9 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -92,9 +92,8 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, err = device_register(&dev->dev); if (err) { printk(KERN_ERR "Failed to register master device. err=%d\n", err); - memset(dev, 0, sizeof(struct w1_master)); - kfree(dev); - dev = NULL; + put_device(&dev->dev); + return NULL; } return dev; diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index af56ad56a89a..34218a8a28cd 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -14,7 +14,7 @@ #ifdef CONFIG_SYSCTL static struct ctl_table_header *fs_table_header; -static ctl_table coda_table[] = { +static struct ctl_table coda_table[] = { { .procname = "timeout", .data = &coda_timeout, @@ -39,7 +39,7 @@ static ctl_table coda_table[] = { {} }; -static ctl_table fs_table[] = { +static struct ctl_table fs_table[] = { { .procname = "coda", .mode = 0555, diff --git a/fs/dcache.c b/fs/dcache.c index 17010fd1cecd..ac6db5cd6b70 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -150,7 +150,7 @@ static long get_nr_dentry_unused(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 76feb4b60fa6..d521bddf876d 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -157,11 +157,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, const char *buf, size_t len) { unsigned int x; + int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - - x = simple_strtoul(buf, NULL, 0); + rc = kstrtouint(buf, 0, &x); + if (rc) + return rc; if (check_zero && !x) return -EINVAL; @@ -730,7 +732,10 @@ static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, size_t len) { - cm->nodeid = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &cm->nodeid); + + if (rc) + return rc; return len; } @@ -742,7 +747,10 @@ static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, size_t len) { - cm->local= simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &cm->local); + + if (rc) + return rc; if (cm->local && !local_comm) local_comm = cm; return len; @@ -846,7 +854,10 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len) { uint32_t seq = 0; - nd->nodeid = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &nd->nodeid); + + if (rc) + return rc; dlm_comm_seq(nd->nodeid, &seq); nd->comm_seq = seq; return len; @@ -860,7 +871,10 @@ static ssize_t node_weight_read(struct dlm_node *nd, char *buf) static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, size_t len) { - nd->weight = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &nd->weight); + + if (rc) + return rc; return len; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index b969deef9ebb..8d77ba7b1756 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -68,7 +68,7 @@ static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - return seq_printf(s, "\n"); + return seq_puts(s, "\n"); } static int print_format1(struct dlm_rsb *res, struct seq_file *s) @@ -92,31 +92,31 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } if (res->res_nodeid > 0) - rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n", + rv = seq_printf(s, "\"\nLocal Copy, Master is node %d\n", res->res_nodeid); else if (res->res_nodeid == 0) - rv = seq_printf(s, "\" \nMaster Copy\n"); + rv = seq_puts(s, "\"\nMaster Copy\n"); else if (res->res_nodeid == -1) - rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n", + rv = seq_printf(s, "\"\nLooking up master (lkid %x)\n", res->res_first_lkid); else - rv = seq_printf(s, "\" \nInvalid master %d\n", + rv = seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid); if (rv) goto out; /* Print the LVB: */ if (res->res_lvbptr) { - seq_printf(s, "LVB: "); + seq_puts(s, "LVB: "); for (i = 0; i < lvblen; i++) { if (i == lvblen / 2) - seq_printf(s, "\n "); + seq_puts(s, "\n "); seq_printf(s, "%02x ", (unsigned char) res->res_lvbptr[i]); } if (rsb_flag(res, RSB_VALNOTVALID)) - seq_printf(s, " (INVALID)"); - rv = seq_printf(s, "\n"); + seq_puts(s, " (INVALID)"); + rv = seq_puts(s, "\n"); if (rv) goto out; } @@ -133,21 +133,21 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } /* Print the locks attached to this resource */ - seq_printf(s, "Granted Queue\n"); + seq_puts(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) goto out; } - seq_printf(s, "Conversion Queue\n"); + seq_puts(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) goto out; } - seq_printf(s, "Waiting Queue\n"); + seq_puts(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) @@ -157,13 +157,13 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) if (list_empty(&res->res_lookup)) goto out; - seq_printf(s, "Lookup Queue\n"); + seq_puts(s, "Lookup Queue\n"); list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { rv = seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); } out: unlock_rsb(res); @@ -300,7 +300,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); if (rv) goto out; @@ -311,7 +311,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); if (rv) goto out; @@ -377,7 +377,7 @@ static int print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); out: unlock_rsb(r); return rv; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 04d6398c1f1c..f3e72787e7f9 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -35,8 +35,11 @@ static struct task_struct * scand_task; static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) { ssize_t ret = len; - int n = simple_strtol(buf, NULL, 0); + int n; + int rc = kstrtoint(buf, 0, &n); + if (rc) + return rc; ls = dlm_find_lockspace_local(ls->ls_local_handle); if (!ls) return -EINVAL; @@ -57,7 +60,10 @@ static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) { - ls->ls_uevent_result = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &ls->ls_uevent_result); + + if (rc) + return rc; set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); wake_up(&ls->ls_uevent_wait); return len; @@ -70,7 +76,10 @@ static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) { - ls->ls_global_id = simple_strtoul(buf, NULL, 0); + int rc = kstrtouint(buf, 0, &ls->ls_global_id); + + if (rc) + return rc; return len; } @@ -81,7 +90,11 @@ static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) { - int val = simple_strtoul(buf, NULL, 0); + int val; + int rc = kstrtoint(buf, 0, &val); + + if (rc) + return rc; if (val == 1) set_bit(LSFL_NODIR, &ls->ls_flags); return len; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 9280202e488c..1de7294aad20 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -50,7 +50,7 @@ static void drop_slab(void) } while (nr_objects > 10); } -int drop_caches_sysctl_handler(ctl_table *table, int write, +int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index af903128891c..b73e0621ce9e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -293,7 +293,7 @@ static LIST_HEAD(tfile_check_list); static long zero; static long long_max = LONG_MAX; -ctl_table epoll_table[] = { +struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 91ad9e1c9441..e26bc9a22ac9 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -303,6 +303,31 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } +static int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + + if (cluster < 0) + return cluster; + + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create) { @@ -311,7 +336,6 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -329,25 +353,39 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) + * Both ->mmu_private and ->i_disksize can access + * on only allocation path. (caller must hold ->i_mutex) */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) >> blocksize_bits; if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); +} + +int fat_bmap2(struct inode *inode, sector_t sector, + unsigned long *mapped_blocks, struct buffer_head *bh_result, + int create, sector_t *bmap) +{ + struct super_block *sb = inode->i_sb; + sector_t last_block; + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + + BUG_ON(create != 0); + + *bmap = 0; + *mapped_blocks = 0; + + last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) + >> blocksize_bits; + + if (sector >= last_block) + return 0; + + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + bmap); } diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377..13b7202bd651 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -119,7 +119,8 @@ struct msdos_inode_info { unsigned int cache_valid_id; /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ - loff_t mmu_private; /* physically allocated size */ + loff_t mmu_private; /* physically allocated size (initialized) */ + loff_t i_disksize; /* physically allocated size (uninitialized) */ int i_start; /* first cluster or 0 */ int i_logstart; /* logical first cluster */ @@ -290,6 +291,9 @@ extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create); +extern int fat_bmap2(struct inode *inode, sector_t sector, + unsigned long *mapped_blocks, + struct buffer_head *bh_result, int create, sector_t *bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e747..92e9e753b554 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -17,8 +17,12 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -182,6 +186,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -220,6 +225,75 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int cluster; + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if ((offset + len) <= MSDOS_I(inode)->i_disksize) + goto error; + + err = inode_newsize_ok(inode, (len + offset)); + if (err) + goto error; + + if (mode & FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - round_up(MSDOS_I(inode)->i_disksize, + sbi->cluster_size); + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_alloc_clusters() error"); + goto error; + } + err = fat_chain_add(inode, cluster, 1); + if (err) { + fat_free_clusters(inode, cluster); + goto error; + } + MSDOS_I(inode)->i_disksize += sbi->cluster_size; + } + } else { + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + if (err) + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_cont_expand() error"); + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -300,8 +374,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) * This protects against truncating a file bigger than it was then * trying to write into the hole. */ - if (MSDOS_I(inode)->mmu_private > offset) + if (MSDOS_I(inode)->i_disksize > offset) { MSDOS_I(inode)->mmu_private = offset; + MSDOS_I(inode)->i_disksize = offset; + } nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 17c099a8bf9c..473143203da8 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -116,6 +116,25 @@ static int fat_add_cluster(struct inode *inode) return err; } +static void check_fallocated_region(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, struct buffer_head *bh_result) +{ + struct super_block *sb = inode->i_sb; + sector_t last_block, disk_block; + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + + last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + disk_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) + >> blocksize_bits; + if (iblock >= last_block && iblock <= disk_block) { + MSDOS_I(inode)->mmu_private += *max_blocks << blocksize_bits; + set_buffer_new(bh_result); + } + +} + static inline int __fat_get_block(struct inode *inode, sector_t iblock, unsigned long *max_blocks, struct buffer_head *bh_result, int create) @@ -130,8 +149,11 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, if (err) return err; if (phys) { - map_bh(bh_result, sb, phys); *max_blocks = min(mapped_blocks, *max_blocks); + if (create) + check_fallocated_region(inode, iblock, max_blocks, + bh_result); + map_bh(bh_result, sb, phys); return 0; } if (!create) @@ -155,6 +177,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; + MSDOS_I(inode)->i_disksize = MSDOS_I(inode)->mmu_private; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); if (err) @@ -269,6 +292,13 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; + + /* + * In case of writing in fallocated region, return 0 and + * fallback to buffered write. + */ + if (MSDOS_I(inode)->i_disksize > MSDOS_I(inode)->mmu_private) + return 0; } /* @@ -282,13 +312,36 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + err = fat_bmap2(inode, iblock, &mapped_blocks, bh_result, create, + &bmap); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -469,7 +522,6 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) error = fat_calc_dir_size(inode); if (error < 0) return error; - MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); } else { /* not a directory */ @@ -484,8 +536,12 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_op = &fat_file_inode_operations; inode->i_fop = &fat_file_operations; inode->i_mapping->a_ops = &fat_aops; - MSDOS_I(inode)->mmu_private = inode->i_size; } + + MSDOS_I(inode)->mmu_private = inode->i_size; + MSDOS_I(inode)->i_disksize = round_up(inode->i_size, + inode->i_sb->s_blocksize); + if (de->attr & ATTR_SYS) { if (sbi->options.sys_immutable) inode->i_flags |= S_IMMUTABLE; @@ -550,12 +606,34 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); + } else { + /* Release unwritten fallocated blocks on inode eviction. */ + if (MSDOS_I(inode)->i_disksize > + round_up(MSDOS_I(inode)->mmu_private, + inode->i_sb->s_blocksize)) { + int err; + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused fallocated " + "blocks, inode could be corrupted. Please run " + "fsck"); + } + } } invalidate_inode_buffers(inode); clear_inode(inode); @@ -1293,6 +1371,7 @@ static int fat_read_root(struct inode *inode) & ~((loff_t)sbi->cluster_size - 1)) >> 9; MSDOS_I(inode)->i_logstart = 0; MSDOS_I(inode)->mmu_private = inode->i_size; + MSDOS_I(inode)->i_disksize = inode->i_size; fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; diff --git a/fs/file_table.c b/fs/file_table.c index f8cc881fbbfb..385bfd31512a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -76,14 +76,14 @@ EXPORT_SYMBOL_GPL(get_max_files); * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; diff --git a/fs/fscache/main.c b/fs/fscache/main.c index acd4bf1fc277..63f868e869b9 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -ctl_table fscache_sysctls[] = { +struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ ctl_table fscache_sysctls[] = { {} }; -ctl_table fscache_sysctls_root[] = { +struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, diff --git a/fs/inode.c b/fs/inode.c index f96d2a6f88cc..2feb9b69f1be 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -105,7 +105,7 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(ctl_table *table, int write, +int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2b431f7266c3..8f27c93f8d2e 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -436,7 +436,7 @@ EXPORT_SYMBOL_GPL(lockd_down); * Sysctl parameters (same as module parameters, different interface). */ -static ctl_table nlm_sysctls[] = { +static struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, @@ -490,7 +490,7 @@ static ctl_table nlm_sysctls[] = { { } }; -static ctl_table nlm_sysctl_dir[] = { +static struct ctl_table nlm_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -499,7 +499,7 @@ static ctl_table nlm_sysctl_dir[] = { { } }; -static ctl_table nlm_sysctl_root[] = { +static struct ctl_table nlm_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 2628d921b7e3..b6ebe7e445f6 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -16,7 +16,7 @@ static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static ctl_table nfs4_cb_sysctls[] = { +static struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, @@ -36,7 +36,7 @@ static ctl_table nfs4_cb_sysctls[] = { { } }; -static ctl_table nfs4_cb_sysctl_dir[] = { +static struct ctl_table nfs4_cb_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -45,7 +45,7 @@ static ctl_table nfs4_cb_sysctl_dir[] = { { } }; -static ctl_table nfs4_cb_sysctl_root[] = { +static struct ctl_table nfs4_cb_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 6b3f2535a3ec..bb6ed810fa6f 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -13,7 +13,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static ctl_table nfs_cb_sysctls[] = { +static struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, @@ -31,7 +31,7 @@ static ctl_table nfs_cb_sysctls[] = { { } }; -static ctl_table nfs_cb_sysctl_dir[] = { +static struct ctl_table nfs_cb_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -40,7 +40,7 @@ static ctl_table nfs_cb_sysctl_dir[] = { { } }; -static ctl_table nfs_cb_sysctl_root[] = { +static struct ctl_table nfs_cb_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 78a2ca3966c3..cc423a30a0c8 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -57,7 +57,7 @@ static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static int zero; -ctl_table inotify_table[] = { +struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &inotify_max_user_instances, diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 1927170a35ce..a503156ec15f 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -34,7 +34,7 @@ #include "debug.h" /* Definition of the ntfs sysctl. */ -static ctl_table ntfs_sysctls[] = { +static struct ctl_table ntfs_sysctls[] = { { .procname = "ntfs-debug", .data = &debug_msgs, /* Data pointer and size. */ @@ -46,7 +46,7 @@ static ctl_table ntfs_sysctls[] = { }; /* Define the parent directory /proc/sys/fs. */ -static ctl_table sysctls_root[] = { +static struct ctl_table sysctls_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index dc9a6829f7c6..1bcffeab713c 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -142,7 +142,6 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, int org = *beg; BUG_ON(!th->t_trans_id); - RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); PROC_INFO_INC(s, scan_bitmap.bmap); @@ -321,7 +320,6 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th, unsigned int off_max = s->s_blocksize << 3; BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, scan_bitmap.call); if (SB_FREE_BLOCKS(s) <= 0) return 0; // No point in looking for more free blocks @@ -388,9 +386,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, unsigned int nr, offset; BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, free_block); - rs = SB_DISK_SUPER_BLOCK(s); sbh = SB_BUFFER_WITH_SB(s); apbi = SB_AP_BITMAP(s); @@ -435,8 +431,8 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th, int for_unformatted) { struct super_block *s = th->t_super; - BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_trans_id); RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); if (!is_reusable(s, block, 1)) return; @@ -471,6 +467,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, unsigned long save = ei->i_prealloc_block; int dirty = 0; struct inode *inode = &ei->vfs_inode; + BUG_ON(!th->t_trans_id); #ifdef CONFIG_REISERFS_CHECK if (ei->i_prealloc_count < 0) @@ -494,6 +491,7 @@ void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, struct inode *inode) { struct reiserfs_inode_info *ei = REISERFS_I(inode); + BUG_ON(!th->t_trans_id); if (ei->i_prealloc_count) __discard_prealloc(th, ei); @@ -504,7 +502,6 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; BUG_ON(!th->t_trans_id); - while (!list_empty(plist)) { struct reiserfs_inode_info *ei; ei = list_entry(plist->next, struct reiserfs_inode_info, @@ -562,7 +559,7 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) if (!strcmp(this_char, "displacing_new_packing_localities")) { SET_OPTION(displacing_new_packing_localities); continue; - }; + } if (!strcmp(this_char, "old_hashed_relocation")) { SET_OPTION(old_hashed_relocation); @@ -729,6 +726,7 @@ void show_alloc_options(struct seq_file *seq, struct super_block *s) static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { char *hash_in; + if (hint->formatted_node) { hash_in = (char *)&hint->key.k_dir_id; } else { @@ -757,6 +755,7 @@ static void dirid_groups(reiserfs_blocknr_hint_t * hint) __u32 dirid = 0; int bm = 0; struct super_block *sb = hint->th->t_super; + if (hint->inode) dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); else if (hint->formatted_node) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index b14706a05d52..615cd9ab7940 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -228,10 +228,10 @@ const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; /* Maximal possible key. It is never in the tree. */ static const struct reiserfs_key MAX_KEY = { - __constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff), - {{__constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff)},} + cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff), + {{cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff)},} }; /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 184323cac1a4..7bc20809c99e 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -52,7 +52,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -116,12 +116,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); return; failed: - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return; } @@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); do_more: overflow = 0; @@ -211,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); return; failed_unlock: - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); failed: UFSD("EXIT (FAILED)\n"); return; @@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return 0; } } @@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return 0; } @@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -439,7 +439,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -485,7 +485,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return 0; } diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 98f7211599ff..a9cc75ffa925 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode) ino = inode->i_ino; - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return; } @@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); } @@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) sbi = UFS_SB(sb); uspi = sbi->s_uspi; - mutex_lock(&sbi->s_lock); + lock_ufs(sb); /* * Try to place the inode in its parent directory @@ -328,21 +328,20 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); clear_nlink(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index c1183f9f69dc..b879f1ba3439 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -697,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) unsigned flags; lock_ufs(sb); - mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -715,7 +714,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; @@ -760,6 +758,7 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); + mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -786,6 +785,14 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags = 0; UFSD("ENTER\n"); + +#ifndef CONFIG_UFS_FS_WRITE + if (!(sb->s_flags & MS_RDONLY)) { + printk("ufs was compiled with read-only support, " + "can't be mounted as read-write\n"); + return -EROFS; + } +#endif sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) @@ -795,15 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); -#ifndef CONFIG_UFS_FS_WRITE - if (!(sb->s_flags & MS_RDONLY)) { - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); - goto failed; - } -#endif mutex_init(&sbi->mutex); - mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* @@ -1257,6 +1256,7 @@ magic_found: return 0; failed: + mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1280,7 +1280,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sync_filesystem(sb); lock_ufs(sb); - mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1294,7 +1293,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } @@ -1302,14 +1300,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1334,7 +1330,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; #else @@ -1344,13 +1339,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EPERM; } @@ -1358,7 +1351,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index ff2c15ab81aa..343e6fc571e5 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,7 +24,6 @@ struct ufs_sb_info { int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ - struct mutex s_lock; }; struct ufs_inode_info { diff --git a/include/linux/fs.h b/include/linux/fs.h index 338e6f758c6d..1cab2f8148c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2408,8 +2408,12 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/key.h b/include/linux/key.h index cd0abb8c9c33..017b0826642f 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -343,7 +343,7 @@ do { \ } while (0) #ifdef CONFIG_SYSCTL -extern ctl_table key_sysctls[]; +extern struct ctl_table key_sysctls[]; #endif /* * the userspace interface diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 5bb424659c04..057e95971014 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -30,6 +30,7 @@ extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; +extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; @@ -83,6 +84,9 @@ static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) static inline void kmemleak_free_percpu(const void __percpu *ptr) { } +static inline void kmemleak_update_trace(const void *ptr) +{ +} static inline void kmemleak_not_leak(const void *ptr) { } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eb65d29516ca..a5cf853129ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -92,6 +92,10 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, bool task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); +extern bool mem_cgroup_within_guarantee(struct mem_cgroup *memcg, + struct mem_cgroup *root); +extern bool mem_cgroup_all_within_guarantee(struct mem_cgroup *root); + extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -288,6 +292,16 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, return &zone->lruvec; } +static inline bool mem_cgroup_within_guarantee(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + return false; +} +static inline bool mem_cgroup_all_within_guarantee(struct mem_cgroup *root) +{ + return false; +} + static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { return NULL; diff --git a/include/linux/mfd/samsung/rtc.h b/include/linux/mfd/samsung/rtc.h index 3e02b768d537..b6401e7661c7 100644 --- a/include/linux/mfd/samsung/rtc.h +++ b/include/linux/mfd/samsung/rtc.h @@ -18,38 +18,38 @@ #ifndef __LINUX_MFD_SEC_RTC_H #define __LINUX_MFD_SEC_RTC_H -enum sec_rtc_reg { - SEC_RTC_SEC, - SEC_RTC_MIN, - SEC_RTC_HOUR, - SEC_RTC_WEEKDAY, - SEC_RTC_DATE, - SEC_RTC_MONTH, - SEC_RTC_YEAR1, - SEC_RTC_YEAR2, - SEC_ALARM0_SEC, - SEC_ALARM0_MIN, - SEC_ALARM0_HOUR, - SEC_ALARM0_WEEKDAY, - SEC_ALARM0_DATE, - SEC_ALARM0_MONTH, - SEC_ALARM0_YEAR1, - SEC_ALARM0_YEAR2, - SEC_ALARM1_SEC, - SEC_ALARM1_MIN, - SEC_ALARM1_HOUR, - SEC_ALARM1_WEEKDAY, - SEC_ALARM1_DATE, - SEC_ALARM1_MONTH, - SEC_ALARM1_YEAR1, - SEC_ALARM1_YEAR2, - SEC_ALARM0_CONF, - SEC_ALARM1_CONF, - SEC_RTC_STATUS, - SEC_WTSR_SMPL_CNTL, - SEC_RTC_UDR_CON, +enum s5m_rtc_reg { + S5M_RTC_SEC, + S5M_RTC_MIN, + S5M_RTC_HOUR, + S5M_RTC_WEEKDAY, + S5M_RTC_DATE, + S5M_RTC_MONTH, + S5M_RTC_YEAR1, + S5M_RTC_YEAR2, + S5M_ALARM0_SEC, + S5M_ALARM0_MIN, + S5M_ALARM0_HOUR, + S5M_ALARM0_WEEKDAY, + S5M_ALARM0_DATE, + S5M_ALARM0_MONTH, + S5M_ALARM0_YEAR1, + S5M_ALARM0_YEAR2, + S5M_ALARM1_SEC, + S5M_ALARM1_MIN, + S5M_ALARM1_HOUR, + S5M_ALARM1_WEEKDAY, + S5M_ALARM1_DATE, + S5M_ALARM1_MONTH, + S5M_ALARM1_YEAR1, + S5M_ALARM1_YEAR2, + S5M_ALARM0_CONF, + S5M_ALARM1_CONF, + S5M_RTC_STATUS, + S5M_WTSR_SMPL_CNTL, + S5M_RTC_UDR_CON, - SEC_RTC_REG_MAX, + S5M_RTC_REG_MAX, }; enum s2mps_rtc_reg { @@ -88,9 +88,9 @@ enum s2mps_rtc_reg { #define HOUR_12 (1 << 7) #define HOUR_AMPM (1 << 6) #define HOUR_PM (1 << 5) -#define ALARM0_STATUS (1 << 1) -#define ALARM1_STATUS (1 << 2) -#define UPDATE_AD (1 << 0) +#define S5M_ALARM0_STATUS (1 << 1) +#define S5M_ALARM1_STATUS (1 << 2) +#define S5M_UPDATE_AD (1 << 0) #define S2MPS_ALARM0_STATUS (1 << 2) #define S2MPS_ALARM1_STATUS (1 << 1) @@ -101,16 +101,26 @@ enum s2mps_rtc_reg { #define MODEL24_SHIFT 1 #define MODEL24_MASK (1 << MODEL24_SHIFT) /* RTC Update Register1 */ -#define RTC_UDR_SHIFT 0 -#define RTC_UDR_MASK (1 << RTC_UDR_SHIFT) +#define S5M_RTC_UDR_SHIFT 0 +#define S5M_RTC_UDR_MASK (1 << S5M_RTC_UDR_SHIFT) #define S2MPS_RTC_WUDR_SHIFT 4 #define S2MPS_RTC_WUDR_MASK (1 << S2MPS_RTC_WUDR_SHIFT) #define S2MPS_RTC_RUDR_SHIFT 0 #define S2MPS_RTC_RUDR_MASK (1 << S2MPS_RTC_RUDR_SHIFT) #define RTC_TCON_SHIFT 1 #define RTC_TCON_MASK (1 << RTC_TCON_SHIFT) -#define RTC_TIME_EN_SHIFT 3 -#define RTC_TIME_EN_MASK (1 << RTC_TIME_EN_SHIFT) +#define S5M_RTC_TIME_EN_SHIFT 3 +#define S5M_RTC_TIME_EN_MASK (1 << S5M_RTC_TIME_EN_SHIFT) +/* + * UDR_T field in S5M_RTC_UDR_CON register determines the time needed + * for updating alarm and time registers. Default is 7.32 ms. + */ +#define S5M_RTC_UDR_T_SHIFT 6 +#define S5M_RTC_UDR_T_MASK (0x3 << S5M_RTC_UDR_T_SHIFT) +#define S5M_RTC_UDR_T_7320_US (0x0 << S5M_RTC_UDR_T_SHIFT) +#define S5M_RTC_UDR_T_1830_US (0x1 << S5M_RTC_UDR_T_SHIFT) +#define S5M_RTC_UDR_T_3660_US (0x2 << S5M_RTC_UDR_T_SHIFT) +#define S5M_RTC_UDR_T_450_US (0x3 << S5M_RTC_UDR_T_SHIFT) /* RTC Hour register */ #define HOUR_PM_SHIFT 6 diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 6a45fb583ff1..447775ee2c4b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void) #ifdef arch_trigger_all_cpu_backtrace static inline bool trigger_all_cpu_backtrace(void) { - arch_trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(true); return true; } +static inline bool trigger_allbutself_cpu_backtrace(void) +{ + arch_trigger_all_cpu_backtrace(false); + return true; +} #else static inline bool trigger_all_cpu_backtrace(void) { return false; } +static inline bool trigger_allbutself_cpu_backtrace(void) +{ + return false; +} #endif #ifdef CONFIG_LOCKUP_DETECTOR @@ -48,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_user_enabled; extern int watchdog_thresh; +extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 56b7bc32db4f..b810855024f9 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -40,6 +40,11 @@ struct res_counter { */ unsigned long long soft_limit; /* + * the limit under which the usage cannot be pushed + * due to external pressure. + */ + unsigned long long low_limit; + /* * the number of unsuccessful attempts to consume the resource */ unsigned long long failcnt; @@ -88,6 +93,7 @@ enum { RES_LIMIT, RES_FAILCNT, RES_SOFT_LIMIT, + RES_LOW_LIMIT, }; /* @@ -175,6 +181,28 @@ res_counter_soft_limit_excess(struct res_counter *cnt) return excess; } +/** + * Get the difference between the usage and the low limit + * @cnt: The counter + * + * Returns 0 if usage is less than or equal to low limit + * The difference between usage and low limit, otherwise. + */ +static inline unsigned long long +res_counter_low_limit_excess(struct res_counter *cnt) +{ + unsigned long long excess; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + if (cnt->usage <= cnt->low_limit) + excess = 0; + else + excess = cnt->usage - cnt->low_limit; + spin_unlock_irqrestore(&cnt->lock, flags); + return excess; +} + static inline void res_counter_reset_max(struct res_counter *cnt) { unsigned long flags; @@ -220,4 +248,16 @@ res_counter_set_soft_limit(struct res_counter *cnt, return 0; } +static inline int +res_counter_set_low_limit(struct res_counter *cnt, + unsigned long long low_limit) +{ + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + cnt->low_limit = low_limit; + spin_unlock_irqrestore(&cnt->lock, flags); + return 0; +} + #endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 03f3b05e8ec1..091d9935c247 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,6 +16,7 @@ #include <linux/atomic.h> +struct optimistic_spin_queue; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -23,9 +24,17 @@ struct rw_semaphore; #else /* All arch specific implementations share the same struct */ struct rw_semaphore { - long count; - raw_spinlock_t wait_lock; - struct list_head wait_list; + long count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_SMP + /* + * Write owner. Used as a speculative check to see + * if the owner is running on the cpu. + */ + struct task_struct *owner; + struct optimistic_spin_queue *osq; /* spinner MCS lock */ +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) # define __RWSEM_DEP_MAP_INIT(lockname) #endif +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list), \ + NULL, /* owner */ \ + NULL /* mcs lock */ \ + __RWSEM_DEP_MAP_INIT(name) } +#else #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, \ __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list) \ __RWSEM_DEP_MAP_INIT(name) } +#endif #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) diff --git a/init/main.c b/init/main.c index e8ae1fef0908..bb1aed928f21 100644 --- a/init/main.c +++ b/init/main.c @@ -6,7 +6,7 @@ * GK 2/5/95 - Changed to support mounting root fs via NFS * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 - * Simplified starting of init: Michael A. Griffith <grif@acm.org> + * Simplified starting of init: Michael A. Griffith <grif@acm.org> */ #define DEBUG /* Enable initcall_debug */ @@ -136,7 +136,7 @@ static char *ramdisk_execute_command; * Used to generate warnings if static_key manipulation functions are used * before jump_label_init is called. */ -bool static_key_initialized __read_mostly = false; +bool static_key_initialized __read_mostly; EXPORT_SYMBOL_GPL(static_key_initialized); /* @@ -159,8 +159,8 @@ static int __init set_reset_devices(char *str) __setup("reset_devices", set_reset_devices); -static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; -const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; extern const struct obs_kernel_param __setup_start[], __setup_end[]; @@ -199,7 +199,6 @@ static int __init obsolete_checksetup(char *line) * still work even if initially too large, it will just take slightly longer */ unsigned long loops_per_jiffy = (1<<12); - EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) @@ -376,8 +375,8 @@ static void __init setup_command_line(char *command_line) initcall_command_line = memblock_virt_alloc(strlen(boot_command_line) + 1, 0); static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); - strcpy (saved_command_line, boot_command_line); - strcpy (static_command_line, command_line); + strcpy(saved_command_line, boot_command_line); + strcpy(static_command_line, command_line); } /* @@ -445,8 +444,8 @@ void __init parse_early_options(char *cmdline) /* Arch code calls this early on, or if not, just before other parsing. */ void __init parse_early_param(void) { - static __initdata int done = 0; - static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; + static int done __initdata; + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; if (done) return; @@ -500,7 +499,8 @@ static void __init mm_init(void) asmlinkage __visible void __init start_kernel(void) { - char * command_line, *after_dashes; + char *command_line; + char *after_dashes; extern const struct kernel_param __start___param[], __stop___param[]; /* @@ -572,7 +572,8 @@ asmlinkage __visible void __init start_kernel(void) * fragile until we cpu_idle() for the first time. */ preempt_disable(); - if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) + if (WARN(!irqs_disabled(), + "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); idr_init_cache(); rcu_init(); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 998d31b230f1..c3f0326e98db 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -18,7 +18,7 @@ #include <linux/msg.h> #include "util.h" -static void *get_ipc(ctl_table *table) +static void *get_ipc(struct ctl_table *table) { char *which = table->data; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; @@ -27,7 +27,7 @@ static void *get_ipc(ctl_table *table) } #ifdef CONFIG_PROC_SYSCTL -static int proc_ipc_dointvec(ctl_table *table, int write, +static int proc_ipc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -38,7 +38,7 @@ static int proc_ipc_dointvec(ctl_table *table, int write, return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); } -static int proc_ipc_dointvec_minmax(ctl_table *table, int write, +static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -49,7 +49,7 @@ static int proc_ipc_dointvec_minmax(ctl_table *table, int write, return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } -static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write, +static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = current->nsproxy->ipc_ns; @@ -62,7 +62,7 @@ static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write, return err; } -static int proc_ipc_callback_dointvec_minmax(ctl_table *table, int write, +static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -85,7 +85,7 @@ static int proc_ipc_callback_dointvec_minmax(ctl_table *table, int write, return rc; } -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, +static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -119,7 +119,7 @@ static void ipc_auto_callback(int val) } } -static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, +static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 5bb8bfe67149..68d4e953762c 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -14,7 +14,7 @@ #include <linux/sysctl.h> #ifdef CONFIG_PROC_SYSCTL -static void *get_mq(ctl_table *table) +static void *get_mq(struct ctl_table *table) { char *which = table->data; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; @@ -22,7 +22,7 @@ static void *get_mq(ctl_table *table) return which; } -static int proc_mq_dointvec(ctl_table *table, int write, +static int proc_mq_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; @@ -32,7 +32,7 @@ static int proc_mq_dointvec(ctl_table *table, int write, return proc_dointvec(&mq_table, write, buffer, lenp, ppos); } -static int proc_mq_dointvec_minmax(ctl_table *table, int write, +static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; @@ -53,7 +53,7 @@ static int msg_max_limit_max = HARD_MSGMAX; static int msg_maxsize_limit_min = MIN_MSGSIZEMAX; static int msg_maxsize_limit_max = HARD_MSGSIZEMAX; -static ctl_table mq_sysctls[] = { +static struct ctl_table mq_sysctls[] = { { .procname = "queues_max", .data = &init_ipc_ns.mq_queues_max, @@ -100,7 +100,7 @@ static ctl_table mq_sysctls[] = { {} }; -static ctl_table mq_sysctl_dir[] = { +static struct ctl_table mq_sysctl_dir[] = { { .procname = "mqueue", .mode = 0555, @@ -109,7 +109,7 @@ static ctl_table mq_sysctl_dir[] = { {} }; -static ctl_table mq_sysctl_root[] = { +static struct ctl_table mq_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3214289df5a7..2ac9f133c4da 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -76,7 +76,7 @@ static bool kprobes_all_disarmed; /* This protects kprobe_table and optimizing_list */ static DEFINE_MUTEX(kprobe_mutex); -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance); static struct { raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; @@ -297,9 +297,9 @@ static inline void reset_kprobe_instance(void) /* * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c + * - under the kprobe_mutex - during kprobe_[un]register() + * OR + * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ struct kprobe *get_kprobe(void *addr) { @@ -567,7 +567,8 @@ static void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); - while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { + while (!list_empty(&optimizing_list) || + !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); /* this will also make optimizing_work execute immmediately */ @@ -676,8 +677,8 @@ static void reuse_unused_kprobe(struct kprobe *ap) */ op = container_of(ap, struct optimized_kprobe, kp); if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); + pr_warn("Warning: found a stray unused aggrprobe@%p\n", + ap->addr); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ @@ -794,7 +795,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } - printk(KERN_INFO "Kprobes globally optimized\n"); + pr_info("Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); } @@ -824,7 +825,7 @@ static void unoptimize_all_kprobes(void) /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); - printk(KERN_INFO "Kprobes globally unoptimized\n"); + pr_info("Kprobes globally unoptimized\n"); } static DEFINE_MUTEX(kprobe_sysctl_mutex); @@ -896,7 +897,7 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) /* There should be no unused kprobes can be reused without optimization */ static void reuse_unused_kprobe(struct kprobe *ap) { - printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + pr_err("Error: There should be no unused kprobe here.\n"); BUG_ON(kprobe_unused(ap)); } @@ -955,7 +956,8 @@ static void disarm_kprobe_ftrace(struct kprobe *p) } ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); - WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", + p->addr, ret); } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) @@ -1389,7 +1391,7 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) if (p != ap) { list_for_each_entry_rcu(list_p, &ap->list, list) if (list_p == p) - /* kprobe p is a valid probe */ + /* kprobe p is a valid probe */ goto valid; return NULL; } @@ -2018,8 +2020,8 @@ EXPORT_SYMBOL_GPL(enable_kprobe); void dump_kprobe(struct kprobe *kp) { - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + pr_warn("Dumping kprobe:\n"); + pr_warn("Name: %s\nAddress: %p\nOffset: %x\n", kp->symbol_name, kp->addr, kp->offset); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2128,7 +2130,7 @@ static int __init init_kprobes(void) kprobe_lookup_name(kretprobe_blacklist[i].name, kretprobe_blacklist[i].addr); if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", + pr_warn("kretprobe: lookup failed: %s\n", kretprobe_blacklist[i].name); } } @@ -2310,7 +2312,7 @@ static void arm_all_kprobes(void) } kprobes_all_disarmed = false; - printk(KERN_INFO "Kprobes globally enabled\n"); + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); @@ -2332,7 +2334,7 @@ static void disarm_all_kprobes(void) } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); + pr_info("Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..1f99664b7f87 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,12 +5,18 @@ * * Writer lock-stealing by Alex Shi <alex.shi@intel.com> * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. */ #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/sched/rt.h> #include <linux/export.h> +#include "mcs_spinlock.h" + /* * Guide to the rw_semaphore's count field for common values. * (32-bit case illustrated, similar for 64-bit) @@ -60,9 +66,7 @@ * */ -/* - * Initialize an rwsem: - */ +/* initialize a rwsem */ void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { @@ -76,6 +80,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_SMP + sem->owner = NULL; + sem->osq = NULL; +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -190,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ __visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) @@ -237,64 +245,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_SMP /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = true; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, the rwsem owner may have + * just acquired it and not set the owner yet or the rwsem + * has been released. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock */ __visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + /* + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include <linux/atomic.h> +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e791130f85a7..85ea94305015 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -136,6 +136,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->failcnt; case RES_SOFT_LIMIT: return &counter->soft_limit; + case RES_LOW_LIMIT: + return &counter->low_limit; }; BUG(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 156ae0104cba..da51641b6841 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -198,7 +198,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -861,6 +861,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 6fbe811c7ad1..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write) { char *which = table->data; struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) return which; } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which) { if (!write) up_read(&uts_sem); @@ -44,7 +44,7 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 516203e665fc..5d2c48f5083b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -244,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -271,6 +289,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -317,7 +336,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -327,6 +357,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -445,7 +486,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d64815651e90..3291a8e37490 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -27,6 +27,7 @@ #include <linux/radix-tree.h> #include <linux/percpu.h> #include <linux/slab.h> +#include <linux/kmemleak.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/string.h> @@ -200,6 +201,11 @@ radix_tree_node_alloc(struct radix_tree_root *root) rtp->nodes[rtp->nr - 1] = NULL; rtp->nr--; } + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(ret); } if (ret == NULL) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); diff --git a/mm/Makefile b/mm/Makefile index 4064f3ec145e..1eaa70bed7a4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..382f8ece1cb2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -234,11 +234,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2c5646f11f41..000000000000 --- a/mm/fremap.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * linux/mm/fremap.c - * - * Explicit pagetable population and nonlinear (random) mappings support. - * - * started by Ingo Molnar, Copyright (C) 2002, 2003 - */ -#include <linux/export.h> -#include <linux/backing-dev.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/file.h> -#include <linux/mman.h> -#include <linux/pagemap.h> -#include <linux/swapops.h> -#include <linux/rmap.h> -#include <linux/syscalls.h> -#include <linux/mmu_notifier.h> - -#include <asm/mmu_context.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> - -#include "internal.h" - -static int mm_counter(struct page *page) -{ - return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; -} - -static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - struct page *page; - swp_entry_t entry; - - if (pte_present(pte)) { - flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); - if (page) { - if (pte_dirty(pte)) - set_page_dirty(page); - update_hiwater_rss(mm); - dec_mm_counter(mm, mm_counter(page)); - page_remove_rmap(page); - page_cache_release(page); - } - } else { /* zap_pte() is not called when pte_none() */ - if (!pte_file(pte)) { - update_hiwater_rss(mm); - entry = pte_to_swp_entry(pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - dec_mm_counter(mm, mm_counter(page)); - } - } else { - free_swap_and_cache(entry); - dec_mm_counter(mm, MM_SWAPENTS); - } - } - pte_clear_not_present_full(mm, addr, ptep, 0); - } -} - -/* - * Install a file pte to a given virtual memory address, release any - * previously existing mapping. - */ -static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) -{ - int err = -ENOMEM; - pte_t *pte, ptfile; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - ptfile = pgoff_to_pte(pgoff); - - if (!pte_none(*pte)) - zap_pte(mm, vma, addr, pte); - - set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); - /* - * We don't need to run update_mmu_cache() here because the "file pte" - * being installed by install_file_pte() is not a real pte - it's a - * non-present entry (like a swap entry), noting what file offset should - * be mapped there when there's a fault (in a non-linear vma where - * that's not obvious). - */ - pte_unmap_unlock(pte, ptl); - err = 0; -out: - return err; -} - -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - struct mm_struct *mm = vma->vm_mm; - int err; - - do { - err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); - if (err) - return err; - - size -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } while (size); - - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - -/** - * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma - * @start: start of the remapped virtual memory range - * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range (see NOTE) - * @pgoff: to-be-mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. - * - * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma - * (shared backing store file). - * - * This syscall works purely via pagetables, so it's the most efficient - * way to map the same (large) file into a given virtual window. Unlike - * mmap()/mremap() it does not create any new vmas. The new mappings are - * also safe across swapout. - * - * NOTE: the @prot parameter right now is ignored (but must be zero), - * and the vma's default protection is used. Arbitrary protections - * might be implemented in the future. - */ -SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, - unsigned long, prot, unsigned long, pgoff, unsigned long, flags) -{ - struct mm_struct *mm = current->mm; - struct address_space *mapping; - struct vm_area_struct *vma; - int err = -EINVAL; - int has_write_lock = 0; - vm_flags_t vm_flags = 0; - - if (prot) - return err; - /* - * Sanitize the syscall parameters: - */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; - - /* Does the address range wrap, or is the span zero-sized? */ - if (start + size <= start) - return err; - - /* Does pgoff wrap? */ - if (pgoff + (size >> PAGE_SHIFT) < pgoff) - return err; - - /* Can we represent this offset inside this architecture's pte's? */ -#if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) - return err; -#endif - - /* We need down_write() to change vma->vm_flags. */ - down_read(&mm->mmap_sem); - retry: - vma = find_vma(mm, start); - - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma. - */ - if (!vma || !(vma->vm_flags & VM_SHARED)) - goto out; - - if (!vma->vm_ops || !vma->vm_ops->remap_pages) - goto out; - - if (start < vma->vm_start || start + size > vma->vm_end) - goto out; - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (!(vma->vm_flags & VM_NONLINEAR)) { - /* - * vm_private_data is used as a swapout cursor - * in a VM_NONLINEAR vma. - */ - if (vma->vm_private_data) - goto out; - - /* Don't need a nonlinear mapping, exit success */ - if (pgoff == linear_page_index(vma, start)) { - err = 0; - goto out; - } - - if (!has_write_lock) { -get_write_lock: - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - /* - * page_mkclean doesn't work on nonlinear vmas, so if - * dirty pages need to be accounted, emulate with linear - * vmas. - */ - if (mapping_cap_account_dirty(mapping)) { - unsigned long addr; - struct file *file = get_file(vma->vm_file); - /* mmap_region may free vma; grab the info now */ - vm_flags = vma->vm_flags; - - addr = mmap_region(file, start, size, vm_flags, pgoff); - fput(file); - if (IS_ERR_VALUE(addr)) { - err = addr; - } else { - BUG_ON(addr != start); - err = 0; - } - goto out_freed; - } - mutex_lock(&mapping->i_mmap_mutex); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_interval_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); - } - - if (vma->vm_flags & VM_LOCKED) { - /* - * drop PG_Mlocked flag for over-mapped range - */ - if (!has_write_lock) - goto get_write_lock; - vm_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = vm_flags; - } - - mmu_notifier_invalidate_range_start(mm, start, start + size); - err = vma->vm_ops->remap_pages(vma, start, size, pgoff); - mmu_notifier_invalidate_range_end(mm, start, start + size); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - -out: - if (vma) - vm_flags = vma->vm_flags; -out_freed: - if (likely(!has_write_lock)) - up_read(&mm->mmap_sem); - else - up_write(&mm->mmap_sem); - if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) - mm_populate(start, size); - - return err; -} diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index ff0d9779cec8..dcdcadb69533 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "kmemleak: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -50,25 +52,25 @@ static int __init kmemleak_test_init(void) printk(KERN_INFO "Kmemleak testing\n"); /* make some orphan objects */ - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); #ifndef CONFIG_MODULES - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); #endif - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); /* * Add elements to a list. They should only appear as orphan @@ -76,7 +78,7 @@ static int __init kmemleak_test_init(void) */ for (i = 0; i < 10; i++) { elem = kzalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; INIT_LIST_HEAD(&elem->list); @@ -85,7 +87,7 @@ static int __init kmemleak_test_init(void) for_each_possible_cpu(i) { per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); - pr_info("kmemleak: kmalloc(129) = %p\n", + pr_info("kmalloc(129) = %p\n", per_cpu(kmemleak_test_pointer, i)); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 736ade31d1dc..3cda50c1e394 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -387,7 +387,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%lx\n", object->flags); - pr_notice(" checksum = %d\n", object->checksum); + pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -990,6 +990,40 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) EXPORT_SYMBOL_GPL(kmemleak_free_percpu); /** + * kmemleak_update_trace - update object allocation stack trace + * @ptr: pointer to beginning of the object + * + * Override the object allocation stack trace for cases where the actual + * allocation place is not always useful. + */ +void __ref kmemleak_update_trace(const void *ptr) +{ + struct kmemleak_object *object; + unsigned long flags; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) + return; + + object = find_and_get_object((unsigned long)ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Updating stack trace for unknown object at %p\n", + ptr); +#endif + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); +} +EXPORT_SYMBOL(kmemleak_update_trace); + +/** * kmemleak_not_leak - mark an allocated object as false positive * @ptr: pointer to beginning of the object * diff --git a/mm/memblock.c b/mm/memblock.c index 0aa0d2b07624..6d2f219a48b0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -691,6 +691,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); return memblock_remove_range(&memblock.reserved, base, size); } @@ -1043,9 +1044,14 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, align = SMP_CACHE_BYTES; found = memblock_find_in_range_node(size, align, start, end, nid); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve(found, size)) { + /* + * The min_count is set to 0 so that memblock allocations are + * never reported as leaks. + */ + kmemleak_alloc(__va(found), size, 0, 0); return found; - + } return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 39a8e90ac78d..15bda8133ff9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -673,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg) static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { - VM_BUG_ON((unsigned)nid >= nr_node_ids); + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } @@ -685,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(memcg, nid, zid); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } static struct mem_cgroup_tree_per_zone * @@ -708,11 +710,9 @@ soft_limit_tree_from_page(struct page *page) return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -742,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, mz->on_tree = true; } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { if (!mz->on_tree) return; @@ -753,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, mz->on_tree = false; } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); spin_unlock(&mctz->lock); } @@ -769,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page); /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = mem_cgroup_page_zoneinfo(memcg, page); excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or @@ -788,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); } } @@ -801,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - int node, zone; - struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); + mem_cgroup_remove_exceeded(mz, mctz); } } } @@ -832,7 +826,7 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || !css_tryget_online(&mz->memcg->css)) goto retry; @@ -943,8 +937,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -952,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, - unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - unsigned long ret = 0; - - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - for_each_lru(lru) { - if (BIT(lru) & lru_mask) - ret += mz->lru_size[lru]; - } - return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - u64 total = 0; + unsigned long nr = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(memcg, - nid, zid, lru_mask); + VM_BUG_ON((unsigned)nid >= nr_node_ids); - return total; + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { + unsigned long nr = 0; int nid; - u64 total = 0; for_each_node_state(nid, N_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return total; + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1240,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, int uninitialized_var(seq); if (reclaim) { - int nid = zone_to_nid(reclaim->zone); - int zid = zone_idx(reclaim->zone); struct mem_cgroup_per_zone *mz; - mz = mem_cgroup_zoneinfo(root, nid, zid); + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; @@ -1351,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, goto out; } - mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + mz = mem_cgroup_zone_zoneinfo(memcg, zone); lruvec = &mz->lruvec; out: /* @@ -1410,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; - mz = page_cgroup_zoneinfo(memcg, page); + mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1683,8 +1666,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_unlock(); - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("memory: usage %llukB, low_limit %llukB limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->res, RES_LOW_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_FAILCNT)); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", @@ -2796,6 +2780,43 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return mem_cgroup_from_id(id); } +/** + * mem_cgroup_within_guarantee - checks whether given memcg is within its + * memory guarantee + * @memcg: target memcg for the reclaim + * @root: root of the reclaim hierarchy (null for the global reclaim) + * + * The given group is within its reclaim gurantee if it is below its low limit + * or the same applies for any parent up the hierarchy until root (including). + * Such a group might be excluded from the reclaim. + */ +bool mem_cgroup_within_guarantee(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + do { + if (!res_counter_low_limit_excess(&memcg->res)) + return true; + if (memcg == root) + break; + + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} + +bool mem_cgroup_all_within_guarantee(struct mem_cgroup *root) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, root) + if (!mem_cgroup_within_guarantee(iter, root)) { + mem_cgroup_iter_break(root, iter); + return false; + } + + return true; +} + struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@ -4595,7 +4616,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add @@ -4606,7 +4627,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); css_put(&mz->memcg->css); loop++; @@ -4790,6 +4811,10 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, if (mem_cgroup_is_root(memcg)) return -EINVAL; + pr_info_once("%s (%d): memory.force_empty is deprecated and will be " + "removed. Let us know if it is needed in your usecase at " + "linux-mm@kvack.org\n", + current->comm, task_pid_nr(current)); return mem_cgroup_force_empty(memcg) ?: nbytes; } @@ -5073,6 +5098,24 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, else return -EINVAL; break; + case RES_LOW_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } + ret = res_counter_memparse_write_strategy(buf, &val); + if (ret) + break; + if (type == _MEM) { + ret = res_counter_set_low_limit(&memcg->res, val); + break; + } + /* + * memsw low limit doesn't make any sense and kmem is not + * implemented yet - if ever + */ + return -EINVAL; + case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buf, &val); if (ret) @@ -5310,7 +5353,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat; recent_rotated[0] += rstat->recent_rotated[0]; @@ -5999,6 +6042,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read_u64, }, { + .name = "low_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_LOW_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), .write = mem_cgroup_write, @@ -6016,6 +6065,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "force_empty", + .flags = CFTYPE_INSANE, .write = mem_cgroup_force_empty_write, }, { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dbde41699272..8bc119909e1c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -65,6 +65,8 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -91,6 +93,7 @@ #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -2592,7 +2595,7 @@ void __init numa_policy_init(void) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } diff --git a/mm/mempool.c b/mm/mempool.c index 455d468c3a5d..e209c98c7203 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/blkdev.h> @@ -222,6 +223,11 @@ repeat_alloc: spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); return element; } diff --git a/mm/mmap.c b/mm/mmap.c index ced5efcdd4b6..af82b67c9f4d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,6 +6,8 @@ * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> @@ -37,6 +39,7 @@ #include <linux/sched/sysctl.h> #include <linux/notifier.h> #include <linux/memory.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -361,20 +364,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - printk("vm_end %lx < vm_start %lx\n", + pr_info("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - printk("free gap %lx, correct %lx\n", + pr_info("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -388,7 +391,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - printk("backwards %d, forwards %d\n", j, i); + pr_info("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -423,17 +426,17 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - printk("map_count %d vm_next %d\n", mm->map_count, i); + pr_info("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - printk("mm->highest_vm_end %lx, found %lx\n", + pr_info("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - printk("map_count %d rb %d\n", mm->map_count, i); + pr_info("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); @@ -2578,6 +2581,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return vm_munmap(addr, len); } + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM @@ -3280,7 +3352,7 @@ static struct notifier_block reserve_mem_nb = { static int __meminit init_reserve_notifier(void) { if (register_hotmemory_notifier(&reserve_mem_nb)) - printk("Failed registering memory add/remove notifier for admin reserve"); + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 04a9d94333a5..7ed58602e71b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -197,7 +197,6 @@ unsigned long __init free_all_bootmem(void) void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - kmemleak_free_part(__va(physaddr), size); memblock_free(physaddr, size); } @@ -212,7 +211,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { - kmemleak_free_part(__va(addr), size); memblock_free(addr, size); } diff --git a/mm/nommu.c b/mm/nommu.c index 85f8d6698d48..e6ced9d836dd 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -13,6 +13,8 @@ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/mm.h> #include <linux/vmacache.h> @@ -32,6 +34,7 @@ #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/sched/sysctl.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -1246,7 +1249,7 @@ error_free: return ret; enomem: - printk("Allocation of length %lu from process %d (%s) failed\n", + pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); show_free_areas(0); return -ENOMEM; @@ -1996,14 +1999,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_map_pages); -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 537d91cd3d28..ba037e32b8f3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1682,7 +1682,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 964130a74cff..7f9776747bde 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3389,7 +3389,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5810,7 +5810,7 @@ module_init(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5827,7 +5827,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5843,7 +5843,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5869,7 +5869,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5882,7 +5882,7 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; diff --git a/mm/page_io.c b/mm/page_io.c index 243a9b76e5ce..1ee120adbb45 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -274,9 +274,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, .count = PAGE_SIZE, .iov_offset = 0, .nr_segs = 1, - .bvec = &bv }; + /* Do this by hand because old gcc messes up the initializer */ + from.bvec = &bv; + init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); kiocb.ki_nbytes = PAGE_SIZE; diff --git a/mm/vmscan.c b/mm/vmscan.c index 3c2caadbdb0f..7c4eebdc6d44 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/module.h> #include <linux/gfp.h> @@ -43,6 +45,7 @@ #include <linux/sysctl.h> #include <linux/oom.h> #include <linux/prefetch.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -83,6 +86,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* anon vs. file LRUs scanning "ratio" */ + int swappiness; + /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. @@ -477,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -1845,13 +1851,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -1912,7 +1911,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (!global_reclaim(sc) && !sc->swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1922,7 +1921,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && sc->swappiness) { scan_balance = SCAN_EQUAL; goto out; } @@ -1965,7 +1964,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); + anon_prio = sc->swappiness; file_prio = 200 - anon_prio; /* @@ -2230,9 +2229,21 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +/** + * __shrink_zone - shrinks a given zone + * + * @zone: zone to shrink + * @sc: scan control with additional reclaim parameters + * @honor_memcg_guarantee: do not reclaim memcgs which are within their memory + * guarantee + * + * Returns the number of reclaimed memcgs. + */ +static unsigned __shrink_zone(struct zone *zone, struct scan_control *sc, + bool honor_memcg_guarantee) { unsigned long nr_reclaimed, nr_scanned; + unsigned nr_scanned_groups = 0; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2249,8 +2260,22 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) do { struct lruvec *lruvec; + /* Memcg might be protected from the reclaim */ + if (honor_memcg_guarantee && + mem_cgroup_within_guarantee(memcg, root)) { + /* + * It would be more optimal to skip the memcg + * subtree now but we do not have a memcg iter + * helper for that. Anyone? + */ + memcg = mem_cgroup_iter(root, memcg, &reclaim); + continue; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + nr_scanned_groups++; + sc->swappiness = mem_cgroup_swappiness(memcg); shrink_lruvec(lruvec, sc); /* @@ -2277,6 +2302,27 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return nr_scanned_groups; +} + +static void shrink_zone(struct zone *zone, struct scan_control *sc) +{ + bool honor_guarantee = true; + + while (!__shrink_zone(zone, sc, honor_guarantee)) { + /* + * The previous round of reclaim didn't find anything to scan + * because + * a) the whole reclaimed hierarchy is within guarantee so + * we fallback to ignore the guarantee because other option + * would be the OOM + * b) multiple reclaimers are racing and so the first round + * should be retried + */ + if (mem_cgroup_all_within_guarantee(sc->target_mem_cgroup)) + honor_guarantee = false; + } } /* Returns true if compaction should go ahead for a high-order request */ @@ -2717,6 +2763,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_swap = !noswap, .order = 0, .priority = 0, + .swappiness = mem_cgroup_swappiness(memcg), .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |