diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2015-05-15 17:31:14 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2015-05-15 17:31:14 +1000 |
commit | c82e20ec7e60193d308401a6bc8aa37f4b08e0a4 (patch) | |
tree | 21bb403569ce8b41bf5ba2eeefed0fbd1c312f0e | |
parent | d608cb95d20a55228c29d4d6012e29df857c81be (diff) | |
parent | ecc81b6b2f2ed94041f0768234a3e9a2b441d329 (diff) |
Merge branch 'akpm-current/current'
Conflicts:
arch/x86/kernel/machine_kexec_64.c
254 files changed, 5207 insertions, 2327 deletions
@@ -84,6 +84,7 @@ Mayuresh Janorkar <mayur@ti.com> Michael Buesch <m@bues.ch> Michel Dänzer <michel@tungstengraphics.com> Mitesh shah <mshah@teja.com> +Mohit Kumar <mohit.kumar@st.com> <mohit.kumar.dhaka@gmail.com> Morten Welinder <terra@gnome.org> Morten Welinder <welinder@anemone.rentec.com> Morten Welinder <welinder@darter.rentec.com> @@ -95,6 +96,7 @@ Patrick Mochel <mochel@digitalimplant.org> Peter A Jonsson <pj@ludd.ltu.se> Peter Oruba <peter@oruba.de> Peter Oruba <peter.oruba@amd.com> +Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com> Praveen BP <praveenbp@ti.com> Rajesh Shah <rajesh.shah@intel.com> Ralf Baechle <ralf@linux-mips.org> diff --git a/Documentation/ABI/testing/configfs-spear-pcie-gadget b/Documentation/ABI/testing/configfs-spear-pcie-gadget index 875988146a63..840c324ef34d 100644 --- a/Documentation/ABI/testing/configfs-spear-pcie-gadget +++ b/Documentation/ABI/testing/configfs-spear-pcie-gadget @@ -1,7 +1,7 @@ What: /config/pcie-gadget Date: Feb 2011 KernelVersion: 2.6.37 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Interface is used to configure selected dual mode PCIe controller diff --git a/Documentation/ABI/testing/sysfs-bus-usb-lvstest b/Documentation/ABI/testing/sysfs-bus-usb-lvstest index aae68fc2d842..5151290cf8e7 100644 --- a/Documentation/ABI/testing/sysfs-bus-usb-lvstest +++ b/Documentation/ABI/testing/sysfs-bus-usb-lvstest @@ -4,14 +4,14 @@ driver is bound with root hub device. What: /sys/bus/usb/devices/.../get_dev_desc Date: March 2014 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Write to this node to issue "Get Device Descriptor" for Link Layer Validation device. It is needed for TD.7.06. What: /sys/bus/usb/devices/.../u1_timeout Date: March 2014 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Set "U1 timeout" for the downstream port where Link Layer Validation device is connected. Timeout value must be between 0 @@ -19,7 +19,7 @@ Description: What: /sys/bus/usb/devices/.../u2_timeout Date: March 2014 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Set "U2 timeout" for the downstream port where Link Layer Validation device is connected. Timeout value must be between 0 @@ -27,21 +27,21 @@ Description: What: /sys/bus/usb/devices/.../hot_reset Date: March 2014 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Write to this node to issue "Reset" for Link Layer Validation device. It is needed for TD.7.29, TD.7.31, TD.7.34 and TD.7.35. What: /sys/bus/usb/devices/.../u3_entry Date: March 2014 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Write to this node to issue "U3 entry" for Link Layer Validation device. It is needed for TD.7.35 and TD.7.36. What: /sys/bus/usb/devices/.../u3_exit Date: March 2014 -Contact: Pratyush Anand <pratyush.anand@st.com> +Contact: Pratyush Anand <pratyush.anand@gmail.com> Description: Write to this node to issue "U3 exit" for Link Layer Validation device. It is needed for TD.7.36. diff --git a/Documentation/ABI/testing/sysfs-class-zram b/Documentation/ABI/testing/sysfs-class-zram new file mode 100644 index 000000000000..48ddacbe0e69 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-zram @@ -0,0 +1,24 @@ +What: /sys/class/zram-control/ +Date: August 2015 +KernelVersion: 4.2 +Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Description: + The zram-control/ class sub-directory belongs to zram + device class + +What: /sys/class/zram-control/hot_add +Date: August 2015 +KernelVersion: 4.2 +Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Description: + RO attribute. Read operation will cause zram to add a new + device and return its device id back to user (so one can + use /dev/zram<id>), or error code. + +What: /sys/class/zram-control/hot_remove +Date: August 2015 +KernelVersion: 4.2 +Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Description: + WO attribute. Remove a specific /dev/zramX device, where X + is a device_id provided by user. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 48a183e29988..c4de576093af 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -19,7 +19,9 @@ Following shows a typical sequence of steps for using zram. 1) Load Module: modprobe zram num_devices=4 This creates 4 devices: /dev/zram{0,1,2,3} - (num_devices parameter is optional. Default: 1) + +num_devices parameter is optional and tells zram how many devices should be +pre-created. Default: 1. 2) Set max number of compression streams Compression backend may use up to max_comp_streams compression streams, @@ -97,7 +99,24 @@ size of the disk when not in use so a huge zram is wasteful. mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -7) Stats: +7) Add/remove zram devices + +zram provides a control interface, which enables dynamic (on-demand) device +addition and removal. + +In order to add a new /dev/zramX device, perform read operation on hot_add +attribute. This will return either new device's device id (meaning that you +can use /dev/zram<id>) or error code. + +Example: + cat /sys/class/zram-control/hot_add + 1 + +To remove the existing /dev/zramX device (where X is a device id) +execute + echo X > /sys/class/zram-control/hot_remove + +8) Stats: Per-device statistics are exported as various nodes under /sys/block/zram<id>/ A brief description of exported device attritbutes. For more details please @@ -126,7 +145,7 @@ mem_used_max RW the maximum amount memory zram have consumed to mem_limit RW the maximum amount of memory ZRAM can use to store the compressed data num_migrated RO the number of objects migrated migrated by compaction - +compact WO trigger memory compaction WARNING ======= @@ -172,11 +191,11 @@ line of text and contains the following stats separated by whitespace: zero_pages num_migrated -8) Deactivate: +9) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -9) Reset: +10) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126aceed8..223c32171dcc 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block <bool>: 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt index ab0baa692c13..22dd6af2e4bd 100644 --- a/Documentation/lockup-watchdogs.txt +++ b/Documentation/lockup-watchdogs.txt @@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows administrators to configure the period of the hrtimer and the perf event. The right value for a particular environment is a trade-off between fast response to lockups and detection overhead. + +By default, the watchdog runs on all online cores. However, on a +kernel configured with NO_HZ_FULL, by default the watchdog runs only +on the housekeeping cores, not the cores specified in the "nohz_full" +boot argument. If we allowed the watchdog to run by default on +the "nohz_full" cores, we would have to run timer ticks to activate +the scheduler, which would prevent the "nohz_full" functionality +from protecting the user code on those cores from the kernel. +Of course, disabling it by default on the nohz_full cores means that +when those cores do enter the kernel, by default we will not be +able to detect if they lock up. However, allowing the watchdog +to continue to run on the housekeeping (non-tickless) cores means +that we will continue to detect lockups properly on those cores. + +In either case, the set of cores excluded from running the watchdog +may be adjusted via the kernel.watchdog_cpumask sysctl. For +nohz_full cores, this may be useful for debugging a case where the +kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/misc-devices/spear-pcie-gadget.txt b/Documentation/misc-devices/spear-pcie-gadget.txt index 02c13ef5e908..89b88dee4143 100644 --- a/Documentation/misc-devices/spear-pcie-gadget.txt +++ b/Documentation/misc-devices/spear-pcie-gadget.txt @@ -2,7 +2,7 @@ Spear PCIe Gadget Driver: Author ============= -Pratyush Anand (pratyush.anand@st.com) +Pratyush Anand (pratyush.anand@gmail.com) Location ============ diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 2216eb187c21..2ec6d84f391c 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -244,6 +244,14 @@ dentry names: Passed by reference. +task_struct comm name: + + %pT + + For printing task_struct->comm. + + Passed by reference (NULL for "current"). + struct va_format: %pV diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c831001c45f1..e5d528e0c46e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -923,6 +923,27 @@ and nmi_watchdog. ============================================================== +watchdog_cpumask: + +This value can be used to control on which cpus the watchdog may run. +The default cpumask is all possible cores, but if NO_HZ_FULL is +enabled in the kernel config, and cores are specified with the +nohz_full= boot argument, those cores are excluded by default. +Offline cores can be included in this mask, and if the core is later +brought online, the watchdog will be started based on the mask value. + +Typically this value would only be touched in the nohz_full case +to re-enable cores that by default were not running the watchdog, +if a kernel lockup was suspected on those cores. + +The argument value is the standard cpulist format for cpumasks, +so for example to enable the watchdog on cores 0, 2, 3, and 4 you +might say: + + echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask + +============================================================== + watchdog_thresh: This value can be used to control the frequency of hrtimer and NMI diff --git a/MAINTAINERS b/MAINTAINERS index 3d5ab5350e31..2dc517cd4835 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1953,7 +1953,7 @@ S: Maintained F: drivers/net/wireless/b43legacy/ BACKLIGHT CLASS/SUBSYSTEM -M: Jingoo Han <jg1.han@samsung.com> +M: Jingoo Han <jingoohan1@gmail.com> M: Lee Jones <lee.jones@linaro.org> S: Maintained F: drivers/video/backlight/ @@ -2162,14 +2162,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rpi/linux-rpi.git S: Maintained N: bcm2835 -BROADCOM BCM33XX MIPS ARCHITECTURE -M: Kevin Cernekee <cernekee@gmail.com> -L: linux-mips@linux-mips.org -S: Maintained -F: arch/mips/bcm3384/* -F: arch/mips/include/asm/mach-bcm3384/* -F: arch/mips/kernel/*bmips* - BROADCOM BCM5301X ARM ARCHITECTURE M: Hauke Mehrtens <hauke@hauke-m.de> L: linux-arm-kernel@lists.infradead.org @@ -2212,7 +2204,7 @@ S: Maintained F: arch/mips/bmips/* F: arch/mips/include/asm/mach-bmips/* F: arch/mips/kernel/*bmips* -F: arch/mips/boot/dts/bcm*.dts* +F: arch/mips/boot/dts/brcm/bcm*.dts* F: drivers/irqchip/irq-bcm7* F: drivers/irqchip/irq-brcmstb* @@ -2264,7 +2256,7 @@ M: Ray Jui <rjui@broadcom.com> L: bcm-kernel-feedback-list@broadcom.com S: Supported F: drivers/gpio/gpio-bcm-kona.c -F: Documentation/devicetree/bindings/gpio/gpio-bcm-kona.txt +F: Documentation/devicetree/bindings/gpio/brcm,kona-gpio.txt BROADCOM SPECIFIC AMBA DRIVER (BCMA) M: Rafał Miłecki <zajec5@gmail.com> @@ -3945,7 +3937,7 @@ F: drivers/extcon/ F: Documentation/extcon/ EXYNOS DP DRIVER -M: Jingoo Han <jg1.han@samsung.com> +M: Jingoo Han <jingoohan1@gmail.com> L: dri-devel@lists.freedesktop.org S: Maintained F: drivers/gpu/drm/exynos/exynos_dp* @@ -5136,11 +5128,10 @@ INTEL ASoC BDW/HSW DRIVERS M: Jie Yang <yang.jie@linux.intel.com> L: alsa-devel@alsa-project.org S: Supported -F: sound/soc/intel/sst-haswell* -F: sound/soc/intel/sst-dsp* -F: sound/soc/intel/sst-firmware.c -F: sound/soc/intel/broadwell.c -F: sound/soc/intel/haswell.c +F: sound/soc/intel/common/sst-dsp* +F: sound/soc/intel/common/sst-firmware.c +F: sound/soc/intel/boards/broadwell.c +F: sound/soc/intel/haswell/ INTEL C600 SERIES SAS CONTROLLER DRIVER M: Intel SCU Linux support <intel-linux-scu@intel.com> @@ -6834,7 +6825,6 @@ L: nbd-general@lists.sourceforge.net T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c -F: include/linux/nbd.h F: include/uapi/linux/nbd.h NETWORK DROP MONITOR @@ -7459,7 +7449,6 @@ F: arch/*/include/asm/paravirt.h PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES M: Tim Waugh <tim@cyberelk.net> L: linux-parport@lists.infradead.org (subscribers-only) -W: http://www.torque.net/linux-pp.html S: Maintained F: Documentation/blockdev/paride.txt F: drivers/block/paride/ @@ -7611,7 +7600,7 @@ S: Maintained F: drivers/pci/host/*rcar* PCI DRIVER FOR SAMSUNG EXYNOS -M: Jingoo Han <jg1.han@samsung.com> +M: Jingoo Han <jingoohan1@gmail.com> L: linux-pci@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org (moderated for non-subscribers) @@ -7619,7 +7608,7 @@ S: Maintained F: drivers/pci/host/pci-exynos.c PCI DRIVER FOR SYNOPSIS DESIGNWARE -M: Jingoo Han <jg1.han@samsung.com> +M: Jingoo Han <jingoohan1@gmail.com> L: linux-pci@vger.kernel.org S: Maintained F: drivers/pci/host/*designware* @@ -8574,7 +8563,7 @@ S: Supported F: sound/soc/samsung/ SAMSUNG FRAMEBUFFER DRIVER -M: Jingoo Han <jg1.han@samsung.com> +M: Jingoo Han <jingoohan1@gmail.com> L: linux-fbdev@vger.kernel.org S: Maintained F: drivers/video/fbdev/s3c-fb.c diff --git a/arch/Kconfig b/arch/Kconfig index a65eafb24997..bec6666a3cc4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -499,6 +499,13 @@ config ARCH_HAS_ELF_RANDOMIZE - arch_mmap_rnd() - arch_randomize_brk() +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b07fd862fec3 --- /dev/null +++ b/arch/alpha/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H +#define _ASM_ALPHA_MM_ARCH_HOOKS_H + +#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */ diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 0086b472bc2b..836fbd44f65b 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -44,6 +44,7 @@ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_SPACEAVAIL 5 /* ensure resources are available */ #define MADV_DONTNEED 6 /* don't need these pages */ +#define MADV_FREE 7 /* free pages only if memory pressure */ /* common/generic parameters */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h index 45b8e0cea176..f787894613ad 100644 --- a/arch/arc/include/asm/dma-mapping.h +++ b/arch/arc/include/asm/dma-mapping.h @@ -178,22 +178,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, } static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir) { int i; + struct scatterlist *sg; - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sglist, sg, nelems, i) _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); } static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction dir) +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction dir) { int i; + struct scatterlist *sg; - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sglist, sg, nelems, i) _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); } diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..c37541c5f8ba --- /dev/null +++ b/arch/arc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ARC_MM_ARCH_HOOKS_H +#define _ASM_ARC_MM_ARCH_HOOKS_H + +#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */ diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 1f1b1cd112f3..31bb7dccb971 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline int huge_pte_none(pte_t pte) { return pte_none(pte); diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..7056660c7cc4 --- /dev/null +++ b/arch/arm/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ARM_MM_ARCH_HOOKS_H +#define _ASM_ARM_MM_ARCH_HOOKS_H + +#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index a745a2a53853..6d6012a320b2 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING); PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); +PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY); PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index c72412415093..fcafb521f14e 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -41,11 +41,6 @@ int pud_huge(pud_t pud) return 0; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 5b7ca8ace95f..734c17e89e94 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline int huge_pte_none(pte_t pte) { return pte_none(pte); diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..562b655f5ba9 --- /dev/null +++ b/arch/arm64/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H +#define _ASM_ARM64_MM_ARCH_HOOKS_H + +#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 56283f8a675c..bd5db28324ba 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -285,10 +285,12 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) +#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) +#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2de9d2e59d96..cccc4af87a03 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -31,13 +31,6 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> -#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} -#endif - int pmd_huge(pmd_t pmd) { return !(pmd_val(pmd) & PMD_TABLE_BIT); diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h index b3d18f9f3e8d..ae7ac9205d20 100644 --- a/arch/avr32/include/asm/dma-mapping.h +++ b/arch/avr32/include/asm/dma-mapping.h @@ -209,17 +209,18 @@ dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, * the same here. */ static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; - for (i = 0; i < nents; i++) { + for_each_sg(sglist, sg, nents, i) { char *virt; - sg[i].dma_address = page_to_bus(sg_page(&sg[i])) + sg[i].offset; - virt = sg_virt(&sg[i]); - dma_cache_sync(dev, virt, sg[i].length, direction); + sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; + virt = sg_virt(sg); + dma_cache_sync(dev, virt, sg->length, direction); } return nents; @@ -321,14 +322,14 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, } static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; - for (i = 0; i < nents; i++) { - dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, direction); - } + for_each_sg(sglist, sg, nents, i) + dma_cache_sync(dev, sg_virt(sg), sg->length, direction); } /* Now for the API extensions over the pci_ one */ diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..145452ffbdad --- /dev/null +++ b/arch/avr32/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H +#define _ASM_AVR32_MM_ARCH_HOOKS_H + +#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */ diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..1c5211ec338f --- /dev/null +++ b/arch/blackfin/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H +#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H + +#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */ diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..bb3c4a6ce8e9 --- /dev/null +++ b/arch/c6x/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_C6X_MM_ARCH_HOOKS_H +#define _ASM_C6X_MM_ARCH_HOOKS_H + +#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */ diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..314f774db2b0 --- /dev/null +++ b/arch/cris/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H +#define _ASM_CRIS_MM_ARCH_HOOKS_H + +#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */ diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..51d13a870404 --- /dev/null +++ b/arch/frv/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_FRV_MM_ARCH_HOOKS_H +#define _ASM_FRV_MM_ARCH_HOOKS_H + +#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */ diff --git a/arch/frv/include/asm/sections.h b/arch/frv/include/asm/sections.h index 17d0fb171bba..d03fb64e93e9 100644 --- a/arch/frv/include/asm/sections.h +++ b/arch/frv/include/asm/sections.h @@ -35,12 +35,6 @@ extern unsigned long __nongprelbss memory_start; extern unsigned long __nongprelbss memory_end; extern unsigned long __nongprelbss rom_length; -/* determine if we're running from ROM */ -static inline int is_in_rom(unsigned long addr) -{ - return 0; /* default case: not in ROM */ -} - #endif #endif #endif /* _ASM_SECTIONS_H */ diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index b99c2a7cc7a4..8eeea0d77aad 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c @@ -119,14 +119,16 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, EXPORT_SYMBOL(dma_map_single); -int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; - for (i=0; i<nents; i++) - frv_cache_wback_inv(sg_dma_address(&sg[i]), - sg_dma_address(&sg[i]) + sg_dma_len(&sg[i])); + for_each_sg(sglist, sg, nents, i) { + frv_cache_wback_inv(sg_dma_address(sg), + sg_dma_address(sg) + sg_dma_len(sg)); + } BUG_ON(direction == DMA_NONE); diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index 82478979ac9a..4d1f01dc46e5 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c @@ -50,19 +50,20 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, EXPORT_SYMBOL(dma_map_single); -int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { unsigned long dampr2; void *vaddr; int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); dampr2 = __get_DAMPR(2); - for (i = 0; i < nents; i++) { - vaddr = kmap_atomic_primary(sg_page(&sg[i])); + for_each_sg(sglist, sg, nents, i) { + vaddr = kmap_atomic_primary(sg_page(sg)); frv_dcache_writeback((unsigned long) vaddr, (unsigned long) vaddr + PAGE_SIZE); diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..05e8b939e416 --- /dev/null +++ b/arch/hexagon/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H +#define _ASM_HEXAGON_MM_ARCH_HOOKS_H + +#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */ diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index aa910054b8e7..ff1377bc02a6 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..ab4b5c698322 --- /dev/null +++ b/arch/ia64/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_IA64_MM_ARCH_HOOKS_H +#define _ASM_IA64_MM_ARCH_HOOKS_H + +#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */ diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 52b7604b5215..f50d4b3f501a 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } /* diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index ea21d4cad540..aa19b7ac8222 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -58,27 +58,22 @@ paddr_to_nid(unsigned long paddr) * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where * the section resides. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static int __meminitdata last_ssec, last_esec; - static int __meminitdata last_nid; - if (section >= last_ssec && section < last_esec) - return last_nid; + if (section >= state->last_start && section < state->last_end) + return state->last_nid; for (i = 0; i < num_node_memblks; i++) { ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT; esec = (node_memblk[i].start_paddr + node_memblk[i].size + ((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT; if (section >= ssec && section < esec) { - last_ssec = ssec; - last_esec = esec; - last_nid = node_memblk[i].nid; + state->last_start = ssec; + state->last_end = esec; + state->last_nid = node_memblk[i].nid; return node_memblk[i].nid; } } diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..6d60b4750f41 --- /dev/null +++ b/arch/m32r/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_M32R_MM_ARCH_HOOKS_H +#define _ASM_M32R_MM_ARCH_HOOKS_H + +#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */ diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..7e8709bc90ae --- /dev/null +++ b/arch/m68k/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_M68K_MM_ARCH_HOOKS_H +#define _ASM_M68K_MM_ARCH_HOOKS_H + +#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */ diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h index 471f481e67f3..f730b396d79b 100644 --- a/arch/metag/include/asm/hugetlb.h +++ b/arch/metag/include/asm/hugetlb.h @@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len); -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b0072b2eb0de --- /dev/null +++ b/arch/metag/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_METAG_MM_ARCH_HOOKS_H +#define _ASM_METAG_MM_ARCH_HOOKS_H + +#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */ diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 7ca80ac42ed5..53f0f6c47027 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { return pmd_page_shift(pmd) > PAGE_SHIFT; diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..5c4065911bda --- /dev/null +++ b/arch/microblaze/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H +#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H + +#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index fe0d15d32660..4a5bb5453408 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b5609fe8e475 --- /dev/null +++ b/arch/mips/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H +#define _ASM_MIPS_MM_ARCH_HOOKS_H + +#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 819af9d057a8..9d8106758142 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) } /* - * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a + * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a * different prototype. */ -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) { pmd_t old = *pmdp; diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index cfcb876cae6b..106e741aa7ee 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -67,6 +67,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index 609d1241b0c4..eeaf0245c3b1 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -262,12 +262,13 @@ static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, plat_unmap_dma_mem(dev, dma_addr, size, direction); } -static int mips_dma_map_sg(struct device *dev, struct scatterlist *sg, +static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction, struct dma_attrs *attrs) { int i; + struct scatterlist *sg; - for (i = 0; i < nents; i++, sg++) { + for_each_sg(sglist, sg, nents, i) { if (!plat_device_is_coherent(dev)) __dma_sync(sg_page(sg), sg->offset, sg->length, direction); @@ -291,13 +292,14 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page, return plat_map_dma_mem_page(dev, page) + offset; } -static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sg, +static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nhwentries, enum dma_data_direction direction, struct dma_attrs *attrs) { int i; + struct scatterlist *sg; - for (i = 0; i < nhwentries; i++, sg++) { + for_each_sg(sglist, sg, nhwentries, i) { if (!plat_device_is_coherent(dev) && direction != DMA_TO_DEVICE) __dma_sync(sg_page(sg), sg->offset, sg->length, @@ -324,26 +326,34 @@ static void mips_dma_sync_single_for_device(struct device *dev, } static void mips_dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction direction) + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { int i; + struct scatterlist *sg; - if (cpu_needs_post_dma_flush(dev)) - for (i = 0; i < nelems; i++, sg++) + if (cpu_needs_post_dma_flush(dev)) { + for_each_sg(sglist, sg, nelems, i) { __dma_sync(sg_page(sg), sg->offset, sg->length, direction); + } + } plat_post_dma_flush(dev); } static void mips_dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction direction) + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { int i; + struct scatterlist *sg; - if (!plat_device_is_coherent(dev)) - for (i = 0; i < nelems; i++, sg++) + if (!plat_device_is_coherent(dev)) { + for_each_sg(sglist, sg, nelems, i) { __dma_sync(sg_page(sg), sg->offset, sg->length, direction); + } + } } int mips_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 06e0f421b41b..74aa6f62468f 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - /* * This function checks for proper alignment of input addr and len parameters. */ diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..e2029a652f4c --- /dev/null +++ b/arch/mn10300/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H +#define _ASM_MN10300_MM_ARCH_HOOKS_H + +#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */ diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d7290dc68558 --- /dev/null +++ b/arch/nios2/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H +#define _ASM_NIOS2_MM_ARCH_HOOKS_H + +#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */ diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..6d33cb555fe1 --- /dev/null +++ b/arch/openrisc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H +#define _ASM_OPENRISC_MM_ARCH_HOOKS_H + +#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */ diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..654ec63b0ee9 --- /dev/null +++ b/arch/parisc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H +#define _ASM_PARISC_MM_ARCH_HOOKS_H + +#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 294d251ca7b2..6cb8db76fd4e 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -40,6 +40,7 @@ #define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ #define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ #define MADV_VPS_INHERIT 7 /* Inherit parents page size */ +#define MADV_FREE 8 /* free pages only if memory pressure */ /* common/generic parameters */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ff834fd67478..b9402c9b3454 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++, sglist++ ) { - unsigned long vaddr = (unsigned long)sg_virt(sglist); - sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr); - sg_dma_len(sglist) = sglist->length; - flush_kernel_dcache_range(vaddr, sglist->length); + for_each_sg(sglist, sg, nents, i) { + unsigned long vaddr = (unsigned long)sg_virt(sg); + + sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); + sg_dma_len(sg) = sg->length; + flush_kernel_dcache_range(vaddr, sg->length); } return nents; } @@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); @@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ - for (i = 0; i < nents; i++, sglist++ ) - flush_kernel_vmap_range(sg_virt(sglist), sglist->length); + for_each_sg(sglist, sg, nents, i) + flush_kernel_vmap_range(sg_virt(sg), sg->length); return; } @@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ - for (i = 0; i < nents; i++, sglist++ ) - flush_kernel_vmap_range(sg_virt(sglist), sglist->length); + for_each_sg(sglist, sg, nents, i) + flush_kernel_vmap_range(sg_virt(sg), sg->length); } static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ - for (i = 0; i < nents; i++, sglist++ ) - flush_kernel_vmap_range(sg_virt(sglist), sglist->length); + for_each_sg(sglist, sg, nents, i) + flush_kernel_vmap_range(sg_virt(sg), sg->length); } struct hppa_dma_ops pcxl_dma_ops = { diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 1d53a65b4ec1..4bbd3c8c2888 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - - static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..f2a2da895897 --- /dev/null +++ b/arch/powerpc/include/asm/mm-arch-hooks.h @@ -0,0 +1,28 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H +#define _ASM_POWERPC_MM_ARCH_HOOKS_H + +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ + /* + * mremap() doesn't allow moving multiple vmas so we can limit the + * check to old_start == vdso_base. + */ + if (old_start == mm->context.vdso_base) + mm->context.vdso_base = new_start; +} +#define arch_remap arch_remap + +#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 73382eba02dc..825cb232eab6 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -8,7 +8,6 @@ #include <linux/spinlock.h> #include <asm/mmu.h> #include <asm/cputable.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputhreads.h> /* @@ -109,5 +108,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, #endif } +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (start <= mm->context.vdso_base && mm->context.vdso_base < end) + mm->context.vdso_base = 0; +} + +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ +} + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index f951d9cf358a..07dbe968dc02 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -491,9 +491,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) +#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) +#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) @@ -553,13 +555,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -extern pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMDP_CLEAR_FLUSH -extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp); +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, @@ -576,6 +574,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, extern void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +#define pmdp_collapse_flush pmdp_collapse_flush + #define __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5bfdab9047be..b7a68442b767 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, struct vio_dev *viodev = to_vio_dev(dev); struct iommu_table *tbl; struct scatterlist *sgl; - int ret, count = 0; + int ret, count; size_t alloc_size = 0; tbl = get_iommu_table_base(dev); - for (sgl = sglist; count < nelems; count++, sgl++) + for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); if (vio_cmo_alloc(viodev, alloc_size)) { @@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, return ret; } - for (sgl = sglist, count = 0; count < ret; count++, sgl++) + for_each_sg(sglist, sgl, ret, count) alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); if (alloc_size) vio_cmo_dealloc(viodev, alloc_size); @@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, struct iommu_table *tbl; struct scatterlist *sgl; size_t alloc_size = 0; - int count = 0; + int count; tbl = get_iommu_table_base(dev); - for (sgl = sglist; count < nelems; count++, sgl++) + for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 3385e3d0506e..38bd5d998c81 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate) } #endif -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #ifdef CONFIG_PPC_FSL_BOOK3E #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 6bfadf1aa5cb..876232d64126 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -554,47 +554,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, return old; } -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - if (pmd_trans_huge(*pmdp)) { - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); - } else { - /* - * khugepaged calls this for normal pmd - */ - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - } + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); return pmd; } @@ -817,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, return; } -pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) +pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { pmd_t old_pmd; pgtable_t pgtable; diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 11eae5f55b70..dfb542ade6b1 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -35,7 +35,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -#define hugetlb_prefault_arch_hook(mm) do { } while (0) #define arch_clear_hugepage_flags(page) do { } while (0) int arch_prepare_hugepage(struct page *page); diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..07680b2f3c59 --- /dev/null +++ b/arch/s390/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_S390_MM_ARCH_HOOKS_H +#define _ASM_S390_MM_ARCH_HOOKS_H + +#endif /* _ASM_S390_MM_ARCH_HOOKS_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index fc642399b489..4565b33ed94b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, return pmd_young(pmd); } -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; @@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, return pmd; } -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL -static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, - unsigned long address, - pmd_t *pmdp, int full) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL +static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp, int full) { pmd_t pmd = *pmdp; @@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, return pmd; } -#define __HAVE_ARCH_PMDP_CLEAR_FLUSH -static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH +static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { - return pmdp_get_and_clear(vma->vm_mm, address, pmdp); + return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #define __HAVE_ARCH_PMDP_INVALIDATE diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index d7fa2f0f1425..f8498dde67b1 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -202,7 +202,7 @@ COMPAT_SYSCALL_WRAP1(epoll_create1, int, flags); COMPAT_SYSCALL_WRAP2(tkill, int, pid, int, sig); COMPAT_SYSCALL_WRAP3(tgkill, int, tgid, int, pid, int, sig); COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags); -COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val); +COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls); COMPAT_SYSCALL_WRAP2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags); COMPAT_SYSCALL_WRAP4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim); COMPAT_SYSCALL_WRAP5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e617e74b7be2..c3f8e3df92ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -193,11 +193,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmdp; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { if (!MACHINE_HAS_HPAGE) diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..5e38689f189a --- /dev/null +++ b/arch/score/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H +#define _ASM_SCORE_MM_ARCH_HOOKS_H + +#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */ diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 699255d6d1c6..b788a9bc8918 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..18087298b728 --- /dev/null +++ b/arch/sh/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_SH_MM_ARCH_HOOKS_H +#define _ASM_SH_MM_ARCH_HOOKS_H + +#endif /* _ASM_SH_MM_ARCH_HOOKS_H */ diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 534bc978af8a..6385f60209b6 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - int pmd_huge(pmd_t pmd) { return 0; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index e4cab465b81f..3130d7636312 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..b89ba44c16f1 --- /dev/null +++ b/arch/sparc/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H +#define _ASM_SPARC_MM_ARCH_HOOKS_H + +#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index dc165ebdf05a..9953d3c86e8b 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -697,6 +697,15 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return __pmd(pte_val(pte)); } +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + pte_t pte = __pte(pmd_val(pmd)); + + pte = pte_mkclean(pte); + + return __pmd(pte_val(pte)); +} + static inline pmd_t pmd_mkyoung(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); @@ -845,10 +854,10 @@ static inline unsigned long pud_pfn(pud_t pud) void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t orig, int fullmm); -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long addr, - pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmdp) { pmd_t pmd = *pmdp; set_pmd_at(mm, addr, pmdp, __pmd(0UL)); diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c index 7d3ca30fcd15..1ae5eb1bb045 100644 --- a/arch/sparc/kernel/ldc.c +++ b/arch/sparc/kernel/ldc.c @@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp, struct cookie_state state; struct ldc_iommu *iommu; int err; + struct scatterlist *s; if (map_perm & ~LDC_MAP_ALL) return -EINVAL; @@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp, state.pte_idx = (base - iommu->page_table); state.nc = 0; - for (i = 0; i < num_sg; i++) - fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT, - sg[i].offset, sg[i].length); + for_each_sg(sg, s, num_sg, i) { + fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT, + s->offset, s->length); + } return state.nc; } diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 4242eab12e10..131eaf4ad7f5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index 3257733003f8..1abd00c55236 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h @@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) -{ -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d1709ea774f7 --- /dev/null +++ b/arch/tile/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_TILE_MM_ARCH_HOOKS_H +#define _ASM_TILE_MM_ARCH_HOOKS_H + +#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */ diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 95a4f19d16c5..2b05ccbebed9 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, } -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, - pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) { return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp))); } diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 8416240c322c..c034dc3fe2d4 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -160,11 +160,6 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..a7c8b0dfdd4e --- /dev/null +++ b/arch/um/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UM_MM_ARCH_HOOKS_H +#define _ASM_UM_MM_ARCH_HOOKS_H + +#endif /* _ASM_UM_MM_ARCH_HOOKS_H */ diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4d79a850c509 --- /dev/null +++ b/arch/unicore32/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H +#define _ASM_UNICORE32_MM_ARCH_HOOKS_H + +#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fb3f245bb717..c92fdcc9a793 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,6 +18,7 @@ config X86_64 select X86_DEV_DMA_OPS select ARCH_USE_CMPXCHG_LOCKREF select HAVE_LIVEPATCH + select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT ### Arch settings config X86 @@ -126,6 +127,7 @@ config X86 select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 + select HAVE_COPY_THREAD_TLS select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_QUEUED_RWLOCKS diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 63450a596800..83e4ed291b11 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -564,7 +564,7 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip),%rax - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 68c05398bba9..dab7a3a750bf 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..4e881a342236 --- /dev/null +++ b/arch/x86/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_X86_MM_ARCH_HOOKS_H +#define _ASM_X86_MM_ARCH_HOOKS_H + +#endif /* _ASM_X86_MM_ARCH_HOOKS_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe57e7a98839..4383012950b0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -267,6 +267,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_ACCESSED); } +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_DIRTY); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_RW); @@ -799,8 +804,8 @@ static inline int pmd_write(pmd_t pmd) return pmd_flags(pmd) & _PAGE_RW; } -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index ca05f86481aa..ca83f7ac388b 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -72,15 +72,16 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, unsigned long cmdline_len) { char *cmdline_ptr = ((char *)params) + cmdline_offset; - unsigned long cmdline_ptr_phys, len; + unsigned long cmdline_ptr_phys, len = 0; uint32_t cmdline_low_32, cmdline_ext_32; - memcpy(cmdline_ptr, cmdline, cmdline_len); if (image->type == KEXEC_TYPE_CRASH) { - len = sprintf(cmdline_ptr + cmdline_len - 1, - " elfcorehdr=0x%lx", image->arch.elf_load_addr); - cmdline_len += len; + len = sprintf(cmdline_ptr, + "elfcorehdr=0x%lx ", image->arch.elf_load_addr); } + memcpy(cmdline_ptr + len, cmdline, cmdline_len); + cmdline_len += len; + cmdline_ptr[cmdline_len - 1] = '\0'; pr_debug("Final command line is: %s\n", cmdline_ptr); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index e1029633f664..5e7047263b35 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -336,6 +336,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a99900cedc22..d4cb245225cc 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 82134506faa8..12f416bdcd5a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(&t->thread.tls_array[tls]); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { int err; struct pt_regs *childregs; @@ -208,10 +208,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, #ifdef CONFIG_IA32_EMULATION if (is_ia32_task()) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h index ba78ccf651e7..1f5f6dc09736 100644 --- a/arch/xtensa/include/asm/dma-mapping.h +++ b/arch/xtensa/include/asm/dma-mapping.h @@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, } static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, +dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) { int i; + struct scatterlist *sg; BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++, sg++ ) { + for_each_sg(sglist, sg, nents, i) { BUG_ON(!sg_page(sg)); sg->dma_address = sg_phys(sg); @@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction); } static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction dir) { int i; - for (i = 0; i < nelems; i++, sg++) + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) consistent_sync(sg_virt(sg), sg->length, dir); } static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction dir) +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction dir) { int i; - for (i = 0; i < nelems; i++, sg++) + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) consistent_sync(sg_virt(sg), sg->length, dir); } static inline int diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h new file mode 100644 index 000000000000..d2e5cfd3dd02 --- /dev/null +++ b/arch/xtensa/include/asm/mm-arch-hooks.h @@ -0,0 +1,15 @@ +/* + * Architecture specific mm hooks + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H +#define _ASM_XTENSA_MM_ARCH_HOOKS_H + +#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 201aec0e0446..1b19f25bc567 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -80,6 +80,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/block/genhd.c b/block/genhd.c index 0a536dc05f3b..64600e911aaa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -850,7 +850,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/base/node.c b/drivers/base/node.c index a2aa65b4215d..31df474d72f4 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -359,12 +359,16 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #define page_initialized(page) (page->lru.next) -static int get_nid_for_pfn(unsigned long pfn) +static int __init_refok get_nid_for_pfn(unsigned long pfn) { struct page *page; if (!pfn_valid_within(pfn)) return -1; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + if (system_state == SYSTEM_BOOTING) + return early_pfn_to_nid(pfn); +#endif page = pfn_to_page(pfn); if (!page_initialized(page)) return -1; diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 6489c0fd0ea6..386ba3d1a6ee 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -23,12 +23,4 @@ config ZRAM_LZ4_COMPRESS default n help This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute. - -config ZRAM_DEBUG - bool "Compressed RAM block device debug support" - depends on ZRAM - default n - help - This option adds additional debugging code to the compressed - RAM block device driver. + algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8dcbced0eafd..f4018839462a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -15,10 +15,6 @@ #define KMSG_COMPONENT "zram" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/bio.h> @@ -32,12 +28,16 @@ #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/err.h> +#include <linux/idr.h> +#include <linux/sysfs.h> #include "zram_drv.h" -/* Globals */ +static DEFINE_IDR(zram_index_idr); +/* idr index must be protected */ +static DEFINE_MUTEX(zram_index_mutex); + static int zram_major; -static struct zram *zram_devices; static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ @@ -53,7 +53,7 @@ static inline void deprecated_attr_warn(const char *name) } #define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ @@ -74,33 +74,117 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static ssize_t compact_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +/* flag operations require table entry bit_spin_lock() being held */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) { - unsigned long nr_migrated; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; + return meta->table[index].value & BIT(flag); +} - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} - meta = zram->meta; - nr_migrated = zs_compact(meta->mem_pool); - atomic64_add(nr_migrated, &zram->stats.num_migrated); - up_read(&zram->init_lock); +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} - return len; +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) { - struct zram *zram = dev_to_zram(dev); + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; - return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline int is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return 0; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return 0; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return 0; + + /* I/O request is valid */ + return 1; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static int page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return 0; + } + + return 1; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); } static ssize_t initstate_show(struct device *dev, @@ -116,6 +200,14 @@ static ssize_t initstate_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%u\n", val); } +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + static ssize_t orig_data_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -143,19 +235,6 @@ static ssize_t mem_used_total_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); } -static ssize_t max_comp_streams_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); -} - static ssize_t mem_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -225,6 +304,19 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", val); +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -282,65 +374,95 @@ static ssize_t comp_algorithm_store(struct device *dev, return len; } -/* flag operations needs meta->tb_lock */ -static int zram_test_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - return meta->table[index].value & BIT(flag); -} + unsigned long nr_migrated; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; -static void zram_set_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value |= BIT(flag); -} + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } -static void zram_clear_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value &= ~BIT(flag); + meta = zram->meta; + nr_migrated = zs_compact(meta->mem_pool); + atomic64_add(nr_migrated, &zram->stats.num_migrated); + up_read(&zram->init_lock); + + return len; } -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; } -static void zram_set_obj_size(struct zram_meta *meta, - u32 index, size_t size) +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + struct zram *zram = dev_to_zram(dev); + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + down_read(&zram->init_lock); + if (init_done(zram)) + mem_used = zs_get_total_pages(zram->meta->mem_pool); + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.num_migrated)); + up_read(&zram->init_lock); + + return ret; } -static inline int is_partial_io(struct bio_vec *bvec) +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static inline bool zram_meta_get(struct zram *zram) { - return bvec->bv_len != PAGE_SIZE; + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; } -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, - sector_t start, unsigned int size) +static inline void zram_meta_put(struct zram *zram) { - u64 end, bound; - - /* unaligned request */ - if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - end = start + (size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; + atomic_dec(&zram->refcount); } static void zram_meta_free(struct zram_meta *meta, u64 disksize) @@ -394,56 +516,6 @@ out_error: return NULL; } -static inline bool zram_meta_get(struct zram *zram) -{ - if (atomic_inc_not_zero(&zram->refcount)) - return true; - return false; -} - -static inline void zram_meta_put(struct zram *zram) -{ - atomic_dec(&zram->refcount); -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - -static int page_zero_filled(void *ptr) -{ - unsigned int pos; - unsigned long *page; - - page = (unsigned long *)ptr; - - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) - return 0; - } - - return 1; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - - /* * To protect concurrent access to the same index entry, * caller should hold this table index entry's bit_spinlock to @@ -561,21 +633,6 @@ out_cleanup: return ret; } -static inline void update_used_max(struct zram *zram, - const unsigned long pages) -{ - unsigned long old_max, cur_max; - - old_max = atomic_long_read(&zram->stats.max_used_pages); - - do { - cur_max = old_max; - if (pages > cur_max) - old_max = atomic_long_cmpxchg( - &zram->stats.max_used_pages, cur_max, pages); - } while (old_max != cur_max); -} - static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { @@ -703,35 +760,6 @@ out: return ret; } -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) -{ - unsigned long start_time = jiffies; - int ret; - - generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, - &zram->disk->part0); - - if (rw == READ) { - atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset); - } else { - atomic64_inc(&zram->stats.num_writes); - ret = zram_bvec_write(zram, bvec, index, offset); - } - - generic_end_io_acct(rw, &zram->disk->part0, start_time); - - if (unlikely(ret)) { - if (rw == READ) - atomic64_inc(&zram->stats.failed_reads); - else - atomic64_inc(&zram->stats.failed_writes); - } - - return ret; -} - /* * zram_bio_discard - handler on discard request * @index: physical block index in PAGE_SIZE units @@ -771,149 +799,32 @@ static void zram_bio_discard(struct zram *zram, u32 index, } } -static void zram_reset_device(struct zram *zram) -{ - struct zram_meta *meta; - struct zcomp *comp; - u64 disksize; - - down_write(&zram->init_lock); - - zram->limit_pages = 0; - - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - - meta = zram->meta; - comp = zram->comp; - disksize = zram->disksize; - /* - * Refcount will go down to 0 eventually and r/w handler - * cannot handle further I/O so it will bail out by - * check zram_meta_get. - */ - zram_meta_put(zram); - /* - * We want to free zram_meta in process context to avoid - * deadlock between reclaim path and any other locks. - */ - wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); - - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - zram->disksize = 0; - zram->max_comp_streams = 1; - set_capacity(zram->disk, 0); - - up_write(&zram->init_lock); - /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); - zcomp_destroy(comp); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - u64 disksize; - struct zcomp *comp; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - int err; - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; - - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->first_minor, disksize); - if (!meta) - return -ENOMEM; - - comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (IS_ERR(comp)) { - pr_info("Cannot initialise %s compressing backend\n", - zram->compressor); - err = PTR_ERR(comp); - goto out_free_meta; - } - - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_destroy_comp; - } - - init_waitqueue_head(&zram->io_done); - atomic_set(&zram->refcount, 1); - zram->meta = meta; - zram->comp = comp; - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ - revalidate_disk(zram->disk); - - return len; - -out_destroy_comp: - up_write(&zram->init_lock); - zcomp_destroy(comp); -out_free_meta: - zram_meta_free(meta, disksize); - return err; -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) { + unsigned long start_time = jiffies; int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - if (!bdev) - return -ENOMEM; + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); - mutex_lock(&bdev->bd_mutex); - /* Do not reset an active device! */ - if (bdev->bd_openers) { - ret = -EBUSY; - goto out; + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); } - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - goto out; + generic_end_io_acct(rw, &zram->disk->part0, start_time); - if (!do_reset) { - ret = -EINVAL; - goto out; + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); } - /* Make sure all pending I/O is finished */ - fsync_bdev(bdev); - zram_reset_device(zram); - - mutex_unlock(&bdev->bd_mutex); - revalidate_disk(zram->disk); - bdput(bdev); - - return len; - -out: - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return ret; } @@ -1053,80 +964,183 @@ out: return err; } -static const struct block_device_operations zram_devops = { - .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, - .owner = THIS_MODULE -}; +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; -static DEVICE_ATTR_WO(compact); -static DEVICE_ATTR_RW(disksize); -static DEVICE_ATTR_RO(initstate); -static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); -static DEVICE_ATTR_RW(max_comp_streams); -static DEVICE_ATTR_RW(comp_algorithm); + down_write(&zram->init_lock); -static ssize_t io_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + zram->max_comp_streams = 1; + set_capacity(zram->disk, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - ssize_t ret; + int err; - down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8llu\n", - (u64)atomic64_read(&zram->stats.failed_reads), - (u64)atomic64_read(&zram->stats.failed_writes), - (u64)atomic64_read(&zram->stats.invalid_io), - (u64)atomic64_read(&zram->stats.notify_free)); - up_read(&zram->init_lock); + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; - return ret; + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->first_minor, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (IS_ERR(comp)) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; } -static ssize_t mm_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - struct zram *zram = dev_to_zram(dev); - u64 orig_size, mem_used = 0; - long max_used; - ssize_t ret; + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; - down_read(&zram->init_lock); - if (init_done(zram)) - mem_used = zs_get_total_pages(zram->meta->mem_pool); + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; - orig_size = atomic64_read(&zram->stats.pages_stored); - max_used = atomic_long_read(&zram->stats.max_used_pages); + if (!do_reset) + return -EINVAL; - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", - orig_size << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.compr_data_size), - mem_used << PAGE_SHIFT, - zram->limit_pages << PAGE_SHIFT, - max_used << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.zero_pages), - (u64)atomic64_read(&zram->stats.num_migrated)); - up_read(&zram->init_lock); + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device or claimed device */ + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + /* From now on, anyone can't open /dev/zram[0-9] */ + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + revalidate_disk(zram->disk); + bdput(bdev); + + mutex_lock(&bdev->bd_mutex); + zram->claim = false; + mutex_unlock(&bdev->bd_mutex); + + return len; +} + +static int zram_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct zram *zram; + + WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + + zram = bdev->bd_disk->private_data; + /* zram was claimed to reset so open request fails */ + if (zram->claim) + ret = -EBUSY; return ret; } -static DEVICE_ATTR_RO(io_stat); -static DEVICE_ATTR_RO(mm_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); +static const struct block_device_operations zram_devops = { + .open = zram_open, + .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1156,10 +1170,24 @@ static struct attribute_group zram_disk_attr_group = { .attrs = zram_disk_attrs, }; -static int create_device(struct zram *zram, int device_id) +/* + * Allocate and initialize new zram device. the function returns + * '>= 0' device_id upon success, and negative value otherwise. + */ +static int zram_add(void) { + struct zram *zram; struct request_queue *queue; - int ret = -ENOMEM; + int ret, device_id; + + zram = kzalloc(sizeof(struct zram), GFP_KERNEL); + if (!zram) + return -ENOMEM; + + ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_dev; + device_id = ret; init_rwsem(&zram->init_lock); @@ -1167,12 +1195,13 @@ static int create_device(struct zram *zram, int device_id) if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); - goto out; + ret = -ENOMEM; + goto out_free_idr; } blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ + /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { pr_warn("Error allocating disk structure for device %d\n", @@ -1230,90 +1259,177 @@ static int create_device(struct zram *zram, int device_id) strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; - return 0; + + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; out_free_disk: del_gendisk(zram->disk); put_disk(zram->disk); out_free_queue: blk_cleanup_queue(queue); -out: +out_free_idr: + idr_remove(&zram_index_idr, device_id); +out_free_dev: + kfree(zram); return ret; } -static void destroy_devices(unsigned int nr) +static int zram_remove(struct zram *zram) +{ + struct block_device *bdev; + + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices. This also helps during + * hot_remove -- zram_reset_device() is the last holder of + * ->init_lock, no later/concurrent disksize_store() or any + * other sysfs handlers are possible. + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + bdput(bdev); + + pr_info("Removed device: %s\n", zram->disk->disk_name); + + idr_remove(&zram_index_idr, zram->disk->first_minor); + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + kfree(zram); + return 0; +} + +/* zram-control sysfs attributes */ +static ssize_t hot_add_show(struct class *class, + struct class_attribute *attr, + char *buf) +{ + int ret; + + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + + if (ret < 0) + return ret; + return scnprintf(buf, PAGE_SIZE, "%d\n", ret); +} + +static ssize_t hot_remove_store(struct class *class, + struct class_attribute *attr, + const char *buf, + size_t count) { struct zram *zram; - unsigned int i; + int ret, dev_id; - for (i = 0; i < nr; i++) { - zram = &zram_devices[i]; - /* - * Remove sysfs first, so no one will perform a disksize - * store while we destroy the devices - */ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); + /* dev_id is gendisk->first_minor, which is `int' */ + ret = kstrtoint(buf, 10, &dev_id); + if (ret) + return ret; + if (dev_id < 0) + return -EINVAL; - zram_reset_device(zram); + mutex_lock(&zram_index_mutex); - blk_cleanup_queue(zram->disk->queue); - del_gendisk(zram->disk); - put_disk(zram->disk); - } + zram = idr_find(&zram_index_idr, dev_id); + if (zram) + ret = zram_remove(zram); + else + ret = -ENODEV; + + mutex_unlock(&zram_index_mutex); + return ret ? ret : count; +} + +static struct class_attribute zram_control_class_attrs[] = { + __ATTR_RO(hot_add), + __ATTR_WO(hot_remove), + __ATTR_NULL, +}; + +static struct class zram_control_class = { + .name = "zram-control", + .owner = THIS_MODULE, + .class_attrs = zram_control_class_attrs, +}; + +static int zram_remove_cb(int id, void *ptr, void *data) +{ + zram_remove(ptr); + return 0; +} - kfree(zram_devices); +static void destroy_devices(void) +{ + class_unregister(&zram_control_class); + idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); - pr_info("Destroyed %u device(s)\n", nr); } static int __init zram_init(void) { - int ret, dev_id; + int ret; - if (num_devices > max_num_devices) { - pr_warn("Invalid value for num_devices: %u\n", - num_devices); - return -EINVAL; + ret = class_register(&zram_control_class); + if (ret) { + pr_warn("Unable to register zram-control class\n"); + return ret; } zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_warn("Unable to get major number\n"); + class_unregister(&zram_control_class); return -EBUSY; } - /* Allocate the device array and initialize each one */ - zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); - if (!zram_devices) { - unregister_blkdev(zram_major, "zram"); - return -ENOMEM; - } - - for (dev_id = 0; dev_id < num_devices; dev_id++) { - ret = create_device(&zram_devices[dev_id], dev_id); - if (ret) + while (num_devices != 0) { + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + if (ret < 0) goto out_error; + num_devices--; } - pr_info("Created %u device(s)\n", num_devices); return 0; out_error: - destroy_devices(dev_id); + destroy_devices(); return ret; } static void __exit zram_exit(void) { - destroy_devices(num_devices); + destroy_devices(); } module_init(zram_init); module_exit(zram_exit); module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); +MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 570c598f4ce9..6dbe2df506bf 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -20,12 +20,6 @@ #include "zcomp.h" -/* - * Some arbitrary value. This is just to catch - * invalid value for num_devices module parameter. - */ -static const unsigned max_num_devices = 32; - /*-- Configurable parameters */ /* @@ -121,5 +115,9 @@ struct zram { */ u64 disksize; /* bytes */ char compressor[10]; + /* + * zram is claimed so open request will be failed + */ + bool claim; /* Protected by bdev->bd_mutex */ }; #endif diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index aeabaa5aedf7..48db922075e2 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -419,10 +419,10 @@ static int jmb38x_ms_issue_cmd(struct memstick_host *msh) } if (host->cmd_flags & DMA_DATA) { - if (1 != pci_map_sg(host->chip->pdev, &host->req->sg, 1, + if (1 != dma_map_sg(&host->chip->pdev->dev, &host->req->sg, 1, host->req->data_dir == READ - ? PCI_DMA_FROMDEVICE - : PCI_DMA_TODEVICE)) { + ? DMA_FROM_DEVICE + : DMA_TO_DEVICE)) { host->req->error = -ENOMEM; return host->req->error; } @@ -487,9 +487,9 @@ static void jmb38x_ms_complete_cmd(struct memstick_host *msh, int last) writel(0, host->addr + DMA_CONTROL); if (host->cmd_flags & DMA_DATA) { - pci_unmap_sg(host->chip->pdev, &host->req->sg, 1, + dma_unmap_sg(&host->chip->pdev->dev, &host->req->sg, 1, host->req->data_dir == READ - ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { t_val = readl(host->addr + INT_STATUS_ENABLE); if (host->req->data_dir == READ) @@ -925,7 +925,7 @@ static int jmb38x_ms_probe(struct pci_dev *pdev, int pci_dev_busy = 0; int rc, cnt; - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); if (rc) return rc; diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c index e2a4f5f415b2..ef09ba0289d7 100644 --- a/drivers/memstick/host/r592.c +++ b/drivers/memstick/host/r592.c @@ -754,7 +754,7 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto error2; pci_set_master(pdev); - error = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + error = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); if (error) goto error3; @@ -787,8 +787,8 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id) } /* This is just a precation, so don't fail */ - dev->dummy_dma_page = pci_alloc_consistent(pdev, PAGE_SIZE, - &dev->dummy_dma_page_physical_address); + dev->dummy_dma_page = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, + &dev->dummy_dma_page_physical_address, GFP_KERNEL); r592_stop_dma(dev , 0); if (request_irq(dev->irq, &r592_irq, IRQF_SHARED, @@ -805,7 +805,7 @@ error7: free_irq(dev->irq, dev); error6: if (dev->dummy_dma_page) - pci_free_consistent(pdev, PAGE_SIZE, dev->dummy_dma_page, + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page, dev->dummy_dma_page_physical_address); kthread_stop(dev->io_thread); @@ -845,7 +845,7 @@ static void r592_remove(struct pci_dev *pdev) memstick_free_host(dev->host); if (dev->dummy_dma_page) - pci_free_consistent(pdev, PAGE_SIZE, dev->dummy_dma_page, + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page, dev->dummy_dma_page_physical_address); } diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index 36f5d52775a9..9a60bd4d3c49 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -220,7 +220,7 @@ static unsigned long lookup_addr(char *arg) else if (!strcmp(arg, "sys_open")) addr = (unsigned long)do_sys_open; else if (!strcmp(arg, "do_fork")) - addr = (unsigned long)do_fork; + addr = (unsigned long)_do_fork; else if (!strcmp(arg, "hw_break_val")) addr = (unsigned long)&hw_break_val; addr = (unsigned long) dereference_function_descriptor((void *)addr); diff --git a/drivers/misc/spear13xx_pcie_gadget.c b/drivers/misc/spear13xx_pcie_gadget.c index fe3ad0ca9a3e..b8374cdaf9c9 100644 --- a/drivers/misc/spear13xx_pcie_gadget.c +++ b/drivers/misc/spear13xx_pcie_gadget.c @@ -2,7 +2,7 @@ * drivers/misc/spear13xx_pcie_gadget.c * * Copyright (C) 2010 ST Microelectronics - * Pratyush Anand<pratyush.anand@st.com> + * Pratyush Anand<pratyush.anand@gmail.com> * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any diff --git a/drivers/pci/host/pcie-spear13xx.c b/drivers/pci/host/pcie-spear13xx.c index 020d78890719..fb0b3e9b7e0a 100644 --- a/drivers/pci/host/pcie-spear13xx.c +++ b/drivers/pci/host/pcie-spear13xx.c @@ -4,8 +4,8 @@ * SPEAr13xx PCIe Glue Layer Source Code * * Copyright (C) 2010-2014 ST Microelectronics - * Pratyush Anand <pratyush.anand@st.com> - * Mohit Kumar <mohit.kumar@st.com> + * Pratyush Anand <pratyush.anand@gmail.com> + * Mohit Kumar <mohit.kumar.dhaka@gmail.com> * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any @@ -387,5 +387,5 @@ static int __init spear13xx_pcie_init(void) module_init(spear13xx_pcie_init); MODULE_DESCRIPTION("ST Microelectronics SPEAr13xx PCIe host controller driver"); -MODULE_AUTHOR("Pratyush Anand <pratyush.anand@st.com>"); +MODULE_AUTHOR("Pratyush Anand <pratyush.anand@gmail.com>"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/phy/phy-spear1310-miphy.c b/drivers/phy/phy-spear1310-miphy.c index 65ae640cfbd1..45d0005b2203 100644 --- a/drivers/phy/phy-spear1310-miphy.c +++ b/drivers/phy/phy-spear1310-miphy.c @@ -2,8 +2,8 @@ * ST SPEAr1310-miphy driver * * Copyright (C) 2014 ST Microelectronics - * Pratyush Anand <pratyush.anand@st.com> - * Mohit Kumar <mohit.kumar@st.com> + * Pratyush Anand <pratyush.anand@gmail.com> + * Mohit Kumar <mohit.kumar.dhaka@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -257,5 +257,5 @@ static struct platform_driver spear1310_miphy_driver = { module_platform_driver(spear1310_miphy_driver); MODULE_DESCRIPTION("ST SPEAR1310-MIPHY driver"); -MODULE_AUTHOR("Pratyush Anand <pratyush.anand@st.com>"); +MODULE_AUTHOR("Pratyush Anand <pratyush.anand@gmail.com>"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/phy/phy-spear1340-miphy.c b/drivers/phy/phy-spear1340-miphy.c index 1a00c2817f34..494240da4a39 100644 --- a/drivers/phy/phy-spear1340-miphy.c +++ b/drivers/phy/phy-spear1340-miphy.c @@ -2,8 +2,8 @@ * ST spear1340-miphy driver * * Copyright (C) 2014 ST Microelectronics - * Pratyush Anand <pratyush.anand@st.com> - * Mohit Kumar <mohit.kumar@st.com> + * Pratyush Anand <pratyush.anand@gmail.com> + * Mohit Kumar <mohit.kumar.dhaka@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -290,5 +290,5 @@ static struct platform_driver spear1340_miphy_driver = { module_platform_driver(spear1340_miphy_driver); MODULE_DESCRIPTION("ST SPEAR1340-MIPHY driver"); -MODULE_AUTHOR("Pratyush Anand <pratyush.anand@st.com>"); +MODULE_AUTHOR("Pratyush Anand <pratyush.anand@gmail.com>"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index ea2a315df6b7..272fc934ce6b 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -61,6 +61,8 @@ static int rtc_suspend(struct device *dev) if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) return 0; + rtc->valid_alarm = !rtc_read_alarm(rtc, &rtc->alarm); + /* snapshot the current RTC and system time at suspend*/ err = rtc_read_time(rtc, &tm); if (err < 0) { @@ -105,6 +107,27 @@ static int rtc_resume(struct device *dev) if (timekeeping_rtc_skipresume()) return 0; + /* + * Ensure that the platform hasn't overwritten a pending alarm while + * suspended + */ + if (rtc->valid_alarm) { + long now, scheduled; + + rtc_read_time(rtc, &tm); + rtc_tm_to_time(&rtc->alarm.time, &scheduled); + rtc_tm_to_time(&tm, &now); + + /* Clear the alarm registers if it went off during suspend */ + if (scheduled <= now) { + rtc_time_to_tm(0, &rtc->alarm.time); + rtc->alarm.enabled = 0; + } + + if (rtc->ops && rtc->ops->set_alarm) + rtc->ops->set_alarm(rtc->dev.parent, &rtc->alarm); + } + rtc_hctosys_ret = -ENODEV; if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) return 0; @@ -141,6 +164,7 @@ static int rtc_resume(struct device *dev) if (sleep_time.tv_sec >= 0) timekeeping_inject_sleeptime64(&sleep_time); rtc_hctosys_ret = 0; + return 0; } diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index a6b14c24d7e3..31f11ab214a0 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -494,7 +494,10 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) struct rtc_time tm; ktime_t now, onesec; - __rtc_read_time(rtc, &tm); + err = __rtc_read_time(rtc, &tm); + if (err < 0) + goto out; + onesec = ktime_set(1, 0); now = rtc_tm_to_ktime(tm); rtc->uie_rtctimer.node.expires = ktime_add(now, onesec); @@ -872,13 +875,17 @@ void rtc_timer_do_work(struct work_struct *work) struct timerqueue_node *next; ktime_t now; struct rtc_time tm; + int err = 0; struct rtc_device *rtc = container_of(work, struct rtc_device, irqwork); mutex_lock(&rtc->ops_lock); again: - __rtc_read_time(rtc, &tm); + err = __rtc_read_time(rtc, &tm); + if (err < 0) + goto out; + now = rtc_tm_to_ktime(tm); while ((next = timerqueue_getnext(&rtc->timerqueue))) { if (next->expires.tv64 > now.tv64) @@ -903,7 +910,6 @@ again: /* Set next alarm */ if (next) { struct rtc_wkalrm alarm; - int err; int retry = 3; alarm.time = rtc_ktime_to_tm(next->expires); @@ -925,6 +931,7 @@ reprogram: } else rtc_alarm_disable(rtc); +out: pm_relax(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); } diff --git a/drivers/rtc/rtc-armada38x.c b/drivers/rtc/rtc-armada38x.c index cb70ced7e0db..4b62d1a875e4 100644 --- a/drivers/rtc/rtc-armada38x.c +++ b/drivers/rtc/rtc-armada38x.c @@ -64,7 +64,7 @@ static void rtc_delayed_write(u32 val, struct armada38x_rtc *rtc, int offset) static int armada38x_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct armada38x_rtc *rtc = dev_get_drvdata(dev); - unsigned long time, time_check, flags; + unsigned long time, time_check; mutex_lock(&rtc->mutex_time); time = readl(rtc->regs + RTC_TIME); diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 4ffabb322a9a..6e76de1856fc 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -742,17 +742,17 @@ static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t) regs[6] &= ~MCP794XX_BIT_ALMX_IF; /* Set alarm match: second, minute, hour, day, date, month. */ regs[6] |= MCP794XX_MSK_ALMX_MATCH; - - if (t->enabled) - regs[0] |= MCP794XX_BIT_ALM0_EN; - else - regs[0] &= ~MCP794XX_BIT_ALM0_EN; + /* Disable interrupt. We will not enable until completely programmed */ + regs[0] &= ~MCP794XX_BIT_ALM0_EN; ret = ds1307->write_block_data(client, MCP794XX_REG_CONTROL, 10, regs); if (ret < 0) return ret; - return 0; + if (!t->enabled) + return 0; + regs[0] |= MCP794XX_BIT_ALM0_EN; + return i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, regs[0]); } static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled) diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index 8b6355ffaff9..7fc666a0bb2e 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -107,6 +107,8 @@ /* OMAP_RTC_OSC_REG bit fields: */ #define OMAP_RTC_OSC_32KCLK_EN BIT(6) +#define OMAP_RTC_OSC_OSC32K_GZ BIT(4) +#define OMAP_RTC_OSC_EXT_32K BIT(3) /* OMAP_RTC_IRQWAKEEN bit fields: */ #define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1) @@ -122,6 +124,7 @@ struct omap_rtc; struct omap_rtc_device_type { bool has_32kclk_en; + bool has_osc_ext_32k; bool has_irqwakeen; bool has_pmic_mode; bool has_power_up_reset; @@ -481,6 +484,7 @@ static const struct omap_rtc_device_type omap_rtc_default_type = { static const struct omap_rtc_device_type omap_rtc_am3352_type = { .has_32kclk_en = true, + .has_osc_ext_32k = true, .has_irqwakeen = true, .has_pmic_mode = true, .lock = am3352_rtc_lock, @@ -577,7 +581,15 @@ static int omap_rtc_probe(struct platform_device *pdev) if (rtc->type->has_32kclk_en) { reg = rtc_read(rtc, OMAP_RTC_OSC_REG); rtc_writel(rtc, OMAP_RTC_OSC_REG, - reg | OMAP_RTC_OSC_32KCLK_EN); + reg | OMAP_RTC_OSC_32KCLK_EN); + } + + /* Enable external clock as the source */ + if (rtc->type->has_osc_ext_32k) { + rtc_writel(rtc, OMAP_RTC_OSC_REG, + (OMAP_RTC_OSC_EXT_32K | + rtc_read(rtc, OMAP_RTC_OSC_REG)) & + (~OMAP_RTC_OSC_OSC32K_GZ)); } /* clear old status */ diff --git a/drivers/usb/misc/lvstest.c b/drivers/usb/misc/lvstest.c index 62cb8cd08403..86b4e4b2ab9a 100644 --- a/drivers/usb/misc/lvstest.c +++ b/drivers/usb/misc/lvstest.c @@ -4,7 +4,7 @@ * Test pattern generation for Link Layer Validation System Tests * * Copyright (C) 2014 ST Microelectronics - * Pratyush Anand <pratyush.anand@st.com> + * Pratyush Anand <pratyush.anand@gmail.com> * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any diff --git a/fs/adfs/super.c b/fs/adfs/super.c index a19c31d3f369..4d4a0df8344f 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index a8f463c028ce..5fa92bc790ef 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry) { struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; - struct buffer_head *bh = NULL, *link_bh = NULL; + struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a022f4accd76..17349500592d 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 { struct super_block *sb = dir->i_sb; struct buffer_head *inode_bh = NULL; - struct buffer_head *bh = NULL; + struct buffer_head *bh; u32 block = 0; int retval; diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 0826e91dacda..22c166280883 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,8 +137,8 @@ static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, befs_btree_super * sup) { - struct buffer_head *bh = NULL; - befs_disk_btree_super *od_sup = NULL; + struct buffer_head *bh; + befs_disk_btree_super *od_sup; befs_debug(sb, "---> %s", __func__); @@ -250,7 +250,7 @@ int befs_btree_find(struct super_block *sb, befs_data_stream * ds, const char *key, befs_off_t * value) { - struct befs_btree_node *this_node = NULL; + struct befs_btree_node *this_node; befs_btree_super bt_super; befs_off_t node_off; int res; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5cfa7129d876..fa173aa682bb 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3390,13 +3390,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, * should have access to this page, we're safe to simply set * PG_locked without checking it first. */ - __set_page_locked(page); + __SetPageLocked(page); rc = add_to_page_cache_locked(page, mapping, page->index, GFP_KERNEL); /* give up if we can't stick it in the cache */ if (rc) { - __clear_page_locked(page); + __ClearPageLocked(page); return rc; } @@ -3417,10 +3417,10 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, if (*bytes + PAGE_CACHE_SIZE > rsize) break; - __set_page_locked(page); + __SetPageLocked(page); if (add_to_page_cache_locked(page, mapping, page->index, GFP_KERNEL)) { - __clear_page_locked(page); + __ClearPageLocked(page); break; } list_move_tail(&page->lru, tmplist); diff --git a/fs/configfs/item.c b/fs/configfs/item.c index e65f9ffbb999..4d6a30e76168 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item *item) +static void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } -EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index add566303c68..c35ffdc12bba 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = &fsi->mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,16 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; + table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(&devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; diff --git a/fs/efs/super.c b/fs/efs/super.c index 7fca462ea4e3..c8411a30f7da 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 8850254136ae..7002467bfbac 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } if (!journal) { - ret = generic_file_fsync(file, start, end, datasync); + if (test_opt(inode->i_sb, BARRIER)) + ret = generic_file_fsync(file, start, end, datasync); + else + ret = __generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 93fc62232ec2..5d384921524d 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } -int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create) +int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + +static int is_exceed_eof(struct inode *inode, sector_t sector, + sector_t *last_block, int create) +{ + struct super_block *sb = inode->i_sb; const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; + + *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= *last_block) { + if (!create) + return 1; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= *last_block) + return 1; + } + + return 0; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create, bool from_bmap) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; } - last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; - if (sector >= last_block) { - if (!create) + if (!from_bmap) { + if (is_exceed_eof(inode, sector, &last_block, create)) return 0; - - /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) - */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) - >> blocksize_bits; + } else { + last_block = inode->i_blocks >> + (inode->i_sb->s_blocksize_bits - 9); if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 4afc4d9d2e41..4c71c8c76426 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -91,7 +91,7 @@ next: *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; - err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); if (err || !phys) return -1; /* beyond EOF or error */ diff --git a/fs/fat/fat.h b/fs/fat/fat.h index be5e15323bab..4307cd4f8da0 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); +extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create); + unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; @@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } +extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold diff --git a/fs/fat/file.c b/fs/fat/file.c index 442d50a0e33e..b90cc275bfaa 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -13,8 +13,12 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -176,6 +180,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -214,6 +219,62 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + loff_t ondisksize; /* block aligned on-disk size in bytes*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_KEEP_SIZE) { + ondisksize = inode->i_blocks << 9; + if ((offset + len) <= ondisksize) + goto error; + + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - ondisksize; + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_add_cluster(inode); + if (err) + goto error; + } + } else { + if ((offset + len) <= i_size_read(inode)) + goto error; + + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c06774658345..e69c84acad75 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -92,7 +92,7 @@ static struct fat_floppy_defaults { }, }; -static int fat_add_cluster(struct inode *inode) +int fat_add_cluster(struct inode *inode) { int err, cluster; @@ -114,10 +114,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; - sector_t phys; + sector_t phys, last_block; int err, offset; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { @@ -134,8 +134,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, return -EIO; } + last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); - if (!offset) { + /* + * allocate a cluster according to the following. + * 1) no more available blocks + * 2) not part of fallocate region + */ + if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) @@ -147,7 +153,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; @@ -272,13 +278,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + BUG_ON(create != 0); + + err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -552,13 +583,43 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); + +static void fat_free_eofblocks(struct inode *inode) +{ + /* Release unwritten fallocated blocks on inode eviction. */ + if ((inode->i_blocks << 9) > + round_up(MSDOS_I(inode)->mmu_private, + MSDOS_SB(inode->i_sb)->cluster_size)) { + int err; + + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused " + "fallocated blocks, inode could be " + "corrupted. Please run fsck"); + } + + } +} + static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); - } + } else + fat_free_eofblocks(inode); + invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87724c1d7be6..0cf74df68617 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) goto out; ret = 0; - hugetlb_prefault_arch_hook(vma->vm_mm); if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b96bd8076b70..0bc333b4a594 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ff2f2e6ad311..799242cecffb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index f131fc23ffc4..fffca9517321 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + /* + * If the ino of the sysfs entry created for a kmem cache gets + * allocated from an ida layer, which is accounted to the memcg that + * owns the cache, the memcg will get pinned forever. So do not account + * ino ida allocations. + */ + ret = ida_simple_get(&root->ino_ida, 1, 0, + GFP_KERNEL | __GFP_NOACCOUNT); if (ret < 0) goto err_out2; kn->ino = ret; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1182d1e26a9c..086cd0a61e80 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/mpage.c b/fs/mpage.c index 3e79220babac..587c7ed4185d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -590,7 +591,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { @@ -614,7 +615,7 @@ alloc_new: */ length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); goto alloc_new; } @@ -624,7 +625,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -636,7 +637,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -692,8 +693,11 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } } blk_finish_plug(&plug); return ret; @@ -710,8 +714,11 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } return ret; } EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2d7f76e52c37..0afb4cb7ce1b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ocfs2_error(sb, - "Extent block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - eb->h_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + goto bail; } if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extent block #%llu has an invalid h_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(eb->h_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto bail; } if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extent block #%llu has an invalid " - "h_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(eb->h_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + goto bail; } - - return 0; +bail: + return rc; } int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, @@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty " - "extent list (next_free_rec == 0)", + "Owner %llu has empty extent list (next_free_rec == 0)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; @@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent " - "list where extent # %d has no physical " - "block start", + "Owner %llu has extent list where extent # %d has no physical block start\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; @@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at " - "depth %u\n", + "Owner %llu has empty extent list at depth %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; @@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad blkno in extent list " - "at depth %u (index %d)\n", + "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; @@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad count in extent list " - "at block %llu (next free=%u, count=%u)\n", + "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), @@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, if (left_el->l_next_free_rec != left_el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Inode %llu has non-full interior leaf node %llu" - "(next free = %u)", + "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); @@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2526,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_block *eb; u32 range; - /* - * In normal tree rotation process, we will never touch the - * tree branch above subtree_index and ocfs2_extend_rotate_transaction - * doesn't reserve the credits for them either. - * - * But we do have a special case here which will update the rightmost - * records for all the bh in the path. - * So we have to allocate extra credits and access them. - */ - ret = ocfs2_extend_trans(handle, subtree_index); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); @@ -2872,8 +2847,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2925,7 +2899,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *right_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); - BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) + return 0; *empty_extent_path = NULL; @@ -2966,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); - ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + ret = ocfs2_extend_rotate_transaction(handle, 0, orig_credits, left_path); if (ret) { mlog_errno(ret); @@ -3039,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(et); if (ret) goto out; - /* - * There's two ways we handle this depending on - * whether path is the only existing one. - */ - ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); - if (ret) { - mlog_errno(ret); - goto out; - } ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { @@ -3130,6 +3093,30 @@ out: return ret; } +static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + handle_t *handle; + int ret; + int credits = path->p_tree_depth * 2 + 1; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + return ret; +} + /* * Left rotation of btree records. * @@ -3199,7 +3186,7 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu", + "Owner %llu has empty extent block at %llu\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; @@ -3627,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, */ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } ret = ocfs2_remove_rightmost_path(handle, et, right_path, @@ -3665,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, BUG_ON(ctxt->c_contig_type == CONTIG_NONE); if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The merge code will need to create an empty * extent to take the place of the newly @@ -3713,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* The merge left us with an empty extent, remove it. */ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { @@ -3734,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, goto out; } + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so @@ -3769,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, } if (ctxt->c_split_covers_rec) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + ret = 0; + goto out; + } + /* * The merge may have left an empty extent in * our leaf. Try to rotate it away. @@ -3929,7 +3960,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has a bad extent list", + "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; @@ -4311,13 +4342,13 @@ out: return ret; } -static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, +static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, - struct ocfs2_extent_rec *split_rec) + struct ocfs2_extent_rec *split_rec, + struct ocfs2_merge_ctxt *ctxt) { - int status; + int status = 0; enum ocfs2_contig_type ret = CONTIG_NONE; u32 left_cpos, right_cpos; struct ocfs2_extent_rec *rec = NULL; @@ -4336,8 +4367,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); - if (!left_path) + if (!left_path) { + status = -ENOMEM; + mlog_errno(status); goto exit; + } status = ocfs2_find_path(et->et_ci, left_path, left_cpos); @@ -4351,10 +4385,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of " - "%d. It should have " - "matched the l_count of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); @@ -4392,8 +4423,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, goto free_left_path; right_path = ocfs2_new_path_from_path(path); - if (!right_path) + if (!right_path) { + status = -ENOMEM; + mlog_errno(status); goto free_left_path; + } status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) @@ -4406,8 +4440,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; @@ -4433,7 +4466,10 @@ free_right_path: free_left_path: ocfs2_free_path(left_path); exit: - return ret; + if (status == 0) + ctxt->c_contig_type = ret; + + return status; } static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, @@ -4960,10 +4996,9 @@ leftright: split_index = ocfs2_search_extent_list(el, cpos); if (split_index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u " - "which can no longer be found.\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5039,9 +5074,14 @@ int ocfs2_split_extent(handle_t *handle, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, - split_index, - split_rec); + ret = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec, + &ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The core merge / split code wants to know how much room is @@ -5143,10 +5183,9 @@ int ocfs2_change_extent_flag(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(sb, - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long) - ocfs2_metadata_cache_owner(et->et_ci), cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5213,9 +5252,7 @@ int ocfs2_mark_extent_written(struct inode *inode, cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = -EROFS; goto out; @@ -5322,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); @@ -5499,8 +5545,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", + "Owner %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5565,7 +5610,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: split at cpos %u lost record.", + "Owner %llu: split at cpos %u lost record\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5581,8 +5626,7 @@ int ocfs2_remove_extent(handle_t *handle, ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: error after split at cpos %u" - "trunc len %u, existing record is (%u,%u)", + "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -5910,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, tl_bh); - /* TODO: Perhaps we can calculate the bulk of the - * credits up front rather than extending like - * this. */ - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } - rec = tl->tl_recs[i]; start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, le32_to_cpu(rec.t_start)); @@ -5940,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, goto bail; } } + + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } i--; } @@ -5998,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -7096,12 +7137,20 @@ start: ocfs2_error(inode->i_sb, "Inode %lu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); - status = -EROFS; - goto bail; + status = ocfs2_remove_rightmost_empty_extent(osb, + &et, path, &dealloc); + if (status) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + goto start; + } else { + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; } - trunc_cpos = le32_to_cpu(rec->e_cpos); - trunc_len = 0; - blkno = 0; } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { /* * Truncate entire record. @@ -7189,8 +7238,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || !ocfs2_supports_inline_data(osb)) { ocfs2_error(inode->i_sb, - "Inline data flags for inode %llu don't agree! " - "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le16_to_cpu(di->i_dyn_features), OCFS2_I(inode)->ip_dyn_features, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f906a250da6a..a3e4cab1de2d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { - ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); return -EROFS; } @@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size > PAGE_CACHE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %Lu", + "Inode %llu has with inline data has bad size: %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)size); return -EROFS; @@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + down_read(&OCFS2_I(inode)->ip_alloc_sem); + /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); @@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, alloc_locked = 1; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + /* fill hole, allocate blocks can't be larger than the size * of the hole */ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); @@ -567,6 +573,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extend_allocation(inode, cpos, clusters_to_alloc, 0); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog_errno(ret); goto bail; } @@ -574,11 +581,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); ret = -EIO; goto bail; } + up_write(&OCFS2_I(inode)->ip_alloc_sem); } /* @@ -833,12 +842,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, /* zeroing out the previously allocated cluster tail * that but not zeroed */ - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, zero_len_tail, cluster_align_tail); - else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + } if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); @@ -925,13 +939,23 @@ clean_orphan: int update_isize = written > 0 ? 1 : 0; loff_t end = update_isize ? offset + written : 0; - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(ret); + goto out; + } + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, update_isize, end); if (tmp_ret < 0) { ret = tmp_ret; + mlog_errno(ret); goto out; } + ocfs2_inode_unlock(inode, 1); + tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { ret = tmp_ret; @@ -2176,16 +2200,6 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out_quota; - } /* * Fill our page array first. That way we've grabbed enough so @@ -2336,7 +2350,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int i; + int i, ret; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2386,6 +2400,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + out_write_size: pos += copied; if (pos > i_size_read(inode)) { @@ -2407,6 +2429,7 @@ out_write_size: */ ocfs2_unlock_pages(wc); +out: ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1edcb141f639..fe50ded1b4ce 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { + if (status) { + /* Clear the rest of the buffers on error */ + put_bh(bh); + bhs[i] = NULL; + continue; + } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16eff45727ee..3a60c83218db 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -36,7 +36,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> #include <linux/bitmap.h> - +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -1061,37 +1061,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1102,7 +1071,7 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); @@ -1119,18 +1088,18 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + after_hb = ktime_get_real(); + + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec, ret); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb.tv64, after_hb.tv64, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index af7598bff1b5..dfe162f5fd4c 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) return count; } +void __mlog_printk(const u64 *mask, const char *func, int line, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *level; + const char *prefix = ""; + + if (!__mlog_test_u64(*mask, mlog_and_bits) || + __mlog_test_u64(*mask, mlog_not_bits)) + return; + + if (*mask & ML_ERROR) { + level = KERN_ERR; + prefix = "ERROR: "; + } else if (*mask & ML_NOTICE) { + level = KERN_NOTICE; + } else { + level = KERN_INFO; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s(%s,%u,%u):%s:%d %s%pV", + level, current->comm, task_pid_nr(current), + raw_smp_processor_id(), func, line, prefix, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(__mlog_printk); + struct mlog_attribute { struct attribute attr; u64 mask; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7fdc25a4d8c0..308ea0eb35fd 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #endif -/* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. - */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) +__printf(4, 5) +void __mlog_printk(const u64 *m, const char *func, int line, + const char *fmt, ...); -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. +/* + * Testing before the __mlog_printk call lets the compiler eliminate the + * call completely when (m & ML_ALLOWED_BITS) is 0. */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, mlog_and_bits) && \ - !__mlog_test_u64(__m, mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ +#define mlog(mask, fmt, ...) \ +do { \ + u64 _m = MLOG_MASK_PREFIX | (mask); \ + if (_m & ML_ALLOWED_BITS) \ + __mlog_printk(&_m, __func__, __LINE__, fmt, \ + ##__VA_ARGS__); \ } while (0) #define mlog_errno(st) ({ \ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ccd4dcfc3645..a9513ff0a47f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - trailer->db_signature); + rc = ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid db_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: @@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { - ocfs2_error(sb, - "Dir Index Root # %llu has bad signature %.*s", - (unsigned long long)le64_to_cpu(dx_root->dr_blkno), - 7, dx_root->dr_signature); - return -EINVAL; + ret = ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s\n", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); } - return 0; + return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, @@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { - ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", - 7, dx_leaf->dl_signature); - return -EROFS; + ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", + 7, dx_leaf->dl_signature); } - return 0; + return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, @@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "btree tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in btree tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } if (!found) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in btree", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -1617,7 +1607,7 @@ int __ocfs2_add_entry(handle_t *handle, struct ocfs2_dir_entry *de, *de1; struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; struct super_block *sb = dir->i_sb; - int retval, status; + int retval; unsigned int size = sb->s_blocksize; struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; @@ -1695,25 +1685,25 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, + retval = ocfs2_journal_access_di(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, + retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (ocfs2_dir_indexed(dir)) { - status = ocfs2_dx_dir_insert(dir, + if (!retval && ocfs2_dir_indexed(dir)) + retval = ocfs2_dx_dir_insert(dir, handle, lookup); - if (status) { - mlog_errno(status); - goto bail; - } - } + } + + if (retval) { + mlog_errno(retval); + goto bail; } /* By now the buffer is marked for journaling */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fae17c640df3..e88ccf8c83ff 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, /* will exit holding res->spinlock, but may drop in function */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); -void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); /* will exit holding res->spinlock, but may drop in function */ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fdf4b41d0609..46b8b2bbc95a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", - res->lockname.len, res->lockname.name); - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->track_lock); - atomic_dec(&dlm->res_cur_count); if (!hlist_unhashed(&res->hash_node) || @@ -795,8 +785,18 @@ lookup: dlm_lockres_grab_inflight_ref(dlm, tmpres); spin_unlock(&tmpres->spinlock); - if (res) + if (res) { + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else + mlog(ML_ERROR, "Resource %.*s not " + "on the Tracking list\n", + res->lockname.len, + res->lockname.name); + spin_unlock(&dlm->track_lock); dlm_lockres_put(res); + } res = tmpres; goto leave; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 69aac6f088ad..2e5e6d5fffe8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, __dlm_unhash_lockres(dlm, res); + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8b23aa2f52dd..23157e40dd74 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -4025,9 +4025,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) osb->dc_work_sequence = osb->dc_wake_sequence; processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - + /* + * blocked lock processing in this loop might call iput which can + * remove items off osb->blocked_lock_list. Downconvert up to + * 'processed' number of locks, but stop short if we had some + * removed in ocfs2_mark_lockres_freeing when downconverting. + */ + while (processed && !list_empty(&osb->blocked_lock_list)) { lockres = list_entry(osb->blocked_lock_list.next, struct ocfs2_lock_res, l_blocked_list); list_del_init(&lockres->l_blocked_list); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 767370b656ca..e4719e0a3f99 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0)\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b254416dc8d9..5e7e39a3e85e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1191,17 +1191,19 @@ void ocfs2_evict_inode(struct inode *inode) int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); + assert_spin_locked(&inode->i_lock); + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; - return res; + return 1; } /* @@ -1350,32 +1352,32 @@ int ocfs2_validate_inode_block(struct super_block *sb, rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { - ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - di->i_signature); + rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { - ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - ocfs2_error(sb, - "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", - (unsigned long long)bh->b_blocknr); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Invalid dinode #%llu: fs_generation is %u\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_fs_generation)); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); goto bail; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff531928269e..ccc570392eca 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) mlog_errno(PTR_ERR(handle)); if (is_journal_aborted(journal)) { - ocfs2_abort(osb->sb, "Detected aborted journal"); + ocfs2_abort(osb->sb, "Detected aborted journal\n"); handle = ERR_PTR(-EROFS); } } else { @@ -775,7 +775,18 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); - BUG_ON(status); + if (status) { + mlog_errno(status); + if (!is_handle_aborted(handle)) { + journal_t *journal = handle->h_transaction->t_journal; + + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " + "Aborting transaction and journal."); + handle->h_err = status; + jbd2_journal_abort_handle(handle); + jbd2_journal_abort(journal, status); + } + } } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) @@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, struct inode *inode = NULL; struct inode *iter; struct ocfs2_inode_info *oi; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_recover_orphans(slot); @@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto next; + } /* * We need to take and drop the inode lock to * force read inode from disk. */ - ret = ocfs2_inode_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); - goto next; + goto unlock_rw; } - ocfs2_inode_unlock(inode, 0); + + di = (struct ocfs2_dinode *)di_bh->b_data; if (inode->i_nlink == 0) { spin_lock(&oi->ip_lock); @@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { - struct buffer_head *di_bh = NULL; - - ret = ocfs2_rw_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto next; - } - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - ocfs2_rw_unlock(inode, 1); - mlog_errno(ret); - goto next; - } - + } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); - ocfs2_inode_unlock(inode, 1); - ocfs2_rw_unlock(inode, 1); - brelse(di_bh); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto next; + goto unlock_inode; } - ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ - +unlock_inode: + ocfs2_inode_unlock(inode, 1); +unlock_rw: + ocfs2_rw_unlock(inode, 1); next: iput(inode); - + brelse(di_bh); + di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 857bbbcd39f3..0a4457fb0711 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u used bits, but a count shows %u", + ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 56a768d06aa6..124471d26a73 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 176fe6afd94e..a1507488d521 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1291,6 +1291,15 @@ static int ocfs2_rename(struct inode *old_dir, } parents_locked = 1; + if (!new_dir->i_nlink) { + mlog(ML_ERROR, "new dir %llu has been removed, inode %llu " + "can not be moved into it.", + (unsigned long long)new_dir->i_ino, + (unsigned long long)old_inode->i_ino); + status = -EACCES; + goto bail; + } + /* make sure both dirs have bhs * get an extra ref on old_dir_bh if old==new */ if (!new_dir_bh) { @@ -1551,12 +1560,25 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_find_entry(old_dentry->d_name.name, old_dentry->d_name.len, old_dir, &old_entry_lookup); - if (status) + if (status) { + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; + } status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; } @@ -2670,30 +2692,22 @@ bail: } int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end) + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle = NULL; int status = 0; - status = ocfs2_inode_lock(inode, &di_bh, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - di = (struct ocfs2_dinode *) di_bh->b_data; - orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, le16_to_cpu(di->i_dio_orphaned_slot)); if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); - goto bail_unlock_inode; + goto bail; } mutex_lock(&orphan_dir_inode->i_mutex); @@ -2702,7 +2716,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); mlog_errno(status); - goto bail_unlock_inode; + goto bail; } handle = ocfs2_start_trans(osb, @@ -2749,10 +2763,6 @@ bail_unlock_orphan: brelse(orphan_dir_bh); iput(orphan_dir_inode); -bail_unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - bail: return status; } diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 5ddecce172fa..e173329eb830 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode); int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end); + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 460c6c37e683..2c923223ec11 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ + OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ + OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 3d0b63d34225..964b727f4ee2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_block, (unsigned long long)i_size_read(inode)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d8c6af101f3f..6f66268d03ca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { - ocfs2_error(sb, - "Refcount block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - rb->rf_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid rf_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(rb->rf_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid " - "rf_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(rb->rf_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + goto out; } - - return 0; +out: + return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, @@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(sb, - "refcount tree %llu has non zero tree " - "depth in leaf btree tree block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(sb, + "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -2361,10 +2357,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2547,10 +2541,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2674,11 +2666,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3108,11 +3099,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } @@ -3378,10 +3367,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - return -EROFS; + return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4479029630bb..0456ae399bf7 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -167,12 +167,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } #define do_error(fmt, ...) \ - do{ \ - if (resize) \ - mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ - else \ - ocfs2_error(sb, fmt, ##__VA_ARGS__); \ - } while (0) +do { \ + if (resize) \ + mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ + else \ + return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ +} while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, @@ -181,44 +181,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - do_error("Group descriptor #%llu has bad signature %.*s", + do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); - return -EINVAL; } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { - do_error("Group descriptor #%llu has an invalid bg_blkno " - "of %llu", + do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); - return -EINVAL; } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { - do_error("Group descriptor #%llu has an invalid " - "fs_generation of #%u", + do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); - return -EINVAL; } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - do_error("Group descriptor #%llu has bit count %u but " - "claims that %u are free", + do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); - return -EINVAL; } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - do_error("Group descriptor #%llu has bit count %u but " - "max bitmap bits of %u", + do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); - return -EINVAL; } return 0; @@ -233,20 +224,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { - do_error("Group descriptor #%llu has bad parent " - "pointer (%llu, expected %llu)", + do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - do_error("Group descriptor #%llu has bit count of %u", + do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); - return -EINVAL; } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ @@ -254,10 +242,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { - do_error("Group descriptor #%llu has bad chain %u", + do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); - return -EINVAL; } return 0; @@ -384,11 +371,10 @@ static int ocfs2_block_group_fill(handle_t *handle, struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { - ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " - "b_blocknr (%llu)", - (unsigned long long)group_blkno, - (unsigned long long) bg_bh->b_blocknr); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "group block (%llu) != b_blocknr (%llu)\n", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); goto bail; } @@ -834,9 +820,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { - ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", - (unsigned long long)le64_to_cpu(fe->i_blkno)); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "Invalid chain allocator %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } @@ -1370,12 +1356,11 @@ int ocfs2_block_group_set_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -1905,13 +1890,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(ac->ac_inode->i_sb, - "Chain allocator dinode %llu has %u used " - "bits but only %u total.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->id1.bitmap1.i_used), - le32_to_cpu(fe->id1.bitmap1.i_total)); - status = -EIO; + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used bits but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } @@ -2429,12 +2412,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } if (undo_fn) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 403c5660b306..2fc02f7c2949 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -192,6 +192,7 @@ enum { Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, + Opt_err_cont, Opt_err, }; @@ -224,6 +225,7 @@ static const match_table_t tokens = { {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_err_cont, "errors=continue"}, {Opt_err, NULL} }; @@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NOINTR; break; case Opt_err_panic: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; break; case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; + break; + case Opt_err_cont: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; break; case Opt_data_ordered: mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; @@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); + else if (opts & OCFS2_MOUNT_ERRORS_CONT) + seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); @@ -2541,31 +2554,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) memset(osb, 0, sizeof(struct ocfs2_super)); } -/* Put OCFS2 into a readonly state, or (if the user specifies it), - * panic(). We do not support continue-on-error operation. */ -static void ocfs2_handle_error(struct super_block *sb) +/* Depending on the mount option passed, perform one of the following: + * Put OCFS2 into a readonly state (default) + * Return EIO so that only the process errs + * Fix the error as if fsck.ocfs2 -y + * panic + */ +static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) - panic("OCFS2: (device %s): panic forced after error\n", - sb->s_id); + int rv = 0; ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + pr_crit("On-disk corruption discovered. " + "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) - return; - - printk(KERN_CRIT "File system is now read-only due to the potential " - "of on-disk corruption. Please run fsck.ocfs2 once the file " - "system is unmounted.\n"); - sb->s_flags |= MS_RDONLY; - ocfs2_set_ro_flag(osb, 0); + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { + pr_crit("OCFS2: Returning error to the calling process.\n"); + rv = -EIO; + } else { /* default option */ + rv = -EROFS; + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return rv; + + pr_crit("OCFS2: File system is now read-only.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); + } + + return rv; } -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; @@ -2577,12 +2602,12 @@ void __ocfs2_error(struct super_block *sb, const char *function, /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); - ocfs2_handle_error(sb); + return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than @@ -2599,7 +2624,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..b477d0b1c7b6 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); __printf(3, 4) -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_error(sb, fmt, ...) \ + __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) __printf(3, 4) void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_abort(sb, fmt, ...) \ + __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) /* * Void signal blockers, because in-kernel sigprocmask() only fails diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d03bfbf3d27d..5613883c05c0 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ocfs2_error(sb, - "Extended attribute block #%llu has bad " - "signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - xb->xb_signature); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); } if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extended attribute block #%llu has an " - "invalid xb_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(xb->xb_blkno)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); } if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extended attribute block #%llu has an invalid " - "xb_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(xb->xb_fs_generation)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); } return 0; @@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in xattr tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; + if (!capable(CAP_SYS_ADMIN)) + return 0; + if (list && total_len <= list_size) { memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); memcpy(list + prefix_len, name, name_len); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84bb65b83570..4fb17ded7d47 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index fd02a9ebfc30..3f57dac31ba6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; + /* + * Parked tasks do not run; they sit in __kthread_parkme(). + * Without this check, we would report them as running, which is + * clearly wrong, so we report them as sleeping instead. + */ + if (tsk->state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); return task_state_array[fls(state)]; diff --git a/fs/proc/base.c b/fs/proc/base.c index 286a422f440e..a7045f253009 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -196,18 +196,204 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t _count, loff_t *pos) { + struct task_struct *tsk; + struct mm_struct *mm; + char *page; + unsigned long count = _count; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long len1, len2, len; + unsigned long p; + char c; + ssize_t rv; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + mm = get_task_mm(tsk); + put_task_struct(tsk); + if (!mm) + return 0; + if (!mm->arg_end) { + rv = 0; + goto out_mmput; + } + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) { + rv = -ENOMEM; + goto out_mmput; + } + + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + BUG_ON(arg_start > arg_end); + BUG_ON(env_start > env_end); + + len1 = arg_end - arg_start; + len2 = env_end - env_start; + /* - * Rely on struct seq_operations::show() being called once - * per internal buffer allocation. See single_open(), traverse(). + * Inherently racy -- command line shares address space + * with code and data. */ - BUG_ON(m->size < PAGE_SIZE); - m->count += get_cmdline(task, m->buf, PAGE_SIZE); - return 0; + rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + if (rv <= 0) + goto out_free_page; + + rv = 0; + + if (c == '\0') { + /* Command line (set of strings) occupies whole ARGV. */ + if (len1 <= *pos) + goto out_free_page; + + p = arg_start + *pos; + len = len1 - *pos; + while (count > 0 && len > 0) { + unsigned int _count; + int nr_read; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + } + } else { + /* + * Command line (1 string) occupies ARGV and maybe + * extends into ENVP. + */ + if (len1 + len2 <= *pos) + goto skip_argv_envp; + if (len1 <= *pos) + goto skip_argv; + + p = arg_start + *pos; + len = len1 - *pos; + while (count > 0 && len > 0) { + unsigned int _count, l; + int nr_read; + bool final; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + /* + * Command line can be shorter than whole ARGV + * even if last "marker" byte says it is not. + */ + final = false; + l = strnlen(page, nr_read); + if (l < nr_read) { + nr_read = l; + final = true; + } + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + + if (final) + goto out_free_page; + } +skip_argv: + /* + * Command line (1 string) occupies ARGV and + * extends into ENVP. + */ + if (len1 <= *pos) { + p = env_start + *pos - len1; + len = len1 + len2 - *pos; + } else { + p = env_start; + len = len2; + } + while (count > 0 && len > 0) { + unsigned int _count, l; + int nr_read; + bool final; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + /* Find EOS. */ + final = false; + l = strnlen(page, nr_read); + if (l < nr_read) { + nr_read = l; + final = true; + } + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + + if (final) + goto out_free_page; + } +skip_argv_envp: + ; + } + +out_free_page: + free_page((unsigned long)page); +out_mmput: + mmput(mm); + if (rv > 0) + *pos += rv; + return rv; } +static const struct file_operations proc_pid_cmdline_ops = { + .read = proc_pid_cmdline_read, + .llseek = generic_file_llseek, +}; + static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -2572,7 +2758,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - ONE("cmdline", S_IRUGO, proc_pid_cmdline), + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), @@ -2918,7 +3104,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - ONE("cmdline", S_IRUGO, proc_pid_cmdline), + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0111ad0466ed..d766bfac06cb 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -588,8 +588,7 @@ static struct kmem_cache *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = (struct reiserfs_inode_info *) - kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; atomic_set(&ei->openers, 0); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 39f1d6a2b04d..e06cbeae14b2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #endif -#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, - pmd_t *pmdp) +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); @@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif -#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL +#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm, +static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int full) { - return pmdp_get_and_clear(mm, address, pmdp); + return pmdp_huge_get_and_clear(mm, address, pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma, pte_t *ptep); #endif -#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH -extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, +#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH +extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif @@ -189,6 +189,31 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif +#ifndef pmdp_collapse_flush +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + /* + * pmd and hugepage pte format are the same. So we could + * use the same function. + */ + return pmdp_huge_clear_flush(vma, address, pmdp); +} +#define pmdp_collapse_flush pmdp_collapse_flush +#else +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + BUILD_BUG(); + return *pmdp; +} +#define pmdp_collapse_flush pmdp_collapse_flush +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0995c2de8162..f589222bfa87 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename, /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT) -#define HASHDIST_DEFAULT 1 +#ifdef CONFIG_NUMA +#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) +extern int hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define HASHDIST_DEFAULT 0 +#define hashdist (0) #endif -extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif /* _LINUX_BOOTMEM_H */ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 371e560d13cf..dfaa7b3e9ae9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -5,9 +5,9 @@ /* * Common definitions for all gcc versions go here. */ -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) /* Optimization barrier */ @@ -46,55 +46,63 @@ * the inline assembly constraint from =g to =r, in this particular * case either is valid. */ -#define RELOC_HIDE(ptr, off) \ - ({ unsigned long __ptr; \ - __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ - (typeof(ptr)) (__ptr + (off)); }) +#define RELOC_HIDE(ptr, off) \ +({ \ + unsigned long __ptr; \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ + (typeof(ptr)) (__ptr + (off)); \ +}) /* Make the optimizer believe the variable can be manipulated arbitrarily. */ -#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var)) +#define OPTIMIZER_HIDE_VAR(var) \ + __asm__ ("" : "=r" (var) : "0" (var)) #ifdef __CHECKER__ -#define __must_be_array(arr) 0 +#define __must_be_array(a) 0 #else /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #endif /* * Force always-inline if the user requests it so via the .config, * or if gcc is too old: */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ +#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -# define inline inline __attribute__((always_inline)) notrace -# define __inline__ __inline__ __attribute__((always_inline)) notrace -# define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline)) notrace +#define __inline__ __inline__ __attribute__((always_inline)) notrace +#define __inline __inline __attribute__((always_inline)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -# define inline inline notrace -# define __inline__ __inline__ notrace -# define __inline __inline notrace +#define inline inline notrace +#define __inline__ __inline__ notrace +#define __inline __inline notrace #endif -#define __deprecated __attribute__((deprecated)) -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) +#define __always_inline inline __attribute__((always_inline)) +#define noinline __attribute__((noinline)) + +#define __deprecated __attribute__((deprecated)) +#define __packed __attribute__((packed)) +#define __weak __attribute__((weak)) +#define __alias(symbol) __attribute__((alias(#symbol))) /* - * it doesn't make sense on ARM (currently the only user of __naked) to trace - * naked functions because then mcount is called without stack and frame pointer - * being set up and there is no chance to restore the lr register to the value - * before mcount was called. + * it doesn't make sense on ARM (currently the only user of __naked) + * to trace naked functions because then mcount is called without + * stack and frame pointer being set up and there is no chance to + * restore the lr register to the value before mcount was called. + * + * The asm() bodies of naked functions often depend on standard calling + * conventions, therefore they must be noinline and noclone. * - * The asm() bodies of naked functions often depend on standard calling conventions, - * therefore they must be noinline and noclone. GCC 4.[56] currently fail to enforce - * this, so we must do so ourselves. See GCC PR44290. + * GCC 4.[56] currently fail to enforce this, so we must do so ourselves. + * See GCC PR44290. */ -#define __naked __attribute__((naked)) noinline __noclone notrace +#define __naked __attribute__((naked)) noinline __noclone notrace -#define __noreturn __attribute__((noreturn)) +#define __noreturn __attribute__((noreturn)) /* * From the GCC manual: @@ -106,19 +114,130 @@ * would be. * [...] */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define noinline __attribute__((noinline)) -#define __attribute_const__ __attribute__((__const__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) - -#define __gcc_header(x) #x -#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h) -#define gcc_header(x) _gcc_header(x) -#include gcc_header(__GNUC__) +#define __pure __attribute__((pure)) +#define __aligned(x) __attribute__((aligned(x))) +#define __printf(a, b) __attribute__((format(printf, a, b))) +#define __scanf(a, b) __attribute__((format(scanf, a, b))) +#define __attribute_const__ __attribute__((__const__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +/* gcc version specific checks */ + +#if GCC_VERSION < 30200 +# error Sorry, your compiler is too old - please upgrade it. +#endif + +#if GCC_VERSION < 30300 +# define __used __attribute__((__unused__)) +#else +# define __used __attribute__((__used__)) +#endif + +#ifdef CONFIG_GCOV_KERNEL +# if GCC_VERSION < 30400 +# error "GCOV profiling support for gcc versions below 3.4 not included" +# endif /* __GNUC_MINOR__ */ +#endif /* CONFIG_GCOV_KERNEL */ + +#if GCC_VERSION >= 30400 +#define __must_check __attribute__((warn_unused_result)) +#endif + +#if GCC_VERSION >= 40000 + +/* GCC 4.1.[01] miscompiles __weak */ +#ifdef __KERNEL__ +# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 +# error Your version of gcc miscompiles the __weak directive +# endif +#endif + +#define __used __attribute__((__used__)) +#define __compiler_offsetof(a, b) \ + __builtin_offsetof(a, b) + +#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 +# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) +#endif + +#if GCC_VERSION >= 40300 +/* Mark functions as cold. gcc will assume any path leading to a call + * to them will be unlikely. This means a lot of manual unlikely()s + * are unnecessary now for any paths leading to the usual suspects + * like BUG(), printk(), panic() etc. [but let's keep them for now for + * older compilers] + * + * Early snapshots of gcc 4.3 don't support this and we can't detect this + * in the preprocessor, but we can live with this because they're unreleased. + * Maketime probing would be overkill here. + * + * gcc also has a __attribute__((__hot__)) to move hot functions into + * a special section, but I don't see any sense in this right now in + * the kernel context + */ +#define __cold __attribute__((__cold__)) + +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +#ifndef __CHECKER__ +# define __compiletime_warning(message) __attribute__((warning(message))) +# define __compiletime_error(message) __attribute__((error(message))) +#endif /* __CHECKER__ */ +#endif /* GCC_VERSION >= 40300 */ + +#if GCC_VERSION >= 40500 +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + * + * Early snapshots of gcc 4.5 don't support this and we can't detect + * this in the preprocessor, but we can live with this because they're + * unreleased. Really, we need to have autoconf for the kernel. + */ +#define unreachable() __builtin_unreachable() + +/* Mark a function definition as prohibited from being cloned. */ +#define __noclone __attribute__((__noclone__)) + +#endif /* GCC_VERSION >= 40500 */ + +#if GCC_VERSION >= 40600 +/* + * Tell the optimizer that something else uses this function or variable. + */ +#define __visible __attribute__((externally_visible)) +#endif + +/* + * GCC 'asm goto' miscompiles certain code sequences: + * + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 + * + * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. + * + * (asm goto is automatically volatile - the naming reflects this.) + */ +#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) + +#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP +#if GCC_VERSION >= 40400 +#define __HAVE_BUILTIN_BSWAP32__ +#define __HAVE_BUILTIN_BSWAP64__ +#endif +#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) +#define __HAVE_BUILTIN_BSWAP16__ +#endif +#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ + +#if GCC_VERSION >= 50000 +#define KASAN_ABI_VERSION 4 +#elif GCC_VERSION >= 40902 +#define KASAN_ABI_VERSION 3 +#endif + +#endif /* gcc version >= 40000 specific checks */ #if !defined(__noclone) #define __noclone /* not needed */ @@ -129,5 +248,3 @@ * code */ #define uninitialized_var(x) x = x - -#define __always_inline inline __attribute__((always_inline)) diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h deleted file mode 100644 index 7d89febe4d79..000000000000 --- a/include/linux/compiler-gcc3.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead." -#endif - -#if GCC_VERSION < 30200 -# error Sorry, your compiler is too old - please upgrade it. -#endif - -#if GCC_VERSION >= 30300 -# define __used __attribute__((__used__)) -#else -# define __used __attribute__((__unused__)) -#endif - -#if GCC_VERSION >= 30400 -#define __must_check __attribute__((warn_unused_result)) -#endif - -#ifdef CONFIG_GCOV_KERNEL -# if GCC_VERSION < 30400 -# error "GCOV profiling support for gcc versions below 3.4 not included" -# endif /* __GNUC_MINOR__ */ -#endif /* CONFIG_GCOV_KERNEL */ diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h deleted file mode 100644 index 769e19864632..000000000000 --- a/include/linux/compiler-gcc4.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead." -#endif - -/* GCC 4.1.[01] miscompiles __weak */ -#ifdef __KERNEL__ -# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101 -# error Your version of gcc miscompiles the __weak directive -# endif -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a,b) __builtin_offsetof(a,b) - -#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 -# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#endif - -#if GCC_VERSION >= 40300 -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ -#endif /* GCC_VERSION >= 40300 */ - -#if GCC_VERSION >= 40500 -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -#endif /* GCC_VERSION >= 40500 */ - -#if GCC_VERSION >= 40600 -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) -#endif - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#if GCC_VERSION >= 40400 -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#endif -#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600) -#define __HAVE_BUILTIN_BSWAP16__ -#endif -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#if GCC_VERSION >= 40902 -#define KASAN_ABI_VERSION 3 -#endif diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h deleted file mode 100644 index efee493714eb..000000000000 --- a/include/linux/compiler-gcc5.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead." -#endif - -#define __used __attribute__((__used__)) -#define __must_check __attribute__((warn_unused_result)) -#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) - -/* Mark functions as cold. gcc will assume any path leading to a call - to them will be unlikely. This means a lot of manual unlikely()s - are unnecessary now for any paths leading to the usual suspects - like BUG(), printk(), panic() etc. [but let's keep them for now for - older compilers] - - Early snapshots of gcc 4.3 don't support this and we can't detect this - in the preprocessor, but we can live with this because they're unreleased. - Maketime probing would be overkill here. - - gcc also has a __attribute__((__hot__)) to move hot functions into - a special section, but I don't see any sense in this right now in - the kernel context */ -#define __cold __attribute__((__cold__)) - -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -#ifndef __CHECKER__ -# define __compiletime_warning(message) __attribute__((warning(message))) -# define __compiletime_error(message) __attribute__((error(message))) -#endif /* __CHECKER__ */ - -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. - */ -#define unreachable() __builtin_unreachable() - -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__)) - -/* - * Tell the optimizer that something else uses this function or variable. - */ -#define __visible __attribute__((externally_visible)) - -/* - * GCC 'asm goto' miscompiles certain code sequences: - * - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 - * - * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * - * (asm goto is automatically volatile - the naming reflects this.) - */ -#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) - -#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP -#define __HAVE_BUILTIN_BSWAP32__ -#define __HAVE_BUILTIN_BSWAP64__ -#define __HAVE_BUILTIN_BSWAP16__ -#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ - -#define KASAN_ABI_VERSION 4 diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 34025df61829..c9e5c57e4edf 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item) return item->ci_name; } -extern void config_item_init(struct config_item *); extern void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type); diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h new file mode 100644 index 000000000000..bba7a4d692b3 --- /dev/null +++ b/include/linux/crc64_ecma.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CRC64_ECMA_H_ +#define __CRC64_ECMA_H_ + +#include <linux/types.h> + + +#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * @pdata: pointer to the data to compute checksum for. + * @nbytes: number of bytes in data buffer. + * @seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed); + +#endif /* __CRC64_ECMA_H_ */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 70a7fee1efb3..ad35f300b9a4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_NOACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -87,6 +88,7 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ +#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) @@ -382,6 +384,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void page_alloc_init_late(void); +#else +static inline void page_alloc_init_late(void) +{ +} +#endif + /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f10b20f05159..44a840a53974 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +extern int madvise_free_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); @@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page, unsigned long address, enum page_check_address_pmd_flag flag, spinlock_t **ptl); +extern int pmd_freeable(pmd_t pmd); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index e804306ef5e8..e5fe4c1416a2 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9497ec7c77ea..bcfa4b63311b 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -93,6 +93,9 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); +void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, + phys_addr_t *out_end); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. @@ -132,6 +135,21 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a, __next_mem_range_rev(&i, nid, type_a, type_b, \ p_start, p_end, p_nid)) +/** + * for_each_reserved_mem_region - iterate over all reserved memblock areas + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over reserved areas of memblock. Available as soon as memblock + * is initialized. + */ +#define for_each_reserved_mem_region(i, p_start, p_end) \ + for (i = 0UL, \ + __next_reserved_mem_region(&i, p_start, p_end); \ + i != (u64)ULLONG_MAX; \ + __next_reserved_mem_region(&i, p_start, p_end)) + #ifdef CONFIG_MOVABLE_NODE static inline bool memblock_is_hotpluggable(struct memblock_region *m) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 72dff5fb0d0c..6c8918114804 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -463,6 +463,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) if (!memcg_kmem_enabled()) return true; + if (gfp & __GFP_NOACCOUNT) + return true; /* * __GFP_NOFAIL allocations will move on even if charging is not * possible. Therefore we don't even try, and have this allocation @@ -522,6 +524,8 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { if (!memcg_kmem_enabled()) return cachep; + if (gfp & __GFP_NOACCOUNT) + return cachep; if (gfp & __GFP_NOFAIL) return cachep; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h new file mode 100644 index 000000000000..4efc3f56e6df --- /dev/null +++ b/include/linux/mm-arch-hooks.h @@ -0,0 +1,25 @@ +/* + * Generic mm no-op hooks. + * + * Copyright (C) 2015, IBM Corporation + * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_MM_ARCH_HOOKS_H +#define _LINUX_MM_ARCH_HOOKS_H + +#include <asm/mm-arch-hooks.h> + +#ifndef arch_remap +static inline void arch_remap(struct mm_struct *mm, + unsigned long old_start, unsigned long old_end, + unsigned long new_start, unsigned long new_end) +{ +} +#define arch_remap arch_remap +#endif + +#endif /* _LINUX_MM_ARCH_HOOKS_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 0755b9fd03a7..2923a51979e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -436,46 +436,6 @@ static inline void compound_unlock_irqrestore(struct page *page, #endif } -static inline struct page *compound_head_by_tail(struct page *tail) -{ - struct page *head = tail->first_page; - - /* - * page->first_page may be a dangling pointer to an old - * compound page, so recheck that it is still a tail - * page before returning. - */ - smp_rmb(); - if (likely(PageTail(tail))) - return head; - return tail; -} - -/* - * Since either compound page could be dismantled asynchronously in THP - * or we access asynchronously arbitrary positioned struct page, there - * would be tail flag race. To handle this race, we should call - * smp_rmb() before checking tail flag. compound_head_by_tail() did it. - */ -static inline struct page *compound_head(struct page *page) -{ - if (unlikely(PageTail(page))) - return compound_head_by_tail(page); - return page; -} - -/* - * If we access compound page synchronously such as access to - * allocated page, there is no need to handle tail flag race, so we can - * check tail flag directly without any synchronization primitive. - */ -static inline struct page *compound_head_fast(struct page *page) -{ - if (unlikely(PageTail(page))) - return page->first_page; - return page; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -499,7 +459,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return !PageSlab(page) && !PageHeadHuge(page); + return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); } /* @@ -1631,6 +1591,8 @@ extern void free_highmem_page(struct page *page); extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); +extern void reserve_bootmem_region(unsigned long start, unsigned long end); + /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) { @@ -1720,7 +1682,8 @@ extern void sparse_memory_present_with_active_regions(int nid); #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn) +static inline int __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { return 0; } @@ -1728,7 +1691,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn) /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); /* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn); +extern int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); @@ -2146,12 +2110,47 @@ enum mf_flags { extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); +extern int get_hwpoison_page(struct page *page); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages; extern int soft_offline_page(struct page *page, int flags); + +/* + * Error handlers for various types of pages. + */ +enum mf_result { + MF_IGNORED, /* Error: cannot be handled */ + MF_FAILED, /* Error: handling failed */ + MF_DELAYED, /* Will be handled later */ + MF_RECOVERED, /* Successfully recovered */ +}; + +enum mf_action_page_type { + MF_MSG_KERNEL, + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, + MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, + MF_MSG_DIRTY_MLOCKED_LRU, + MF_MSG_CLEAN_MLOCKED_LRU, + MF_MSG_DIRTY_UNEVICTABLE_LRU, + MF_MSG_CLEAN_UNEVICTABLE_LRU, + MF_MSG_DIRTY_LRU, + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, + MF_MSG_BUDDY_2ND, + MF_MSG_UNKNOWN, +}; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 95243d28a0ee..61cd67f4d788 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pte; \ }) -#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \ +#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \ + ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ ___pmd; \ }) -#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \ +#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ pmd_t ___pmd; \ \ - ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \ + ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ mmu_notifier_invalidate_range(__mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ @@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush -#define pmdp_clear_flush_notify pmdp_clear_flush -#define pmdp_get_and_clear_notify pmdp_get_and_clear +#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush +#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 54d74f6eb233..754c25966a0a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -762,6 +762,14 @@ typedef struct pglist_data { /* Number of pages migrated during the rate limiting time interval */ unsigned long numabalancing_migrate_nr_pages; #endif + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * If memory initialisation on large machines is deferred then this + * is the first PFN that needs to be initialised. + */ + unsigned long first_deferred_pfn; +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -1216,11 +1224,16 @@ void sparse_init(void); #define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool early_pfn_in_nid(unsigned long pfn, int nid); -#else -#define early_pfn_in_nid(pfn, nid) (1) -#endif +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * may treat start/end as pfns or sections. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3d46fb4708e0..f94da0e65dea 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , @@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); +extern int proc_watchdog_cpumask(struct ctl_table *, int, + void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f34e040b34e9..91b7f9b2b774 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -134,49 +134,68 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +/* Page flags policies wrt compound pages */ +#define PF_ANY(page, enforce) page +#define PF_HEAD(page, enforce) compound_head(page) +#define PF_NO_TAIL(page, enforce) ({ \ + if (enforce) \ + VM_BUG_ON_PAGE(PageTail(page), page); \ + else \ + page = compound_head(page); \ + page;}) +#define PF_NO_COMPOUND(page, enforce) ({ \ + if (enforce) \ + VM_BUG_ON_PAGE(PageCompound(page), page); \ + page;}) + /* * Macros to create function definitions for page flags */ -#define TESTPAGEFLAG(uname, lname) \ -static inline int Page##uname(const struct page *page) \ - { return test_bit(PG_##lname, &page->flags); } +#define TESTPAGEFLAG(uname, lname, policy) \ +static inline int Page##uname(struct page *page) \ + { return test_bit(PG_##lname, &policy(page, 0)->flags); } -#define SETPAGEFLAG(uname, lname) \ +#define SETPAGEFLAG(uname, lname, policy) \ static inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &page->flags); } + { set_bit(PG_##lname, &policy(page, 1)->flags); } -#define CLEARPAGEFLAG(uname, lname) \ +#define CLEARPAGEFLAG(uname, lname, policy) \ static inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &page->flags); } + { clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __SETPAGEFLAG(uname, lname) \ +#define __SETPAGEFLAG(uname, lname, policy) \ static inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &page->flags); } + { __set_bit(PG_##lname, &policy(page, 1)->flags); } -#define __CLEARPAGEFLAG(uname, lname) \ +#define __CLEARPAGEFLAG(uname, lname, policy) \ static inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &page->flags); } + { __clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTSETFLAG(uname, lname) \ +#define TESTSETFLAG(uname, lname, policy) \ static inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &page->flags); } + { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTCLEARFLAG(uname, lname) \ +#define TESTCLEARFLAG(uname, lname, policy) \ static inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &page->flags); } + { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __TESTCLEARFLAG(uname, lname) \ +#define __TESTCLEARFLAG(uname, lname, policy) \ static inline int __TestClearPage##uname(struct page *page) \ - { return __test_and_clear_bit(PG_##lname, &page->flags); } + { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) +#define PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + SETPAGEFLAG(uname, lname, policy) \ + CLEARPAGEFLAG(uname, lname, policy) -#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) +#define __PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + __SETPAGEFLAG(uname, lname, policy) \ + __CLEARPAGEFLAG(uname, lname, policy) -#define TESTSCFLAG(uname, lname) \ - TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTSCFLAG(uname, lname, policy) \ + TESTSETFLAG(uname, lname, policy) \ + TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname) \ static inline int Page##uname(const struct page *page) { return 0; } @@ -205,47 +224,100 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) -struct page; /* forward declaration */ - -TESTPAGEFLAG(Locked, locked) -PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) -PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) - __SETPAGEFLAG(Referenced, referenced) -PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) -PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) -PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) - TESTCLEARFLAG(Active, active) -__PAGEFLAG(Slab, slab) -PAGEFLAG(Checked, checked) /* Used by some filesystems */ -PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ -PAGEFLAG(SavePinned, savepinned); /* Xen */ -PAGEFLAG(Foreign, foreign); /* Xen */ -PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) -PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) - __SETPAGEFLAG(SwapBacked, swapbacked) - -__PAGEFLAG(SlobFree, slob_free) +/* Forward declarations */ +struct page; +static inline int PageCompound(struct page *page); +static inline int PageTail(struct page *page); + +static inline struct page *compound_head_by_tail(struct page *tail) +{ + struct page *head = tail->first_page; + + /* + * page->first_page may be a dangling pointer to an old + * compound page, so recheck that it is still a tail + * page before returning. + */ + smp_rmb(); + if (likely(PageTail(tail))) + return head; + return tail; +} + +/* + * Since either compound page could be dismantled asynchronously in THP + * or we access asynchronously arbitrary positioned struct page, there + * would be tail flag race. To handle this race, we should call + * smp_rmb() before checking tail flag. compound_head_by_tail() did it. + */ +static inline struct page *compound_head(struct page *page) +{ + if (unlikely(PageTail(page))) + return compound_head_by_tail(page); + return page; +} + +/* + * If we access compound page synchronously such as access to + * allocated page, there is no need to handle tail flag race, so we can + * check tail flag directly without any synchronization primitive. + */ +static inline struct page *compound_head_fast(struct page *page) +{ + if (unlikely(PageTail(page))) + return page->first_page; + return page; +} + +__PAGEFLAG(Locked, locked, PF_NO_TAIL) +PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) +PAGEFLAG(Referenced, referenced, PF_HEAD) + TESTCLEARFLAG(Referenced, referenced, PF_HEAD) + __SETPAGEFLAG(Referenced, referenced, PF_HEAD) +PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) + __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) +PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) +PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) + TESTCLEARFLAG(Active, active, PF_HEAD) +__PAGEFLAG(Slab, slab, PF_NO_TAIL) +__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) +PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ + +/* Xen */ +PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) +PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND) +PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND) + +PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) +PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause releasepage() and co to be invoked */ -PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) - __CLEARPAGEFLAG(Private, private) -PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2) -PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) +PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) + __CLEARPAGEFLAG(Private, private, PF_ANY) +PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) + TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -PAGEFLAG(MappedToDisk, mappedtodisk) +TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) + TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) +PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -258,31 +330,33 @@ PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP -PAGEFLAG(SwapCache, swapcache) +PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(SwapCache) #endif -PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) - TESTCLEARFLAG(Unevictable, unevictable) +PAGEFLAG(Unevictable, unevictable, PF_HEAD) + __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) + TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) - TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) +PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) + TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) + __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached) +PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(Uncached) #endif #ifdef CONFIG_MEMORY_FAILURE -PAGEFLAG(HWPoison, hwpoison) -TESTSCFLAG(HWPoison, hwpoison) +PAGEFLAG(HWPoison, hwpoison, PF_ANY) +TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) @@ -311,6 +385,7 @@ PAGEFLAG_FALSE(HWPoison) static inline int PageAnon(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } @@ -323,6 +398,7 @@ static inline int PageAnon(struct page *page) */ static inline int PageKsm(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); } @@ -334,8 +410,9 @@ u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) { - int ret = test_bit(PG_uptodate, &(page)->flags); - + int ret; + page = compound_head(page); + ret = test_bit(PG_uptodate, &(page)->flags); /* * Must ensure that the data we read out of the page is loaded * _after_ we've loaded page->flags to check for PageUptodate. @@ -352,22 +429,24 @@ static inline int PageUptodate(struct page *page) static inline void __SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); - __set_bit(PG_uptodate, &(page)->flags); + __set_bit(PG_uptodate, &page->flags); } static inline void SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the page * uptodate are actually visible before PageUptodate becomes true. */ smp_wmb(); - set_bit(PG_uptodate, &(page)->flags); + set_bit(PG_uptodate, &page->flags); } -CLEARPAGEFLAG(Uptodate, uptodate) +CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); @@ -396,8 +475,8 @@ static inline void set_page_writeback_keepwrite(struct page *page) * and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages * and avoid handling those in real mode. */ -__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) -__PAGEFLAG(Tail, tail) +__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +__PAGEFLAG(Tail, tail, PF_ANY) static inline int PageCompound(struct page *page) { @@ -421,8 +500,8 @@ static inline void ClearPageCompound(struct page *page) * because PageCompound is always set for compound pages and not for * pages on the LRU and/or pagecache. */ -TESTPAGEFLAG(Compound, compound) -__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound) +TESTPAGEFLAG(Compound, compound, PF_ANY) +__SETPAGEFLAG(Head, compound, PF_ANY) __CLEARPAGEFLAG(Head, compound, PF_ANY) /* * PG_reclaim is used in combination with PG_compound to mark the @@ -518,21 +597,9 @@ static inline int PageTransTail(struct page *page) } #else - -static inline int PageTransHuge(struct page *page) -{ - return 0; -} - -static inline int PageTransCompound(struct page *page) -{ - return 0; -} - -static inline int PageTransTail(struct page *page) -{ - return 0; -} +TESTPAGEFLAG_FALSE(TransHuge) +TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransTail) #endif /* @@ -655,6 +722,10 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +#undef PF_ANY +#undef PF_HEAD +#undef PF_NO_TAIL +#undef PF_NO_COMPOUND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..7c3790764795 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,18 +426,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); -static inline void __set_page_locked(struct page *page) -{ - __set_bit(PG_locked, &page->flags); -} - -static inline void __clear_page_locked(struct page *page) -{ - __clear_bit(PG_locked, &page->flags); -} - static inline int trylock_page(struct page *page) { + page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } @@ -490,9 +481,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page, static inline int wait_on_page_locked_killable(struct page *page) { - if (PageLocked(page)) - return wait_on_page_bit_killable(page, PG_locked); - return 0; + if (!PageLocked(page)) + return 0; + return wait_on_page_bit_killable(compound_head(page), PG_locked); } extern wait_queue_head_t *page_waitqueue(struct page *page); @@ -511,7 +502,7 @@ static inline void wake_up_page(struct page *page, int bit) static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + wait_on_page_bit(compound_head(page), PG_locked); } /* @@ -656,17 +647,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: - * the page is new, so we can just run __set_page_locked() against it. + * the page is new, so we can just run __SetPageLocked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; - __set_page_locked(page); + __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) - __clear_page_locked(page); + __ClearPageLocked(page); return error; } diff --git a/include/linux/poison.h b/include/linux/poison.h index 2110a81c5e2a..7b2a7fcde6a3 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -32,6 +32,10 @@ /********** mm/debug-pagealloc.c **********/ #define PAGE_POISON 0xaa +/********** mm/page_alloc.c ************/ + +#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA) + /********** mm/slab.c **********/ /* * Magic nums for obj red zoning. diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c89c53a113a8..bf36b6e644c4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,6 +85,7 @@ enum ttu_flags { TTU_UNMAP = 1, /* unmap mode */ TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ + TTU_FREE = 8, /* free mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -183,7 +184,8 @@ static inline void page_dup_rmap(struct page *page) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags, + int *is_pte_dirty); #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) @@ -260,9 +262,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc); static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + int *is_pte_dirty) { *vm_flags = 0; + if (is_pte_dirty) + *is_pte_dirty = 0; return 0; } diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 587017e7939c..ad6e32f3eab2 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -133,6 +133,10 @@ struct rtc_device { /* Some hardware can't support UIE mode */ int uie_unsupported; +#ifdef CONFIG_PM_SLEEP + struct rtc_wkalrm alarm; + bool valid_alarm; +#endif #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL struct work_struct uie_task; struct timer_list uie_timer; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index eca1ec93775c..e839025a81cb 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -265,12 +265,12 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, gfp_t gfp_mask); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen); + const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip); + const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); diff --git a/include/linux/sched.h b/include/linux/sched.h index d6540c5929c7..e860d8658640 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2556,8 +2556,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); @@ -2576,6 +2590,7 @@ extern int do_execveat(int, struct filename *, const char __user * const __user *, const char __user * const __user *, int); +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/slab.h b/include/linux/slab.h index ffd24c830151..ca761238b2a5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,8 +153,30 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) +/* + * The KMALLOC_LOOP_LOW is the definition for the for loop index start number + * to create the kmalloc_caches object in create_kmalloc_caches(). The first + * and the second are 96 and 192. You can see that in the kmalloc_index(), if + * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, + * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't + * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. + */ +#if KMALLOC_MIN_SIZE <= 32 +#define KMALLOC_LOOP_LOW 1 +#elif KMALLOC_MIN_SIZE <= 64 +#define KMALLOC_LOOP_LOW 2 +#else +#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW +#endif + #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +/* + * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. + * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be + * initialized. + */ +#define KMALLOC_LOOP_LOW 1 #endif /* @@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes - * 2 = 120 .. 192 bytes - * n = 2^(n-1) .. 2^n -1 + * 2 = 129 .. 192 bytes + * n = 2^(n-1)+1 .. 2^n */ static __always_inline int kmalloc_index(size_t size) { @@ -290,6 +312,16 @@ void *__kmalloc(size_t size, gfp_t flags); void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); void kmem_cache_free(struct kmem_cache *, void *); +/* + * Bulk allocation and freeing operations. These are accellerated in an + * allocator specific way to avoid taking locks repeatedly or building + * metadata structures unnecessarily. + * + * Note that interrupts must be enabled when calling these functions. + */ +void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node); void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d600afb21926..da3c593f9845 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -27,6 +27,8 @@ struct smpboot_thread_data; * @pre_unpark: Optional unpark function, called before the thread is * unparked (cpu online). This is not guaranteed to be * called on the target cpu of the thread. Careful! + * @cpumask: Internal state. To update which threads are unparked, + * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -41,11 +43,14 @@ struct smp_hotplug_thread { void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); void (*pre_unpark)(unsigned int cpu); + cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif diff --git a/include/linux/string.h b/include/linux/string.h index e40099e585c9..12a5a60f1f3b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -117,6 +117,7 @@ extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp); extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); +extern char *kstrimdup(const char *s, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/include/linux/swap.h b/include/linux/swap.h index cee108cbe2d5..0428e4c84e1d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -308,6 +308,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38aabe1..bb51becf23f8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 0ee05da38899..03835522dfcb 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -109,12 +109,12 @@ static inline bool gid_lte(kgid_t left, kgid_t right) static inline bool uid_valid(kuid_t uid) { - return !uid_eq(uid, INVALID_UID); + return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { - return !gid_eq(gid, INVALID_GID); + return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9246d32dc973..2b1cef88b827 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, + PGLAZYFREED, FOR_ALL_ZONES(PGREFILL), FOR_ALL_ZONES(PGSTEAL_KSWAPD), FOR_ALL_ZONES(PGSTEAL_DIRECT), diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 79abb9c71772..1443d79e4fe6 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -11,6 +11,7 @@ #include <linux/pci.h> #include <linux/aer.h> #include <linux/cper.h> +#include <linux/mm.h> /* * MCE Extended Error Log trace event @@ -232,6 +233,90 @@ TRACE_EVENT(aer_event, __print_flags(__entry->status, "|", aer_uncorrectable_errors)) ); +/* + * memory-failure recovery action result event + * + * unsigned long pfn - Page Frame Number of the corrupted page + * int type - Page types of the corrupted page + * int result - Result of recovery action + */ + +#ifdef CONFIG_MEMORY_FAILURE +#define MF_ACTION_RESULT \ + EM ( MF_IGNORED, "Ignored" ) \ + EM ( MF_FAILED, "Failed" ) \ + EM ( MF_DELAYED, "Delayed" ) \ + EMe ( MF_RECOVERED, "Recovered" ) + +#define MF_PAGE_TYPE \ + EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ + EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ + EM ( MF_MSG_SLAB, "kernel slab page" ) \ + EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ + EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ + EM ( MF_MSG_HUGE, "huge page" ) \ + EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ + EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ + EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ + EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ + EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ + EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ + EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ + EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ + EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ + EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ + EM ( MF_MSG_BUDDY, "free buddy page" ) \ + EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ + EMe ( MF_MSG_UNKNOWN, "unknown page" ) + +/* + * First define the enums in MM_ACTION_RESULT to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +MF_ACTION_RESULT +MF_PAGE_TYPE + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(memory_failure_event, + TP_PROTO(unsigned long pfn, + int type, + int result), + + TP_ARGS(pfn, type, result), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(int, type) + __field(int, result) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->type = type; + __entry->result = result; + ), + + TP_printk("pfn %#lx: recovery action for %s: %s", + __entry->pfn, + __print_symbolic(__entry->type, MF_PAGE_TYPE), + __print_symbolic(__entry->result, MF_ACTION_RESULT) + ) +); +#endif /* CONFIG_MEMORY_FAILURE */ #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 81ea59812117..6cd975f5b225 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -140,19 +140,23 @@ DEFINE_EVENT(kmem_free, kfree, TP_ARGS(call_site, ptr) ); -DEFINE_EVENT(kmem_free, kmem_cache_free, +DEFINE_EVENT_CONDITION(kmem_free, kmem_cache_free, TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr) + TP_ARGS(call_site, ptr), + + TP_CONDITION(cpu_online(smp_processor_id())) ); -TRACE_EVENT(mm_page_free, +TRACE_EVENT_CONDITION(mm_page_free, TP_PROTO(struct page *page, unsigned int order), TP_ARGS(page, order), + TP_CONDITION(cpu_online(smp_processor_id())), + TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) @@ -253,12 +257,26 @@ DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked, TP_ARGS(page, order, migratetype) ); -DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain, +TRACE_EVENT_CONDITION(mm_page_pcpu_drain, TP_PROTO(struct page *page, unsigned int order, int migratetype), TP_ARGS(page, order, migratetype), + TP_CONDITION(cpu_online(smp_processor_id())), + + TP_STRUCT__entry( + __field( unsigned long, pfn ) + __field( unsigned int, order ) + __field( int, migratetype ) + ), + + TP_fast_assign( + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->order = order; + __entry->migratetype = migratetype; + ), + TP_printk("page=%p pfn=%lu order=%d migratetype=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order, __entry->migratetype) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ddc3b36f1046..7a94102b7a02 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -34,6 +34,7 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ +#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_REMOVE 9 /* remove these pages & resources */ diff --git a/init/main.c b/init/main.c index 2115055faeac..d3a512a087e5 100644 --- a/init/main.c +++ b/init/main.c @@ -998,6 +998,8 @@ static noinline void __init kernel_init_freeable(void) smp_init(); sched_init_smp(); + page_alloc_init_late(); + do_basic_setup(); /* Open the /dev/console on the rootfs, this should never fail */ diff --git a/ipc/msg.c b/ipc/msg.c index 2b6fdbb9e0e9..bacfe6137580 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -37,6 +37,7 @@ #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> +#include <linux/freezer.h> #include <asm/current.h> #include <linux/uaccess.h> @@ -673,7 +674,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); - schedule(); + freezable_schedule(); rcu_read_lock(); ipc_lock_object(&msq->q_perm); @@ -915,7 +916,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); - schedule(); + freezable_schedule(); /* Lockless receive, part 1: * Disable preemption. We don't hold a reference to the queue diff --git a/kernel/exit.c b/kernel/exit.c index 22fcc05dec40..8a87bb43dbd0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -711,10 +711,10 @@ void do_exit(long code) current->comm, task_pid_nr(current), preempt_count()); - acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 2e670864174f..0e0ae9a7427e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1238,7 +1238,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, - int trace) + int trace, + unsigned long tls) { int retval; struct task_struct *p; @@ -1444,7 +1445,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p); + retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); if (retval) goto bad_fork_cleanup_io; @@ -1656,7 +1657,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1671,11 +1672,12 @@ struct task_struct *fork_idle(int cpu) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + unsigned long tls) { struct task_struct *p; int trace = 0; @@ -1700,7 +1702,7 @@ long do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace); + child_tidptr, NULL, trace, tls); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1741,20 +1743,34 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +/* For compatibility with architectures that call do_fork directly rather than + * using the syscall entry points below. */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return _do_fork(clone_flags, stack_start, stack_size, + parent_tidptr, child_tidptr, 0); +} +#endif + /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL); + return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, + (unsigned long)arg, NULL, NULL, 0); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, 0, NULL, NULL); + return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); #else /* can not support in nommu mode */ return -EINVAL; @@ -1765,8 +1781,8 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL); + return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + 0, NULL, NULL, 0); } #endif @@ -1774,27 +1790,27 @@ SYSCALL_DEFINE0(vfork) #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, - int, tls_val, + unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #endif { - return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); } #endif diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c697f73d82d6..7c434c39f02a 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - smpboot_unpark_thread(cur, cpu); + if (cpumask_test_cpu(cpu, cur->cpumask)) + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); } @@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; + /* Unpark any threads that were voluntarily parked. */ + for_each_cpu_not(cpu, ht->cpumask) { + if (cpu_online(cpu)) { + struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + if (tsk) + kthread_unpark(tsk); + } + } + /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) + return -ENOMEM; + cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { @@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); + free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); +/** + * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked + * @plug_thread: Hotplug thread descriptor + * @new: Revised mask to use + * + * The cpumask field in the smp_hotplug_thread must not be updated directly + * by the client, but only by calling this function. + * This function can only be called on a registered smp_hotplug_thread. + */ +int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) +{ + struct cpumask *old = plug_thread->cpumask; + cpumask_var_t tmp; + unsigned int cpu; + + if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) + return -ENOMEM; + + get_online_cpus(); + mutex_lock(&smpboot_threads_lock); + + /* Park threads that were exclusively enabled on the old mask. */ + cpumask_andnot(tmp, old, new); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_park_thread(plug_thread, cpu); + + /* Unpark threads that are exclusively enabled on the new mask. */ + cpumask_andnot(tmp, new, old); + for_each_cpu_and(cpu, tmp, cpu_online_mask) + smpboot_unpark_thread(plug_thread, cpu); + + cpumask_copy(old, new); + + mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); + + free_cpumask_var(tmp); + + return 0; +} +EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); + static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2082b1a88fb9..699571a74e3b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -881,6 +881,13 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, + { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 2316f50b07a4..93ef2bad457c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -19,6 +19,7 @@ #include <linux/sysctl.h> #include <linux/smpboot.h> #include <linux/sched/rt.h> +#include <linux/tick.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -56,6 +57,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 #endif +static struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -205,7 +212,7 @@ void touch_all_softlockup_watchdogs(void) * do we care if a 0 races with a timestamp? * all it means is the softlock check starts one cycle later */ - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; } @@ -608,11 +615,11 @@ void watchdog_nmi_enable_all(void) { int cpu; - if (!watchdog_user_enabled) + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) return; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) watchdog_nmi_enable(cpu); put_online_cpus(); } @@ -625,7 +632,7 @@ void watchdog_nmi_disable_all(void) return; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) watchdog_nmi_disable(cpu); put_online_cpus(); } @@ -684,7 +691,7 @@ static void update_watchdog_all_cpus(void) int cpu; get_online_cpus(); - for_each_online_cpu(cpu) + for_each_watchdog_cpu(cpu) update_watchdog(cpu); put_online_cpus(); } @@ -697,8 +704,12 @@ static int watchdog_enable_all_cpus(void) err = smpboot_register_percpu_thread(&watchdog_threads); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else + else { + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask)) + pr_err("Failed to set cpumask for watchdog threads\n"); watchdog_running = 1; + } } else { /* * Enable/disable the lockup detectors or @@ -869,12 +880,58 @@ out: mutex_unlock(&watchdog_proc_mutex); return err; } + +/* + * The cpumask is the mask of possible cpus that the watchdog can run + * on, not the mask of cpus it is actually running on. This allows the + * user to specify a mask that will include cpus that have not yet + * been brought online, if desired. + */ +int proc_watchdog_cpumask(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + mutex_lock(&watchdog_proc_mutex); + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) { + /* Remove impossible cpus to keep sysctl output cleaner. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, + cpu_possible_mask); + + if (watchdog_running) { + /* + * Failure would be due to being unable to allocate + * a temporary cpumask, so we are likely not in a + * position to do much else to make things better. + */ + if (smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask) != 0) + pr_err("cpumask update failed\n"); + } + } + mutex_unlock(&watchdog_proc_mutex); + return err; +} + #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) { + if (!cpumask_empty(tick_nohz_full_mask)) + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, + tick_nohz_full_mask); + } else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#else + cpumask_copy(&watchdog_cpumask, cpu_possible_mask); +#endif + if (watchdog_enabled) watchdog_enable_all_cpus(); } diff --git a/lib/Kconfig b/lib/Kconfig index 34e332b8d326..e9ac40c1763a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -188,6 +188,13 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config CRC64_ECMA + tristate "CRC64 ECMA function" + help + This option provides CRC64 ECMA function. Drivers may select this + when they need to do cyclic redundancy check according to the CRC64 + ECMA algorithm. + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH diff --git a/lib/Makefile b/lib/Makefile index ff37c8c2f7b2..81ea9822b01b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_842_COMPRESS) += 842/ diff --git a/lib/bitmap.c b/lib/bitmap.c index 64c0926f5dd8..a578a0189199 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -462,19 +462,20 @@ EXPORT_SYMBOL(bitmap_parse_user); * Output format is a comma-separated list of decimal numbers and * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. + * + * It is assumed that @buf is a pointer into a PAGE_SIZE area and that + * sufficient storage remains at @buf to accommodate the + * bitmap_print_to_pagebuf() output. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { - ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2; + ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; int n = 0; - if (len > 1) { - n = list ? scnprintf(buf, len, "%*pbl", nmaskbits, maskp) : - scnprintf(buf, len, "%*pb", nmaskbits, maskp); - buf[n++] = '\n'; - buf[n] = '\0'; - } + if (len > 1) + n = list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : + scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); return n; } EXPORT_SYMBOL(bitmap_print_to_pagebuf); @@ -506,12 +507,12 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, unsigned a, b; int c, old_c, totaldigits; const char __user __force *ubuf = (const char __user __force *)buf; - int exp_digit, in_range; + int at_start, in_range; totaldigits = c = 0; bitmap_zero(maskp, nmaskbits); do { - exp_digit = 1; + at_start = 1; in_range = 0; a = b = 0; @@ -540,11 +541,10 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, break; if (c == '-') { - if (exp_digit || in_range) + if (at_start || in_range) return -EINVAL; b = 0; in_range = 1; - exp_digit = 1; continue; } @@ -554,16 +554,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, b = b * 10 + (c - '0'); if (!in_range) a = b; - exp_digit = 0; + at_start = 0; totaldigits++; } if (!(a <= b)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; - while (a <= b) { - set_bit(a, maskp); - a++; + if (!at_start) { + while (a <= b) { + set_bit(a, maskp); + a++; + } } } while (buflen && c == ','); return 0; diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c new file mode 100644 index 000000000000..41629ea5a60c --- /dev/null +++ b/lib/crc64_ecma.c @@ -0,0 +1,341 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <linux/crc64_ecma.h> + + +#define CRC64_BYTE_MASK 0xFF +#define CRC64_TABLE_SIZE 256 + + +struct crc64_table { + u64 seed; + u64 table[CRC64_TABLE_SIZE]; +}; + + +static struct crc64_table CRC64_ECMA_182 = { + CRC64_DEFAULT_INITVAL, + { + 0x0000000000000000ULL, + 0xb32e4cbe03a75f6fULL, + 0xf4843657a840a05bULL, + 0x47aa7ae9abe7ff34ULL, + 0x7bd0c384ff8f5e33ULL, + 0xc8fe8f3afc28015cULL, + 0x8f54f5d357cffe68ULL, + 0x3c7ab96d5468a107ULL, + 0xf7a18709ff1ebc66ULL, + 0x448fcbb7fcb9e309ULL, + 0x0325b15e575e1c3dULL, + 0xb00bfde054f94352ULL, + 0x8c71448d0091e255ULL, + 0x3f5f08330336bd3aULL, + 0x78f572daa8d1420eULL, + 0xcbdb3e64ab761d61ULL, + 0x7d9ba13851336649ULL, + 0xceb5ed8652943926ULL, + 0x891f976ff973c612ULL, + 0x3a31dbd1fad4997dULL, + 0x064b62bcaebc387aULL, + 0xb5652e02ad1b6715ULL, + 0xf2cf54eb06fc9821ULL, + 0x41e11855055bc74eULL, + 0x8a3a2631ae2dda2fULL, + 0x39146a8fad8a8540ULL, + 0x7ebe1066066d7a74ULL, + 0xcd905cd805ca251bULL, + 0xf1eae5b551a2841cULL, + 0x42c4a90b5205db73ULL, + 0x056ed3e2f9e22447ULL, + 0xb6409f5cfa457b28ULL, + 0xfb374270a266cc92ULL, + 0x48190ecea1c193fdULL, + 0x0fb374270a266cc9ULL, + 0xbc9d3899098133a6ULL, + 0x80e781f45de992a1ULL, + 0x33c9cd4a5e4ecdceULL, + 0x7463b7a3f5a932faULL, + 0xc74dfb1df60e6d95ULL, + 0x0c96c5795d7870f4ULL, + 0xbfb889c75edf2f9bULL, + 0xf812f32ef538d0afULL, + 0x4b3cbf90f69f8fc0ULL, + 0x774606fda2f72ec7ULL, + 0xc4684a43a15071a8ULL, + 0x83c230aa0ab78e9cULL, + 0x30ec7c140910d1f3ULL, + 0x86ace348f355aadbULL, + 0x3582aff6f0f2f5b4ULL, + 0x7228d51f5b150a80ULL, + 0xc10699a158b255efULL, + 0xfd7c20cc0cdaf4e8ULL, + 0x4e526c720f7dab87ULL, + 0x09f8169ba49a54b3ULL, + 0xbad65a25a73d0bdcULL, + 0x710d64410c4b16bdULL, + 0xc22328ff0fec49d2ULL, + 0x85895216a40bb6e6ULL, + 0x36a71ea8a7ace989ULL, + 0x0adda7c5f3c4488eULL, + 0xb9f3eb7bf06317e1ULL, + 0xfe5991925b84e8d5ULL, + 0x4d77dd2c5823b7baULL, + 0x64b62bcaebc387a1ULL, + 0xd7986774e864d8ceULL, + 0x90321d9d438327faULL, + 0x231c512340247895ULL, + 0x1f66e84e144cd992ULL, + 0xac48a4f017eb86fdULL, + 0xebe2de19bc0c79c9ULL, + 0x58cc92a7bfab26a6ULL, + 0x9317acc314dd3bc7ULL, + 0x2039e07d177a64a8ULL, + 0x67939a94bc9d9b9cULL, + 0xd4bdd62abf3ac4f3ULL, + 0xe8c76f47eb5265f4ULL, + 0x5be923f9e8f53a9bULL, + 0x1c4359104312c5afULL, + 0xaf6d15ae40b59ac0ULL, + 0x192d8af2baf0e1e8ULL, + 0xaa03c64cb957be87ULL, + 0xeda9bca512b041b3ULL, + 0x5e87f01b11171edcULL, + 0x62fd4976457fbfdbULL, + 0xd1d305c846d8e0b4ULL, + 0x96797f21ed3f1f80ULL, + 0x2557339fee9840efULL, + 0xee8c0dfb45ee5d8eULL, + 0x5da24145464902e1ULL, + 0x1a083bacedaefdd5ULL, + 0xa9267712ee09a2baULL, + 0x955cce7fba6103bdULL, + 0x267282c1b9c65cd2ULL, + 0x61d8f8281221a3e6ULL, + 0xd2f6b4961186fc89ULL, + 0x9f8169ba49a54b33ULL, + 0x2caf25044a02145cULL, + 0x6b055fede1e5eb68ULL, + 0xd82b1353e242b407ULL, + 0xe451aa3eb62a1500ULL, + 0x577fe680b58d4a6fULL, + 0x10d59c691e6ab55bULL, + 0xa3fbd0d71dcdea34ULL, + 0x6820eeb3b6bbf755ULL, + 0xdb0ea20db51ca83aULL, + 0x9ca4d8e41efb570eULL, + 0x2f8a945a1d5c0861ULL, + 0x13f02d374934a966ULL, + 0xa0de61894a93f609ULL, + 0xe7741b60e174093dULL, + 0x545a57dee2d35652ULL, + 0xe21ac88218962d7aULL, + 0x5134843c1b317215ULL, + 0x169efed5b0d68d21ULL, + 0xa5b0b26bb371d24eULL, + 0x99ca0b06e7197349ULL, + 0x2ae447b8e4be2c26ULL, + 0x6d4e3d514f59d312ULL, + 0xde6071ef4cfe8c7dULL, + 0x15bb4f8be788911cULL, + 0xa6950335e42fce73ULL, + 0xe13f79dc4fc83147ULL, + 0x521135624c6f6e28ULL, + 0x6e6b8c0f1807cf2fULL, + 0xdd45c0b11ba09040ULL, + 0x9aefba58b0476f74ULL, + 0x29c1f6e6b3e0301bULL, + 0xc96c5795d7870f42ULL, + 0x7a421b2bd420502dULL, + 0x3de861c27fc7af19ULL, + 0x8ec62d7c7c60f076ULL, + 0xb2bc941128085171ULL, + 0x0192d8af2baf0e1eULL, + 0x4638a2468048f12aULL, + 0xf516eef883efae45ULL, + 0x3ecdd09c2899b324ULL, + 0x8de39c222b3eec4bULL, + 0xca49e6cb80d9137fULL, + 0x7967aa75837e4c10ULL, + 0x451d1318d716ed17ULL, + 0xf6335fa6d4b1b278ULL, + 0xb199254f7f564d4cULL, + 0x02b769f17cf11223ULL, + 0xb4f7f6ad86b4690bULL, + 0x07d9ba1385133664ULL, + 0x4073c0fa2ef4c950ULL, + 0xf35d8c442d53963fULL, + 0xcf273529793b3738ULL, + 0x7c0979977a9c6857ULL, + 0x3ba3037ed17b9763ULL, + 0x888d4fc0d2dcc80cULL, + 0x435671a479aad56dULL, + 0xf0783d1a7a0d8a02ULL, + 0xb7d247f3d1ea7536ULL, + 0x04fc0b4dd24d2a59ULL, + 0x3886b22086258b5eULL, + 0x8ba8fe9e8582d431ULL, + 0xcc0284772e652b05ULL, + 0x7f2cc8c92dc2746aULL, + 0x325b15e575e1c3d0ULL, + 0x8175595b76469cbfULL, + 0xc6df23b2dda1638bULL, + 0x75f16f0cde063ce4ULL, + 0x498bd6618a6e9de3ULL, + 0xfaa59adf89c9c28cULL, + 0xbd0fe036222e3db8ULL, + 0x0e21ac88218962d7ULL, + 0xc5fa92ec8aff7fb6ULL, + 0x76d4de52895820d9ULL, + 0x317ea4bb22bfdfedULL, + 0x8250e80521188082ULL, + 0xbe2a516875702185ULL, + 0x0d041dd676d77eeaULL, + 0x4aae673fdd3081deULL, + 0xf9802b81de97deb1ULL, + 0x4fc0b4dd24d2a599ULL, + 0xfceef8632775faf6ULL, + 0xbb44828a8c9205c2ULL, + 0x086ace348f355aadULL, + 0x34107759db5dfbaaULL, + 0x873e3be7d8faa4c5ULL, + 0xc094410e731d5bf1ULL, + 0x73ba0db070ba049eULL, + 0xb86133d4dbcc19ffULL, + 0x0b4f7f6ad86b4690ULL, + 0x4ce50583738cb9a4ULL, + 0xffcb493d702be6cbULL, + 0xc3b1f050244347ccULL, + 0x709fbcee27e418a3ULL, + 0x3735c6078c03e797ULL, + 0x841b8ab98fa4b8f8ULL, + 0xadda7c5f3c4488e3ULL, + 0x1ef430e13fe3d78cULL, + 0x595e4a08940428b8ULL, + 0xea7006b697a377d7ULL, + 0xd60abfdbc3cbd6d0ULL, + 0x6524f365c06c89bfULL, + 0x228e898c6b8b768bULL, + 0x91a0c532682c29e4ULL, + 0x5a7bfb56c35a3485ULL, + 0xe955b7e8c0fd6beaULL, + 0xaeffcd016b1a94deULL, + 0x1dd181bf68bdcbb1ULL, + 0x21ab38d23cd56ab6ULL, + 0x9285746c3f7235d9ULL, + 0xd52f0e859495caedULL, + 0x6601423b97329582ULL, + 0xd041dd676d77eeaaULL, + 0x636f91d96ed0b1c5ULL, + 0x24c5eb30c5374ef1ULL, + 0x97eba78ec690119eULL, + 0xab911ee392f8b099ULL, + 0x18bf525d915feff6ULL, + 0x5f1528b43ab810c2ULL, + 0xec3b640a391f4fadULL, + 0x27e05a6e926952ccULL, + 0x94ce16d091ce0da3ULL, + 0xd3646c393a29f297ULL, + 0x604a2087398eadf8ULL, + 0x5c3099ea6de60cffULL, + 0xef1ed5546e415390ULL, + 0xa8b4afbdc5a6aca4ULL, + 0x1b9ae303c601f3cbULL, + 0x56ed3e2f9e224471ULL, + 0xe5c372919d851b1eULL, + 0xa26908783662e42aULL, + 0x114744c635c5bb45ULL, + 0x2d3dfdab61ad1a42ULL, + 0x9e13b115620a452dULL, + 0xd9b9cbfcc9edba19ULL, + 0x6a978742ca4ae576ULL, + 0xa14cb926613cf817ULL, + 0x1262f598629ba778ULL, + 0x55c88f71c97c584cULL, + 0xe6e6c3cfcadb0723ULL, + 0xda9c7aa29eb3a624ULL, + 0x69b2361c9d14f94bULL, + 0x2e184cf536f3067fULL, + 0x9d36004b35545910ULL, + 0x2b769f17cf112238ULL, + 0x9858d3a9ccb67d57ULL, + 0xdff2a94067518263ULL, + 0x6cdce5fe64f6dd0cULL, + 0x50a65c93309e7c0bULL, + 0xe388102d33392364ULL, + 0xa4226ac498dedc50ULL, + 0x170c267a9b79833fULL, + 0xdcd7181e300f9e5eULL, + 0x6ff954a033a8c131ULL, + 0x28532e49984f3e05ULL, + 0x9b7d62f79be8616aULL, + 0xa707db9acf80c06dULL, + 0x14299724cc279f02ULL, + 0x5383edcd67c06036ULL, + 0xe0ada17364673f59ULL + } +}; + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void) +{ + return CRC64_ECMA_182.seed; +} +EXPORT_SYMBOL(crc64_ecma_seed); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * pdata: pointer to the data to compute checksum for. + * nbytes: number of bytes in data buffer. + * seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed) +{ + unsigned int i; + u64 crc = seed; + + for (i = 0; i < nbytes; i++) + crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^ + (crc >> 8); + + return crc; +} +EXPORT_SYMBOL(crc64_ecma); + +MODULE_DESCRIPTION("CRC64 ECMA function"); +MODULE_AUTHOR("Freescale Semiconductor Inc."); +MODULE_LICENSE("GPL"); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3d2aa27b845b..2cbd6ef17a40 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -65,7 +65,8 @@ static struct kmem_cache *radix_tree_node_cachep; */ struct radix_tree_preload { int nr; - struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE]; + /* nodes->private_data points to next preallocated node */ + struct radix_tree_node *nodes; }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; @@ -197,8 +198,9 @@ radix_tree_node_alloc(struct radix_tree_root *root) */ rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { - ret = rtp->nodes[rtp->nr - 1]; - rtp->nodes[rtp->nr - 1] = NULL; + ret = rtp->nodes; + rtp->nodes = ret->private_data; + ret->private_data = NULL; rtp->nr--; } /* @@ -257,17 +259,20 @@ static int __radix_tree_preload(gfp_t gfp_mask) preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { + while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - if (rtp->nr < ARRAY_SIZE(rtp->nodes)) - rtp->nodes[rtp->nr++] = node; - else + if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + node->private_data = rtp->nodes; + rtp->nodes = node; + rtp->nr++; + } else { kmem_cache_free(radix_tree_node_cachep, node); + } } ret = 0; out: @@ -1463,15 +1468,16 @@ static int radix_tree_callback(struct notifier_block *nfb, { int cpu = (long)hcpu; struct radix_tree_preload *rtp; + struct radix_tree_node *node; /* Free per-cpu pool of perloaded nodes */ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { - kmem_cache_free(radix_tree_node_cachep, - rtp->nodes[rtp->nr-1]); - rtp->nodes[rtp->nr-1] = NULL; - rtp->nr--; + node = rtp->nodes; + rtp->nodes = node->private_data; + kmem_cache_free(radix_tree_node_cachep, node); + rtp->nr--; } } return NOTIFY_OK; diff --git a/lib/scatterlist.c b/lib/scatterlist.c index c9f2e8c6ccc9..e2374ec4e6a3 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -669,9 +669,9 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, * **/ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen) + const void *buf, size_t buflen) { - return sg_copy_buffer(sgl, nents, buf, buflen, 0, false); + return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false); } EXPORT_SYMBOL(sg_copy_from_buffer); @@ -697,16 +697,16 @@ EXPORT_SYMBOL(sg_copy_to_buffer); * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from - * @skip: Number of bytes to skip before copying * @buflen: The number of bytes to copy + * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip) + const void *buf, size_t buflen, off_t skip) { - return sg_copy_buffer(sgl, nents, buf, buflen, skip, false); + return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false); } EXPORT_SYMBOL(sg_pcopy_from_buffer); @@ -715,8 +715,8 @@ EXPORT_SYMBOL(sg_pcopy_from_buffer); * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to - * @skip: Number of bytes to skip before copying * @buflen: The number of bytes to copy + * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * diff --git a/lib/sort.c b/lib/sort.c index 43c9fe73ae2e..9c6b229049ce 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -15,6 +15,13 @@ static void u32_swap(void *a, void *b, int size) *(u32 *)b = t; } +static void u64_swap(void *a, void *b, int size) +{ + u64 t = *(u64 *)a; + *(u64 *)a = *(u64 *)b; + *(u64 *)b = t; +} + static void generic_swap(void *a, void *b, int size) { char t; @@ -50,8 +57,32 @@ void sort(void *base, size_t num, size_t size, /* pre-scale counters for performance */ int i = (num/2 - 1) * size, n = num * size, c, r; - if (!swap_func) - swap_func = (size == 4 ? u32_swap : generic_swap); + if (!swap_func) { +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + switch (size) { + case 4: + swap_func = u32_swap; + break; + case 8: + swap_func = u64_swap; + break; + } +#else + switch (size) { + case 4: + if (((unsigned long)base & 3) == 0) + swap_func = u32_swap; + break; + case 8: + if (((unsigned long)base & 7) == 0) + swap_func = u64_swap; + break; + } +#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + + if (!swap_func) + swap_func = generic_swap; + } /* heapify */ for ( ; i >= 0; i -= size) { diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c index c227cc43ec0a..5241df36eedf 100644 --- a/lib/test-hexdump.c +++ b/lib/test-hexdump.c @@ -25,19 +25,19 @@ static const char * const test_data_1_le[] __initconst = { "4c", "d1", "19", "99", "43", "b1", "af", "0c", }; -static const char *test_data_2_le[] __initdata = { +static const char * const test_data_2_le[] __initconst = { "32be", "7bdb", "180a", "b293", "ba70", "24c4", "837d", "9b34", "9ca6", "ad31", "0f9c", "e9ac", "d14c", "9919", "b143", "0caf", }; -static const char *test_data_4_le[] __initdata = { +static const char * const test_data_4_le[] __initconst = { "7bdb32be", "b293180a", "24c4ba70", "9b34837d", "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", }; -static const char *test_data_8_le[] __initdata = { +static const char * const test_data_8_le[] __initconst = { "b293180a7bdb32be", "9b34837d24c4ba70", "e9ac0f9cad319ca6", "0cafb1439919d14c", }; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index da39c608a28c..8243e2fb1e6b 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1360,6 +1360,21 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, } } +static noinline_for_stack +char *comm_name(char *buf, char *end, struct task_struct *tsk, + struct printf_spec spec, const char *fmt) +{ + char name[TASK_COMM_LEN]; + + /* Caller can pass NULL instead of current. */ + if (!tsk) + tsk = current; + /* Not using get_task_comm() in case I'm in IRQ context. */ + memcpy(name, tsk->comm, TASK_COMM_LEN); + name[sizeof(name) - 1] = '\0'; + return string(buf, end, name, spec); +} + int kptr_restrict __read_mostly; /* @@ -1447,6 +1462,7 @@ int kptr_restrict __read_mostly; * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cr' For a clock, it prints the current rate of the clock + * - 'T' task_struct->comm * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -1458,7 +1474,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, { int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0); - if (!ptr && *fmt != 'K') { + if (!ptr && *fmt != 'K' && *fmt != 'T') { /* * Print (null) with the same width as a pointer so it makes * tabular output look nice. @@ -1597,6 +1613,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return dentry_name(buf, end, ((const struct file *)ptr)->f_path.dentry, spec, fmt); + case 'T': + return comm_name(buf, end, ptr, spec, fmt); } spec.flags |= SMALL; if (spec.field_width == -1) { diff --git a/mm/Kconfig b/mm/Kconfig index 390214da4546..52ffb863383c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -635,3 +635,21 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. diff --git a/mm/bootmem.c b/mm/bootmem.c index 477be696511d..a23dd1934654 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) return 0; @@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long cur = start; + cur = start; start = ALIGN(start + 1, BITS_PER_LONG); while (vec && cur != start) { if (vec & 1) { page = pfn_to_page(cur); - __free_pages_bootmem(page, 0); + __free_pages_bootmem(page, cur, 0); count++; } vec >>= 1; @@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } } + cur = bdata->node_min_pfn; page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) - __free_pages_bootmem(page++, 0); + __free_pages_bootmem(page++, cur++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/compaction.c b/mm/compaction.c index 018f08da99a2..6ef2fdf1d6b6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -732,18 +732,18 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * splitting and collapsing (collapsing has already happened * if PageLRU is set) but the lock is not necessarily taken * here and it is wasteful to take it just to check transhuge. - * Check TransHuge without lock and skip the whole pageblock if - * it's either a transhuge or hugetlbfs page, as calling + * Check PageCompound without lock and skip the whole pageblock + * if it's either a transhuge or hugetlbfs page, as calling * compound_order() without preventing THP from splitting the * page underneath us may return surprising results. */ - if (PageTransHuge(page)) { - if (!locked) - low_pfn = ALIGN(low_pfn + 1, - pageblock_nr_pages) - 1; + if (PageCompound(page)) { + int nr; + if (locked) + nr = 1 << compound_order(page); else - low_pfn += (1 << compound_order(page)) - 1; - + nr = pageblock_nr_pages; + low_pfn = ALIGN(low_pfn + 1, nr) - 1; continue; } @@ -763,11 +763,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!locked) break; - /* Recheck PageLRU and PageTransHuge under lock */ + /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) continue; - if (PageTransHuge(page)) { - low_pfn += (1 << compound_order(page)) - 1; + if (PageCompound(page)) { + int nr = 1 << compound_order(page); + low_pfn = ALIGN(low_pfn + 1, nr) - 1; continue; } } @@ -778,7 +779,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (__isolate_lru_page(page, isolate_mode) != 0) continue; - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_PAGE(PageCompound(page), page); /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); diff --git a/mm/filemap.c b/mm/filemap.c index 6bf5e42d560a..6ad0a8053b96 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -615,11 +615,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret; - __set_page_locked(page); + __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow); if (unlikely(ret)) - __clear_page_locked(page); + __ClearPageLocked(page); else { /* * The page might have been evicted from cache only @@ -742,6 +742,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { + page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_atomic(); @@ -806,18 +807,20 @@ EXPORT_SYMBOL_GPL(page_endio); */ void __lock_page(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, + __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); int __lock_page_killable(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - return __wait_on_bit_lock(page_waitqueue(page), &wait, + return __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 078832cf3636..cb8904cf5fc6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1174,7 +1174,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1384,6 +1384,36 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr) + +{ + spinlock_t *ptl; + struct mm_struct *mm = tlb->mm; + int ret = 1; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + struct page *page; + pmd_t orig_pmd; + + orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd); + + /* No hugepage in swapcache */ + page = pmd_page(orig_pmd); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + spin_unlock(ptl); + ret = 0; + } + + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1396,12 +1426,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; /* * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_get_and_clear. So do the + * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { @@ -1459,7 +1489,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl)) { @@ -1505,7 +1535,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mkwrite(entry); @@ -1599,6 +1629,11 @@ unlock: return NULL; } +int pmd_freeable(pmd_t pmd) +{ + return !pmd_dirty(pmd); +} + static int __split_huge_page_splitting(struct page *page, struct vm_area_struct *vma, unsigned long address) @@ -1710,7 +1745,7 @@ static void __split_huge_page_refcount(struct page *page, */ page_tail->_mapcount = page->_mapcount; - BUG_ON(page_tail->mapping); + BUG_ON(page_tail->mapping != TAIL_MAPPING); page_tail->mapping = page->mapping; page_tail->index = page->index + i; @@ -2499,7 +2534,7 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2799,7 +2834,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2820,8 +2855,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; @@ -2865,7 +2898,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 271e4432734c..54f129dc37f6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1188,7 +1188,7 @@ static void dissolve_free_huge_page(struct page *page) */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned int order = 8 * sizeof(void *); + unsigned int order = UINT_MAX; unsigned long pfn; struct hstate *h; @@ -1200,6 +1200,7 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) if (order > huge_page_order(h)) order = huge_page_order(h); VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + VM_BUG_ON(order == UINT_MAX); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) dissolve_free_huge_page(pfn_to_page(pfn)); } @@ -3789,6 +3790,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { return NULL; } + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4ca5fe0042e1..bf73ac17dad4 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val) /* * This implies unable to support free buddy pages. */ - if (!get_page_unless_zero(hpage)) + if (!get_hwpoison_page(p)) return 0; if (!hwpoison_filter_enable) @@ -58,7 +58,7 @@ inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); put_out: - put_page(hpage); + put_page(p); return 0; } diff --git a/mm/internal.h b/mm/internal.h index a25e359a4039..36b23f1e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); @@ -361,10 +362,7 @@ do { \ } while (0) extern void mminit_verify_pageflags_layout(void); -extern void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn); extern void mminit_verify_zonelist(void); - #else static inline void mminit_dprintk(enum mminit_level level, @@ -376,11 +374,6 @@ static inline void mminit_verify_pageflags_layout(void) { } -static inline void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn) -{ -} - static inline void mminit_verify_zonelist(void) { } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4986b0acab21..c242adf6bc85 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,7 +7,6 @@ #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) #define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5405aff5a590..f0fe4f2c1fa7 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -115,7 +115,8 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOACCOUNT)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) @@ -1884,7 +1884,7 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); - __set_page_locked(new_page); + __SetPageLocked(new_page); } return new_page; diff --git a/mm/madvise.c b/mm/madvise.c index d551475517bf..22e8f0ca7040 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -19,6 +19,14 @@ #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlb.h> + +struct madvise_free_private { + struct vm_area_struct *vma; + struct mmu_gather *tlb; +}; /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -31,6 +39,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -254,6 +263,164 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct madvise_free_private *fp = walk->private; + struct mmu_gather *tlb = fp->tlb; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = fp->vma; + spinlock_t *ptl; + pte_t *pte, ptent; + struct page *page; + swp_entry_t entry; + unsigned long next; + int nr_swap = 0; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma, addr, pmd); + else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr)) + goto next; + /* fall through */ + } + + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + /* + * If the pte has swp_entry, just clear page table to + * prevent swap-in which is more expensive rather than + * (page allocation + zeroing). + */ + if (!pte_present(ptent)) { + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry)) + continue; + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + continue; + } + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + if (PageSwapCache(page)) { + if (!trylock_page(page)) + continue; + + if (!try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + if (PageActive(page)) + deactivate_page(page); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + if (nr_swap) { + if (current->mm == mm) + sync_mm_rss(mm); + + add_mm_counter(mm, MM_SWAPENTS, nr_swap); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +next: + cond_resched(); + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_free_private fp = { + .vma = vma, + .tlb = tlb, + }; + + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = &fp, + }; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (vma->vm_file) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -377,6 +544,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -396,6 +571,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/memblock.c b/mm/memblock.c index 9318b567ed79..0a988ed82886 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -779,6 +779,38 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** + * __next_reserved_mem_region - next function for for_each_reserved_region() + * @idx: pointer to u64 loop variable + * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL + * + * Iterate over all reserved memory regions. + */ +void __init_memblock __next_reserved_mem_region(u64 *idx, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + struct memblock_type *rsv = &memblock.reserved; + + if (*idx >= 0 && *idx < rsv->cnt) { + struct memblock_region *r = &rsv->regions[*idx]; + phys_addr_t base = r->base; + phys_addr_t size = r->size; + + if (out_start) + *out_start = base; + if (out_end) + *out_end = base + size - 1; + + *idx += 1; + return; + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes @@ -1316,7 +1348,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 501820c815b3..8cbe23ac1056 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -20,6 +20,14 @@ * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back @@ -28,13 +36,6 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -56,6 +57,7 @@ #include <linux/mm_inline.h> #include <linux/kfifo.h> #include "internal.h" +#include "ras/ras_event.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -503,68 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill, kfree(tk); } -/* - * Error handlers for various types of pages. - */ - -enum outcome { - IGNORED, /* Error: cannot be handled */ - FAILED, /* Error: handling failed */ - DELAYED, /* Will be handled later */ - RECOVERED, /* Successfully recovered */ -}; - static const char *action_name[] = { - [IGNORED] = "Ignored", - [FAILED] = "Failed", - [DELAYED] = "Delayed", - [RECOVERED] = "Recovered", -}; - -enum action_page_type { - MSG_KERNEL, - MSG_KERNEL_HIGH_ORDER, - MSG_SLAB, - MSG_DIFFERENT_COMPOUND, - MSG_POISONED_HUGE, - MSG_HUGE, - MSG_FREE_HUGE, - MSG_UNMAP_FAILED, - MSG_DIRTY_SWAPCACHE, - MSG_CLEAN_SWAPCACHE, - MSG_DIRTY_MLOCKED_LRU, - MSG_CLEAN_MLOCKED_LRU, - MSG_DIRTY_UNEVICTABLE_LRU, - MSG_CLEAN_UNEVICTABLE_LRU, - MSG_DIRTY_LRU, - MSG_CLEAN_LRU, - MSG_TRUNCATED_LRU, - MSG_BUDDY, - MSG_BUDDY_2ND, - MSG_UNKNOWN, + [MF_IGNORED] = "Ignored", + [MF_FAILED] = "Failed", + [MF_DELAYED] = "Delayed", + [MF_RECOVERED] = "Recovered", }; static const char * const action_page_types[] = { - [MSG_KERNEL] = "reserved kernel page", - [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", - [MSG_SLAB] = "kernel slab page", - [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MSG_POISONED_HUGE] = "huge page already hardware poisoned", - [MSG_HUGE] = "huge page", - [MSG_FREE_HUGE] = "free huge page", - [MSG_UNMAP_FAILED] = "unmapping failed page", - [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", - [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", - [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", - [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", - [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", - [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", - [MSG_DIRTY_LRU] = "dirty LRU page", - [MSG_CLEAN_LRU] = "clean LRU page", - [MSG_TRUNCATED_LRU] = "already truncated LRU page", - [MSG_BUDDY] = "free buddy page", - [MSG_BUDDY_2ND] = "free buddy page (2nd try)", - [MSG_UNKNOWN] = "unknown page", + [MF_MSG_KERNEL] = "reserved kernel page", + [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MF_MSG_SLAB] = "kernel slab page", + [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MF_MSG_HUGE] = "huge page", + [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_UNMAP_FAILED] = "unmapping failed page", + [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MF_MSG_DIRTY_LRU] = "dirty LRU page", + [MF_MSG_CLEAN_LRU] = "clean LRU page", + [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MF_MSG_BUDDY] = "free buddy page", + [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MF_MSG_UNKNOWN] = "unknown page", }; /* @@ -598,7 +566,7 @@ static int delete_from_lru_cache(struct page *p) */ static int me_kernel(struct page *p, unsigned long pfn) { - return IGNORED; + return MF_IGNORED; } /* @@ -607,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); - return FAILED; + return MF_FAILED; } /* @@ -616,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn) static int me_pagecache_clean(struct page *p, unsigned long pfn) { int err; - int ret = FAILED; + int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -626,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * should be the one m_f() holds. */ if (PageAnon(p)) - return RECOVERED; + return MF_RECOVERED; /* * Now truncate the page in the page cache. This is really @@ -640,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return FAILED; + return MF_FAILED; } /* @@ -657,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { - ret = RECOVERED; + ret = MF_RECOVERED; } } else { /* @@ -665,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * This fails on dirty or anything with private pages */ if (invalidate_inode_page(p)) - ret = RECOVERED; + ret = MF_RECOVERED; else printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", pfn); @@ -751,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ClearPageUptodate(p); if (!delete_from_lru_cache(p)) - return DELAYED; + return MF_DELAYED; else - return FAILED; + return MF_FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) @@ -761,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) delete_from_swap_cache(p); if (!delete_from_lru_cache(p)) - return RECOVERED; + return MF_RECOVERED; else - return FAILED; + return MF_FAILED; } /* @@ -776,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + + if (!PageHuge(hpage)) + return MF_DELAYED; + /* * We can safely recover from error on free or reserved (i.e. * not in-use) hugepage by dequeuing it from freelist. @@ -789,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); if (!res) - return RECOVERED; + return MF_RECOVERED; } - return DELAYED; + return MF_DELAYED; } /* @@ -823,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) static struct page_state { unsigned long mask; unsigned long res; - enum action_page_type type; + enum mf_action_page_type type; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, MSG_KERNEL, me_kernel }, + { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -837,31 +809,31 @@ static struct page_state { * currently unused objects without touching them. But just * treat it as standard kernel for now. */ - { slab, slab, MSG_SLAB, me_kernel }, + { slab, slab, MF_MSG_SLAB, me_kernel }, #ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, MSG_HUGE, me_huge_page }, - { tail, tail, MSG_HUGE, me_huge_page }, + { head, head, MF_MSG_HUGE, me_huge_page }, + { tail, tail, MF_MSG_HUGE, me_huge_page }, #else - { compound, compound, MSG_HUGE, me_huge_page }, + { compound, compound, MF_MSG_HUGE, me_huge_page }, #endif - { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, - { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, + { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, - { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, - { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, + { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, - { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, - { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, + { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, - { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, - { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, + { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, /* * Catchall entry: must be at end. */ - { 0, 0, MSG_UNKNOWN, me_unknown }, + { 0, 0, MF_MSG_UNKNOWN, me_unknown }, }; #undef dirty @@ -881,8 +853,11 @@ static struct page_state { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum action_page_type type, int result) +static void action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { + trace_memory_failure_event(pfn, type, result); + pr_err("MCE %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -896,13 +871,13 @@ static int page_action(struct page_state *ps, struct page *p, result = ps->action(p, pfn); count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == DELAYED) + if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { printk(KERN_ERR "MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); - result = FAILED; + result = MF_FAILED; } action_result(pfn, ps->type, result); @@ -911,10 +886,33 @@ static int page_action(struct page_state *ps, struct page *p, * Could adjust zone counters here to correct for the missing page. */ - return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } /* + * Get refcount for memory error handling: + * - @page: raw page + */ +int get_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) + return get_page_unless_zero(head); + else if (PageTransHuge(head)) + if (get_page_unless_zero(head)) { + if (PageTail(page)) + get_page(page); + return 1; + } else { + return 0; + } + else + return get_page_unless_zero(page); +} +EXPORT_SYMBOL_GPL(get_hwpoison_page); + +/* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ @@ -927,7 +925,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1, forcekill; struct page *hpage = *hpagep; - struct page *ppage; /* * Here we are interested only in user-mapped pages, so skip any @@ -977,59 +974,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* - * ppage: poisoned page - * if p is regular page(4k page) - * ppage == real poisoned page; - * else p is hugetlb or THP, ppage == head page. - */ - ppage = hpage; - - if (PageTransHuge(hpage)) { - /* - * Verify that this isn't a hugetlbfs head page, the check for - * PageAnon is just for avoid tripping a split_huge_page - * internal debug check, as split_huge_page refuses to deal with - * anything that isn't an anon page. PageAnon can't go away fro - * under us because we hold a refcount on the hpage, without a - * refcount on the hpage. split_huge_page can't be safely called - * in the first place, having a refcount on the tail isn't - * enough * to be safe. - */ - if (!PageHuge(hpage) && PageAnon(hpage)) { - if (unlikely(split_huge_page(hpage))) { - /* - * FIXME: if splitting THP is failed, it is - * better to stop the following operation rather - * than causing panic by unmapping. System might - * survive if the page is freed later. - */ - printk(KERN_INFO - "MCE %#lx: failed to split THP\n", pfn); - - BUG_ON(!PageHWPoison(p)); - return SWAP_FAIL; - } - /* - * We pinned the head page for hwpoison handling, - * now we split the thp and we are interested in - * the hwpoisoned raw page, so move the refcount - * to it. Similarly, page lock is shifted. - */ - if (hpage != p) { - if (!(flags & MF_COUNT_INCREASED)) { - put_page(hpage); - get_page(p); - } - lock_page(p); - unlock_page(hpage); - *hpagep = p; - } - /* THP is split, so ppage should be the real poisoned page. */ - ppage = p; - } - } - - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -1038,12 +982,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(ppage, ttu); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(ppage)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1055,7 +999,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); @@ -1101,6 +1045,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page_state *ps; struct page *p; struct page *hpage; + struct page *orig_head; int res; unsigned int nr_pages; unsigned long page_flags; @@ -1116,7 +1061,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - hpage = compound_head(p); + orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; @@ -1149,10 +1094,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(hpage)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { - action_result(pfn, MSG_BUDDY, DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; } else if (PageHuge(hpage)) { /* @@ -1169,37 +1113,52 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, MSG_FREE_HUGE, - res ? IGNORED : DELAYED); + action_result(pfn, MF_MSG_FREE_HUGE, + res ? MF_IGNORED : MF_DELAYED); unlock_page(hpage); return res; } else { - action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + return -EBUSY; + } + } + + if (!PageHuge(p) && PageTransHuge(hpage)) { + if (!PageAnon(hpage)) { + pr_info("MCE: %#lx: non anonymous thp", pfn); + put_page(p); + return -EBUSY; + } + if (unlikely(split_huge_page(hpage))) { + pr_info("MCE: %#lx: thp split failed", pfn); + put_page(p); return -EBUSY; } + VM_BUG_ON_PAGE(!page_count(p), p); + hpage = compound_head(p); } /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others - * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageLocked() * - to avoid races with __SetPageSlab*() (and more non-atomic ops) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ if (!PageHuge(p)) { - if (!PageLRU(hpage)) - shake_page(hpage, 0); - if (!PageLRU(hpage)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { /* * shake_page could have turned it free. */ if (is_free_buddy_page(p)) { if (flags & MF_COUNT_INCREASED) - action_result(pfn, MSG_BUDDY, DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); else - action_result(pfn, MSG_BUDDY_2ND, - DELAYED); + action_result(pfn, MF_MSG_BUDDY_2ND, + MF_DELAYED); return 0; } } @@ -1211,8 +1170,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The page could have changed compound pages during the locking. * If this happens just bail out. */ - if (compound_head(p) != hpage) { - action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); + if (PageCompound(p) && compound_head(p) != orig_head) { + action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto out; } @@ -1252,7 +1211,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * on the head page to show that the hugepage is hwpoisoned */ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, MSG_POISONED_HUGE, IGNORED); + action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); unlock_page(hpage); put_page(hpage); return 0; @@ -1281,7 +1240,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - action_result(pfn, MSG_UNMAP_FAILED, IGNORED); + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; } @@ -1290,7 +1249,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); + action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; goto out; } @@ -1443,19 +1402,12 @@ int unpoison_memory(unsigned long pfn) return 0; } - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - pr_info("MCE: Memory failure is now running on %#lx\n", pfn); - return 0; - } - - nr_pages = 1 << compound_order(page); + if (PageHuge(page)) + nr_pages = 1 << compound_order(page); + else + nr_pages = 1; - if (!get_page_unless_zero(page)) { + if (!get_hwpoison_page(p)) { /* * Since HWPoisoned hugepage should have non-zero refcount, * race between memory failure and unpoison seems to happen. @@ -1479,7 +1431,7 @@ int unpoison_memory(unsigned long pfn) * the PG_hwpoison page will be caught and isolated on the entrance to * the free buddy page pool. */ - if (TestClearPageHWPoison(page)) { + if (TestClearPageHWPoison(p)) { pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &num_poisoned_pages); freeit = 1; @@ -1488,9 +1440,9 @@ int unpoison_memory(unsigned long pfn) } unlock_page(page); - put_page(page); + put_page(p); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_page(page); + put_page(p); return 0; } @@ -1523,7 +1475,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) * When the target page is a free hugepage, just remove it * from free hugepage list. */ - if (!get_page_unless_zero(compound_head(p))) { + if (!get_hwpoison_page(p)) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = 0; @@ -1694,20 +1646,7 @@ static int __soft_offline_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - /* - * After page migration succeeds, the source page can - * be trapped in pagevec and actual freeing is delayed. - * Freeing code works differently based on PG_hwpoison, - * so there's a race. We need to make sure that the - * source page should be freed back to buddy before - * setting PG_hwpoison. - */ - if (!is_free_buddy_page(page)) - drain_all_pages(page_zone(page)); SetPageHWPoison(page); - if (!is_free_buddy_page(page)) - pr_info("soft offline: %#lx: page leaked\n", - pfn); atomic_long_inc(&num_poisoned_pages); } } else { @@ -1759,14 +1698,6 @@ int soft_offline_page(struct page *page, int flags) get_online_mems(); - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - set_migratetype_isolate(page, true); - ret = get_any_page(page, pfn, flags); put_online_mems(); if (ret > 0) { /* for in-use pages */ @@ -1785,6 +1716,5 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } - unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 22e037e3364e..d1fa0c1fad0b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3067,7 +3067,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = fault_page->mapping; + mapping = page_rmapping(fault_page); unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 457bde530cbe..c6a8d95c5dc7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1332,7 +1332,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) is belongs to the same zone. + * Confirm all pages in a range [start, end) belong to the same zone. */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { @@ -1343,10 +1343,11 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) - i++; + /* Find the first valid pfn in this pageblock */ + for (i = 0; i < MAX_ORDER_NR_PAGES; i++) { + if (pfn_valid(pfn + i)) + break; + } if (i == MAX_ORDER_NR_PAGES) continue; page = pfn_to_page(pfn + i); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ede26291d4aa..747743237d9f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2518,7 +2518,7 @@ static void __init check_numabalancing_enable(void) if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); - if (nr_node_ids > 1 && !numabalancing_override) { + if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. " "Configure with numa_balancing= or the " "kernel.numa_balancing sysctl", diff --git a/mm/migrate.c b/mm/migrate.c index f53838fe3dfe..236ee25e79d9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -918,7 +918,8 @@ out: static ICE_noinline int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, - int force, enum migrate_mode mode) + int force, enum migrate_mode mode, + enum migrate_reason reason) { int rc = 0; int *result = NULL; @@ -949,7 +950,8 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + if (reason != MR_MEMORY_FAILURE) + putback_lru_page(page); } /* @@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, pass > 2, mode); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode); + private, page, pass > 2, mode, + reason); switch(rc) { case -ENOMEM: @@ -1746,7 +1749,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ @@ -1796,7 +1799,7 @@ fail_putback: */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start); - pmdp_clear_flush_notify(vma, mmun_start, pmd); + pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); diff --git a/mm/mm_init.c b/mm/mm_init.c index 5f420f7fafa1..fdadf918de76 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/memory.h> #include <linux/notifier.h> +#include <linux/sched.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void) BUG_ON(or_mask != add_mask); } -void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, - unsigned long nid, unsigned long pfn) -{ - BUG_ON(page_to_nid(page) != nid); - BUG_ON(page_zonenum(page) != zone); - BUG_ON(page_to_pfn(page) != pfn); -} - static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/mm/mmap.c b/mm/mmap.c index bb50cacc3ea5..9c1c5b9de59c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1280,7 +1280,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -ENOMEM; /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (len >> PAGE_SHIFT) - 1) < pgoff) return -EOVERFLOW; /* Too many mappings? */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 88584838e704..e7d6f1171ecb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -322,6 +324,15 @@ success: change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + /* + * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major + * fault on access. + */ + if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && + (newflags & VM_WRITE)) { + populate_vma_page_range(vma, start, end, NULL); + } + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); diff --git a/mm/mremap.c b/mm/mremap.c index 034e2d360652..a7c93eceb1c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include <linux/mmu_notifier.h> #include <linux/sched/sysctl.h> #include <linux/uaccess.h> +#include <linux/mm-arch-hooks.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, old_addr, - moved_len, true); - return err; + } else { + if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, + old_addr, moved_len, true); + return err; + } } + arch_remap(mm, old_addr, old_addr + old_len, + new_addr, new_addr + new_len); } /* Conceal VM_ACCOUNT so old reservation is not undone */ diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 90b50468333e..bae652713ee5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -77,7 +77,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -92,7 +92,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order); } @@ -121,6 +121,9 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2fd31aebef30..afd5459c0420 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/interrupt.h> +#include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> @@ -61,6 +62,7 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> #include <linux/page_owner.h> +#include <linux/kthread.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -235,6 +237,77 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __meminit early_page_uninitialised(unsigned long pfn) +{ + int nid = early_pfn_to_nid(pfn); + + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -373,6 +446,7 @@ void prep_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); + p->mapping = TAIL_MAPPING; p->first_page = page; /* Make sure p->first_page is always valid for PageTail() */ smp_wmb(); @@ -380,20 +454,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -static inline void prep_zero_page(struct page *page, unsigned int order, - gfp_t gfp_flags) -{ - int i; - - /* - * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO - * and __GFP_HIGHMEM from hard or soft interrupt context. - */ - VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly; @@ -765,6 +825,12 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + page->mapping = NULL; + return 1; + } + page->mapping = NULL; if (!IS_ENABLED(CONFIG_DEBUG_VM)) return 0; if (unlikely(!PageTail(page))) { @@ -778,6 +844,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return 0; } +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + set_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, + int nid) +{ + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + break; + } + __init_single_pfn(pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + SetPageReserved(page); + } + } +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); @@ -832,7 +967,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -852,6 +988,223 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +/* Only safe to use early in boot when initialisation is single-threaded */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __init deferred_free_range(struct page *page, + unsigned long pfn, int nr_pages) +{ + int i; + + if (!page) + return; + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == MAX_ORDER_NR_PAGES && + (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_boot_core(page, pfn, MAX_ORDER-1); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) + __free_pages_boot_core(page, pfn, 0); +} + +static __initdata DECLARE_RWSEM(pgdat_init_rwsem); + +/* Initialise remaining memory on a node */ +static int __init deferred_init_memmap(void *data) +{ + pg_data_t *pgdat = data; + int nid = pgdat->node_id; + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long start = jiffies; + unsigned long nr_pages = 0; + unsigned long walk_start, walk_end; + int i, zid; + struct zone *zone; + unsigned long first_init_pfn = pgdat->first_deferred_pfn; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (first_init_pfn == ULONG_MAX) { + up_read(&pgdat_init_rwsem); + return 0; + } + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { + unsigned long pfn, end_pfn; + struct page *page = NULL; + struct page *free_base_page = NULL; + unsigned long free_base_pfn = 0; + int nr_to_free = 0; + + end_pfn = min(walk_end, zone_end_pfn(zone)); + pfn = first_init_pfn; + if (pfn < walk_start) + pfn = walk_start; + if (pfn < zone->zone_start_pfn) + pfn = zone->zone_start_pfn; + + for (; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + goto free_range; + + /* + * Ensure pfn_valid is checked every + * MAX_ORDER_NR_PAGES for memory holes + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(pfn)) { + page = NULL; + goto free_range; + } + } + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + page = NULL; + goto free_range; + } + + /* Minimise pfn page lookups and scheduler checks */ + if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + page++; + } else { + nr_pages += nr_to_free; + deferred_free_range(free_base_page, + free_base_pfn, nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + + page = pfn_to_page(pfn); + cond_resched(); + } + + if (page->flags) { + VM_BUG_ON(page_zone(page) != zone); + goto free_range; + } + + __init_single_page(page, pfn, zid, nid); + if (!free_base_page) { + free_base_page = page; + free_base_pfn = pfn; + nr_to_free = 0; + } + nr_to_free++; + + /* Where possible, batch up pages for a single free */ + continue; +free_range: + /* Free the current block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, + nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + } + + first_init_pfn = max(end_pfn, first_init_pfn); + } + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, + jiffies_to_msecs(jiffies - start)); + up_read(&pgdat_init_rwsem); + return 0; +} + +void __init page_alloc_init_late(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + down_read(&pgdat_init_rwsem); + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + down_write(&pgdat_init_rwsem); + up_write(&pgdat_init_rwsem); +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -975,7 +1328,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -4203,6 +4557,9 @@ static void setup_zone_migrate_reserve(struct zone *zone) zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) + return; + if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); @@ -4265,15 +4622,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - struct page *page; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4285,14 +4643,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } - page = pfn_to_page(pfn); - set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); - init_page_count(page); - page_mapcount_reset(page); - page_cpupid_reset_last(page); - SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -4307,17 +4662,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (!(pfn & (pageblock_nr_pages - 1))) { + struct page *page = pfn_to_page(pfn); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(pfn << PAGE_SHIFT)); -#endif + __init_single_page(page, pfn, zone, nid); + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } else { + __init_single_pfn(pfn, zone, nid); + } } } @@ -4575,57 +4927,30 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0 && nid != node) - return false; - return true; -} -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -5144,6 +5469,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6111,9 +6437,9 @@ out: return ret; } +#ifdef CONFIG_NUMA int hashdist = HASHDIST_DEFAULT; -#ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 755a42c76eb4..0e69d259beb7 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); - if (!is_migrate_isolate_page(buddy)) { + if (pfn_valid_within(page_to_pfn(buddy)) && + !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); kernel_map_pages(page, (1 << order), 1); set_page_refcounted(page); @@ -177,8 +178,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo: for (pfn = start_pfn; pfn < undo_pfn; - pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page) + unset_migratetype_isolate(page, migratetype); + } return -EBUSY; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c25f94b33811..af063ad47d55 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, } #endif -#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + VM_BUG_ON(!pmd_trans_huge(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } diff --git a/mm/rmap.c b/mm/rmap.c index 24dd3f9fee27..9c045940ed10 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pmd = pmd_offset(pud, address); /* - * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ @@ -712,6 +712,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) } struct page_referenced_arg { + int dirtied; int mapcount; int referenced; unsigned long vm_flags; @@ -726,6 +727,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + int dirty = 0; struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { @@ -749,6 +751,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; + + /* + * Use pmd_freeable instead of raw pmd_dirty because in some + * of architecture, pmd_dirty is not defined unless + * CONFIG_TRANSPARENT_HUGEPAGE is enabled + */ + if (!pmd_freeable(*pmd)) + dirty++; + spin_unlock(ptl); } else { pte_t *pte; @@ -778,6 +789,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } + + if (pte_dirty(*pte)) + dirty++; + pte_unmap_unlock(pte, ptl); } @@ -786,6 +801,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pra->vm_flags |= vma->vm_flags; } + if (dirty) + pra->dirtied++; + pra->mapcount--; if (!pra->mapcount) return SWAP_SUCCESS; /* To break the loop */ @@ -810,6 +828,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) * @is_locked: caller holds lock on the page * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page + * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. @@ -817,7 +836,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, - unsigned long *vm_flags) + unsigned long *vm_flags, + int *is_pte_dirty) { int ret; int we_locked = 0; @@ -832,6 +852,9 @@ int page_referenced(struct page *page, }; *vm_flags = 0; + if (is_pte_dirty) + *is_pte_dirty = 0; + if (!page_mapped(page)) return 0; @@ -859,6 +882,9 @@ int page_referenced(struct page *page, if (we_locked) unlock_page(page); + if (is_pte_dirty) + *is_pte_dirty = pra.dirtied; + return pra.referenced; } @@ -950,7 +976,12 @@ void page_move_anon_rmap(struct page *page, VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; + /* + * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * simultaneously, so a concurrent reader (eg page_referenced()'s + * PageAnon()) will not see one without the other. + */ + WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); } /** @@ -1187,6 +1218,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spinlock_t *ptl; int ret = SWAP_AGAIN; enum ttu_flags flags = (enum ttu_flags)arg; + int dirty = 0; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1216,7 +1248,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) + dirty = pte_dirty(pteval); + if (dirty) set_page_dirty(page); /* Update high watermark before we lower rss */ @@ -1245,6 +1278,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; + if (flags & TTU_FREE) { + VM_BUG_ON_PAGE(PageSwapCache(page), page); + if (!dirty && !PageDirty(page)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } else { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + } + if (PageSwapCache(page)) { /* * Store the swap location in the pte. @@ -1285,6 +1331,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, MM_FILEPAGES); +discard: page_remove_rmap(page); page_cache_release(page); diff --git a/mm/shmem.c b/mm/shmem.c index a59087edf728..b03960be82d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -981,7 +981,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __set_page_locked(newpage); + __SetPageLocked(newpage); SetPageUptodate(newpage); SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); @@ -1173,7 +1173,7 @@ repeat: } __SetPageSwapBacked(page); - __set_page_locked(page); + __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); diff --git a/mm/slab.c b/mm/slab.c index 7eb38dd1cefa..504adb18522d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void) kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; + setup_kmalloc_cache_index_table(); slab_early_init = 0; @@ -3415,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + #ifdef CONFIG_TRACING void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) diff --git a/mm/slab.h b/mm/slab.h index 4c3ac12dd644..88b55497738c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags, #ifndef CONFIG_SLOB /* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(unsigned long); /* Find the kmalloc slab corresponding for a certain size */ @@ -162,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the objecct listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_MEMCG_KMEM /* * Iterate over all memcg caches of the given root cache. The caller must hold diff --git a/mm/slab_common.c b/mm/slab_common.c index 999bb3424d44..8873985f905e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -105,6 +105,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) + kmem_cache_free(s, p[i]); +} + +bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + } + return true; +} + #ifdef CONFIG_MEMCG_KMEM void slab_init_memcg_params(struct kmem_cache *s) { @@ -784,25 +807,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) } /* - * Create the kmalloc array. Some of the regular kmalloc arrays - * may already have been created because they were needed to - * enable allocations for slab creation. + * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is + * kmalloc-67108864. */ -void __init create_kmalloc_caches(unsigned long flags) +static struct { + const char *name; + unsigned long size; +} const kmalloc_info[] __initconst = { + {NULL, 0}, {"kmalloc-96", 96}, + {"kmalloc-192", 192}, {"kmalloc-8", 8}, + {"kmalloc-16", 16}, {"kmalloc-32", 32}, + {"kmalloc-64", 64}, {"kmalloc-128", 128}, + {"kmalloc-256", 256}, {"kmalloc-512", 512}, + {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, + {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, + {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, + {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, + {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, + {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, + {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, + {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, + {"kmalloc-67108864", 67108864} +}; + +/* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ +void __init setup_kmalloc_cache_index_table(void) { int i; - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); @@ -833,39 +876,41 @@ void __init create_kmalloc_caches(unsigned long flags) for (i = 128 + 8; i <= 192; i += 8) size_index[size_index_elem(i)] = 8; } - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + + for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache(NULL, - 1 << i, flags); + kmalloc_caches[i] = create_kmalloc_cache( + kmalloc_info[i].name, + kmalloc_info[i].size, + flags); } /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches + * "i == 2" is the "kmalloc-192" case which is the last special + * case for initialization and it's the point to jump to + * allocate the minimize size of the object. In slab allocator, + * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4 + * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is + * defined, it may be larger than 2^5 and here is also the + * trick to skip the empty gap. */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); - - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + if (i == 2) + i = (KMALLOC_SHIFT_LOW - 1); } /* Kmalloc array is now usable */ slab_state = UP; - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - char *n; - - if (s) { - n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); - - BUG_ON(!n); - s->name = n; - } - } - #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { struct kmem_cache *s = kmalloc_caches[i]; diff --git a/mm/slob.c b/mm/slob.c index 4765f65019c7..495df8e006ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index 54c0876b43d5..f920dc583f5d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x) */ static __always_inline void slab_lock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } static __always_inline void slab_unlock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } @@ -2750,6 +2752,72 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + if (!kmem_cache_debug(s)) { + struct kmem_cache_cpu *c; + + /* Drain objects in the per cpu slab */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + while (size) { + void *object = c->freelist; + + if (unlikely(!object)) { + /* + * Check if there are remotely freed objects + * available in the page. + */ + object = get_freelist(s, c->page); + + if (!object) { + /* + * All objects in use. Let's check if + * we have other per cpu partial pages + * that have available objects. + */ + c->page = c->partial; + if (!c->page) { + /* No per cpu objects left */ + c->freelist = NULL; + break; + } + + /* Next per cpu partial page */ + c->partial = c->page->next; + c->freelist = get_freelist(s, c->page); + continue; + } + + } + + *p++ = object; + size--; + + if (unlikely(flags & __GFP_ZERO)) + memset(object, 0, s->object_size); + + c->freelist = get_freepointer(s, object); + + } + c->tid = next_tid(c->tid); + + local_irq_enable(); + } + + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -3700,6 +3768,7 @@ void __init kmem_cache_init(void) kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); #ifdef CONFIG_SMP diff --git a/mm/swap.c b/mm/swap.c index a7251a8ed532..e980f52186b9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -44,6 +44,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -131,7 +132,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page) * here, see the comment above this function. */ VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); if (put_page_testzero(page_head)) { /* * If this is the tail of a slab THP page, @@ -210,8 +210,6 @@ out_put_single: */ if (put_page_testzero(page_head)) VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); atomic_dec(&page->_mapcount); VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); @@ -797,6 +795,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } + +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(lruvec, file, 0); + } +} + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -823,6 +839,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + activate_page_drain(cpu); } @@ -852,6 +872,26 @@ void deactivate_file_page(struct page *page) } } +/** + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + void lru_add_drain(void) { lru_add_drain_cpu(get_cpu()); @@ -881,6 +921,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); diff --git a/mm/swap_state.c b/mm/swap_state.c index 8bc8e66138da..a2611ce55413 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -357,7 +357,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -371,7 +371,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/mm/util.c b/mm/util.c index 68ff8a5361e7..c7434060039b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -3,6 +3,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> +#include <linux/ctype.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/security.h> @@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) EXPORT_SYMBOL(kstrndup); /** + * kstrimdup - Trim and copy a %NUL terminated string. + * @s: the string to trim and duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Returns an address, which the caller must kfree, containing + * a duplicate of the passed string with leading and/or trailing + * whitespace (as defined by isspace) removed. + */ +char *kstrimdup(const char *s, gfp_t gfp) +{ + char *buf; + char *begin = skip_spaces(s); + size_t len = strlen(begin); + + while (len && isspace(begin[len - 1])) + len--; + + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) + return NULL; + + memcpy(buf, begin, len); + buf[len] = '\0'; + + return buf; +} +EXPORT_SYMBOL(kstrimdup); + +/** * kmemdup - duplicate region of memory * * @src: memory region to duplicate @@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page) struct address_space *page_mapping(struct page *page) { - unsigned long mapping; + struct address_space *mapping; + + page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -368,10 +400,10 @@ struct address_space *page_mapping(struct page *page) return swap_address_space(entry); } - mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + mapping = page->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return page->mapping; + return mapping; } int overcommit_ratio_handler(struct ctl_table *table, int write, diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e8eadd71bac..37e90db1520b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -754,13 +754,17 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, - struct scan_control *sc) + struct scan_control *sc, + bool *freeable) { int referenced_ptes, referenced_page; unsigned long vm_flags; + int pte_dirty; + + VM_BUG_ON_PAGE(!PageLocked(page), page); referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); + &vm_flags, &pte_dirty); referenced_page = TestClearPageReferenced(page); /* @@ -801,6 +805,10 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_KEEP; } + if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) && + !PageDirty(page)) + *freeable = true; + /* Reclaim if clean, defer dirty pages to writeback */ if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; @@ -869,6 +877,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool freeable = false; cond_resched(); @@ -992,7 +1001,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (!force_reclaim) - references = page_check_references(page, sc); + references = page_check_references(page, sc, + &freeable); switch (references) { case PAGEREF_ACTIVATE: @@ -1009,22 +1019,31 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (!add_to_swap(page, page_list)) - goto activate_locked; - may_enter_fs = 1; - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + if (!freeable) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (!add_to_swap(page, page_list)) + goto activate_locked; + may_enter_fs = 1; + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } else { + if (likely(!PageTransHuge(page))) + goto unmap; + /* try_to_unmap isn't aware of THP page */ + if (unlikely(split_huge_page_to_list(page, + page_list))) + goto keep_locked; + } } - +unmap: /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + if (page_mapped(page) && (mapping || freeable)) { + switch (try_to_unmap(page, + freeable ? TTU_FREE : ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1032,7 +1051,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, case SWAP_MLOCK: goto cull_mlocked; case SWAP_SUCCESS: - ; /* try to free the page below */ + /* try to free the page below */ + if (!freeable) + break; + /* + * Freeable anon page doesn't have mapping + * due to skipping of swapcache so we free + * page in here rather than __remove_mapping. + */ + VM_BUG_ON_PAGE(PageSwapCache(page), page); + if (!page_freeze_refs(page, 1)) + goto keep_locked; + __ClearPageLocked(page); + count_vm_event(PGLAZYFREED); + goto free_it; } } @@ -1142,7 +1174,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ - __clear_page_locked(page); + __ClearPageLocked(page); free_it: nr_reclaimed++; @@ -1401,6 +1433,32 @@ int isolate_lru_page(struct page *page) return ret; } +static int __too_many_isolated(struct zone *zone, int file, + struct scan_control *sc, int safe) +{ + unsigned long inactive, isolated; + + if (safe) { + inactive = zone_page_state_snapshot(zone, + NR_INACTIVE_ANON + 2 * file); + isolated = zone_page_state_snapshot(zone, + NR_ISOLATED_ANON + file); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file); + isolated = zone_page_state(zone, NR_ISOLATED_ANON + file); + } + + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + + return isolated > inactive; +} + /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get resheduled. When there are massive number of tasks doing page @@ -1409,33 +1467,24 @@ int isolate_lru_page(struct page *page) * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, - struct scan_control *sc) + struct scan_control *sc) { - unsigned long inactive, isolated; - if (current_is_kswapd()) return 0; if (!global_reclaim(sc)) return 0; - if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); - } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); - } - /* - * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they - * won't get blocked by normal direct-reclaimers, forming a circular - * deadlock. + * __too_many_isolated(safe=0) is fast but inaccurate, because it + * doesn't account for the vm_stat_diff[] counters. So if it looks + * like too_many_isolated() is about to return true, fall back to the + * slower, more accurate zone_page_state_snapshot(). */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) - inactive >>= 3; + if (unlikely(__too_many_isolated(zone, file, sc, 0))) + return __too_many_isolated(zone, file, sc, 1); - return isolated > inactive; + return 0; } static noinline_for_stack void @@ -1772,7 +1821,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags, NULL)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -2646,7 +2695,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -3596,7 +3646,7 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -3638,12 +3688,12 @@ static long zone_pagecache_reclaimable(struct zone *zone) long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and zone_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) + if (zone_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); else nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); @@ -3674,15 +3724,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .order = order, .priority = ZONE_RECLAIM_PRIORITY, .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, }; cond_resched(); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP + * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * and RECLAIM_UNMAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); diff --git a/mm/vmstat.c b/mm/vmstat.c index 4f5cd974e11a..1fd0886a389f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -759,6 +759,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd") diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 08bd7a3d464a..33d512646379 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -45,10 +45,6 @@ * */ -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> diff --git a/mm/zswap.c b/mm/zswap.c index 4249e82ff934..f8583f1fc938 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -490,7 +490,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -501,7 +501,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index d7016279ec2b..fc169fd2a3cc 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -186,6 +186,27 @@ if (-f $conf) { unshift(@ARGV, @conf_args) if @conf_args; } +my @ignore_emails = (); +my $ignore_file = which_conf(".get_maintainer.ignore"); +if (-f $ignore_file) { + open(my $ignore, '<', "$ignore_file") + or warn "$P: Can't find a readable .get_maintainer.ignore file $!\n"; + while (<$ignore>) { + my $line = $_; + + $line =~ s/\s*\n?$//; + $line =~ s/^\s*//; + $line =~ s/\s+$//; + $line =~ s/#.*$//; + + next if ($line =~ m/^\s*$/); + if (rfc822_valid($line)) { + push(@ignore_emails, $line); + } + } + close($ignore); +} + if (!GetOptions( 'email!' => \$email, 'git!' => \$email_git, @@ -513,6 +534,16 @@ if ($web) { exit($exit); +sub ignore_email_address { + my ($address) = @_; + + foreach my $ignore (@ignore_emails) { + return 1 if ($ignore eq $address); + } + + return 0; +} + sub range_is_maintained { my ($start, $end) = @_; @@ -1868,6 +1899,7 @@ sub vcs_assign { my $percent = $sign_offs * 100 / $divisor; $percent = 100 if ($percent > 100); + next if (ignore_email_address($line)); $count++; last if ($sign_offs < $email_git_min_signatures || $count > $email_git_max_maintainers || |