diff options
173 files changed, 6787 insertions, 1803 deletions
@@ -1381,6 +1381,9 @@ S: 17 rue Danton S: F - 94270 Le Kremlin-Bicêtre S: France +N: Jack Hammer +D: IBM ServeRAID RAID (ips) driver maintenance + N: Greg Hankins E: gregh@cc.gatech.edu D: fixed keyboard driver to separate LED and locking status @@ -1691,6 +1694,10 @@ S: Reading S: RG6 2NU S: United Kingdom +N: Dave Jeffery +E: dhjeffery@gmail.com +D: SCSI hacks and IBM ServeRAID RAID driver maintenance + N: Jakub Jelinek E: jakub@redhat.com W: http://sunsite.mff.cuni.cz/~jj diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf5..3187a18af6da 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126aceed8..223c32171dcc 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block <bool>: 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index f34a8ee6f860..9735c87ca363 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -87,6 +87,13 @@ pages_sharing - how many more sites are sharing them i.e. how much saved pages_unshared - how many pages unique but repeatedly checked for merging pages_volatile - how many pages changing too fast to be placed in a tree full_scans - how many times all mergeable areas have been scanned +deferrable_timer - whether to use deferrable timers or not + e.g. "echo 1 > /sys/kernel/mm/ksm/deferrable_timer" + Default: 0 (means, we are not using deferrable timers. Users + might want to set deferrable_timer option if they donot want + ksm thread to wakeup CPU to carryout ksm activities thus + gaining on battery while compromising slightly on memory + that could have been saved.) A high ratio of pages_sharing to pages_shared indicates good sharing, but a high ratio of pages_unshared to pages_sharing indicates wasted effort. diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt index 560e4363a55d..f609142f406a 100644 --- a/Documentation/vm/remap_file_pages.txt +++ b/Documentation/vm/remap_file_pages.txt @@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit virtual address space. This use-case is not critical anymore since 64-bit systems are widely available. -The plan is to deprecate the syscall and replace it with an emulation. -The emulation will create new VMAs instead of nonlinear mappings. It's -going to work slower for rare users of remap_file_pages() but ABI is -preserved. +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. One side effect of emulation (apart from performance) is that user can hit vm.max_map_count limit more easily due to additional VMAs. See comment for diff --git a/MAINTAINERS b/MAINTAINERS index 226c8025688d..c3346aabdfb8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -597,7 +597,7 @@ AMD GEODE CS5536 USB DEVICE CONTROLLER DRIVER M: Thomas Dahlmann <dahlmann.thomas@arcor.de> L: linux-geode@lists.infradead.org (moderated for non-subscribers) S: Supported -F: drivers/usb/gadget/amd5536udc.* +F: drivers/usb/gadget/udc/amd5536udc.* AMD GEODE PROCESSOR/CHIPSET SUPPORT P: Andres Salomon <dilinger@queued.net> @@ -621,7 +621,7 @@ AMD MICROCODE UPDATE SUPPORT M: Andreas Herrmann <herrmann.der.user@googlemail.com> L: amd64-microcode@amd64.org S: Maintained -F: arch/x86/kernel/microcode_amd.c +F: arch/x86/kernel/cpu/microcode/amd* AMD XGBE DRIVER M: Tom Lendacky <thomas.lendacky@amd.com> @@ -911,7 +911,7 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://git.kernel.org/pub/scm/linux/kernel/git/baohua/linux.git S: Maintained F: arch/arm/mach-prima2/ -F: drivers/clk/clk-prima2.c +F: drivers/clk/sirf/ F: drivers/clocksource/timer-prima2.c F: drivers/clocksource/timer-marco.c N: [^a-z]sirf @@ -1164,6 +1164,7 @@ M: Linus Walleij <linus.walleij@linaro.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-nomadik/ +F: drivers/pinctrl/nomadik/ F: drivers/i2c/busses/i2c-nomadik.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git @@ -1185,8 +1186,7 @@ F: drivers/mmc/host/msm_sdcc.h F: drivers/tty/serial/msm_serial.h F: drivers/tty/serial/msm_serial.c F: drivers/*/pm8???-* -F: drivers/mfd/ssbi/ -F: include/linux/mfd/pm8xxx/ +F: drivers/mfd/ssbi.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/davidb/linux-msm.git S: Maintained @@ -1443,7 +1443,8 @@ F: drivers/mfd/abx500* F: drivers/mfd/ab8500* F: drivers/mfd/dbx500* F: drivers/mfd/db8500* -F: drivers/pinctrl/pinctrl-nomadik* +F: drivers/pinctrl/nomadik/pinctrl-ab* +F: drivers/pinctrl/nomadik/pinctrl-nomadik* F: drivers/rtc/rtc-ab8500.c F: drivers/rtc/rtc-pl031.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git @@ -1699,7 +1700,7 @@ ATMEL USBA UDC DRIVER M: Nicolas Ferre <nicolas.ferre@atmel.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported -F: drivers/usb/gadget/atmel_usba_udc.* +F: drivers/usb/gadget/udc/atmel_usba_udc.* ATMEL WIRELESS DRIVER M: Simon Kelley <simon@thekelleys.org.uk> @@ -1991,7 +1992,7 @@ F: arch/arm/boot/dts/bcm113* F: arch/arm/boot/dts/bcm216* F: arch/arm/boot/dts/bcm281* F: arch/arm/configs/bcm_defconfig -F: drivers/mmc/host/sdhci_bcm_kona.c +F: drivers/mmc/host/sdhci-bcm-kona.c F: drivers/clocksource/bcm_kona_timer.c BROADCOM BCM2835 ARM ARCHICTURE @@ -2341,12 +2342,6 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/cirrus/ep93xx_eth.c -CIRRUS LOGIC EP93XX OHCI USB HOST DRIVER -M: Lennert Buytenhek <kernel@wantstofly.org> -L: linux-usb@vger.kernel.org -S: Maintained -F: drivers/usb/host/ohci-ep93xx.c - CIRRUS LOGIC AUDIO CODEC DRIVERS M: Brian Austin <brian.austin@cirrus.com> M: Paul Handrigan <Paul.Handrigan@cirrus.com> @@ -2431,7 +2426,7 @@ W: http://linux-cifs.samba.org/ Q: http://patchwork.ozlabs.org/project/linux-cifs-client/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git S: Supported -F: Documentation/filesystems/cifs.txt +F: Documentation/filesystems/cifs/ F: fs/cifs/ COMPACTPCI HOTPLUG CORE @@ -2966,7 +2961,9 @@ L: linux-media@vger.kernel.org L: dri-devel@lists.freedesktop.org L: linaro-mm-sig@lists.linaro.org F: drivers/dma-buf/ -F: include/linux/dma-buf* include/linux/reservation.h include/linux/*fence.h +F: include/linux/dma-buf* +F: include/linux/reservation.h +F: include/linux/*fence.h F: Documentation/dma-buf-sharing.txt T: git git://git.linaro.org/people/sumitsemwal/linux-dma-buf.git @@ -3061,7 +3058,6 @@ L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~agd5f/linux S: Supported F: drivers/gpu/drm/radeon/ -F: include/drm/radeon* F: include/uapi/drm/radeon* DRM PANEL DRIVERS @@ -3255,26 +3251,12 @@ T: git git://linuxtv.org/anttip/media_tree.git S: Maintained F: drivers/media/tuners/e4000* -EATA-DMA SCSI DRIVER -M: Michael Neuffer <mike@i-Connect.Net> -L: linux-eata@i-connect.net -L: linux-scsi@vger.kernel.org -S: Maintained -F: drivers/scsi/eata* - EATA ISA/EISA/PCI SCSI DRIVER M: Dario Ballabio <ballabio_dario@emc.com> L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/eata.c -EATA-PIO SCSI DRIVER -M: Michael Neuffer <mike@i-Connect.Net> -L: linux-eata@i-connect.net -L: linux-scsi@vger.kernel.org -S: Maintained -F: drivers/scsi/eata_pio.* - EC100 MEDIA DRIVER M: Antti Palosaari <crope@iki.fi> L: linux-media@vger.kernel.org @@ -3449,7 +3431,7 @@ M: Matt Fleming <matt.fleming@intel.com> L: linux-efi@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi.git S: Maintained -F: Documentation/x86/efi-stub.txt +F: Documentation/efi-stub.txt F: arch/ia64/kernel/efi.c F: arch/x86/boot/compressed/eboot.[ch] F: arch/x86/include/asm/efi.h @@ -3836,7 +3818,7 @@ M: Li Yang <leoli@freescale.com> L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Maintained -F: drivers/usb/gadget/fsl* +F: drivers/usb/gadget/udc/fsl* FREESCALE QUICC ENGINE UCC ETHERNET DRIVER M: Li Yang <leoli@freescale.com> @@ -4526,10 +4508,7 @@ S: Supported F: drivers/scsi/ibmvscsi/ibmvfc* IBM ServeRAID RAID DRIVER -P: Jack Hammer -M: Dave Jeffery <ipslinux@adaptec.com> -W: http://www.developer.ibm.com/welcome/netfinity/serveraid.html -S: Supported +S: Orphan F: drivers/scsi/ips.* ICH LPC AND GPIO DRIVER @@ -4726,8 +4705,8 @@ F: drivers/platform/x86/intel_menlow.c INTEL IA32 MICROCODE UPDATE SUPPORT M: Tigran Aivazian <tigran@aivazian.fsnet.co.uk> S: Maintained -F: arch/x86/kernel/microcode_core.c -F: arch/x86/kernel/microcode_intel.c +F: arch/x86/kernel/cpu/microcode/core* +F: arch/x86/kernel/cpu/microcode/intel* INTEL I/OAT DMA DRIVER M: Dan Williams <dan.j.williams@intel.com> @@ -5186,7 +5165,6 @@ L: linux-nfs@vger.kernel.org W: http://nfs.sourceforge.net/ S: Supported F: fs/nfsd/ -F: include/linux/nfsd/ F: include/uapi/linux/nfsd/ F: fs/lockd/ F: fs/nfs_common/ @@ -5907,7 +5885,6 @@ F: drivers/clocksource/metag_generic.c F: drivers/irqchip/irq-metag.c F: drivers/irqchip/irq-metag-ext.c F: drivers/tty/metag_da.c -F: fs/imgdafs/ MICROBLAZE ARCHITECTURE M: Michal Simek <monstr@monstr.eu> @@ -7006,9 +6983,9 @@ M: Jamie Iles <jamie@jamieiles.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://github.com/jamieiles/linux-2.6-ji.git S: Supported +F: arch/arm/boot/dts/picoxcell* F: arch/arm/mach-picoxcell/ -F: drivers/*/picoxcell* -F: drivers/*/*/picoxcell* +F: drivers/crypto/picoxcell* PIN CONTROL SUBSYSTEM M: Linus Walleij <linus.walleij@linaro.org> @@ -7232,7 +7209,7 @@ F: drivers/ptp/* F: include/linux/ptp_cl* PTRACE SUPPORT -M: Roland McGrath <roland@redhat.com> +M: Roland McGrath <roland@hack.frob.com> M: Oleg Nesterov <oleg@redhat.com> S: Maintained F: include/asm-generic/syscall.h @@ -7282,7 +7259,7 @@ S: Maintained F: arch/arm/mach-pxa/ F: drivers/pcmcia/pxa2xx* F: drivers/spi/spi-pxa2xx* -F: drivers/usb/gadget/pxa2* +F: drivers/usb/gadget/udc/pxa2* F: include/sound/pxa2xx-lib.h F: sound/arm/pxa* F: sound/soc/pxa/ @@ -7291,7 +7268,7 @@ PXA3xx NAND FLASH DRIVER M: Ezequiel Garcia <ezequiel.garcia@free-electrons.com> L: linux-mtd@lists.infradead.org S: Maintained -F: drivers/mtd/nand/pxa3xx-nand.c +F: drivers/mtd/nand/pxa3xx_nand.c MMP SUPPORT M: Eric Miao <eric.y.miao@gmail.com> @@ -9636,8 +9613,8 @@ USB WEBCAM GADGET M: Laurent Pinchart <laurent.pinchart@ideasonboard.com> L: linux-usb@vger.kernel.org S: Maintained -F: drivers/usb/gadget/*uvc*.c -F: drivers/usb/gadget/webcam.c +F: drivers/usb/gadget/function/*uvc*.c +F: drivers/usb/gadget/legacy/webcam.c USB WIRELESS RNDIS DRIVER (rndis_wlan) M: Jussi Kivilinna <jussi.kivilinna@iki.fi> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 32cbbd565902..c49a775937db 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1983,6 +1983,8 @@ config XIP_PHYS_ADDR config KEXEC bool "Kexec system call (EXPERIMENTAL)" depends on (!SMP || PM_SLEEP_SMP) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/arm/mach-omap2/board-omap3touchbook.c b/arch/arm/mach-omap2/board-omap3touchbook.c index 7da48bc42bbf..70b904c010c6 100644 --- a/arch/arm/mach-omap2/board-omap3touchbook.c +++ b/arch/arm/mach-omap2/board-omap3touchbook.c @@ -336,7 +336,7 @@ static int __init early_touchbook_revision(char *p) if (!p) return 0; - return strict_strtoul(p, 10, &touchbook_revision); + return kstrtoul(p, 10, &touchbook_revision); } early_param("tbr", early_touchbook_revision); diff --git a/arch/arm/mach-omap2/mux.c b/arch/arm/mach-omap2/mux.c index f62f7537d899..ac8a249779f2 100644 --- a/arch/arm/mach-omap2/mux.c +++ b/arch/arm/mach-omap2/mux.c @@ -681,29 +681,19 @@ static ssize_t omap_mux_dbg_signal_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - char buf[OMAP_MUX_MAX_ARG_CHAR]; struct seq_file *seqf; struct omap_mux *m; - unsigned long val; - int buf_size, ret; + u16 val; + int ret; struct omap_mux_partition *partition; if (count > OMAP_MUX_MAX_ARG_CHAR) return -EINVAL; - memset(buf, 0, sizeof(buf)); - buf_size = min(count, sizeof(buf) - 1); - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - ret = strict_strtoul(buf, 0x10, &val); + ret = kstrtou16_from_user(user_buf, count, 0x10, &val); if (ret < 0) return ret; - if (val > 0xffff) - return -EINVAL; - seqf = file->private_data; m = seqf->private; @@ -711,7 +701,7 @@ static ssize_t omap_mux_dbg_signal_write(struct file *file, if (!partition) return -ENODEV; - omap_mux_write(partition, (u16)val, m->reg_offset); + omap_mux_write(partition, val, m->reg_offset); *ppos += count; return count; @@ -917,14 +907,14 @@ static void __init omap_mux_set_cmdline_signals(void) while ((token = strsep(&next_opt, ",")) != NULL) { char *keyval, *name; - unsigned long val; + u16 val; keyval = token; name = strsep(&keyval, "="); if (name) { int res; - res = strict_strtoul(keyval, 0x10, &val); + res = kstrtou16(keyval, 0x10, &val); if (res < 0) continue; diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c index 43596e0ed051..d897292712eb 100644 --- a/arch/arm/mach-pxa/balloon3.c +++ b/arch/arm/mach-pxa/balloon3.c @@ -90,7 +90,7 @@ int __init parse_balloon3_features(char *arg) if (!arg) return 0; - return strict_strtoul(arg, 0, &balloon3_features_present); + return kstrtoul(arg, 0, &balloon3_features_present); } early_param("balloon3_features", parse_balloon3_features); diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c index 41f27f667ca8..de3b08073fe7 100644 --- a/arch/arm/mach-pxa/viper.c +++ b/arch/arm/mach-pxa/viper.c @@ -769,7 +769,7 @@ static unsigned long viper_tpm; static int __init viper_tpm_setup(char *str) { - return strict_strtoul(str, 10, &viper_tpm) >= 0; + return kstrtoul(str, 10, &viper_tpm) >= 0; } __setup("tpm=", viper_tpm_setup); diff --git a/arch/arm/mach-s3c24xx/mach-jive.c b/arch/arm/mach-s3c24xx/mach-jive.c index e647b47244a9..7804d3c6991b 100644 --- a/arch/arm/mach-s3c24xx/mach-jive.c +++ b/arch/arm/mach-s3c24xx/mach-jive.c @@ -242,7 +242,7 @@ static int __init jive_mtdset(char *options) if (options == NULL || options[0] == '\0') return 0; - if (strict_strtoul(options, 10, &set)) { + if (kstrtoul(options, 10, &set)) { printk(KERN_ERR "failed to parse mtdset=%s\n", options); return 0; } diff --git a/arch/arm/mach-w90x900/cpu.c b/arch/arm/mach-w90x900/cpu.c index b1eabaad50a5..213230ee57d1 100644 --- a/arch/arm/mach-w90x900/cpu.c +++ b/arch/arm/mach-w90x900/cpu.c @@ -178,7 +178,8 @@ static int __init nuc900_set_cpufreq(char *str) if (!*str) return 0; - strict_strtoul(str, 0, &cpufreq); + if (kstrtoul(str, 0, &cpufreq)) + return 0; nuc900_clock_source(NULL, "ext"); diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 7a3f462133b0..22b16232bd60 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -28,9 +28,6 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -/* We do define AT_SYSINFO_EHDR but don't use the gate mechanism */ -#define __HAVE_ARCH_GATE_AREA 1 - /* * The idmap and swapper page tables need some space reserved in the kernel * image. Both require pgd, pud (4 levels only) and pmd tables to (section) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index a81a446a5786..32aeea083d93 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -195,25 +195,6 @@ up_fail: } /* - * We define AT_SYSINFO_EHDR, so we need these function stubs to keep - * Linux happy. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -/* * Update the vDSO data page to keep in sync with kernel timekeeping. */ void update_vsyscall(struct timekeeper *tk) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c84c88bbbbd7..64aefb76bd69 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -549,6 +549,8 @@ source "drivers/sn/Kconfig" config KEXEC bool "kexec system call" depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index f1e1b2e3cdb3..1f1bf144fe62 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -231,4 +231,6 @@ get_order (unsigned long size) #define PERCPU_ADDR (-PERCPU_PAGE_SIZE) #define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE) +#define __HAVE_ARCH_GATE_AREA 1 + #endif /* _ASM_IA64_PAGE_H */ diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 892d43e32f3b..6b3345758d3e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -278,6 +278,37 @@ setup_gate (void) ia64_patch_gate(); } +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + + return 0; +} +__initcall(gate_vma_init); + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return &gate_vma; +} + +int in_gate_area_no_mm(unsigned long addr) +{ + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; + return 0; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + return in_gate_area_no_mm(addr); +} + void ia64_mmu_init(void *my_cpu_data) { unsigned long pta, impl_va_bits; diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 87b7c7581b1d..3ff8c9a25335 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -91,6 +91,8 @@ config MMU_SUN3 config KEXEC bool "kexec system call" depends on M68KCLASSIC + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 900c7e5333b6..df51e78a72cc 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2396,6 +2396,8 @@ source "kernel/Kconfig.preempt" config KEXEC bool "Kexec system call" + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4bc7b62fb4b6..a577609f8ed6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -399,6 +399,8 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 32e4e212b9c1..26fe1ae15212 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -48,9 +48,6 @@ extern unsigned int HPAGE_SHIFT; #define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1) #endif -/* We do define AT_SYSINFO_EHDR but don't use the gate mechanism */ -#define __HAVE_ARCH_GATE_AREA 1 - /* * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we * assign PAGE_MASK to a larger type it gets extended the way we want diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index d0225572faa1..75d62d63fe68 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -149,13 +149,13 @@ static void check_smt_enabled(void) else if (!strcmp(smt_enabled_cmdline, "off")) smt_enabled_at_boot = 0; else { - long smt; + int smt; int rc; - rc = strict_strtol(smt_enabled_cmdline, 10, &smt); + rc = kstrtoint(smt_enabled_cmdline, 10, &smt); if (!rc) smt_enabled_at_boot = - min(threads_per_core, (int)smt); + min(threads_per_core, smt); } } else { dn = of_find_node_by_path("/options"); diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index ce74c335a6a4..f174351842cf 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -840,19 +840,3 @@ static int __init vdso_init(void) return 0; } arch_initcall(vdso_init); - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 904c66128fae..5bfdab9047be 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -977,7 +977,7 @@ static ssize_t viodev_cmo_desired_set(struct device *dev, size_t new_desired; int ret; - ret = strict_strtoul(buf, 10, &new_desired); + ret = kstrtoul(buf, 10, &new_desired); if (ret) return ret; diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 2d0b4d68a40a..a2450b8a50a5 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -400,10 +400,10 @@ out: static ssize_t dlpar_cpu_probe(const char *buf, size_t count) { struct device_node *dn, *parent; - unsigned long drc_index; + u32 drc_index; int rc; - rc = strict_strtoul(buf, 0, &drc_index); + rc = kstrtou32(buf, 0, &drc_index); if (rc) return -EINVAL; diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index d146fef038b8..e7cb6d4a871a 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -320,7 +320,7 @@ static ssize_t migrate_store(struct class *class, struct class_attribute *attr, u64 streamid; int rc; - rc = strict_strtoull(buf, 0, &streamid); + rc = kstrtou64(buf, 0, &streamid); if (rc) return rc; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 05c78bb5f570..ab39ceb89ecf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -48,6 +48,8 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KEXEC def_bool y + select CRYPTO + select CRYPTO_SHA256 config AUDIT_ARCH def_bool y diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 114258eeaacd..7b2ac6e44166 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -162,6 +162,4 @@ static inline int devmem_is_allowed(unsigned long pfn) #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> -#define __HAVE_ARCH_GATE_AREA 1 - #endif /* _S390_PAGE_H */ diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 613649096783..0bbb7e027c5a 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -316,18 +316,3 @@ static int __init vdso_init(void) return 0; } early_initcall(vdso_init); - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index aa2df3eaeb29..453fa5c09550 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -595,6 +595,8 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" depends on SUPERH32 && MMU + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index 15d970328f71..fe20d14ae051 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -186,11 +186,6 @@ typedef struct page *pgtable_t; #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> -/* vDSO support */ -#ifdef CONFIG_VSYSCALL -#define __HAVE_ARCH_GATE_AREA -#endif - /* * Some drivers need to perform DMA into kmalloc'ed buffers * and so we have to increase the kmalloc minalign for this. diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 5ca579720a09..ea2aa1393b87 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -92,18 +92,3 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long address) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long address) -{ - return 0; -} diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 7fcd492adbfc..a3ffe2dd4832 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -191,6 +191,8 @@ source "kernel/Kconfig.hz" config KEXEC bool "kexec system call" + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h index 672768008618..a213a8d84a95 100644 --- a/arch/tile/include/asm/page.h +++ b/arch/tile/include/asm/page.h @@ -39,12 +39,6 @@ #define HPAGE_MASK (~(HPAGE_SIZE - 1)) /* - * We do define AT_SYSINFO_EHDR to support vDSO, - * but don't use the gate mechanism. - */ -#define __HAVE_ARCH_GATE_AREA 1 - -/* * If the Kconfig doesn't specify, set a maximum zone order that * is enough so that we can create huge pages from small pages given * the respective sizes of the two page types. See <linux/mmzone.h>. diff --git a/arch/tile/kernel/vdso.c b/arch/tile/kernel/vdso.c index 1533af24106e..5bc51d7dfdcb 100644 --- a/arch/tile/kernel/vdso.c +++ b/arch/tile/kernel/vdso.c @@ -121,21 +121,6 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long address) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long address) -{ - return 0; -} - int setup_vdso_pages(void) { struct page **pagelist; diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 5ff53d9185f7..71c5d132062a 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -119,4 +119,9 @@ extern unsigned long uml_physmem; #include <asm-generic/getorder.h> #endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_X86_32 +#define __HAVE_ARCH_GATE_AREA 1 +#endif + #endif /* __UM_PAGE_H */ diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index e5287d8517aa..61b6d51866f8 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -16,3 +16,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ + +ifeq ($(CONFIG_X86_64),y) +obj-$(CONFIG_KEXEC) += purgatory/ +endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fcbe79ee86a2..541e7cc43324 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1584,6 +1584,9 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" + select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1598,6 +1601,28 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC + ---help--- + This option makes kernel signature verification mandatory for + kexec_file_load() syscall. If kernel is signature can not be + verified, kexec_file_load() will fail. + + This option enforces signature verification at generic level. + One needs to enable signature verification for type of kernel + image being loaded to make sure it works. For example, enable + bzImage signature verification option to be able to load and + verify signatures of bzImage. Otherwise kernel loading will fail. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on KEXEC_VERIFY_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + ---help--- + Enable bzImage signature verification support. + config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c65fd9650467..c1aa36887843 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -183,6 +183,14 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all +archprepare: +ifeq ($(CONFIG_KEXEC),y) +# Build only for 64bit. No loaders for 32bit yet. + ifeq ($(CONFIG_X86_64),y) + $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c + endif +endif + ### # Kernel objects diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h new file mode 100644 index 000000000000..f498411f2500 --- /dev/null +++ b/arch/x86/include/asm/crash.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_CRASH_H +#define _ASM_X86_CRASH_H + +int crash_load_segments(struct kimage *image); +int crash_copy_backup_region(struct kimage *image); +int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + +#endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h new file mode 100644 index 000000000000..d1b5d194e31d --- /dev/null +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KEXEC_BZIMAGE64_H +#define _ASM_KEXEC_BZIMAGE64_H + +extern struct kexec_file_ops kexec_bzImage64_ops; + +#endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 17483a492f18..d2434c1cad05 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -23,6 +23,9 @@ #include <asm/page.h> #include <asm/ptrace.h> +#include <asm/bootparam.h> + +struct kimage; /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -61,6 +64,10 @@ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif +/* Memory to backup during crash kdump */ +#define KEXEC_BACKUP_SRC_START (0UL) +#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ + /* * CPU does not save ss and sp on stack if execution is already * running in kernel mode at the time of NMI occurrence. This code @@ -160,6 +167,44 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; + /* Details of backup region */ + unsigned long backup_src_start; + unsigned long backup_src_sz; + + /* Physical address of backup segment */ + unsigned long backup_load_addr; + + /* Core ELF header buffer */ + void *elf_headers; + unsigned long elf_headers_sz; + unsigned long elf_load_addr; +}; +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 +/* + * Number of elements and order of elements in this structure should match + * with the ones in arch/x86/purgatory/entry64.S. If you make a change here + * make an appropriate change in purgatory too. + */ +struct kexec_entry64_regs { + uint64_t rax; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t rsp; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rip; }; #endif diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 775873d3be55..802dde30c928 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr); #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> -#define __HAVE_ARCH_GATE_AREA 1 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 0f1ddee6a0ce..f408caf73430 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,4 +39,6 @@ void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ +#define __HAVE_ARCH_GATE_AREA 1 + #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bde3993624f1..b5ea75c4a4b4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -118,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 9c8f7394c612..c7035073dfc1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - if (strict_strtoul(buf, 10, &val) < 0) + if (kstrtoul(buf, 10, &val) < 0) return -EINVAL; err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); @@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; - if (strict_strtoul(buf, 16, &val) < 0) + if (kstrtoul(buf, 16, &val) < 0) return -EINVAL; if (amd_set_subcaches(cpu, val)) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4fc57975acc1..bd9ccda8087f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; attr_to_bank(attr)->ctl = new; @@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; if (mca_cfg.ignore_ce ^ !!new) { @@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; if (mca_cfg.cmci_disabled ^ !!new) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 603df4f74640..1e49f8f41276 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) if (!b->interrupt_capable) return -EINVAL; - if (strict_strtoul(buf, 0, &new) < 0) + if (kstrtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; @@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; - if (strict_strtoul(buf, 0, &new) < 0) + if (kstrtoul(buf, 0, &new) < 0) return -EINVAL; if (new > THRESHOLD_MAX) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 507de8066594..0553a34fa0df 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -4,9 +4,14 @@ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> * */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> @@ -16,6 +21,7 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -28,6 +34,45 @@ #include <asm/reboot.h> #include <asm/virtext.h> +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +/* This primarily represents number of split ranges due to exclusion */ +#define CRASH_MAX_RANGES 16 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int nr_ranges; + struct crash_mem_range ranges[CRASH_MAX_RANGES]; +}; + +/* Misc data about ram ranges needed to prepare elf headers */ +struct crash_elf_data { + struct kimage *image; + /* + * Total number of ram ranges we have after various adjustments for + * GART, crash reserved region etc. + */ + unsigned int max_nr_ranges; + unsigned long gart_start, gart_end; + + /* Pointer to elf header */ + void *ehdr; + /* Pointer to next phdr */ + void *bufp; + struct crash_mem mem; +}; + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + int in_crash_kexec; /* @@ -39,6 +84,7 @@ int in_crash_kexec; */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif crash_save_cpu(regs, safe_smp_processor_id()); } + +#ifdef CONFIG_X86_64 + +static int get_nr_ram_ranges_callback(unsigned long start_pfn, + unsigned long nr_pfn, void *arg) +{ + int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int get_gart_ranges_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + + ced->gart_start = start; + ced->gart_end = end; + + /* Not expecting more than 1 gart aperture */ + return 1; +} + + +/* Gather all the required information to prepare elf headers for ram regions */ +static void fill_up_crash_elf_data(struct crash_elf_data *ced, + struct kimage *image) +{ + unsigned int nr_ranges = 0; + + ced->image = image; + + walk_system_ram_range(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; + + /* + * We don't create ELF headers for GART aperture as an attempt + * to dump this memory in second kernel leads to hang/crash. + * If gart aperture is present, one needs to exclude that region + * and that could lead to need of extra phdr. + */ + walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, + ced, get_gart_ranges_callback); + + /* + * If we have gart region, excluding that could potentially split + * a memory range, resulting in extra header. Account for that. + */ + if (ced->gart_end) + ced->max_nr_ranges++; + + /* Exclusion of crash region could split memory ranges */ + ced->max_nr_ranges++; + + /* If crashk_low_res is not 0, another range split possible */ + if (crashk_low_res.end != 0) + ced->max_nr_ranges++; +} + +static int exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happend, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == CRASH_MAX_RANGES - 1) { + pr_err("Too many crash ranges after split\n"); + return -ENOMEM; + } + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in ced->mem.ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_elf_data *ced, + unsigned long long mstart, unsigned long long mend) +{ + struct crash_mem *cmem = &ced->mem; + int ret = 0; + + memset(cmem->ranges, 0, sizeof(cmem->ranges)); + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude crashkernel region */ + ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; + + /* Exclude GART region */ + if (ced->gart_end) { + ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); + if (ret) + return ret; + } + + return ret; +} + +static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long mstart, mend; + struct kimage *image = ced->image; + struct crash_mem *cmem; + int ret, i; + + ehdr = ced->ehdr; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(ced, start, end); + if (ret) + return ret; + + /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ + cmem = &ced->mem; + + for (i = 0; i < cmem->nr_ranges; i++) { + mstart = cmem->ranges[i].start; + mend = cmem->ranges[i].end; + + phdr = ced->bufp; + ced->bufp += sizeof(Elf64_Phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + if (mstart == image->arch.backup_src_start && + (mend - mstart + 1) == image->arch.backup_src_sz) + phdr->p_offset = image->arch.backup_load_addr; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + return ret; +} + +static int prepare_elf64_headers(struct crash_elf_data *ced, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf, *bufp; + unsigned int cpu; + unsigned long long notes_addr; + int ret; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += ced->max_nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area on x86_64 (ffffffff80000000 - ffffffffa0000000). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + bufp = buf; + ehdr = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + (ehdr->e_phnum)++; + +#ifdef CONFIG_X86_64 + /* Prepare PT_LOAD type program header for kernel text region */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + (ehdr->e_phnum)++; +#endif + + /* Prepare PT_LOAD headers for system ram chunks. */ + ced->ehdr = ehdr; + ced->bufp = bufp; + ret = walk_system_ram_res(0, -1, ced, + prepare_elf64_ram_headers_callback); + if (ret < 0) + return ret; + + *addr = buf; + *sz = elf_sz; + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_elf_data *ced; + int ret; + + ced = kzalloc(sizeof(*ced), GFP_KERNEL); + if (!ced) + return -ENOMEM; + + fill_up_crash_elf_data(ced, image); + + /* By default prepare 64bit headers */ + ret = prepare_elf64_headers(ced, addr, sz); + kfree(ced); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820MAX) + return 1; + + memcpy(¶ms->e820_map[nr_e820_entries], entry, + sizeof(struct e820entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(u64 start, u64 end, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820entry ei; + + ei.addr = start; + ei.size = end - start + 1; + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + int ret = 0; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude Backup region */ + start = image->arch.backup_load_addr; + end = start + image->arch.backup_src_sz - 1; + ret = exclude_mem_range(cmem, start, end); + if (ret) + return ret; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(sizeof(struct crash_mem)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add first 640K segment */ + ei.addr = image->arch.backup_src_start; + ei.size = image->arch.backup_src_sz; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + + /* Add ACPI tables */ + cmd.type = E820_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_NVS; + walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, + crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +static int determine_backup_region(u64 start, u64 end, void *arg) +{ + struct kimage *image = arg; + + image->arch.backup_src_start = start; + image->arch.backup_src_sz = end - start + 1; + + /* Expecting only one range for backup region */ + return 1; +} + +int crash_load_segments(struct kimage *image) +{ + unsigned long src_start, src_sz, elf_sz; + void *elf_addr; + int ret; + + /* + * Determine and load a segment for backup area. First 640K RAM + * region is backup source + */ + + ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, + image, determine_backup_region); + + /* Zero or postive return values are ok */ + if (ret < 0) + return ret; + + src_start = image->arch.backup_src_start; + src_sz = image->arch.backup_src_sz; + + /* Add backup segment. */ + if (src_sz) { + /* + * Ideally there is no source for backup segment. This is + * copied in purgatory after crash. Just add a zero filled + * segment for now to make sure checksum logic works fine. + */ + ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, + sizeof(crash_zero_bytes), src_sz, + PAGE_SIZE, 0, -1, 0, + &image->arch.backup_load_addr); + if (ret) + return ret; + pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", + image->arch.backup_load_addr, src_start, src_sz); + } + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + if (ret) + return ret; + + image->arch.elf_headers = elf_addr; + image->arch.elf_headers_sz = elf_sz; + + ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, + ELF_CORE_HEADER_ALIGN, 0, -1, 0, + &image->arch.elf_load_addr); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, elf_sz, elf_sz); + + return ret; +} + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000000..9642b9b33655 --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,553 @@ +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/efi.h> +#include <linux/verify_pefile.h> +#include <keys/system_keyring.h> + +#include <asm/bootparam.h> +#include <asm/setup.h> +#include <asm/crash.h> +#include <asm/efi.h> + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct kimage *image, struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys, len; + uint32_t cmdline_low_32, cmdline_ext_32; + + memcpy(cmdline_ptr, cmdline, cmdline_len); + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr + cmdline_len - 1, + " elfcorehdr=0x%lx", image->arch.elf_load_addr); + cmdline_len += len; + } + cmdline_ptr[cmdline_len - 1] = '\0'; + + pr_debug("Final command line is: %s\n", cmdline_ptr); + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_e820_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_saved.nr_map; + + /* TODO: Pass entries more than E820MAX in bootparams setup data */ + if (nr_e820_entries > E820MAX) + nr_e820_entries = E820MAX; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_map, &e820_saved.map, + nr_e820_entries * sizeof(struct e820entry)); + + return 0; +} + +#ifdef CONFIG_EFI +static int setup_efi_info_memmap(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, + unsigned int efi_map_sz) +{ + void *efi_map = (void *)params + efi_map_offset; + unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_map_sz) + return 0; + + efi_runtime_map_copy(efi_map, efi_map_sz); + + ei->efi_memmap = efi_map_phys_addr & 0xffffffff; + ei->efi_memmap_hi = efi_map_phys_addr >> 32; + ei->efi_memmap_size = efi_map_sz; + + return 0; +} + +static int +prepare_add_efi_setup_data(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_setup_data_offset) +{ + unsigned long setup_data_phys; + struct setup_data *sd = (void *)params + efi_setup_data_offset; + struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); + + esd->fw_vendor = efi.fw_vendor; + esd->runtime = efi.runtime; + esd->tables = efi.config_table; + esd->smbios = efi.smbios; + + sd->type = SETUP_EFI; + sd->len = sizeof(struct efi_setup_data); + + /* Add setup data */ + setup_data_phys = params_load_addr + efi_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; + + return 0; +} + +static int +setup_efi_state(struct boot_params *params, unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + struct efi_info *current_ei = &boot_params.efi_info; + struct efi_info *ei = ¶ms->efi_info; + + if (!current_ei->efi_memmap_size) + return 0; + + /* + * If 1:1 mapping is not enabled, second kernel can not setup EFI + * and use EFI run time services. User space will have to pass + * acpi_rsdp=<addr> on kernel command line to make second kernel boot + * without efi. + */ + if (efi_enabled(EFI_OLD_MEMMAP)) + return 0; + + ei->efi_loader_signature = current_ei->efi_loader_signature; + ei->efi_systab = current_ei->efi_systab; + ei->efi_systab_hi = current_ei->efi_systab_hi; + + ei->efi_memdesc_version = current_ei->efi_memdesc_version; + ei->efi_memdesc_size = efi_get_runtime_map_desc_size(); + + setup_efi_info_memmap(params, params_load_addr, efi_map_offset, + efi_map_sz); + prepare_add_efi_setup_data(params, params_load_addr, + efi_setup_data_offset); + return 0; +} +#endif /* CONFIG_EFI */ + +static int +setup_boot_parameters(struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i, ret = 0; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &boot_params.screen_info, + sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + /* Default sysdesc table */ + params->sys_desc_table.length = 0; + + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_e820_entries(params); + + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_map[i].type != E820_RAM) + continue; + start = params->e820_map[i].addr; + end = params->e820_map[i].addr + params->e820_map[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + +#ifdef CONFIG_EFI + /* Setup EFI state */ + setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, + efi_setup_data_offset); +#endif + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return ret; +} + +int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be atleast two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* + * Can't handle 32bit EFI as it does not allow loading kernel + * above 4G. This should be handled by 32bit bzImage loader + */ + if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) { + pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + unsigned long purgatory_load_addr; + unsigned long kernel_bufsz, kernel_memsz, kernel_align; + char *kernel_buf; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * In case of crash dump, we will append elfcorehdr=<addr> to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, + &purgatory_load_addr); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + + + /* + * Load Bootparams and cmdline and space for efi stuff. + * + * Allocate memory together for multiple data structures so + * that they all can go in single area/segment and we don't + * have to create separate segment for each. Keeps things + * little bit simple + */ + efi_map_sz = efi_get_runtime_map_size(); + efi_map_sz = ALIGN(efi_map_sz, 16); + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; + params_cmdline_sz = ALIGN(params_cmdline_sz, 16); + params_misc_sz = params_cmdline_sz + efi_map_sz + + sizeof(struct setup_data) + + sizeof(struct efi_setup_data); + + params = kzalloc(params_misc_sz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + efi_map_offset = params_cmdline_sz; + efi_setup_data_offset = efi_map_offset + efi_map_sz; + + /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + ret = kexec_add_buffer(image, (char *)params, params_misc_sz, + params_misc_sz, 16, MIN_BOOTPARAM_ADDR, + ULONG_MAX, 1, &bootparam_load_addr); + if (ret) + goto out_free_params; + pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, params_misc_sz, params_misc_sz); + + /* Load kernel */ + kernel_buf = kernel + kern16_size; + kernel_bufsz = kernel_len - kern16_size; + kernel_memsz = PAGE_ALIGN(header->init_size); + kernel_align = header->kernel_alignment; + + ret = kexec_add_buffer(image, kernel_buf, + kernel_bufsz, kernel_memsz, kernel_align, + MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, + &kernel_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kernel_memsz, kernel_memsz); + + /* Load initrd high */ + if (initrd) { + ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, + PAGE_SIZE, MIN_INITRD_LOAD_ADDR, + ULONG_MAX, 1, &initrd_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + ret = setup_boot_parameters(image, params, bootparam_load_addr, + efi_map_offset, efi_map_sz, + efi_setup_data_offset); + if (ret) + goto out_free_params; + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG +int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) +{ + bool trusted; + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + system_trusted_keyring, &trusted); + if (ret < 0) + return ret; + if (!trusted) + return -EKEYREJECTED; + return 0; +} +#endif + +struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = bzImage64_verify_sig, +#endif +}; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 679cef0791cd..8b04018e5d1f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> @@ -21,6 +23,11 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> +#include <asm/kexec-bzimage64.h> + +static struct kexec_file_ops *kexec_file_loaders[] = { + &kexec_bzImage64_ops, +}; static void free_transition_pgtable(struct kimage *image) { @@ -171,6 +178,38 @@ static void load_segments(void) ); } +/* Update purgatory as needed after various image segments have been prepared */ +static int arch_update_purgatory(struct kimage *image) +{ + int ret = 0; + + if (!image->file_mode) + return 0; + + /* Setup copying of backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + &image->arch.backup_load_addr, + sizeof(image->arch.backup_load_addr), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_src", + &image->arch.backup_src_start, + sizeof(image->arch.backup_src_start), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + &image->arch.backup_src_sz, + sizeof(image->arch.backup_src_sz), 0); + if (ret) + return ret; + } + + return ret; +} + int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -184,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; + /* update purgatory as needed */ + result = arch_update_purgatory(image); + if (result) + return result; + return 0; } @@ -283,3 +327,198 @@ void arch_crash_save_vmcoreinfo(void) (unsigned long)&_text - __START_KERNEL); } +/* arch-dependent functionality related to kexec file-based syscall */ + +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + int i, ret = -ENOEXEC; + struct kexec_file_ops *fops; + + for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { + fops = kexec_file_loaders[i]; + if (!fops || !fops->probe) + continue; + + ret = fops->probe(buf, buf_len); + if (!ret) { + image->fops = fops; + return ret; + } + } + + return ret; +} + +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); +} + +int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, + unsigned long kernel_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification."); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(kernel, kernel_len); +} + +/* + * Apply purgatory relocations. + * + * ehdr: Pointer to elf headers + * sechdrs: Pointer to section headers. + * relsec: section index of SHT_RELA section. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, + Elf64_Shdr *sechdrs, unsigned int relsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + Elf64_Shdr *section, *symtabsec; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + + /* + * ->sh_offset has been modified to keep the pointer to section + * contents in memory + */ + rel = (void *)sechdrs[relsec].sh_offset; + + /* Section to which relocations apply */ + section = &sechdrs[sechdrs[relsec].sh_info]; + + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* Associated symbol table */ + symtabsec = &sechdrs[sechdrs[relsec].sh_link]; + + /* String table */ + if (symtabsec->sh_link >= ehdr->e_shnum) { + /* Invalid strtab section number */ + pr_err("Invalid string table section index %d\n", + symtabsec->sh_link); + return -ENOEXEC; + } + + strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + + /* section header string table */ + shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update (->sh_offset). This is temporary + * buffer where section is currently loaded. This will finally + * be loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = (void *)(section->sh_offset + rel[i].r_offset); + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (Elf64_Sym *)symtabsec->sh_offset + + ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 1185fe7a7f47..9ade5cfb5a4c 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp) int ret; unsigned long enable; - ret = strict_strtoul(val, 10, &enable); + ret = kstrtoul(val, 10, &enable); if (ret < 0) return -EINVAL; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index ed161c6e278b..3968d67d366b 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1479,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return count; } - if (strict_strtol(optstr, 10, &input_arg) < 0) { + if (kstrtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile new file mode 100644 index 000000000000..7fde9ee438a4 --- /dev/null +++ b/arch/x86/purgatory/Makefile @@ -0,0 +1,30 @@ +purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o + +targets += $(purgatory-y) +PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) + +LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib +targets += purgatory.ro + +# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That +# in turn leaves some undefined symbols like __fentry__ in purgatory and not +# sure how to relocate those. Like kexec-tools, use custom flags. + +KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large + +$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE + $(call if_changed,ld) + +targets += kexec-purgatory.c + +quiet_cmd_bin2c = BIN2C $@ + cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c + +$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE + $(call if_changed,bin2c) + + +# No loaders for 32bits yet. +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_KEXEC) += kexec-purgatory.o +endif diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S new file mode 100644 index 000000000000..d1a4291d3568 --- /dev/null +++ b/arch/x86/purgatory/entry64.S @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + + * Author(s): Vivek Goyal <vgoyal@redhat.com> + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .balign 16 + .code64 + .globl entry64, entry64_regs + + +entry64: + /* Setup a gdt that should be preserved */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup new stack */ + leaq stack_init(%rip), %rsp + pushq $0x10 /* CS */ + leaq new_cs_exit(%rip), %rax + pushq %rax + lretq +new_cs_exit: + + /* Load the registers */ + movq rax(%rip), %rax + movq rbx(%rip), %rbx + movq rcx(%rip), %rcx + movq rdx(%rip), %rdx + movq rsi(%rip), %rsi + movq rdi(%rip), %rdi + movq rsp(%rip), %rsp + movq rbp(%rip), %rbp + movq r8(%rip), %r8 + movq r9(%rip), %r9 + movq r10(%rip), %r10 + movq r11(%rip), %r11 + movq r12(%rip), %r12 + movq r13(%rip), %r13 + movq r14(%rip), %r14 + movq r15(%rip), %r15 + + /* Jump to the new code... */ + jmpq *rip(%rip) + + .section ".rodata" + .balign 4 +entry64_regs: +rax: .quad 0x0 +rcx: .quad 0x0 +rdx: .quad 0x0 +rbx: .quad 0x0 +rsp: .quad 0x0 +rbp: .quad 0x0 +rsi: .quad 0x0 +rdi: .quad 0x0 +r8: .quad 0x0 +r9: .quad 0x0 +r10: .quad 0x0 +r11: .quad 0x0 +r12: .quad 0x0 +r13: .quad 0x0 +r14: .quad 0x0 +r15: .quad 0x0 +rip: .quad 0x0 + .size entry64_regs, . - entry64_regs + + /* GDT */ + .section ".rodata" + .balign 16 +gdt: + /* 0x00 unusable segment + * 0x08 unused + * so use them as gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: +stack: .quad 0, 0 +stack_init: diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c new file mode 100644 index 000000000000..25e068ba3382 --- /dev/null +++ b/arch/x86/purgatory/purgatory.c @@ -0,0 +1,72 @@ +/* + * purgatory: Runs between two kernels + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "sha256.h" +#include "../boot/string.h" + +struct sha_region { + unsigned long start; + unsigned long len; +}; + +unsigned long backup_dest = 0; +unsigned long backup_src = 0; +unsigned long backup_sz = 0; + +u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; + +struct sha_region sha_regions[16] = {}; + +/* + * On x86, second kernel requries first 640K of memory to boot. Copy + * first 640K to a backup region in reserved memory range so that second + * kernel can use first 640K. + */ +static int copy_backup_region(void) +{ + if (backup_dest) + memcpy((void *)backup_dest, (void *)backup_src, backup_sz); + + return 0; +} + +int verify_sha256_digest(void) +{ + struct sha_region *ptr, *end; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state sctx; + + sha256_init(&sctx); + end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; + for (ptr = sha_regions; ptr < end; ptr++) + sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); + + sha256_final(&sctx, digest); + + if (memcmp(digest, sha256_digest, sizeof(digest))) + return 1; + + return 0; +} + +void purgatory(void) +{ + int ret; + + ret = verify_sha256_digest(); + if (ret) { + /* loop forever */ + for (;;) + ; + } + copy_backup_region(); +} diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S new file mode 100644 index 000000000000..fe3c91ba1bd0 --- /dev/null +++ b/arch/x86/purgatory/setup-x86_64.S @@ -0,0 +1,58 @@ +/* + * purgatory: setup code + * + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .globl purgatory_start + .balign 16 +purgatory_start: + .code64 + + /* Load a gdt so I know what the segment registers are */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup a stack */ + leaq lstack_end(%rip), %rsp + + /* Call the C code */ + call purgatory + jmp entry64 + + .section ".rodata" + .balign 16 +gdt: /* 0x00 unusable segment + * 0x08 unused + * so use them as the gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: + + .bss + .balign 4096 +lstack: + .skip 4096 +lstack_end: diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c new file mode 100644 index 000000000000..548ca675a14a --- /dev/null +++ b/arch/x86/purgatory/sha256.c @@ -0,0 +1,283 @@ +/* + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * Copyright (c) 2014 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/bitops.h> +#include <asm/byteorder.h> +#include "sha256.h" +#include "../boot/string.h" + +static inline u32 Ch(u32 x, u32 y, u32 z) +{ + return z ^ (x & (y ^ z)); +} + +static inline u32 Maj(u32 x, u32 y, u32 z) +{ + return (x & y) | (z & (x | y)); +} + +#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) +#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) +#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) +#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) + +static inline void LOAD_OP(int I, u32 *W, const u8 *input) +{ + W[I] = __be32_to_cpu(((__be32 *)(input))[I]); +} + +static inline void BLEND_OP(int I, u32 *W) +{ + W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; +} + +static void sha256_transform(u32 *state, const u8 *input) +{ + u32 a, b, c, d, e, f, g, h, t1, t2; + u32 W[64]; + int i; + + /* load the input */ + for (i = 0; i < 16; i++) + LOAD_OP(i, W, input); + + /* now blend */ + for (i = 16; i < 64; i++) + BLEND_OP(i, W); + + /* load the state into our registers */ + a = state[0]; b = state[1]; c = state[2]; d = state[3]; + e = state[4]; f = state[5]; g = state[6]; h = state[7]; + + /* now iterate */ + t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + state[4] += e; state[5] += f; state[6] += g; state[7] += h; + + /* clear any sensitive info... */ + a = b = c = d = e = f = g = h = t1 = t2 = 0; + memset(W, 0, 64 * sizeof(u32)); +} + +int sha256_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) +{ + unsigned int partial, done; + const u8 *src; + + partial = sctx->count & 0x3f; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) > 63) { + if (partial) { + done = -partial; + memcpy(sctx->buf + partial, data, done + 64); + src = sctx->buf; + } + + do { + sha256_transform(sctx->state, src); + done += 64; + src = data + done; + } while (done + 63 < len); + + partial = 0; + } + memcpy(sctx->buf + partial, src, len - done); + + return 0; +} + +int sha256_final(struct sha256_state *sctx, u8 *out) +{ + __be32 *dst = (__be32 *)out; + __be64 bits; + unsigned int index, pad_len; + int i; + static const u8 padding[64] = { 0x80, }; + + /* Save number of bits */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + sha256_update(sctx, padding, pad_len); + + /* Append length (before padding) */ + sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h new file mode 100644 index 000000000000..bd15a4127735 --- /dev/null +++ b/arch/x86/purgatory/sha256.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Red Hat Inc. + * + * Author: Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#ifndef SHA256_H +#define SHA256_H + + +#include <linux/types.h> +#include <crypto/sha.h> + +extern int sha256_init(struct sha256_state *sctx); +extern int sha256_update(struct sha256_state *sctx, const u8 *input, + unsigned int length); +extern int sha256_final(struct sha256_state *sctx, u8 *hash); + +#endif /* SHA256_H */ diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S new file mode 100644 index 000000000000..3cefba1fefc8 --- /dev/null +++ b/arch/x86/purgatory/stack.S @@ -0,0 +1,19 @@ +/* + * purgatory: stack + * + * Copyright (C) 2014 Red Hat Inc. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + /* A stack for the loaded kernel. + * Seperate and in the data section so it can be prepopulated. + */ + .data + .balign 4096 + .globl stack, stack_end + +stack: + .skip 4096 +stack_end: diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c new file mode 100644 index 000000000000..d886b1fa36f0 --- /dev/null +++ b/arch/x86/purgatory/string.c @@ -0,0 +1,13 @@ +/* + * Simple string functions. + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "../boot/string.c" diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index d1b4a119d4a5..028b78168d85 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -362,3 +362,4 @@ 353 i386 renameat2 sys_renameat2 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 252c804bb1aa..35dd922727b9 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -325,6 +325,8 @@ 316 common renameat2 sys_renameat2 317 common seccomp sys_seccomp 318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 0feee2fd5077..25a1022dd793 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -216,6 +216,5 @@ extern long elf_aux_hwcap; #define ELF_HWCAP (elf_aux_hwcap) #define SET_PERSONALITY(ex) do ; while(0) -#define __HAVE_ARCH_GATE_AREA 1 #endif diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c index c6492e75797b..f8fecaddcc0d 100644 --- a/arch/x86/um/mem_64.c +++ b/arch/x86/um/mem_64.c @@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index fd57829b30d8..0224987556ce 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -109,16 +109,18 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, /* Validate mapping addresses. */ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - if (!syms[i]) + INT_BITS symval = syms[special_pages[i]]; + + if (!symval) continue; /* The mapping isn't used; ignore it. */ - if (syms[i] % 4096) + if (symval % 4096) fail("%s must be a multiple of 4096\n", required_syms[i].name); - if (syms[sym_vvar_start] > syms[i] + 4096) - fail("%s underruns begin_vvar\n", + if (symval + 4096 < syms[sym_vvar_start]) + fail("%s underruns vvar_start\n", required_syms[i].name); - if (syms[i] + 4096 > 0) + if (symval + 4096 > 0) fail("%s is on the wrong side of the vdso text\n", required_syms[i].name); } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e4f7781ee162..e904c270573b 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -115,23 +115,6 @@ static __init int ia32_binfmt_init(void) return 0; } __initcall(ia32_binfmt_init); -#endif - -#else /* CONFIG_X86_32 */ - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_X86_64 */ diff --git a/drivers/atm/he.c b/drivers/atm/he.c index aa6be2698669..c39702bc279d 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -533,14 +533,13 @@ static void he_init_tx_lbfp(struct he_dev *he_dev) static int he_init_tpdrq(struct he_dev *he_dev) { - he_dev->tpdrq_base = pci_alloc_consistent(he_dev->pci_dev, - CONFIG_TPDRQ_SIZE * sizeof(struct he_tpdrq), &he_dev->tpdrq_phys); + he_dev->tpdrq_base = pci_zalloc_consistent(he_dev->pci_dev, + CONFIG_TPDRQ_SIZE * sizeof(struct he_tpdrq), + &he_dev->tpdrq_phys); if (he_dev->tpdrq_base == NULL) { hprintk("failed to alloc tpdrq\n"); return -ENOMEM; } - memset(he_dev->tpdrq_base, 0, - CONFIG_TPDRQ_SIZE * sizeof(struct he_tpdrq)); he_dev->tpdrq_tail = he_dev->tpdrq_base; he_dev->tpdrq_head = he_dev->tpdrq_base; @@ -804,13 +803,13 @@ static int he_init_group(struct he_dev *he_dev, int group) goto out_free_rbpl_virt; } - he_dev->rbpl_base = pci_alloc_consistent(he_dev->pci_dev, - CONFIG_RBPL_SIZE * sizeof(struct he_rbp), &he_dev->rbpl_phys); + he_dev->rbpl_base = pci_zalloc_consistent(he_dev->pci_dev, + CONFIG_RBPL_SIZE * sizeof(struct he_rbp), + &he_dev->rbpl_phys); if (he_dev->rbpl_base == NULL) { hprintk("failed to alloc rbpl_base\n"); goto out_destroy_rbpl_pool; } - memset(he_dev->rbpl_base, 0, CONFIG_RBPL_SIZE * sizeof(struct he_rbp)); INIT_LIST_HEAD(&he_dev->rbpl_outstanding); @@ -843,13 +842,13 @@ static int he_init_group(struct he_dev *he_dev, int group) /* rx buffer ready queue */ - he_dev->rbrq_base = pci_alloc_consistent(he_dev->pci_dev, - CONFIG_RBRQ_SIZE * sizeof(struct he_rbrq), &he_dev->rbrq_phys); + he_dev->rbrq_base = pci_zalloc_consistent(he_dev->pci_dev, + CONFIG_RBRQ_SIZE * sizeof(struct he_rbrq), + &he_dev->rbrq_phys); if (he_dev->rbrq_base == NULL) { hprintk("failed to allocate rbrq\n"); goto out_free_rbpl; } - memset(he_dev->rbrq_base, 0, CONFIG_RBRQ_SIZE * sizeof(struct he_rbrq)); he_dev->rbrq_head = he_dev->rbrq_base; he_writel(he_dev, he_dev->rbrq_phys, G0_RBRQ_ST + (group * 16)); @@ -867,13 +866,13 @@ static int he_init_group(struct he_dev *he_dev, int group) /* tx buffer ready queue */ - he_dev->tbrq_base = pci_alloc_consistent(he_dev->pci_dev, - CONFIG_TBRQ_SIZE * sizeof(struct he_tbrq), &he_dev->tbrq_phys); + he_dev->tbrq_base = pci_zalloc_consistent(he_dev->pci_dev, + CONFIG_TBRQ_SIZE * sizeof(struct he_tbrq), + &he_dev->tbrq_phys); if (he_dev->tbrq_base == NULL) { hprintk("failed to allocate tbrq\n"); goto out_free_rbpq_base; } - memset(he_dev->tbrq_base, 0, CONFIG_TBRQ_SIZE * sizeof(struct he_tbrq)); he_dev->tbrq_head = he_dev->tbrq_base; @@ -1460,13 +1459,13 @@ static int he_start(struct atm_dev *dev) /* host status page */ - he_dev->hsp = pci_alloc_consistent(he_dev->pci_dev, - sizeof(struct he_hsp), &he_dev->hsp_phys); + he_dev->hsp = pci_zalloc_consistent(he_dev->pci_dev, + sizeof(struct he_hsp), + &he_dev->hsp_phys); if (he_dev->hsp == NULL) { hprintk("failed to allocate host status page\n"); return -ENOMEM; } - memset(he_dev->hsp, 0, sizeof(struct he_hsp)); he_writel(he_dev, he_dev->hsp_phys, HSP_BA); /* initialize framer */ diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index b621f56a36be..2b24ed056728 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -641,13 +641,11 @@ alloc_scq(struct idt77252_dev *card, int class) scq = kzalloc(sizeof(struct scq_info), GFP_KERNEL); if (!scq) return NULL; - scq->base = pci_alloc_consistent(card->pcidev, SCQ_SIZE, - &scq->paddr); + scq->base = pci_zalloc_consistent(card->pcidev, SCQ_SIZE, &scq->paddr); if (scq->base == NULL) { kfree(scq); return NULL; } - memset(scq->base, 0, SCQ_SIZE); scq->next = scq->base; scq->last = scq->base + (SCQ_ENTRIES - 1); @@ -972,13 +970,12 @@ init_rsq(struct idt77252_dev *card) { struct rsq_entry *rsqe; - card->rsq.base = pci_alloc_consistent(card->pcidev, RSQSIZE, - &card->rsq.paddr); + card->rsq.base = pci_zalloc_consistent(card->pcidev, RSQSIZE, + &card->rsq.paddr); if (card->rsq.base == NULL) { printk("%s: can't allocate RSQ.\n", card->name); return -1; } - memset(card->rsq.base, 0, RSQSIZE); card->rsq.last = card->rsq.base + RSQ_NUM_ENTRIES - 1; card->rsq.next = card->rsq.last; @@ -3400,14 +3397,14 @@ static int init_card(struct atm_dev *dev) writel(0, SAR_REG_GP); /* Initialize RAW Cell Handle Register */ - card->raw_cell_hnd = pci_alloc_consistent(card->pcidev, 2 * sizeof(u32), - &card->raw_cell_paddr); + card->raw_cell_hnd = pci_zalloc_consistent(card->pcidev, + 2 * sizeof(u32), + &card->raw_cell_paddr); if (!card->raw_cell_hnd) { printk("%s: memory allocation failure.\n", card->name); deinit_card(card); return -1; } - memset(card->raw_cell_hnd, 0, 2 * sizeof(u32)); writel(card->raw_cell_paddr, SAR_REG_RAWHND); IPRINTK("%s: raw cell handle is at 0x%p.\n", card->name, card->raw_cell_hnd); diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 125d84505738..811e11c82f32 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6741,11 +6741,11 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, ErrorCode = -ENOMEM; if (DataTransferLength > 0) { - DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, - DataTransferLength, &DataTransferBufferDMA); + DataTransferBuffer = pci_zalloc_consistent(Controller->PCIDevice, + DataTransferLength, + &DataTransferBufferDMA); if (DataTransferBuffer == NULL) break; - memset(DataTransferBuffer, 0, DataTransferLength); } else if (DataTransferLength < 0) { @@ -6877,11 +6877,11 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, ErrorCode = -ENOMEM; if (DataTransferLength > 0) { - DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, - DataTransferLength, &DataTransferBufferDMA); + DataTransferBuffer = pci_zalloc_consistent(Controller->PCIDevice, + DataTransferLength, + &DataTransferBufferDMA); if (DataTransferBuffer == NULL) break; - memset(DataTransferBuffer, 0, DataTransferLength); } else if (DataTransferLength < 0) { @@ -6899,14 +6899,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, RequestSenseLength = UserCommand.RequestSenseLength; if (RequestSenseLength > 0) { - RequestSenseBuffer = pci_alloc_consistent(Controller->PCIDevice, - RequestSenseLength, &RequestSenseBufferDMA); + RequestSenseBuffer = pci_zalloc_consistent(Controller->PCIDevice, + RequestSenseLength, + &RequestSenseBufferDMA); if (RequestSenseBuffer == NULL) { ErrorCode = -ENOMEM; goto Failure2; } - memset(RequestSenseBuffer, 0, RequestSenseLength); } spin_lock_irqsave(&Controller->queue_lock, flags); while ((Command = DAC960_AllocateCommand(Controller)) == NULL) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 4595c22f33f7..ff20f192b0f6 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1014,24 +1014,21 @@ static CommandList_struct *cmd_special_alloc(ctlr_info_t *h) u64bit temp64; dma_addr_t cmd_dma_handle, err_dma_handle; - c = (CommandList_struct *) pci_alloc_consistent(h->pdev, - sizeof(CommandList_struct), &cmd_dma_handle); + c = pci_zalloc_consistent(h->pdev, sizeof(CommandList_struct), + &cmd_dma_handle); if (c == NULL) return NULL; - memset(c, 0, sizeof(CommandList_struct)); c->cmdindex = -1; - c->err_info = (ErrorInfo_struct *) - pci_alloc_consistent(h->pdev, sizeof(ErrorInfo_struct), - &err_dma_handle); + c->err_info = pci_zalloc_consistent(h->pdev, sizeof(ErrorInfo_struct), + &err_dma_handle); if (c->err_info == NULL) { pci_free_consistent(h->pdev, sizeof(CommandList_struct), c, cmd_dma_handle); return NULL; } - memset(c->err_info, 0, sizeof(ErrorInfo_struct)); INIT_LIST_HEAD(&c->list); c->busaddr = (__u32) cmd_dma_handle; diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 608532d3f8c9..f0a089df85cc 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4112,16 +4112,14 @@ static int skd_cons_skcomp(struct skd_device *skdev) skdev->name, __func__, __LINE__, nbytes, SKD_N_COMPLETION_ENTRY); - skcomp = pci_alloc_consistent(skdev->pdev, nbytes, - &skdev->cq_dma_address); + skcomp = pci_zalloc_consistent(skdev->pdev, nbytes, + &skdev->cq_dma_address); if (skcomp == NULL) { rc = -ENOMEM; goto err_out; } - memset(skcomp, 0, nbytes); - skdev->skcomp_table = skcomp; skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp + sizeof(*skcomp) * @@ -4304,15 +4302,14 @@ static int skd_cons_skspcl(struct skd_device *skdev) nbytes = SKD_N_SPECIAL_FITMSG_BYTES; - skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes, - &skspcl->mb_dma_address); + skspcl->msg_buf = + pci_zalloc_consistent(skdev->pdev, nbytes, + &skspcl->mb_dma_address); if (skspcl->msg_buf == NULL) { rc = -ENOMEM; goto err_out; } - memset(skspcl->msg_buf, 0, nbytes); - skspcl->req.sg = kzalloc(sizeof(struct scatterlist) * SKD_N_SG_PER_SPECIAL, GFP_KERNEL); if (skspcl->req.sg == NULL) { @@ -4353,25 +4350,21 @@ static int skd_cons_sksb(struct skd_device *skdev) nbytes = SKD_N_INTERNAL_BYTES; - skspcl->data_buf = pci_alloc_consistent(skdev->pdev, nbytes, - &skspcl->db_dma_address); + skspcl->data_buf = pci_zalloc_consistent(skdev->pdev, nbytes, + &skspcl->db_dma_address); if (skspcl->data_buf == NULL) { rc = -ENOMEM; goto err_out; } - memset(skspcl->data_buf, 0, nbytes); - nbytes = SKD_N_SPECIAL_FITMSG_BYTES; - skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes, - &skspcl->mb_dma_address); + skspcl->msg_buf = pci_zalloc_consistent(skdev->pdev, nbytes, + &skspcl->mb_dma_address); if (skspcl->msg_buf == NULL) { rc = -ENOMEM; goto err_out; } - memset(skspcl->msg_buf, 0, nbytes); - skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1, &skspcl->req.sksg_dma_address); if (skspcl->req.sksg_list == NULL) { diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index 12fea3e22348..8d2a7728434d 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -2617,14 +2617,13 @@ static int hifn_probe(struct pci_dev *pdev, const struct pci_device_id *id) } } - dev->desc_virt = pci_alloc_consistent(pdev, sizeof(struct hifn_dma), - &dev->desc_dma); + dev->desc_virt = pci_zalloc_consistent(pdev, sizeof(struct hifn_dma), + &dev->desc_dma); if (!dev->desc_virt) { dprintk("Failed to allocate descriptor rings.\n"); err = -ENOMEM; goto err_out_unmap_bars; } - memset(dev->desc_virt, 0, sizeof(struct hifn_dma)); dev->pdev = pdev; dev->irq = pdev->irq; diff --git a/drivers/firmware/efi/runtime-map.c b/drivers/firmware/efi/runtime-map.c index 97cdd16a2169..018c29a26615 100644 --- a/drivers/firmware/efi/runtime-map.c +++ b/drivers/firmware/efi/runtime-map.c @@ -138,6 +138,27 @@ add_sysfs_runtime_map_entry(struct kobject *kobj, int nr) return entry; } +int efi_get_runtime_map_size(void) +{ + return nr_efi_runtime_map * efi_memdesc_size; +} + +int efi_get_runtime_map_desc_size(void) +{ + return efi_memdesc_size; +} + +int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + size_t sz = efi_get_runtime_map_size(); + + if (sz > bufsz) + sz = bufsz; + + memcpy(buf, efi_runtime_map, sz); + return 0; +} + void efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) { efi_runtime_map = map; diff --git a/drivers/gpio/gpio-zevio.c b/drivers/gpio/gpio-zevio.c index 54e54e4cc6c4..cd71e5769a76 100644 --- a/drivers/gpio/gpio-zevio.c +++ b/drivers/gpio/gpio-zevio.c @@ -18,6 +18,10 @@ #include <linux/slab.h> #include <linux/gpio.h> +#ifndef IOMEM +#define IOMEM(x) ((void __force __iomem *)(x)) +#endif + /* * Memory layout: * This chip has four gpio sections, each controls 8 GPIOs. diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c index e88bac1d781f..bae897de9468 100644 --- a/drivers/gpu/drm/i810/i810_dma.c +++ b/drivers/gpu/drm/i810/i810_dma.c @@ -393,15 +393,14 @@ static int i810_dma_initialize(struct drm_device *dev, /* Program Hardware Status Page */ dev_priv->hw_status_page = - pci_alloc_consistent(dev->pdev, PAGE_SIZE, - &dev_priv->dma_status_page); + pci_zalloc_consistent(dev->pdev, PAGE_SIZE, + &dev_priv->dma_status_page); if (!dev_priv->hw_status_page) { dev->dev_private = (void *)dev_priv; i810_dma_cleanup(dev); DRM_ERROR("Can not allocate hardware status page\n"); return -ENOMEM; } - memset(dev_priv->hw_status_page, 0, PAGE_SIZE); DRM_DEBUG("hw status page @ %p\n", dev_priv->hw_status_page); I810_WRITE(0x02080, dev_priv->dma_status_page); diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index 00400c352c1a..766a71ccefed 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -604,16 +604,14 @@ static int c2_up(struct net_device *netdev) tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc); c2_port->mem_size = tx_size + rx_size; - c2_port->mem = pci_alloc_consistent(c2dev->pcidev, c2_port->mem_size, - &c2_port->dma); + c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size, + &c2_port->dma); if (c2_port->mem == NULL) { pr_debug("Unable to allocate memory for " "host descriptor rings\n"); return -ENOMEM; } - memset(c2_port->mem, 0, c2_port->mem_size); - /* Create the Rx host descriptor ring */ if ((ret = c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma, diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 90200245c5eb..02120d340d50 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1003,13 +1003,13 @@ int nes_init_cqp(struct nes_device *nesdev) (sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) + sizeof(struct nes_hw_cqp_qp_context); - nesdev->cqp_vbase = pci_alloc_consistent(nesdev->pcidev, nesdev->cqp_mem_size, - &nesdev->cqp_pbase); + nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev, + nesdev->cqp_mem_size, + &nesdev->cqp_pbase); if (!nesdev->cqp_vbase) { nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n"); return -ENOMEM; } - memset(nesdev->cqp_vbase, 0, nesdev->cqp_mem_size); /* Allocate a twice the number of CQP requests as the SQ size */ nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) * @@ -1691,13 +1691,13 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) (NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) + sizeof(struct nes_hw_nic_qp_context); - nesvnic->nic_vbase = pci_alloc_consistent(nesdev->pcidev, nesvnic->nic_mem_size, - &nesvnic->nic_pbase); + nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev, + nesvnic->nic_mem_size, + &nesvnic->nic_pbase); if (!nesvnic->nic_vbase) { nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n"); return -ENOMEM; } - memset(nesvnic->nic_vbase, 0, nesvnic->nic_mem_size); nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n", nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 218dd3574285..fef067c959fc 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1616,8 +1616,8 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, entries, nescq->cq_mem_size, nescq->hw_cq.cq_number); /* allocate the physical buffer space */ - mem = pci_alloc_consistent(nesdev->pcidev, nescq->cq_mem_size, - &nescq->hw_cq.cq_pbase); + mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size, + &nescq->hw_cq.cq_pbase); if (!mem) { printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); @@ -1625,7 +1625,6 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, return ERR_PTR(-ENOMEM); } - memset(mem, 0, nescq->cq_mem_size); nescq->hw_cq.cq_vbase = mem; nescq->hw_cq.cq_head = 0; nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n", diff --git a/drivers/media/common/saa7146/saa7146_core.c b/drivers/media/common/saa7146/saa7146_core.c index 34b0d0ddeef3..97afee672d07 100644 --- a/drivers/media/common/saa7146/saa7146_core.c +++ b/drivers/media/common/saa7146/saa7146_core.c @@ -421,23 +421,20 @@ static int saa7146_init_one(struct pci_dev *pci, const struct pci_device_id *ent err = -ENOMEM; /* get memory for various stuff */ - dev->d_rps0.cpu_addr = pci_alloc_consistent(pci, SAA7146_RPS_MEM, - &dev->d_rps0.dma_handle); + dev->d_rps0.cpu_addr = pci_zalloc_consistent(pci, SAA7146_RPS_MEM, + &dev->d_rps0.dma_handle); if (!dev->d_rps0.cpu_addr) goto err_free_irq; - memset(dev->d_rps0.cpu_addr, 0x0, SAA7146_RPS_MEM); - dev->d_rps1.cpu_addr = pci_alloc_consistent(pci, SAA7146_RPS_MEM, - &dev->d_rps1.dma_handle); + dev->d_rps1.cpu_addr = pci_zalloc_consistent(pci, SAA7146_RPS_MEM, + &dev->d_rps1.dma_handle); if (!dev->d_rps1.cpu_addr) goto err_free_rps0; - memset(dev->d_rps1.cpu_addr, 0x0, SAA7146_RPS_MEM); - dev->d_i2c.cpu_addr = pci_alloc_consistent(pci, SAA7146_RPS_MEM, - &dev->d_i2c.dma_handle); + dev->d_i2c.cpu_addr = pci_zalloc_consistent(pci, SAA7146_RPS_MEM, + &dev->d_i2c.dma_handle); if (!dev->d_i2c.cpu_addr) goto err_free_rps1; - memset(dev->d_i2c.cpu_addr, 0x0, SAA7146_RPS_MEM); /* the rest + print status message */ diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c index d9e1d6395ed9..6c47f3fe9b0f 100644 --- a/drivers/media/common/saa7146/saa7146_fops.c +++ b/drivers/media/common/saa7146/saa7146_fops.c @@ -520,14 +520,15 @@ int saa7146_vv_init(struct saa7146_dev* dev, struct saa7146_ext_vv *ext_vv) configuration data) */ dev->ext_vv_data = ext_vv; - vv->d_clipping.cpu_addr = pci_alloc_consistent(dev->pci, SAA7146_CLIPPING_MEM, &vv->d_clipping.dma_handle); + vv->d_clipping.cpu_addr = + pci_zalloc_consistent(dev->pci, SAA7146_CLIPPING_MEM, + &vv->d_clipping.dma_handle); if( NULL == vv->d_clipping.cpu_addr ) { ERR("out of memory. aborting.\n"); kfree(vv); v4l2_ctrl_handler_free(hdl); return -1; } - memset(vv->d_clipping.cpu_addr, 0x0, SAA7146_CLIPPING_MEM); saa7146_video_uops.init(dev,vv); if (dev->ext_vv_data->capabilities & V4L2_CAP_VBI_CAPTURE) diff --git a/drivers/media/pci/bt8xx/bt878.c b/drivers/media/pci/bt8xx/bt878.c index d0c281f41a0a..11765835d7b2 100644 --- a/drivers/media/pci/bt8xx/bt878.c +++ b/drivers/media/pci/bt8xx/bt878.c @@ -101,28 +101,20 @@ static int bt878_mem_alloc(struct bt878 *bt) if (!bt->buf_cpu) { bt->buf_size = 128 * 1024; - bt->buf_cpu = - pci_alloc_consistent(bt->dev, bt->buf_size, - &bt->buf_dma); - + bt->buf_cpu = pci_zalloc_consistent(bt->dev, bt->buf_size, + &bt->buf_dma); if (!bt->buf_cpu) return -ENOMEM; - - memset(bt->buf_cpu, 0, bt->buf_size); } if (!bt->risc_cpu) { bt->risc_size = PAGE_SIZE; - bt->risc_cpu = - pci_alloc_consistent(bt->dev, bt->risc_size, - &bt->risc_dma); - + bt->risc_cpu = pci_zalloc_consistent(bt->dev, bt->risc_size, + &bt->risc_dma); if (!bt->risc_cpu) { bt878_mem_free(bt); return -ENOMEM; } - - memset(bt->risc_cpu, 0, bt->risc_size); } return 0; diff --git a/drivers/media/pci/ngene/ngene-core.c b/drivers/media/pci/ngene/ngene-core.c index 826228c3800e..4930b55fd5f4 100644 --- a/drivers/media/pci/ngene/ngene-core.c +++ b/drivers/media/pci/ngene/ngene-core.c @@ -1075,12 +1075,11 @@ static int AllocCommonBuffers(struct ngene *dev) dev->ngenetohost = dev->FWInterfaceBuffer + 256; dev->EventBuffer = dev->FWInterfaceBuffer + 512; - dev->OverflowBuffer = pci_alloc_consistent(dev->pci_dev, - OVERFLOW_BUFFER_SIZE, - &dev->PAOverflowBuffer); + dev->OverflowBuffer = pci_zalloc_consistent(dev->pci_dev, + OVERFLOW_BUFFER_SIZE, + &dev->PAOverflowBuffer); if (!dev->OverflowBuffer) return -ENOMEM; - memset(dev->OverflowBuffer, 0, OVERFLOW_BUFFER_SIZE); for (i = STREAM_VIDEOIN1; i < MAX_STREAM; i++) { int type = dev->card_info->io_type[i]; diff --git a/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c b/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c index f166ffc9800a..cef7a00099ea 100644 --- a/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c +++ b/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c @@ -803,11 +803,9 @@ static int ttusb_alloc_iso_urbs(struct ttusb *ttusb) { int i; - ttusb->iso_buffer = pci_alloc_consistent(NULL, - ISO_FRAME_SIZE * - FRAMES_PER_ISO_BUF * - ISO_BUF_COUNT, - &ttusb->iso_dma_handle); + ttusb->iso_buffer = pci_zalloc_consistent(NULL, + ISO_FRAME_SIZE * FRAMES_PER_ISO_BUF * ISO_BUF_COUNT, + &ttusb->iso_dma_handle); if (!ttusb->iso_buffer) { dprintk("%s: pci_alloc_consistent - not enough memory\n", @@ -815,9 +813,6 @@ static int ttusb_alloc_iso_urbs(struct ttusb *ttusb) return -ENOMEM; } - memset(ttusb->iso_buffer, 0, - ISO_FRAME_SIZE * FRAMES_PER_ISO_BUF * ISO_BUF_COUNT); - for (i = 0; i < ISO_BUF_COUNT; i++) { struct urb *urb; diff --git a/drivers/media/usb/ttusb-dec/ttusb_dec.c b/drivers/media/usb/ttusb-dec/ttusb_dec.c index 29724af9b9ab..15ab584cf265 100644 --- a/drivers/media/usb/ttusb-dec/ttusb_dec.c +++ b/drivers/media/usb/ttusb-dec/ttusb_dec.c @@ -1151,11 +1151,9 @@ static int ttusb_dec_alloc_iso_urbs(struct ttusb_dec *dec) dprintk("%s\n", __func__); - dec->iso_buffer = pci_alloc_consistent(NULL, - ISO_FRAME_SIZE * - (FRAMES_PER_ISO_BUF * - ISO_BUF_COUNT), - &dec->iso_dma_handle); + dec->iso_buffer = pci_zalloc_consistent(NULL, + ISO_FRAME_SIZE * (FRAMES_PER_ISO_BUF * ISO_BUF_COUNT), + &dec->iso_dma_handle); if (!dec->iso_buffer) { dprintk("%s: pci_alloc_consistent - not enough memory\n", @@ -1163,9 +1161,6 @@ static int ttusb_dec_alloc_iso_urbs(struct ttusb_dec *dec) return -ENOMEM; } - memset(dec->iso_buffer, 0, - ISO_FRAME_SIZE * (FRAMES_PER_ISO_BUF * ISO_BUF_COUNT)); - for (i = 0; i < ISO_BUF_COUNT; i++) { struct urb *urb; diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index e7cc9174e364..4a8fdc4721d5 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -481,37 +481,32 @@ static void pcnet32_realloc_tx_ring(struct net_device *dev, dma_addr_t *new_dma_addr_list; struct pcnet32_tx_head *new_tx_ring; struct sk_buff **new_skb_list; + unsigned int entries = BIT(size); pcnet32_purge_tx_ring(dev); - new_tx_ring = pci_alloc_consistent(lp->pci_dev, - sizeof(struct pcnet32_tx_head) * - (1 << size), - &new_ring_dma_addr); - if (new_tx_ring == NULL) { - netif_err(lp, drv, dev, "Consistent memory allocation failed\n"); + new_tx_ring = + pci_zalloc_consistent(lp->pci_dev, + sizeof(struct pcnet32_tx_head) * entries, + &new_ring_dma_addr); + if (new_tx_ring == NULL) return; - } - memset(new_tx_ring, 0, sizeof(struct pcnet32_tx_head) * (1 << size)); - new_dma_addr_list = kcalloc(1 << size, sizeof(dma_addr_t), - GFP_ATOMIC); + new_dma_addr_list = kcalloc(entries, sizeof(dma_addr_t), GFP_ATOMIC); if (!new_dma_addr_list) goto free_new_tx_ring; - new_skb_list = kcalloc(1 << size, sizeof(struct sk_buff *), - GFP_ATOMIC); + new_skb_list = kcalloc(entries, sizeof(struct sk_buff *), GFP_ATOMIC); if (!new_skb_list) goto free_new_lists; kfree(lp->tx_skbuff); kfree(lp->tx_dma_addr); pci_free_consistent(lp->pci_dev, - sizeof(struct pcnet32_tx_head) * - lp->tx_ring_size, lp->tx_ring, - lp->tx_ring_dma_addr); + sizeof(struct pcnet32_tx_head) * lp->tx_ring_size, + lp->tx_ring, lp->tx_ring_dma_addr); - lp->tx_ring_size = (1 << size); + lp->tx_ring_size = entries; lp->tx_mod_mask = lp->tx_ring_size - 1; lp->tx_len_bits = (size << 12); lp->tx_ring = new_tx_ring; @@ -524,8 +519,7 @@ free_new_lists: kfree(new_dma_addr_list); free_new_tx_ring: pci_free_consistent(lp->pci_dev, - sizeof(struct pcnet32_tx_head) * - (1 << size), + sizeof(struct pcnet32_tx_head) * entries, new_tx_ring, new_ring_dma_addr); } @@ -549,17 +543,14 @@ static void pcnet32_realloc_rx_ring(struct net_device *dev, struct pcnet32_rx_head *new_rx_ring; struct sk_buff **new_skb_list; int new, overlap; - unsigned int entries = 1 << size; + unsigned int entries = BIT(size); - new_rx_ring = pci_alloc_consistent(lp->pci_dev, - sizeof(struct pcnet32_rx_head) * - entries, - &new_ring_dma_addr); - if (new_rx_ring == NULL) { - netif_err(lp, drv, dev, "Consistent memory allocation failed\n"); + new_rx_ring = + pci_zalloc_consistent(lp->pci_dev, + sizeof(struct pcnet32_rx_head) * entries, + &new_ring_dma_addr); + if (new_rx_ring == NULL) return; - } - memset(new_rx_ring, 0, sizeof(struct pcnet32_rx_head) * entries); new_dma_addr_list = kcalloc(entries, sizeof(dma_addr_t), GFP_ATOMIC); if (!new_dma_addr_list) diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c index 4345332533ad..316e0c3fe048 100644 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c @@ -831,17 +831,14 @@ static int atl1e_setup_ring_resources(struct atl1e_adapter *adapter) /* real ring DMA buffer */ size = adapter->ring_size; - adapter->ring_vir_addr = pci_alloc_consistent(pdev, - adapter->ring_size, &adapter->ring_dma); - + adapter->ring_vir_addr = pci_zalloc_consistent(pdev, adapter->ring_size, + &adapter->ring_dma); if (adapter->ring_vir_addr == NULL) { netdev_err(adapter->netdev, "pci_alloc_consistent failed, size = D%d\n", size); return -ENOMEM; } - memset(adapter->ring_vir_addr, 0, adapter->ring_size); - rx_page_desc = rx_ring->rx_page_desc; /* Init TPD Ring */ diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c index 5abc496bcf29..37472ce4fac3 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_dev.c +++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c @@ -432,14 +432,12 @@ int vnic_dev_fw_info(struct vnic_dev *vdev, int err = 0; if (!vdev->fw_info) { - vdev->fw_info = pci_alloc_consistent(vdev->pdev, - sizeof(struct vnic_devcmd_fw_info), - &vdev->fw_info_pa); + vdev->fw_info = pci_zalloc_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_fw_info), + &vdev->fw_info_pa); if (!vdev->fw_info) return -ENOMEM; - memset(vdev->fw_info, 0, sizeof(struct vnic_devcmd_fw_info)); - a0 = vdev->fw_info_pa; a1 = sizeof(struct vnic_devcmd_fw_info); diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 69693384b58c..59915144aabb 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -1622,11 +1622,10 @@ static int sky2_alloc_buffers(struct sky2_port *sky2) if (!sky2->tx_ring) goto nomem; - sky2->rx_le = pci_alloc_consistent(hw->pdev, RX_LE_BYTES, - &sky2->rx_le_map); + sky2->rx_le = pci_zalloc_consistent(hw->pdev, RX_LE_BYTES, + &sky2->rx_le_map); if (!sky2->rx_le) goto nomem; - memset(sky2->rx_le, 0, RX_LE_BYTES); sky2->rx_ring = kcalloc(sky2->rx_pending, sizeof(struct rx_ring_info), GFP_KERNEL); diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c index 064a48d0c368..cd5f106306d9 100644 --- a/drivers/net/ethernet/micrel/ksz884x.c +++ b/drivers/net/ethernet/micrel/ksz884x.c @@ -4409,14 +4409,13 @@ static int ksz_alloc_desc(struct dev_info *adapter) DESC_ALIGNMENT; adapter->desc_pool.alloc_virt = - pci_alloc_consistent( - adapter->pdev, adapter->desc_pool.alloc_size, - &adapter->desc_pool.dma_addr); + pci_zalloc_consistent(adapter->pdev, + adapter->desc_pool.alloc_size, + &adapter->desc_pool.dma_addr); if (adapter->desc_pool.alloc_virt == NULL) { adapter->desc_pool.alloc_size = 0; return 1; } - memset(adapter->desc_pool.alloc_virt, 0, adapter->desc_pool.alloc_size); /* Align to the next cache line boundary. */ offset = (((ulong) adapter->desc_pool.alloc_virt % DESC_ALIGNMENT) ? diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c index 6f6be57f4690..b8d5270359cd 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c @@ -129,14 +129,12 @@ netxen_get_minidump_template(struct netxen_adapter *adapter) return NX_RCODE_INVALID_ARGS; } - addr = pci_alloc_consistent(adapter->pdev, size, &md_template_addr); - + addr = pci_zalloc_consistent(adapter->pdev, size, &md_template_addr); if (!addr) { dev_err(&adapter->pdev->dev, "Unable to allocate dmable memory for template.\n"); return -ENOMEM; } - memset(addr, 0, size); memset(&cmd, 0, sizeof(cmd)); memset(&cmd.rsp, 1, sizeof(struct _cdrp_cmd)); cmd.req.cmd = NX_CDRP_CMD_GET_TEMP_HDR; diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index b40050e03a56..d836ace52277 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -2727,23 +2727,22 @@ static void ql_free_shadow_space(struct ql_adapter *qdev) static int ql_alloc_shadow_space(struct ql_adapter *qdev) { qdev->rx_ring_shadow_reg_area = - pci_alloc_consistent(qdev->pdev, - PAGE_SIZE, &qdev->rx_ring_shadow_reg_dma); + pci_zalloc_consistent(qdev->pdev, PAGE_SIZE, + &qdev->rx_ring_shadow_reg_dma); if (qdev->rx_ring_shadow_reg_area == NULL) { netif_err(qdev, ifup, qdev->ndev, "Allocation of RX shadow space failed.\n"); return -ENOMEM; } - memset(qdev->rx_ring_shadow_reg_area, 0, PAGE_SIZE); + qdev->tx_ring_shadow_reg_area = - pci_alloc_consistent(qdev->pdev, PAGE_SIZE, - &qdev->tx_ring_shadow_reg_dma); + pci_zalloc_consistent(qdev->pdev, PAGE_SIZE, + &qdev->tx_ring_shadow_reg_dma); if (qdev->tx_ring_shadow_reg_area == NULL) { netif_err(qdev, ifup, qdev->ndev, "Allocation of TX shadow space failed.\n"); goto err_wqp_sh_area; } - memset(qdev->tx_ring_shadow_reg_area, 0, PAGE_SIZE); return 0; err_wqp_sh_area: diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c index 485006604bbc..58ef59469dd0 100644 --- a/drivers/net/irda/vlsi_ir.c +++ b/drivers/net/irda/vlsi_ir.c @@ -485,13 +485,13 @@ static int vlsi_create_hwif(vlsi_irda_dev_t *idev) idev->virtaddr = NULL; idev->busaddr = 0; - ringarea = pci_alloc_consistent(idev->pdev, HW_RING_AREA_SIZE, &idev->busaddr); + ringarea = pci_zalloc_consistent(idev->pdev, HW_RING_AREA_SIZE, + &idev->busaddr); if (!ringarea) { IRDA_ERROR("%s: insufficient memory for descriptor rings\n", __func__); goto out; } - memset(ringarea, 0, HW_RING_AREA_SIZE); hwmap = (struct ring_descr_hw *)ringarea; idev->rx_ring = vlsi_alloc_ring(idev->pdev, hwmap, ringsize[1], diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c index dfc6dfc56d52..1ab8e500fb77 100644 --- a/drivers/net/wireless/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/ipw2x00/ipw2100.c @@ -3449,8 +3449,9 @@ static int ipw2100_msg_allocate(struct ipw2100_priv *priv) return -ENOMEM; for (i = 0; i < IPW_COMMAND_POOL_SIZE; i++) { - v = pci_alloc_consistent(priv->pci_dev, - sizeof(struct ipw2100_cmd_header), &p); + v = pci_zalloc_consistent(priv->pci_dev, + sizeof(struct ipw2100_cmd_header), + &p); if (!v) { printk(KERN_ERR DRV_NAME ": " "%s: PCI alloc failed for msg " @@ -3459,8 +3460,6 @@ static int ipw2100_msg_allocate(struct ipw2100_priv *priv) break; } - memset(v, 0, sizeof(struct ipw2100_cmd_header)); - priv->msg_buffers[i].type = COMMAND; priv->msg_buffers[i].info.c_struct.cmd = (struct ipw2100_cmd_header *)v; @@ -4336,16 +4335,12 @@ static int status_queue_allocate(struct ipw2100_priv *priv, int entries) IPW_DEBUG_INFO("enter\n"); q->size = entries * sizeof(struct ipw2100_status); - q->drv = - (struct ipw2100_status *)pci_alloc_consistent(priv->pci_dev, - q->size, &q->nic); + q->drv = pci_zalloc_consistent(priv->pci_dev, q->size, &q->nic); if (!q->drv) { IPW_DEBUG_WARNING("Can not allocate status queue.\n"); return -ENOMEM; } - memset(q->drv, 0, q->size); - IPW_DEBUG_INFO("exit\n"); return 0; @@ -4374,13 +4369,12 @@ static int bd_queue_allocate(struct ipw2100_priv *priv, q->entries = entries; q->size = entries * sizeof(struct ipw2100_bd); - q->drv = pci_alloc_consistent(priv->pci_dev, q->size, &q->nic); + q->drv = pci_zalloc_consistent(priv->pci_dev, q->size, &q->nic); if (!q->drv) { IPW_DEBUG_INFO ("can't allocate shared memory for buffer descriptors\n"); return -ENOMEM; } - memset(q->drv, 0, q->size); IPW_DEBUG_INFO("exit\n"); diff --git a/drivers/net/wireless/mwl8k.c b/drivers/net/wireless/mwl8k.c index 9a3d4d6724f7..fc6cb215e761 100644 --- a/drivers/net/wireless/mwl8k.c +++ b/drivers/net/wireless/mwl8k.c @@ -1159,12 +1159,11 @@ static int mwl8k_rxq_init(struct ieee80211_hw *hw, int index) size = MWL8K_RX_DESCS * priv->rxd_ops->rxd_size; - rxq->rxd = pci_alloc_consistent(priv->pdev, size, &rxq->rxd_dma); + rxq->rxd = pci_zalloc_consistent(priv->pdev, size, &rxq->rxd_dma); if (rxq->rxd == NULL) { wiphy_err(hw->wiphy, "failed to alloc RX descriptors\n"); return -ENOMEM; } - memset(rxq->rxd, 0, size); rxq->buf = kcalloc(MWL8K_RX_DESCS, sizeof(*rxq->buf), GFP_KERNEL); if (rxq->buf == NULL) { @@ -1451,12 +1450,11 @@ static int mwl8k_txq_init(struct ieee80211_hw *hw, int index) size = MWL8K_TX_DESCS * sizeof(struct mwl8k_tx_desc); - txq->txd = pci_alloc_consistent(priv->pdev, size, &txq->txd_dma); + txq->txd = pci_zalloc_consistent(priv->pdev, size, &txq->txd_dma); if (txq->txd == NULL) { wiphy_err(hw->wiphy, "failed to alloc TX descriptors\n"); return -ENOMEM; } - memset(txq->txd, 0, size); txq->skb = kcalloc(MWL8K_TX_DESCS, sizeof(*txq->skb), GFP_KERNEL); if (txq->skb == NULL) { diff --git a/drivers/net/wireless/rtl818x/rtl8180/dev.c b/drivers/net/wireless/rtl818x/rtl8180/dev.c index 4b904f708184..fcc45e5bf50a 100644 --- a/drivers/net/wireless/rtl818x/rtl8180/dev.c +++ b/drivers/net/wireless/rtl818x/rtl8180/dev.c @@ -972,16 +972,13 @@ static int rtl8180_init_rx_ring(struct ieee80211_hw *dev) else priv->rx_ring_sz = sizeof(struct rtl8180_rx_desc); - priv->rx_ring = pci_alloc_consistent(priv->pdev, - priv->rx_ring_sz * 32, - &priv->rx_ring_dma); - + priv->rx_ring = pci_zalloc_consistent(priv->pdev, priv->rx_ring_sz * 32, + &priv->rx_ring_dma); if (!priv->rx_ring || (unsigned long)priv->rx_ring & 0xFF) { wiphy_err(dev->wiphy, "Cannot allocate RX ring\n"); return -ENOMEM; } - memset(priv->rx_ring, 0, priv->rx_ring_sz * 32); priv->rx_idx = 0; for (i = 0; i < 32; i++) { @@ -1040,14 +1037,14 @@ static int rtl8180_init_tx_ring(struct ieee80211_hw *dev, dma_addr_t dma; int i; - ring = pci_alloc_consistent(priv->pdev, sizeof(*ring) * entries, &dma); + ring = pci_zalloc_consistent(priv->pdev, sizeof(*ring) * entries, + &dma); if (!ring || (unsigned long)ring & 0xFF) { wiphy_err(dev->wiphy, "Cannot allocate TX ring (prio = %d)\n", prio); return -ENOMEM; } - memset(ring, 0, sizeof(*ring)*entries); priv->tx_ring[prio].desc = ring; priv->tx_ring[prio].dma = dma; priv->tx_ring[prio].idx = 0; diff --git a/drivers/net/wireless/rtlwifi/pci.c b/drivers/net/wireless/rtlwifi/pci.c index dae55257f0e8..67d1ee6edcad 100644 --- a/drivers/net/wireless/rtlwifi/pci.c +++ b/drivers/net/wireless/rtlwifi/pci.c @@ -1092,16 +1092,14 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, u32 nextdescaddress; int i; - ring = pci_alloc_consistent(rtlpci->pdev, - sizeof(*ring) * entries, &dma); - + ring = pci_zalloc_consistent(rtlpci->pdev, sizeof(*ring) * entries, + &dma); if (!ring || (unsigned long)ring & 0xFF) { RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG, "Cannot allocate TX ring (prio = %d)\n", prio); return -ENOMEM; } - memset(ring, 0, sizeof(*ring) * entries); rtlpci->tx_ring[prio].desc = ring; rtlpci->tx_ring[prio].dma = dma; rtlpci->tx_ring[prio].idx = 0; @@ -1139,10 +1137,9 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw) for (rx_queue_idx = 0; rx_queue_idx < RTL_PCI_MAX_RX_QUEUE; rx_queue_idx++) { rtlpci->rx_ring[rx_queue_idx].desc = - pci_alloc_consistent(rtlpci->pdev, - sizeof(*rtlpci->rx_ring[rx_queue_idx]. - desc) * rtlpci->rxringcount, - &rtlpci->rx_ring[rx_queue_idx].dma); + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*rtlpci->rx_ring[rx_queue_idx].desc) * rtlpci->rxringcount, + &rtlpci->rx_ring[rx_queue_idx].dma); if (!rtlpci->rx_ring[rx_queue_idx].desc || (unsigned long)rtlpci->rx_ring[rx_queue_idx].desc & 0xFF) { @@ -1151,10 +1148,6 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw) return -ENOMEM; } - memset(rtlpci->rx_ring[rx_queue_idx].desc, 0, - sizeof(*rtlpci->rx_ring[rx_queue_idx].desc) * - rtlpci->rxringcount); - rtlpci->rx_ring[rx_queue_idx].idx = 0; /* If amsdu_8k is disabled, set buffersize to 4096. This diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 4de346017e9f..6da6cec9a651 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -683,14 +683,13 @@ static int twl_allocate_memory(TW_Device_Extension *tw_dev, int size, int which) unsigned long *cpu_addr; int retval = 1; - cpu_addr = pci_alloc_consistent(tw_dev->tw_pci_dev, size*TW_Q_LENGTH, &dma_handle); + cpu_addr = pci_zalloc_consistent(tw_dev->tw_pci_dev, size * TW_Q_LENGTH, + &dma_handle); if (!cpu_addr) { TW_PRINTK(tw_dev->host, TW_DRIVER, 0x5, "Memory allocation failed"); goto out; } - memset(cpu_addr, 0, size*TW_Q_LENGTH); - for (i = 0; i < TW_Q_LENGTH; i++) { switch(which) { case 0: diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c index 522570d297ca..7e33a61c1ba4 100644 --- a/drivers/scsi/a100u2w.c +++ b/drivers/scsi/a100u2w.c @@ -1125,23 +1125,19 @@ static int inia100_probe_one(struct pci_dev *pdev, /* Get total memory needed for SCB */ sz = ORC_MAXQUEUE * sizeof(struct orc_scb); - host->scb_virt = pci_alloc_consistent(pdev, sz, - &host->scb_phys); + host->scb_virt = pci_zalloc_consistent(pdev, sz, &host->scb_phys); if (!host->scb_virt) { printk("inia100: SCB memory allocation error\n"); goto out_host_put; } - memset(host->scb_virt, 0, sz); /* Get total memory needed for ESCB */ sz = ORC_MAXQUEUE * sizeof(struct orc_extended_scb); - host->escb_virt = pci_alloc_consistent(pdev, sz, - &host->escb_phys); + host->escb_virt = pci_zalloc_consistent(pdev, sz, &host->escb_phys); if (!host->escb_virt) { printk("inia100: ESCB memory allocation error\n"); goto out_free_scb_array; } - memset(host->escb_virt, 0, sz); biosaddr = host->BIOScfg; biosaddr = (biosaddr << 4); diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index 56467df3d6de..eb3e3e619155 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -3538,10 +3538,9 @@ static int be_queue_alloc(struct beiscsi_hba *phba, struct be_queue_info *q, q->len = len; q->entry_size = entry_size; mem->size = len * entry_size; - mem->va = pci_alloc_consistent(phba->pcidev, mem->size, &mem->dma); + mem->va = pci_zalloc_consistent(phba->pcidev, mem->size, &mem->dma); if (!mem->va) return -ENOMEM; - memset(mem->va, 0, mem->size); return 0; } @@ -4320,9 +4319,9 @@ static int beiscsi_get_boot_info(struct beiscsi_hba *phba) "BM_%d : No boot session\n"); return ret; } - nonemb_cmd.va = pci_alloc_consistent(phba->ctrl.pdev, - sizeof(*session_resp), - &nonemb_cmd.dma); + nonemb_cmd.va = pci_zalloc_consistent(phba->ctrl.pdev, + sizeof(*session_resp), + &nonemb_cmd.dma); if (nonemb_cmd.va == NULL) { beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_INIT | BEISCSI_LOG_CONFIG, @@ -4332,7 +4331,6 @@ static int beiscsi_get_boot_info(struct beiscsi_hba *phba) return -ENOMEM; } - memset(nonemb_cmd.va, 0, sizeof(*session_resp)); tag = mgmt_get_session_info(phba, s_handle, &nonemb_cmd); if (!tag) { diff --git a/drivers/scsi/be2iscsi/be_mgmt.c b/drivers/scsi/be2iscsi/be_mgmt.c index a3e56487616c..665afcb74a56 100644 --- a/drivers/scsi/be2iscsi/be_mgmt.c +++ b/drivers/scsi/be2iscsi/be_mgmt.c @@ -900,13 +900,12 @@ free_cmd: static int mgmt_alloc_cmd_data(struct beiscsi_hba *phba, struct be_dma_mem *cmd, int iscsi_cmd, int size) { - cmd->va = pci_alloc_consistent(phba->ctrl.pdev, size, &cmd->dma); + cmd->va = pci_zalloc_consistent(phba->ctrl.pdev, size, &cmd->dma); if (!cmd->va) { beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_CONFIG, "BG_%d : Failed to allocate memory for if info\n"); return -ENOMEM; } - memset(cmd->va, 0, size); cmd->size = size; be_cmd_hdr_prepare(cmd->va, CMD_SUBSYSTEM_ISCSI, iscsi_cmd, size); return 0; diff --git a/drivers/scsi/csiostor/csio_wr.c b/drivers/scsi/csiostor/csio_wr.c index 4255ce264abf..773da14cfa14 100644 --- a/drivers/scsi/csiostor/csio_wr.c +++ b/drivers/scsi/csiostor/csio_wr.c @@ -232,7 +232,7 @@ csio_wr_alloc_q(struct csio_hw *hw, uint32_t qsize, uint32_t wrsize, q = wrm->q_arr[free_idx]; - q->vstart = pci_alloc_consistent(hw->pdev, qsz, &q->pstart); + q->vstart = pci_zalloc_consistent(hw->pdev, qsz, &q->pstart); if (!q->vstart) { csio_err(hw, "Failed to allocate DMA memory for " @@ -240,12 +240,6 @@ csio_wr_alloc_q(struct csio_hw *hw, uint32_t qsize, uint32_t wrsize, return -1; } - /* - * We need to zero out the contents, importantly for ingress, - * since we start with a generatiom bit of 1 for ingress. - */ - memset(q->vstart, 0, qsz); - q->type = type; q->owner = owner; q->pidx = q->cidx = q->inc_idx = 0; diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c index 03372cff38f3..813dd5c998e4 100644 --- a/drivers/scsi/eata.c +++ b/drivers/scsi/eata.c @@ -1238,8 +1238,8 @@ static int port_detect(unsigned long port_base, unsigned int j, struct eata_config *cf; dma_addr_t cf_dma_addr; - cf = pci_alloc_consistent(pdev, sizeof(struct eata_config), - &cf_dma_addr); + cf = pci_zalloc_consistent(pdev, sizeof(struct eata_config), + &cf_dma_addr); if (!cf) { printk @@ -1249,7 +1249,6 @@ static int port_detect(unsigned long port_base, unsigned int j, } /* Set board configuration */ - memset((char *)cf, 0, sizeof(struct eata_config)); cf->len = (ushort) H2DEV16((ushort) 510); cf->ocena = 1; diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 8545d1826725..6b35d0dfe64c 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -4732,23 +4732,21 @@ static struct CommandList *cmd_special_alloc(struct ctlr_info *h) union u64bit temp64; dma_addr_t cmd_dma_handle, err_dma_handle; - c = pci_alloc_consistent(h->pdev, sizeof(*c), &cmd_dma_handle); + c = pci_zalloc_consistent(h->pdev, sizeof(*c), &cmd_dma_handle); if (c == NULL) return NULL; - memset(c, 0, sizeof(*c)); c->cmd_type = CMD_SCSI; c->cmdindex = -1; - c->err_info = pci_alloc_consistent(h->pdev, sizeof(*c->err_info), - &err_dma_handle); + c->err_info = pci_zalloc_consistent(h->pdev, sizeof(*c->err_info), + &err_dma_handle); if (c->err_info == NULL) { pci_free_consistent(h->pdev, sizeof(*c), c, cmd_dma_handle); return NULL; } - memset(c->err_info, 0, sizeof(*c->err_info)); INIT_LIST_HEAD(&c->list); c->busaddr = (u32) cmd_dma_handle; diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c index e2237a97cb9d..531dce419c18 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.c +++ b/drivers/scsi/megaraid/megaraid_mbox.c @@ -998,8 +998,9 @@ megaraid_alloc_cmd_packets(adapter_t *adapter) * Allocate the common 16-byte aligned memory for the handshake * mailbox. */ - raid_dev->una_mbox64 = pci_alloc_consistent(adapter->pdev, - sizeof(mbox64_t), &raid_dev->una_mbox64_dma); + raid_dev->una_mbox64 = pci_zalloc_consistent(adapter->pdev, + sizeof(mbox64_t), + &raid_dev->una_mbox64_dma); if (!raid_dev->una_mbox64) { con_log(CL_ANN, (KERN_WARNING @@ -1007,7 +1008,6 @@ megaraid_alloc_cmd_packets(adapter_t *adapter) __LINE__)); return -1; } - memset(raid_dev->una_mbox64, 0, sizeof(mbox64_t)); /* * Align the mailbox at 16-byte boundary @@ -1026,8 +1026,8 @@ megaraid_alloc_cmd_packets(adapter_t *adapter) align; // Allocate memory for commands issued internally - adapter->ibuf = pci_alloc_consistent(pdev, MBOX_IBUF_SIZE, - &adapter->ibuf_dma_h); + adapter->ibuf = pci_zalloc_consistent(pdev, MBOX_IBUF_SIZE, + &adapter->ibuf_dma_h); if (!adapter->ibuf) { con_log(CL_ANN, (KERN_WARNING @@ -1036,7 +1036,6 @@ megaraid_alloc_cmd_packets(adapter_t *adapter) goto out_free_common_mbox; } - memset(adapter->ibuf, 0, MBOX_IBUF_SIZE); // Allocate memory for our SCSI Command Blocks and their associated // memory @@ -2972,8 +2971,8 @@ megaraid_mbox_product_info(adapter_t *adapter) * Issue an ENQUIRY3 command to find out certain adapter parameters, * e.g., max channels, max commands etc. */ - pinfo = pci_alloc_consistent(adapter->pdev, sizeof(mraid_pinfo_t), - &pinfo_dma_h); + pinfo = pci_zalloc_consistent(adapter->pdev, sizeof(mraid_pinfo_t), + &pinfo_dma_h); if (pinfo == NULL) { con_log(CL_ANN, (KERN_WARNING @@ -2982,7 +2981,6 @@ megaraid_mbox_product_info(adapter_t *adapter) return -1; } - memset(pinfo, 0, sizeof(mraid_pinfo_t)); mbox->xferaddr = (uint32_t)adapter->ibuf_dma_h; memset((void *)adapter->ibuf, 0, MBOX_IBUF_SIZE); diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 112799b131a9..22a04e37b70a 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -2038,9 +2038,9 @@ int megasas_sriov_start_heartbeat(struct megasas_instance *instance, if (initial) { instance->hb_host_mem = - pci_alloc_consistent(instance->pdev, - sizeof(struct MR_CTRL_HB_HOST_MEM), - &instance->hb_host_mem_h); + pci_zalloc_consistent(instance->pdev, + sizeof(struct MR_CTRL_HB_HOST_MEM), + &instance->hb_host_mem_h); if (!instance->hb_host_mem) { printk(KERN_DEBUG "megasas: SR-IOV: Couldn't allocate" " memory for heartbeat host memory for " @@ -2048,8 +2048,6 @@ int megasas_sriov_start_heartbeat(struct megasas_instance *instance, retval = -ENOMEM; goto out; } - memset(instance->hb_host_mem, 0, - sizeof(struct MR_CTRL_HB_HOST_MEM)); } memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE); diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index 7a6160f172ce..57a95e2c3442 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c @@ -1915,14 +1915,12 @@ static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) /* We use the PCI APIs for now until the generic one gets fixed * enough or until we get some macio-specific versions */ - dma_cmd_space = pci_alloc_consistent(macio_get_pci_dev(mdev), - ms->dma_cmd_size, - &dma_cmd_bus); + dma_cmd_space = pci_zalloc_consistent(macio_get_pci_dev(mdev), + ms->dma_cmd_size, &dma_cmd_bus); if (dma_cmd_space == NULL) { printk(KERN_ERR "mesh: can't allocate DMA table\n"); goto out_unmap; } - memset(dma_cmd_space, 0, ms->dma_cmd_size); ms->dma_cmds = (struct dbdma_cmd *) DBDMA_ALIGN(dma_cmd_space); ms->dma_cmd_space = dma_cmd_space; diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index edbee8dc62c9..3e716b2f611a 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -142,8 +142,8 @@ static struct mvumi_res *mvumi_alloc_mem_resource(struct mvumi_hba *mhba, case RESOURCE_UNCACHED_MEMORY: size = round_up(size, 8); - res->virt_addr = pci_alloc_consistent(mhba->pdev, size, - &res->bus_addr); + res->virt_addr = pci_zalloc_consistent(mhba->pdev, size, + &res->bus_addr); if (!res->virt_addr) { dev_err(&mhba->pdev->dev, "unable to allocate consistent mem," @@ -151,7 +151,6 @@ static struct mvumi_res *mvumi_alloc_mem_resource(struct mvumi_hba *mhba, kfree(res); return NULL; } - memset(res->virt_addr, 0, size); break; default: @@ -258,12 +257,10 @@ static int mvumi_internal_cmd_sgl(struct mvumi_hba *mhba, struct mvumi_cmd *cmd, if (size == 0) return 0; - virt_addr = pci_alloc_consistent(mhba->pdev, size, &phy_addr); + virt_addr = pci_zalloc_consistent(mhba->pdev, size, &phy_addr); if (!virt_addr) return -1; - memset(virt_addr, 0, size); - m_sg = (struct mvumi_sgl *) &cmd->frame->payload[0]; cmd->frame->sg_counts = 1; cmd->data_buf = virt_addr; diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 34cea8291772..76570e6a547d 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -116,13 +116,12 @@ int pm8001_mem_alloc(struct pci_dev *pdev, void **virt_addr, u64 align_offset = 0; if (align) align_offset = (dma_addr_t)align - 1; - mem_virt_alloc = - pci_alloc_consistent(pdev, mem_size + align, &mem_dma_handle); + mem_virt_alloc = pci_zalloc_consistent(pdev, mem_size + align, + &mem_dma_handle); if (!mem_virt_alloc) { pm8001_printk("memory allocation error\n"); return -1; } - memset((void *)mem_virt_alloc, 0, mem_size+align); *pphys_addr = mem_dma_handle; phys_align = (*pphys_addr + align_offset) & ~align_offset; *virt_addr = (void *)mem_virt_alloc + phys_align - *pphys_addr; diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index 017f8b9554e5..6f3275d020a0 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -4213,9 +4213,9 @@ static ssize_t pmcraid_store_log_level( { struct Scsi_Host *shost; struct pmcraid_instance *pinstance; - unsigned long val; + u8 val; - if (strict_strtoul(buf, 10, &val)) + if (kstrtou8(buf, 10, &val)) return -EINVAL; /* log-level should be from 0 to 2 */ if (val > 2) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 406b3038bbad..8b4105a22ac2 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -910,9 +910,9 @@ sdev_store_queue_ramp_up_period(struct device *dev, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); - unsigned long period; + unsigned int period; - if (strict_strtoul(buf, 10, &period)) + if (kstrtouint(buf, 10, &period)) return -EINVAL; sdev->queue_ramp_up_period = msecs_to_jiffies(period); diff --git a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c index 2920e406030a..5729cf678765 100644 --- a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c +++ b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c @@ -2065,20 +2065,16 @@ static short rtl8192_alloc_rx_desc_ring(struct net_device *dev) int i, rx_queue_idx; for (rx_queue_idx = 0; rx_queue_idx < MAX_RX_QUEUE; rx_queue_idx++) { - priv->rx_ring[rx_queue_idx] = pci_alloc_consistent(priv->pdev, - sizeof(*priv->rx_ring[rx_queue_idx]) * - priv->rxringcount, - &priv->rx_ring_dma[rx_queue_idx]); - + priv->rx_ring[rx_queue_idx] = + pci_zalloc_consistent(priv->pdev, + sizeof(*priv->rx_ring[rx_queue_idx]) * priv->rxringcount, + &priv->rx_ring_dma[rx_queue_idx]); if (!priv->rx_ring[rx_queue_idx] || (unsigned long)priv->rx_ring[rx_queue_idx] & 0xFF) { RT_TRACE(COMP_ERR, "Cannot allocate RX ring\n"); return -ENOMEM; } - memset(priv->rx_ring[rx_queue_idx], 0, - sizeof(*priv->rx_ring[rx_queue_idx]) * - priv->rxringcount); priv->rx_idx[rx_queue_idx] = 0; for (i = 0; i < priv->rxringcount; i++) { @@ -2118,14 +2114,13 @@ static int rtl8192_alloc_tx_desc_ring(struct net_device *dev, dma_addr_t dma; int i; - ring = pci_alloc_consistent(priv->pdev, sizeof(*ring) * entries, &dma); + ring = pci_zalloc_consistent(priv->pdev, sizeof(*ring) * entries, &dma); if (!ring || (unsigned long)ring & 0xFF) { RT_TRACE(COMP_ERR, "Cannot allocate TX ring (prio = %d)\n", prio); return -ENOMEM; } - memset(ring, 0, sizeof(*ring)*entries); priv->tx_ring[prio].desc = ring; priv->tx_ring[prio].dma = dma; priv->tx_ring[prio].idx = 0; diff --git a/drivers/staging/rtl8192ee/pci.c b/drivers/staging/rtl8192ee/pci.c index f3abbcc9f3ba..0215aef1eacc 100644 --- a/drivers/staging/rtl8192ee/pci.c +++ b/drivers/staging/rtl8192ee/pci.c @@ -1224,10 +1224,10 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, /* alloc tx buffer desc for new trx flow*/ if (rtlpriv->use_new_trx_flow) { - buffer_desc = pci_alloc_consistent(rtlpci->pdev, - sizeof(*buffer_desc) * entries, - &buffer_desc_dma); - + buffer_desc = + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*buffer_desc) * entries, + &buffer_desc_dma); if (!buffer_desc || (unsigned long)buffer_desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, ("Cannot allocate TX ring (prio = %d)\n", @@ -1235,7 +1235,6 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, return -ENOMEM; } - memset(buffer_desc, 0, sizeof(*buffer_desc) * entries); rtlpci->tx_ring[prio].buffer_desc = buffer_desc; rtlpci->tx_ring[prio].buffer_desc_dma = buffer_desc_dma; @@ -1245,16 +1244,14 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, } /* alloc dma for this ring */ - desc = pci_alloc_consistent(rtlpci->pdev, - sizeof(*desc) * entries, &desc_dma); - + desc = pci_zalloc_consistent(rtlpci->pdev, sizeof(*desc) * entries, + &desc_dma); if (!desc || (unsigned long)desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, ("Cannot allocate TX ring (prio = %d)\n", prio)); return -ENOMEM; } - memset(desc, 0, sizeof(*desc) * entries); rtlpci->tx_ring[prio].desc = desc; rtlpci->tx_ring[prio].dma = desc_dma; @@ -1290,11 +1287,9 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw, int rxring_idx) struct rtl_rx_buffer_desc *entry = NULL; /* alloc dma for this ring */ rtlpci->rx_ring[rxring_idx].buffer_desc = - pci_alloc_consistent(rtlpci->pdev, - sizeof(*rtlpci->rx_ring[rxring_idx]. - buffer_desc) * - rtlpci->rxringcount, - &rtlpci->rx_ring[rxring_idx].dma); + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*rtlpci->rx_ring[rxring_idx].buffer_desc) * rtlpci->rxringcount, + &rtlpci->rx_ring[rxring_idx].dma); if (!rtlpci->rx_ring[rxring_idx].buffer_desc || (unsigned long)rtlpci->rx_ring[rxring_idx].buffer_desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, @@ -1302,10 +1297,6 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw, int rxring_idx) return -ENOMEM; } - memset(rtlpci->rx_ring[rxring_idx].buffer_desc, 0, - sizeof(*rtlpci->rx_ring[rxring_idx].buffer_desc) * - rtlpci->rxringcount); - /* init every desc in this ring */ rtlpci->rx_ring[rxring_idx].idx = 0; @@ -1320,19 +1311,15 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw, int rxring_idx) u8 tmp_one = 1; /* alloc dma for this ring */ rtlpci->rx_ring[rxring_idx].desc = - pci_alloc_consistent(rtlpci->pdev, - sizeof(*rtlpci->rx_ring[rxring_idx]. - desc) * rtlpci->rxringcount, - &rtlpci->rx_ring[rxring_idx].dma); + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*rtlpci->rx_ring[rxring_idx].desc) * rtlpci->rxringcount, + &rtlpci->rx_ring[rxring_idx].dma); if (!rtlpci->rx_ring[rxring_idx].desc || (unsigned long)rtlpci->rx_ring[rxring_idx].desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, ("Cannot allocate RX ring\n")); return -ENOMEM; } - memset(rtlpci->rx_ring[rxring_idx].desc, 0, - sizeof(*rtlpci->rx_ring[rxring_idx].desc) * - rtlpci->rxringcount); /* init every desc in this ring */ rtlpci->rx_ring[rxring_idx].idx = 0; diff --git a/drivers/staging/rtl8821ae/pci.c b/drivers/staging/rtl8821ae/pci.c index f9847d1fbdeb..26d7b2fc852a 100644 --- a/drivers/staging/rtl8821ae/pci.c +++ b/drivers/staging/rtl8821ae/pci.c @@ -1248,9 +1248,10 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, /* alloc tx buffer desc for new trx flow*/ if (rtlpriv->use_new_trx_flow) { - buffer_desc = pci_alloc_consistent(rtlpci->pdev, - sizeof(*buffer_desc) * entries, - &buffer_desc_dma); + buffer_desc = + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*buffer_desc) * entries, + &buffer_desc_dma); if (!buffer_desc || (unsigned long)buffer_desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, @@ -1259,7 +1260,6 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, return -ENOMEM; } - memset(buffer_desc, 0, sizeof(*buffer_desc) * entries); rtlpci->tx_ring[prio].buffer_desc = buffer_desc; rtlpci->tx_ring[prio].buffer_desc_dma = buffer_desc_dma; @@ -1270,8 +1270,8 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, } /* alloc dma for this ring */ - desc = pci_alloc_consistent(rtlpci->pdev, - sizeof(*desc) * entries, &desc_dma); + desc = pci_zalloc_consistent(rtlpci->pdev, sizeof(*desc) * entries, + &desc_dma); if (!desc || (unsigned long)desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, @@ -1279,7 +1279,6 @@ static int _rtl_pci_init_tx_ring(struct ieee80211_hw *hw, return -ENOMEM; } - memset(desc, 0, sizeof(*desc) * entries); rtlpci->tx_ring[prio].desc = desc; rtlpci->tx_ring[prio].dma = desc_dma; @@ -1316,21 +1315,15 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw, int rxring_idx) struct rtl_rx_buffer_desc *entry = NULL; /* alloc dma for this ring */ rtlpci->rx_ring[rxring_idx].buffer_desc = - pci_alloc_consistent(rtlpci->pdev, - sizeof(*rtlpci->rx_ring[rxring_idx]. - buffer_desc) * - rtlpci->rxringcount, - &rtlpci->rx_ring[rxring_idx].dma); + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*rtlpci->rx_ring[rxring_idx].buffer_desc) * rtlpci->rxringcount, + &rtlpci->rx_ring[rxring_idx].dma); if (!rtlpci->rx_ring[rxring_idx].buffer_desc || (unsigned long)rtlpci->rx_ring[rxring_idx].buffer_desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, ("Cannot allocate RX ring\n")); return -ENOMEM; } - memset(rtlpci->rx_ring[rxring_idx].buffer_desc, 0, - sizeof(*rtlpci->rx_ring[rxring_idx].buffer_desc) * - rtlpci->rxringcount); - /* init every desc in this ring */ rtlpci->rx_ring[rxring_idx].idx = 0; for (i = 0; i < rtlpci->rxringcount; i++) { @@ -1344,10 +1337,9 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw, int rxring_idx) u8 tmp_one = 1; /* alloc dma for this ring */ rtlpci->rx_ring[rxring_idx].desc = - pci_alloc_consistent(rtlpci->pdev, - sizeof(*rtlpci->rx_ring[rxring_idx]. - desc) * rtlpci->rxringcount, - &rtlpci->rx_ring[rxring_idx].dma); + pci_zalloc_consistent(rtlpci->pdev, + sizeof(*rtlpci->rx_ring[rxring_idx].desc) * rtlpci->rxringcount, + &rtlpci->rx_ring[rxring_idx].dma); if (!rtlpci->rx_ring[rxring_idx].desc || (unsigned long)rtlpci->rx_ring[rxring_idx].desc & 0xFF) { RT_TRACE(COMP_ERR, DBG_EMERG, @@ -1355,10 +1347,6 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw, int rxring_idx) return -ENOMEM; } - memset(rtlpci->rx_ring[rxring_idx].desc, 0, - sizeof(*rtlpci->rx_ring[rxring_idx].desc) * - rtlpci->rxringcount); - /* init every desc in this ring */ rtlpci->rx_ring[rxring_idx].idx = 0; for (i = 0; i < rtlpci->rxringcount; i++) { diff --git a/drivers/staging/slicoss/slicoss.c b/drivers/staging/slicoss/slicoss.c index 50ece291fc6a..f35fa3dfe22c 100644 --- a/drivers/staging/slicoss/slicoss.c +++ b/drivers/staging/slicoss/slicoss.c @@ -1191,18 +1191,15 @@ static int slic_rspqueue_init(struct adapter *adapter) rspq->num_pages = SLIC_RSPQ_PAGES_GB; for (i = 0; i < rspq->num_pages; i++) { - rspq->vaddr[i] = pci_alloc_consistent(adapter->pcidev, - PAGE_SIZE, - &rspq->paddr[i]); + rspq->vaddr[i] = pci_zalloc_consistent(adapter->pcidev, + PAGE_SIZE, + &rspq->paddr[i]); if (!rspq->vaddr[i]) { dev_err(&adapter->pcidev->dev, "pci_alloc_consistent failed\n"); slic_rspqueue_free(adapter); return -ENOMEM; } - /* FIXME: - * do we really need this assertions (4K PAGE_SIZE aligned addr)? */ - memset(rspq->vaddr[i], 0, PAGE_SIZE); if (paddrh == 0) { slic_reg32_write(&slic_regs->slic_rbar, diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c index c78d06eff7ea..0b583a37f5b3 100644 --- a/drivers/staging/vt6655/device_main.c +++ b/drivers/staging/vt6655/device_main.c @@ -1111,25 +1111,17 @@ static bool device_init_rings(PSDevice pDevice) void *vir_pool; /*allocate all RD/TD rings a single pool*/ - vir_pool = pci_alloc_consistent(pDevice->pcid, - pDevice->sOpts.nRxDescs0 * sizeof(SRxDesc) + - pDevice->sOpts.nRxDescs1 * sizeof(SRxDesc) + - pDevice->sOpts.nTxDescs[0] * sizeof(STxDesc) + - pDevice->sOpts.nTxDescs[1] * sizeof(STxDesc), - &pDevice->pool_dma); - + vir_pool = pci_zalloc_consistent(pDevice->pcid, + pDevice->sOpts.nRxDescs0 * sizeof(SRxDesc) + + pDevice->sOpts.nRxDescs1 * sizeof(SRxDesc) + + pDevice->sOpts.nTxDescs[0] * sizeof(STxDesc) + + pDevice->sOpts.nTxDescs[1] * sizeof(STxDesc), + &pDevice->pool_dma); if (vir_pool == NULL) { DBG_PRT(MSG_LEVEL_ERR, KERN_ERR "%s : allocate desc dma memory failed\n", pDevice->dev->name); return false; } - memset(vir_pool, 0, - pDevice->sOpts.nRxDescs0 * sizeof(SRxDesc) + - pDevice->sOpts.nRxDescs1 * sizeof(SRxDesc) + - pDevice->sOpts.nTxDescs[0] * sizeof(STxDesc) + - pDevice->sOpts.nTxDescs[1] * sizeof(STxDesc) - ); - pDevice->aRD0Ring = vir_pool; pDevice->aRD1Ring = vir_pool + pDevice->sOpts.nRxDescs0 * sizeof(SRxDesc); @@ -1138,13 +1130,12 @@ static bool device_init_rings(PSDevice pDevice) pDevice->rd1_pool_dma = pDevice->rd0_pool_dma + pDevice->sOpts.nRxDescs0 * sizeof(SRxDesc); - pDevice->tx0_bufs = pci_alloc_consistent(pDevice->pcid, - pDevice->sOpts.nTxDescs[0] * PKT_BUF_SZ + - pDevice->sOpts.nTxDescs[1] * PKT_BUF_SZ + - CB_BEACON_BUF_SIZE + - CB_MAX_BUF_SIZE, - &pDevice->tx_bufs_dma0); - + pDevice->tx0_bufs = pci_zalloc_consistent(pDevice->pcid, + pDevice->sOpts.nTxDescs[0] * PKT_BUF_SZ + + pDevice->sOpts.nTxDescs[1] * PKT_BUF_SZ + + CB_BEACON_BUF_SIZE + + CB_MAX_BUF_SIZE, + &pDevice->tx_bufs_dma0); if (pDevice->tx0_bufs == NULL) { DBG_PRT(MSG_LEVEL_ERR, KERN_ERR "%s: allocate buf dma memory failed\n", pDevice->dev->name); pci_free_consistent(pDevice->pcid, @@ -1157,13 +1148,6 @@ static bool device_init_rings(PSDevice pDevice) return false; } - memset(pDevice->tx0_bufs, 0, - pDevice->sOpts.nTxDescs[0] * PKT_BUF_SZ + - pDevice->sOpts.nTxDescs[1] * PKT_BUF_SZ + - CB_BEACON_BUF_SIZE + - CB_MAX_BUF_SIZE - ); - pDevice->td0_pool_dma = pDevice->rd1_pool_dma + pDevice->sOpts.nRxDescs1 * sizeof(SRxDesc); diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index ba1dbcdf4609..0e8c39b6ccd4 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -3383,12 +3383,11 @@ static int alloc_desc(struct slgt_info *info) unsigned int pbufs; /* allocate memory to hold descriptor lists */ - info->bufs = pci_alloc_consistent(info->pdev, DESC_LIST_SIZE, &info->bufs_dma_addr); + info->bufs = pci_zalloc_consistent(info->pdev, DESC_LIST_SIZE, + &info->bufs_dma_addr); if (info->bufs == NULL) return -ENOMEM; - memset(info->bufs, 0, DESC_LIST_SIZE); - info->rbufs = (struct slgt_desc*)info->bufs; info->tbufs = ((struct slgt_desc*)info->bufs) + info->rbuf_count; diff --git a/drivers/vme/bridges/vme_ca91cx42.c b/drivers/vme/bridges/vme_ca91cx42.c index bfb2d3f06738..18078ecbfcc6 100644 --- a/drivers/vme/bridges/vme_ca91cx42.c +++ b/drivers/vme/bridges/vme_ca91cx42.c @@ -1555,16 +1555,14 @@ static int ca91cx42_crcsr_init(struct vme_bridge *ca91cx42_bridge, } /* Allocate mem for CR/CSR image */ - bridge->crcsr_kernel = pci_alloc_consistent(pdev, VME_CRCSR_BUF_SIZE, - &bridge->crcsr_bus); + bridge->crcsr_kernel = pci_zalloc_consistent(pdev, VME_CRCSR_BUF_SIZE, + &bridge->crcsr_bus); if (bridge->crcsr_kernel == NULL) { dev_err(&pdev->dev, "Failed to allocate memory for CR/CSR " "image\n"); return -ENOMEM; } - memset(bridge->crcsr_kernel, 0, VME_CRCSR_BUF_SIZE); - crcsr_addr = slot * (512 * 1024); iowrite32(bridge->crcsr_bus - crcsr_addr, bridge->base + VCSR_TO); diff --git a/drivers/vme/bridges/vme_tsi148.c b/drivers/vme/bridges/vme_tsi148.c index 61e706c0e00c..e07cfa8001bb 100644 --- a/drivers/vme/bridges/vme_tsi148.c +++ b/drivers/vme/bridges/vme_tsi148.c @@ -2275,16 +2275,14 @@ static int tsi148_crcsr_init(struct vme_bridge *tsi148_bridge, bridge = tsi148_bridge->driver_priv; /* Allocate mem for CR/CSR image */ - bridge->crcsr_kernel = pci_alloc_consistent(pdev, VME_CRCSR_BUF_SIZE, - &bridge->crcsr_bus); + bridge->crcsr_kernel = pci_zalloc_consistent(pdev, VME_CRCSR_BUF_SIZE, + &bridge->crcsr_bus); if (bridge->crcsr_kernel == NULL) { dev_err(tsi148_bridge->parent, "Failed to allocate memory for " "CR/CSR image\n"); return -ENOMEM; } - memset(bridge->crcsr_kernel, 0, VME_CRCSR_BUF_SIZE); - reg_split(bridge->crcsr_bus, &crcsr_bus_high, &crcsr_bus_low); iowrite32be(crcsr_bus_high, bridge->base + TSI148_LCSR_CROU); diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 47249a30eae3..20f766afa4c7 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -91,8 +91,7 @@ static struct w1_master *w1_alloc_dev(u32 id, int slave_count, int slave_ttl, err = device_register(&dev->dev); if (err) { pr_err("Failed to register master device. err=%d\n", err); - memset(dev, 0, sizeof(struct w1_master)); - kfree(dev); + put_device(&dev->dev); dev = NULL; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index f704458ea5f5..3e0b6fcb0d2b 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -30,7 +30,7 @@ struct plock_op { struct plock_xop { struct plock_op xop; - void *callback; + int (*callback)(struct file_lock *fl, int result); void *fl; void *file; struct file_lock flc; @@ -144,23 +144,23 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); - if (xop->callback == NULL) { - rv = wait_event_killable(recv_wq, (op->done != 0)); - if (rv == -ERESTARTSYS) { - log_debug(ls, "dlm_posix_lock: wait killed %llx", - (unsigned long long)number); - spin_lock(&ops_lock); - list_del(&op->list); - spin_unlock(&ops_lock); - kfree(xop); - do_unlock_close(ls, number, file, fl); - goto out; - } - } else { + if (xop->callback) { rv = FILE_LOCK_DEFERRED; goto out; } + rv = wait_event_killable(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + log_debug(ls, "dlm_posix_lock: wait killed %llx", + (unsigned long long)number); + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + kfree(xop); + do_unlock_close(ls, number, file, fl); + goto out; + } + spin_lock(&ops_lock); if (!list_empty(&op->list)) { log_error(ls, "dlm_posix_lock: op on list %llx", @@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op) struct file *file; struct file_lock *fl; struct file_lock *flc; - int (*notify)(void *, void *, int) = NULL; + int (*notify)(struct file_lock *fl, int result) = NULL; struct plock_xop *xop = (struct plock_xop *)op; int rv = 0; @@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op) notify = xop->callback; if (op->info.rv) { - notify(fl, NULL, op->info.rv); + notify(fl, op->info.rv); goto out; } @@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op) (unsigned long long)op->info.number, file, fl); } - rv = notify(fl, NULL, 0); + rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ log_print("dlm_plock_callback: lock granted after lock request " diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 91ad9e1c9441..e26bc9a22ac9 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -303,6 +303,31 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } +static int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + + if (cluster < 0) + return cluster; + + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create) { @@ -311,7 +336,6 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -329,25 +353,39 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) + * Both ->mmu_private and ->i_disksize can access + * on only allocation path. (caller must hold ->i_mutex) */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) >> blocksize_bits; if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); +} + +int fat_bmap2(struct inode *inode, sector_t sector, + unsigned long *mapped_blocks, struct buffer_head *bh_result, + int create, sector_t *bmap) +{ + struct super_block *sb = inode->i_sb; + sector_t last_block; + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + + BUG_ON(create != 0); + + *bmap = 0; + *mapped_blocks = 0; + + last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) + >> blocksize_bits; + + if (sector >= last_block) + return 0; + + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + bmap); } diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377..13b7202bd651 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -119,7 +119,8 @@ struct msdos_inode_info { unsigned int cache_valid_id; /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ - loff_t mmu_private; /* physically allocated size */ + loff_t mmu_private; /* physically allocated size (initialized) */ + loff_t i_disksize; /* physically allocated size (uninitialized) */ int i_start; /* first cluster or 0 */ int i_logstart; /* logical first cluster */ @@ -290,6 +291,9 @@ extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create); +extern int fat_bmap2(struct inode *inode, sector_t sector, + unsigned long *mapped_blocks, + struct buffer_head *bh_result, int create, sector_t *bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e747..92e9e753b554 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -17,8 +17,12 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -182,6 +186,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -220,6 +225,75 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int cluster; + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if ((offset + len) <= MSDOS_I(inode)->i_disksize) + goto error; + + err = inode_newsize_ok(inode, (len + offset)); + if (err) + goto error; + + if (mode & FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - round_up(MSDOS_I(inode)->i_disksize, + sbi->cluster_size); + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_alloc_clusters() error"); + goto error; + } + err = fat_chain_add(inode, cluster, 1); + if (err) { + fat_free_clusters(inode, cluster); + goto error; + } + MSDOS_I(inode)->i_disksize += sbi->cluster_size; + } + } else { + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + if (err) + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_cont_expand() error"); + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -300,8 +374,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) * This protects against truncating a file bigger than it was then * trying to write into the hole. */ - if (MSDOS_I(inode)->mmu_private > offset) + if (MSDOS_I(inode)->i_disksize > offset) { MSDOS_I(inode)->mmu_private = offset; + MSDOS_I(inode)->i_disksize = offset; + } nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 756aead10d96..c3b86c58ad88 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -116,6 +116,25 @@ static int fat_add_cluster(struct inode *inode) return err; } +static void check_fallocated_region(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, struct buffer_head *bh_result) +{ + struct super_block *sb = inode->i_sb; + sector_t last_block, disk_block; + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + + last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + disk_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) + >> blocksize_bits; + if (iblock >= last_block && iblock <= disk_block) { + MSDOS_I(inode)->mmu_private += *max_blocks << blocksize_bits; + set_buffer_new(bh_result); + } + +} + static inline int __fat_get_block(struct inode *inode, sector_t iblock, unsigned long *max_blocks, struct buffer_head *bh_result, int create) @@ -130,8 +149,11 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, if (err) return err; if (phys) { - map_bh(bh_result, sb, phys); *max_blocks = min(mapped_blocks, *max_blocks); + if (create) + check_fallocated_region(inode, iblock, max_blocks, + bh_result); + map_bh(bh_result, sb, phys); return 0; } if (!create) @@ -155,6 +177,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; + MSDOS_I(inode)->i_disksize = MSDOS_I(inode)->mmu_private; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); if (err) @@ -269,6 +292,13 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; + + /* + * In case of writing in fallocated region, return 0 and + * fallback to buffered write. + */ + if (MSDOS_I(inode)->i_disksize > MSDOS_I(inode)->mmu_private) + return 0; } /* @@ -282,13 +312,36 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + err = fat_bmap2(inode, iblock, &mapped_blocks, bh_result, create, + &bmap); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -469,7 +522,6 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) error = fat_calc_dir_size(inode); if (error < 0) return error; - MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); } else { /* not a directory */ @@ -484,8 +536,12 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_op = &fat_file_inode_operations; inode->i_fop = &fat_file_operations; inode->i_mapping->a_ops = &fat_aops; - MSDOS_I(inode)->mmu_private = inode->i_size; } + + MSDOS_I(inode)->mmu_private = inode->i_size; + MSDOS_I(inode)->i_disksize = round_up(inode->i_size, + inode->i_sb->s_blocksize); + if (de->attr & ATTR_SYS) { if (sbi->options.sys_immutable) inode->i_flags |= S_IMMUTABLE; @@ -550,12 +606,34 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); + } else { + /* Release unwritten fallocated blocks on inode eviction. */ + if (MSDOS_I(inode)->i_disksize > + round_up(MSDOS_I(inode)->mmu_private, + inode->i_sb->s_blocksize)) { + int err; + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused fallocated " + "blocks, inode could be corrupted. Please run " + "fsck"); + } + } } invalidate_inode_buffers(inode); clear_inode(inode); @@ -1293,6 +1371,7 @@ static int fat_read_root(struct inode *inode) & ~((loff_t)sbi->cluster_size - 1)) >> 9; MSDOS_I(inode)->i_logstart = 0; MSDOS_I(inode)->mmu_private = inode->i_size; + MSDOS_I(inode)->i_disksize = inode->i_size; fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; diff --git a/fs/fcntl.c b/fs/fcntl.c index 72c82f69b01b..22d1c3df61ac 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/shmem_fs.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -336,6 +337,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, arg); break; + case F_ADD_SEALS: + case F_GET_SEALS: + err = shmem_fcntl(filp, cmd, arg); + break; default: break; } diff --git a/fs/inode.c b/fs/inode.c index 5938f3928944..26753ba7b6d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index ab798a88ec1d..2a6170133c1d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -667,22 +667,16 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l * deferred rpc for GETLK and SETLK. */ static void -nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, - int result) +nlmsvc_update_deferred_block(struct nlm_block *block, int result) { block->b_flags |= B_GOT_CALLBACK; if (result == 0) block->b_granted = 1; else block->b_flags |= B_TIMED_OUT; - if (conf) { - if (block->b_fl) - __locks_copy_lock(block->b_fl, conf); - } } -static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, - int result) +static int nlmsvc_grant_deferred(struct file_lock *fl, int result) { struct nlm_block *block; int rc = -ENOENT; @@ -697,7 +691,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, rc = -ENOLCK; break; } - nlmsvc_update_deferred_block(block, conf, result); + nlmsvc_update_deferred_block(block, result); } else if (result == 0) block->b_granted = 1; diff --git a/include/asm-generic/pci-dma-compat.h b/include/asm-generic/pci-dma-compat.h index 1437b7da09b2..c110843fc53b 100644 --- a/include/asm-generic/pci-dma-compat.h +++ b/include/asm-generic/pci-dma-compat.h @@ -19,6 +19,14 @@ pci_alloc_consistent(struct pci_dev *hwdev, size_t size, return dma_alloc_coherent(hwdev == NULL ? NULL : &hwdev->dev, size, dma_handle, GFP_ATOMIC); } +static inline void * +pci_zalloc_consistent(struct pci_dev *hwdev, size_t size, + dma_addr_t *dma_handle) +{ + return dma_zalloc_coherent(hwdev == NULL ? NULL : &hwdev->dev, + size, dma_handle, GFP_ATOMIC); +} + static inline void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) diff --git a/include/linux/efi.h b/include/linux/efi.h index efc681fd5895..45cb4ffdea62 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1156,6 +1156,9 @@ int efivars_sysfs_init(void); #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); void efi_runtime_map_setup(void *, int, u32); +int efi_get_runtime_map_size(void); +int efi_get_runtime_map_desc_size(void); +int efi_runtime_map_copy(void *buf, size_t bufsz); #else static inline int efi_runtime_map_init(struct kobject *kobj) { @@ -1164,6 +1167,22 @@ static inline int efi_runtime_map_init(struct kobject *kobj) static inline void efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) {} + +static inline int efi_get_runtime_map_size(void) +{ + return 0; +} + +static inline int efi_get_runtime_map_desc_size(void) +{ + return 0; +} + +static inline int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + return 0; +} + #endif /* prototypes shared between arch specific and generic stub code */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 1d9acf7cfdb3..98bfe7ecff78 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include <linux/linkage.h> #include <linux/wait.h> #include <linux/kdev_t.h> @@ -50,12 +49,12 @@ struct seq_file; struct workqueue_struct; struct iov_iter; -extern void __init inode_init(void); -extern void __init inode_init_early(void); -extern void __init files_init(unsigned long); +void __init inode_init(void); +void __init inode_init_early(void); +void __init files_init(unsigned long); extern struct files_stat_struct files_stat; -extern unsigned long get_max_files(void); +unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; @@ -64,9 +63,9 @@ extern int sysctl_protected_hardlinks; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); + struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, - ssize_t bytes, void *private); + ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 @@ -192,8 +191,8 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define READ 0 #define WRITE RW_MASK #define READA RWA_MASK -#define KERNEL_READ (READ|REQ_KERNEL) -#define KERNEL_WRITE (WRITE|REQ_KERNEL) +#define KERNEL_READ (READ | REQ_KERNEL) +#define KERNEL_WRITE (WRITE | REQ_KERNEL) #define READ_SYNC (READ | REQ_SYNC) #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) @@ -256,25 +255,25 @@ struct iattr { */ #include <linux/quota.h> -/** +/** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has - * completed, that the page is still locked, and - * should be considered active. The VM uses this hint - * to return the page to the active list -- it won't - * be a candidate for writeback again in the near - * future. Other callers must be careful to unlock - * the page if they get this return. Returned by - * writepage(); + * completed, that the page is still locked, and + * should be considered active. The VM uses this hint + * to return the page to the active list -- it won't + * be a candidate for writeback again in the near + * future. Other callers must be careful to unlock + * the page if they get this return. Returned by + * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has - * unlocked it and the page might have been truncated. - * The caller should back up to acquiring a new page and - * trying again. The aop will be taking reasonable - * precautions not to livelock. If the caller held a page - * reference, it should drop it before retrying. Returned - * by readpage(). + * unlocked it and the page might have been truncated. + * The caller should back up to acquiring a new page and + * trying again. The aop will be taking reasonable + * precautions not to livelock. If the caller held a page + * reference, it should drop it before retrying. Returned + * by readpage(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a @@ -320,7 +319,7 @@ typedef struct { } read_descriptor_t; typedef int (*read_actor_t)(read_descriptor_t *, struct page *, - unsigned long, unsigned long); + unsigned long, unsigned long); struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); @@ -333,38 +332,38 @@ struct address_space_operations { int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); + struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); - int (*releasepage) (struct page *, gfp_t); + void (*invalidatepage)(struct page *, unsigned int, unsigned int); + int (*releasepage)(struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); int (*get_xip_mem)(struct address_space *, pgoff_t, int, - void **, unsigned long *); + void **, unsigned long *); /* * migrate the contents of a page to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ - int (*migratepage) (struct address_space *, - struct page *, struct page *, enum migrate_mode); - int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); - void (*is_dirty_writeback) (struct page *, bool *, bool *); + int (*migratepage)(struct address_space *, + struct page *, struct page *, enum migrate_mode); + int (*launder_page)(struct page *); + int (*is_partially_uptodate)(struct page *, unsigned long, + unsigned long); + void (*is_dirty_writeback)(struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, - sector_t *span); + sector_t *span); void (*swap_deactivate)(struct file *file); }; @@ -375,19 +374,19 @@ extern const struct address_space_operations empty_aops; * to write into the pagecache. */ int pagecache_write_begin(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); int pagecache_write_end(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ - unsigned int i_mmap_writable;/* count VM_SHARED mappings */ + atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ @@ -402,35 +401,35 @@ struct address_space { struct list_head private_list; /* ditto */ void *private_data; /* ditto */ } __attribute__((aligned(sizeof(long)))); - /* - * On most architectures that alignment is already the case; but - * must be enforced here for CRIS, to let the least significant bit - * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. - */ +/* + * On most architectures that alignment is already the case; but + * must be enforced here for CRIS, to let the least significant bit + * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. + */ struct request_queue; struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ int bd_openers; - struct inode * bd_inode; /* will die */ - struct super_block * bd_super; + struct inode *bd_inode; /* will die */ + struct super_block *bd_super; struct mutex bd_mutex; /* open/close mutex */ struct list_head bd_inodes; - void * bd_claiming; - void * bd_holder; + void *bd_claiming; + void *bd_holder; int bd_holders; bool bd_write_holder; #ifdef CONFIG_SYSFS struct list_head bd_holder_disks; #endif - struct block_device * bd_contains; + struct block_device *bd_contains; unsigned bd_block_size; - struct hd_struct * bd_part; + struct hd_struct *bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; int bd_invalidated; - struct gendisk * bd_disk; - struct request_queue * bd_queue; + struct gendisk *bd_disk; + struct request_queue *bd_queue; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device @@ -461,8 +460,8 @@ int mapping_tagged(struct address_space *mapping, int tag); */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear); + return !RB_EMPTY_ROOT(&mapping->i_mmap) || + !list_empty(&mapping->i_mmap_nonlinear); } /* @@ -470,16 +469,41 @@ static inline int mapping_mapped(struct address_space *mapping) * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. + * + * If i_mmap_writable is negative, no new writable mappings are allowed. You + * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { - return mapping->i_mmap_writable != 0; + return atomic_read(&mapping->i_mmap_writable) > 0; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; +} + +static inline void mapping_unmap_writable(struct address_space *mapping) +{ + atomic_dec(&mapping->i_mmap_writable); +} + +static inline int mapping_deny_writable(struct address_space *mapping) +{ + return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? + 0 : -EBUSY; +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) @@ -609,8 +633,7 @@ static inline int inode_unhashed(struct inode *inode) * The locking order between these classes is * parent -> child -> normal -> xattr -> second non-directory */ -enum inode_i_mutex_lock_class -{ +enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, @@ -633,7 +656,7 @@ void unlock_two_nondirectories(struct inode *, struct inode*); */ static inline loff_t i_size_read(const struct inode *inode) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; @@ -642,7 +665,7 @@ static inline loff_t i_size_read(const struct inode *inode) i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT) loff_t i_size; preempt_disable(); @@ -661,13 +684,13 @@ static inline loff_t i_size_read(const struct inode *inode) */ static inline void i_size_write(struct inode *inode, loff_t i_size) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT) preempt_disable(); inode->i_size = i_size; preempt_enable(); @@ -711,7 +734,7 @@ static inline unsigned imajor(const struct inode *inode) return MAJOR(inode->i_rdev); } -extern struct block_device *I_BDEV(struct inode *inode); +struct block_device *I_BDEV(struct inode *inode); struct fown_struct { rwlock_t lock; /* protects pid, uid, euid fields */ @@ -747,7 +770,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) struct file { union { struct llist_node fu_llist; - struct rcu_head fu_rcuhead; + struct rcu_head fu_rcuhead; } f_u; struct path f_path; #define f_dentry f_path.dentry @@ -760,7 +783,7 @@ struct file { */ spinlock_t f_lock; atomic_long_t f_count; - unsigned int f_flags; + unsigned int f_flags; fmode_t f_mode; struct mutex f_pos_lock; loff_t f_pos; @@ -800,12 +823,12 @@ static inline struct file *get_file(struct file *f) #define MAX_NON_LFS ((1UL<<31) - 1) -/* Page cache limit. The filesystems should put that into their s_maxbytes - limits, otherwise bad things can happen in VM. */ -#if BITS_PER_LONG==32 -#define MAX_LFS_FILESIZE (((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) -#elif BITS_PER_LONG==64 -#define MAX_LFS_FILESIZE ((loff_t)0x7fffffffffffffffLL) +/* Page cache limit. The filesystems should put that into their s_maxbytes + limits, otherwise bad things can happen in VM. */ +#if BITS_PER_LONG == 32 +#define MAX_LFS_FILESIZE (((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG - 1)) - 1) +#elif BITS_PER_LONG == 64 +#define MAX_LFS_FILESIZE ((loff_t)0x7fffffffffffffffLL) #endif #define FL_POSIX 1 @@ -836,17 +859,17 @@ static inline struct file *get_file(struct file *f) typedef void *fl_owner_t; struct file_lock_operations { - void (*fl_copy_lock)(struct file_lock *, struct file_lock *); - void (*fl_release_private)(struct file_lock *); + void (*fl_copy_lock)(struct file_lock *dst, struct file_lock *fl); + void (*fl_release_private)(struct file_lock *fl); }; struct lock_manager_operations { - int (*lm_compare_owner)(struct file_lock *, struct file_lock *); - unsigned long (*lm_owner_key)(struct file_lock *); - void (*lm_notify)(struct file_lock *); /* unblock callback */ - int (*lm_grant)(struct file_lock *, struct file_lock *, int); - void (*lm_break)(struct file_lock *); - int (*lm_change)(struct file_lock **, int); + int (*lm_compare_owner)(struct file_lock *fl1, struct file_lock *fl2); + unsigned long (*lm_owner_key)(struct file_lock *fl); + void (*lm_notify)(struct file_lock *fl); /* unblock callback */ + int (*lm_grant)(struct file_lock *fl, int result); + void (*lm_break)(struct file_lock *fl); + int (*lm_change)(struct file_lock **fl, int type); }; struct lock_manager { @@ -894,7 +917,7 @@ struct file_lock { loff_t fl_start; loff_t fl_end; - struct fasync_struct * fl_fasync; /* for lease break notifications */ + struct fasync_struct *fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; @@ -920,46 +943,47 @@ struct file_lock { #include <linux/fcntl.h> -extern void send_sigio(struct fown_struct *fown, int fd, int band); +void send_sigio(struct fown_struct *fown, int fd, int band); #ifdef CONFIG_FILE_LOCKING -extern int fcntl_getlk(struct file *, unsigned int, struct flock __user *); -extern int fcntl_setlk(unsigned int, struct file *, unsigned int, - struct flock __user *); +int fcntl_getlk(struct file *, unsigned int, struct flock __user *); +int fcntl_setlk(unsigned int, struct file *, unsigned int, + struct flock __user *); #if BITS_PER_LONG == 32 -extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 __user *); -extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, - struct flock64 __user *); +int fcntl_getlk64(struct file *, unsigned int, struct flock64 __user *); +int fcntl_setlk64(unsigned int, struct file *, unsigned int, + struct flock64 __user *); #endif -extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); -extern int fcntl_getlease(struct file *filp); +int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +int fcntl_getlease(struct file *filp); /* fs/locks.c */ void locks_free_lock(struct file_lock *fl); -extern void locks_init_lock(struct file_lock *); -extern struct file_lock * locks_alloc_lock(void); -extern void locks_copy_lock(struct file_lock *, struct file_lock *); -extern void __locks_copy_lock(struct file_lock *, const struct file_lock *); -extern void locks_remove_posix(struct file *, fl_owner_t); -extern void locks_remove_file(struct file *); -extern void locks_release_private(struct file_lock *); -extern void posix_test_lock(struct file *, struct file_lock *); -extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); -extern int posix_lock_file_wait(struct file *, struct file_lock *); -extern int posix_unblock_lock(struct file_lock *); -extern int vfs_test_lock(struct file *, struct file_lock *); -extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); -extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); -extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); -extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); -extern void lease_get_mtime(struct inode *, struct timespec *time); -extern int generic_setlease(struct file *, long, struct file_lock **); -extern int vfs_setlease(struct file *, long, struct file_lock **); -extern int lease_modify(struct file_lock **, int); -extern int lock_may_read(struct inode *, loff_t start, unsigned long count); -extern int lock_may_write(struct inode *, loff_t start, unsigned long count); +void locks_init_lock(struct file_lock *); +struct file_lock *locks_alloc_lock(void); +void locks_copy_lock(struct file_lock *, struct file_lock *); +void __locks_copy_lock(struct file_lock *, const struct file_lock *); +void locks_remove_posix(struct file *, fl_owner_t); +void locks_remove_file(struct file *); +void locks_release_private(struct file_lock *); +void posix_test_lock(struct file *, struct file_lock *); +int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); +int posix_lock_file_wait(struct file *, struct file_lock *); +int posix_unblock_lock(struct file_lock *); +int vfs_test_lock(struct file *, struct file_lock *); +int vfs_lock_file(struct file *, unsigned int, struct file_lock *, + struct file_lock *); +int vfs_cancel_lock(struct file *filp, struct file_lock *fl); +int flock_lock_file_wait(struct file *filp, struct file_lock *fl); +int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); +void lease_get_mtime(struct inode *, struct timespec *time); +int generic_setlease(struct file *, long, struct file_lock **); +int vfs_setlease(struct file *, long, struct file_lock **); +int lease_modify(struct file_lock **, int); +int lock_may_read(struct inode *, loff_t start, unsigned long count); +int lock_may_write(struct inode *, loff_t start, unsigned long count); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -998,32 +1022,26 @@ static inline int fcntl_getlease(struct file *filp) static inline void locks_init_lock(struct file_lock *fl) { - return; } static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) { - return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { - return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { - return; } static inline void locks_remove_file(struct file *filp) { - return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { - return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, @@ -1064,18 +1082,18 @@ static inline int flock_lock_file_wait(struct file *filp, return -ENOLCK; } -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +static inline int __break_lease(struct inode *inode, unsigned int mode, + unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec *time) { - return; } static inline int generic_setlease(struct file *filp, long arg, - struct file_lock **flp) + struct file_lock **flp) { return -EINVAL; } @@ -1104,7 +1122,6 @@ static inline int lock_may_write(struct inode *inode, loff_t start, } #endif /* !CONFIG_FILE_LOCKING */ - struct fasync_struct { spinlock_t fa_lock; int magic; @@ -1117,20 +1134,22 @@ struct fasync_struct { #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ -extern int fasync_helper(int, struct file *, int, struct fasync_struct **); -extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); -extern int fasync_remove_entry(struct file *, struct fasync_struct **); -extern struct fasync_struct *fasync_alloc(void); -extern void fasync_free(struct fasync_struct *); +int fasync_helper(int, struct file *, int, struct fasync_struct **); +struct fasync_struct *fasync_insert_entry(int, struct file *, + struct fasync_struct **, + struct fasync_struct *); +int fasync_remove_entry(struct file *, struct fasync_struct **); +struct fasync_struct *fasync_alloc(void); +void fasync_free(struct fasync_struct *); /* can be called from interrupts */ -extern void kill_fasync(struct fasync_struct **, int, int); +void kill_fasync(struct fasync_struct **, int, int); -extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); -extern int f_setown(struct file *filp, unsigned long arg, int force); -extern void f_delown(struct file *filp); -extern pid_t f_getown(struct file *filp); -extern int send_sigurg(struct fown_struct *fown); +int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); +int f_setown(struct file *filp, unsigned long arg, int force); +void f_delown(struct file *filp); +pid_t f_getown(struct file *filp); +int send_sigurg(struct fown_struct *fown); struct mm_struct; @@ -1208,7 +1227,7 @@ struct super_block { char s_id[32]; /* Informational name */ u8 s_uuid[16]; /* UUID */ - void *s_fs_info; /* Filesystem private info */ + void *s_fs_info; /* Filesystem private info */ unsigned int s_max_links; fmode_t s_mode; @@ -1261,7 +1280,7 @@ struct super_block { struct rcu_head rcu; }; -extern struct timespec current_fs_time(struct super_block *sb); +struct timespec current_fs_time(struct super_block *sb); /* * Snapshotting support. @@ -1377,31 +1396,31 @@ static inline void sb_start_intwrite(struct super_block *sb) __sb_start_write(sb, SB_FREEZE_FS, true); } - -extern bool inode_owner_or_capable(const struct inode *inode); +bool inode_owner_or_capable(const struct inode *inode); /* * VFS helper functions.. */ -extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); -extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); -extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -extern int vfs_symlink(struct inode *, struct dentry *, const char *); -extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); -extern int vfs_rmdir(struct inode *, struct dentry *); -extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); -extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +int vfs_create(struct inode *, struct dentry *, umode_t, bool); +int vfs_mkdir(struct inode *, struct dentry *, umode_t); +int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +int vfs_symlink(struct inode *, struct dentry *, const char *); +int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); +int vfs_rmdir(struct inode *, struct dentry *); +int vfs_unlink(struct inode *, struct dentry *, struct inode **); +int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, + struct inode **, unsigned int); /* * VFS dentry helper functions. */ -extern void dentry_unhash(struct dentry *dentry); +void dentry_unhash(struct dentry *dentry); /* * VFS file helper functions. */ -extern void inode_init_owner(struct inode *inode, const struct inode *dir, - umode_t mode); +void inode_init_owner(struct inode *inode, const struct inode *dir, + umode_t mode); /* * VFS FS_IOC_FIEMAP helper definitions. */ @@ -1410,7 +1429,7 @@ struct fiemap_extent_info { unsigned int fi_extents_mapped; /* Number of mapped extents */ unsigned int fi_extents_max; /* Size of fiemap_extent array */ struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ + fiemap_extent array */ }; int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, u64 phys, u64 len, u32 flags); @@ -1456,31 +1475,38 @@ struct iov_iter; struct file_operations { struct module *owner; - loff_t (*llseek) (struct file *, loff_t, int); - ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); - ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); - ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); - ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iterate) (struct file *, struct dir_context *); - unsigned int (*poll) (struct file *, struct poll_table_struct *); - long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); - long (*compat_ioctl) (struct file *, unsigned int, unsigned long); - int (*mmap) (struct file *, struct vm_area_struct *); - int (*open) (struct inode *, struct file *); - int (*flush) (struct file *, fl_owner_t id); - int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, loff_t, loff_t, int datasync); - int (*aio_fsync) (struct kiocb *, int datasync); - int (*fasync) (int, struct file *, int); - int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + loff_t (*llseek)(struct file *, loff_t, int); + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + ssize_t (*aio_read)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + ssize_t (*aio_write)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + ssize_t (*read_iter)(struct kiocb *, struct iov_iter *); + ssize_t (*write_iter)(struct kiocb *, struct iov_iter *); + int (*iterate)(struct file *, struct dir_context *); + unsigned int (*poll)(struct file *, struct poll_table_struct *); + long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long); + long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + int (*mmap)(struct file *, struct vm_area_struct *); + int (*open)(struct inode *, struct file *); + int (*flush)(struct file *, fl_owner_t id); + int (*release)(struct inode *, struct file *); + int (*fsync)(struct file *, loff_t, loff_t, int datasync); + int (*aio_fsync)(struct kiocb *, int datasync); + int (*fasync)(int, struct file *, int); + int (*lock)(struct file *, int, struct file_lock *); + ssize_t (*sendpage)(struct file *, struct page *, int, size_t, + loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, + unsigned long, unsigned long, + unsigned long); int (*check_flags)(int); - int (*flock) (struct file *, int, struct file_lock *); - ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); - ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + int (*flock)(struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); @@ -1488,76 +1514,79 @@ struct file_operations { }; struct inode_operations { - struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - void * (*follow_link) (struct dentry *, struct nameidata *); - int (*permission) (struct inode *, int); + struct dentry *(*lookup)(struct inode *, struct dentry *, unsigned int); + void *(*follow_link)(struct dentry *, struct nameidata *); + int (*permission)(struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); - int (*readlink) (struct dentry *, char __user *,int); - void (*put_link) (struct dentry *, struct nameidata *, void *); - - int (*create) (struct inode *,struct dentry *, umode_t, bool); - int (*link) (struct dentry *,struct inode *,struct dentry *); - int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,umode_t); - int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); - int (*rename2) (struct inode *, struct dentry *, - struct inode *, struct dentry *, unsigned int); - int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); - int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); - ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); - ssize_t (*listxattr) (struct dentry *, char *, size_t); - int (*removexattr) (struct dentry *, const char *); + int (*readlink)(struct dentry *, char __user *, int); + void (*put_link)(struct dentry *, struct nameidata *, void *); + + int (*create)(struct inode *, struct dentry *, umode_t, bool); + int (*link)(struct dentry *, struct inode *, struct dentry *); + int (*unlink)(struct inode *, struct dentry *); + int (*symlink)(struct inode *, struct dentry *, const char *); + int (*mkdir)(struct inode *, struct dentry *, umode_t); + int (*rmdir)(struct inode *, struct dentry *); + int (*mknod)(struct inode *, struct dentry *, umode_t, dev_t); + int (*rename)(struct inode *, struct dentry *, + struct inode *, struct dentry *); + int (*rename2)(struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); + int (*setattr)(struct dentry *, struct iattr *); + int (*getattr)(struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*setxattr)(struct dentry *, const char *, const void *, size_t, + int); + ssize_t (*getxattr)(struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr)(struct dentry *, char *, size_t); + int (*removexattr)(struct dentry *, const char *); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); - int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*tmpfile)(struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); } ____cacheline_aligned; -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, +ssize_t rw_copy_check_uvector(int type, const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, struct iovec **ret_pointer); -extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); -extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); -extern ssize_t vfs_readv(struct file *, const struct iovec __user *, - unsigned long, loff_t *); -extern ssize_t vfs_writev(struct file *, const struct iovec __user *, - unsigned long, loff_t *); +ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); +ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); +ssize_t vfs_readv(struct file *, const struct iovec __user *, unsigned long, + loff_t *); +ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, + loff_t *); struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); + struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, struct writeback_control *wbc); - int (*drop_inode) (struct inode *); - void (*evict_inode) (struct inode *); - void (*put_super) (struct super_block *); + void (*dirty_inode)(struct inode *, int flags); + int (*write_inode)(struct inode *, struct writeback_control *wbc); + int (*drop_inode)(struct inode *); + void (*evict_inode)(struct inode *); + void (*put_super)(struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_fs) (struct super_block *); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*umount_begin) (struct super_block *); + int (*freeze_fs)(struct super_block *); + int (*unfreeze_fs)(struct super_block *); + int (*statfs)(struct dentry *, struct kstatfs *); + int (*remount_fs)(struct super_block *, int *, char *); + void (*umount_begin)(struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, + loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, + loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); long (*nr_cached_objects)(struct super_block *, int); @@ -1597,12 +1626,12 @@ struct super_operations { #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) -#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ - ((inode)->i_flags & S_SYNC)) -#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ - ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) +#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ + ((inode)->i_flags & S_SYNC)) +#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS | MS_DIRSYNC) || \ + ((inode)->i_flags & (S_SYNC | S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) -#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) +#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY | MS_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) @@ -1688,7 +1717,7 @@ struct super_operations { #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) -extern void __mark_inode_dirty(struct inode *, int); +void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); @@ -1699,10 +1728,10 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } -extern void inc_nlink(struct inode *inode); -extern void drop_nlink(struct inode *inode); -extern void clear_nlink(struct inode *inode); -extern void set_nlink(struct inode *inode, unsigned int nlink); +void inc_nlink(struct inode *inode); +void drop_nlink(struct inode *inode); +void clear_nlink(struct inode *inode); +void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { @@ -1726,9 +1755,9 @@ static inline void inode_dec_link_count(struct inode *inode) static inline void inode_inc_iversion(struct inode *inode) { - spin_lock(&inode->i_lock); - inode->i_version++; - spin_unlock(&inode->i_lock); + spin_lock(&inode->i_lock); + inode->i_version++; + spin_unlock(&inode->i_lock); } enum file_time_flags { @@ -1738,7 +1767,7 @@ enum file_time_flags { S_VERSION = 8, }; -extern void touch_atime(const struct path *); +void touch_atime(const struct path *); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) @@ -1751,17 +1780,17 @@ int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; -#define FS_REQUIRES_DEV 1 +#define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); - void (*kill_sb) (struct super_block *); + struct dentry *(*mount)(struct file_system_type *, int, + const char *, void *); + void (*kill_sb)(struct super_block *); struct module *owner; - struct file_system_type * next; + struct file_system_type *next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; @@ -1776,18 +1805,19 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) -extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); +struct dentry *mount_ns(struct file_system_type *fs_type, int flags, + void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct dentry *mount_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct dentry *mount_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct dentry *mount_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); @@ -1798,56 +1828,56 @@ int set_anon_super(struct super_block *s, void *data); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget(struct file_system_type *type, - int (*test)(struct super_block *,void *), - int (*set)(struct super_block *,void *), - int flags, void *data); -extern struct dentry *mount_pseudo(struct file_system_type *, char *, - const struct super_operations *ops, - const struct dentry_operations *dops, - unsigned long); + int (*test)(struct super_block *, void *), + int (*set)(struct super_block *, void *), + int flags, void *data); +struct dentry *mount_pseudo(struct file_system_type *, char *, + const struct super_operations *ops, + const struct dentry_operations *dops, + unsigned long); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ -#define fops_get(fops) \ +#define fops_get(fops) \ (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) -#define fops_put(fops) \ - do { if (fops) module_put((fops)->owner); } while(0) +#define fops_put(fops) \ + do { if (fops) module_put((fops)->owner); } while (0) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ -#define replace_fops(f, fops) \ - do { \ - struct file *__file = (f); \ - fops_put(__file->f_op); \ - BUG_ON(!(__file->f_op = (fops))); \ - } while(0) - -extern int register_filesystem(struct file_system_type *); -extern int unregister_filesystem(struct file_system_type *); -extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); +#define replace_fops(f, fops) \ + do { \ + struct file *__file = (f); \ + fops_put(__file->f_op); \ + BUG_ON(!(__file->f_op = (fops))); \ + } while (0) + +int register_filesystem(struct file_system_type *); +int unregister_filesystem(struct file_system_type *); +struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) -extern void kern_unmount(struct vfsmount *mnt); -extern int may_umount_tree(struct vfsmount *); -extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char *, const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(struct path *); -extern void drop_collected_mounts(struct vfsmount *); -extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, - struct vfsmount *); -extern int vfs_statfs(struct path *, struct kstatfs *); -extern int user_statfs(const char __user *, struct kstatfs *); -extern int fd_statfs(int, struct kstatfs *); -extern int vfs_ustat(dev_t, struct kstatfs *); -extern int freeze_super(struct super_block *super); -extern int thaw_super(struct super_block *super); -extern bool our_mnt(struct vfsmount *mnt); -extern bool fs_fully_visible(struct file_system_type *); - -extern int current_umask(void); - -extern void ihold(struct inode * inode); -extern void iput(struct inode *); +void kern_unmount(struct vfsmount *mnt); +int may_umount_tree(struct vfsmount *); +int may_umount(struct vfsmount *); +long do_mount(const char *, const char *, const char *, unsigned long, void *); +struct vfsmount *collect_mounts(struct path *); +void drop_collected_mounts(struct vfsmount *); +int iterate_mounts(int (*)(struct vfsmount *, void *), void *, + struct vfsmount *); +int vfs_statfs(struct path *, struct kstatfs *); +int user_statfs(const char __user *, struct kstatfs *); +int fd_statfs(int, struct kstatfs *); +int vfs_ustat(dev_t, struct kstatfs *); +int freeze_super(struct super_block *super); +int thaw_super(struct super_block *super); +bool our_mnt(struct vfsmount *mnt); +bool fs_fully_visible(struct file_system_type *); + +int current_umask(void); + +void ihold(struct inode *inode); +void iput(struct inode *); static inline struct inode *file_inode(struct file *f) { @@ -1863,8 +1893,8 @@ extern struct kobject *fs_kobj; #define FLOCK_VERIFY_WRITE 2 #ifdef CONFIG_FILE_LOCKING -extern int locks_mandatory_locked(struct file *); -extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); +int locks_mandatory_locked(struct file *); +int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); /* * Candidates for mandatory locking have the setgid bit set @@ -1894,8 +1924,8 @@ static inline int locks_verify_locked(struct file *file) } static inline int locks_verify_truncate(struct inode *inode, - struct file *filp, - loff_t size) + struct file *filp, + loff_t size) { if (inode->i_flock && mandatory_lock(inode)) return locks_mandatory_area( @@ -1903,7 +1933,7 @@ static inline int locks_verify_truncate(struct inode *inode, size < inode->i_size ? size : inode->i_size, (size < inode->i_size ? inode->i_size - size : size - inode->i_size) - ); + ); return 0; } @@ -1933,11 +1963,12 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +static inline int try_break_deleg(struct inode *inode, + struct inode **delegated_inode) { int ret; - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); + ret = break_deleg(inode, O_WRONLY | O_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); @@ -1999,7 +2030,8 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +static inline int try_break_deleg(struct inode *inode, + struct inode **delegated_inode) { return 0; } @@ -2021,69 +2053,66 @@ struct filename { bool separate; /* should "name" be freed? */ }; -extern long vfs_truncate(struct path *, loff_t); -extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, - struct file *filp); -extern int do_fallocate(struct file *file, int mode, loff_t offset, - loff_t len); -extern long do_sys_open(int dfd, const char __user *filename, int flags, - umode_t mode); -extern struct file *file_open_name(struct filename *, int, umode_t); -extern struct file *filp_open(const char *, int, umode_t); -extern struct file *file_open_root(struct dentry *, struct vfsmount *, - const char *, int); -extern struct file * dentry_open(const struct path *, int, const struct cred *); -extern int filp_close(struct file *, fl_owner_t id); - -extern struct filename *getname(const char __user *); -extern struct filename *getname_kernel(const char *); +long vfs_truncate(struct path *, loff_t); +int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, + struct file *filp); +int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len); +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); +struct file *file_open_name(struct filename *, int, umode_t); +struct file *filp_open(const char *, int, umode_t); +struct file *file_open_root(struct dentry *, struct vfsmount *, + const char *, int); +struct file *dentry_open(const struct path *, int, const struct cred *); +int filp_close(struct file *, fl_owner_t id); + +struct filename *getname(const char __user *); +struct filename *getname_kernel(const char *); enum { FILE_CREATED = 1, FILE_OPENED = 2 }; -extern int finish_open(struct file *file, struct dentry *dentry, - int (*open)(struct inode *, struct file *), - int *opened); -extern int finish_no_open(struct file *file, struct dentry *dentry); +int finish_open(struct file *file, struct dentry *dentry, + int (*open)(struct inode *, struct file *), int *opened); +int finish_no_open(struct file *file, struct dentry *dentry); /* fs/ioctl.c */ -extern int ioctl_preallocate(struct file *filp, void __user *argp); +int ioctl_preallocate(struct file *filp, void __user *argp); /* fs/dcache.c */ -extern void __init vfs_caches_init_early(void); -extern void __init vfs_caches_init(unsigned long); +void __init vfs_caches_init_early(void); +void __init vfs_caches_init(unsigned long); extern struct kmem_cache *names_cachep; -extern void final_putname(struct filename *name); +void final_putname(struct filename *name); #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) #ifndef CONFIG_AUDITSYSCALL #define putname(name) final_putname(name) #else -extern void putname(struct filename *name); +void putname(struct filename *name); #endif #ifdef CONFIG_BLOCK -extern int register_blkdev(unsigned int, const char *); -extern void unregister_blkdev(unsigned int, const char *); -extern struct block_device *bdget(dev_t); -extern struct block_device *bdgrab(struct block_device *bdev); -extern void bd_set_size(struct block_device *, loff_t size); -extern void bd_forget(struct inode *inode); -extern void bdput(struct block_device *); -extern void invalidate_bdev(struct block_device *); -extern void iterate_bdevs(void (*)(struct block_device *, void *), void *); -extern int sync_blockdev(struct block_device *bdev); -extern void kill_bdev(struct block_device *); -extern struct super_block *freeze_bdev(struct block_device *); -extern void emergency_thaw_all(void); -extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); -extern int fsync_bdev(struct block_device *); -extern int sb_is_blkdev_sb(struct super_block *sb); +int register_blkdev(unsigned int, const char *); +void unregister_blkdev(unsigned int, const char *); +struct block_device *bdget(dev_t); +struct block_device *bdgrab(struct block_device *bdev); +void bd_set_size(struct block_device *, loff_t size); +void bd_forget(struct inode *inode); +void bdput(struct block_device *); +void invalidate_bdev(struct block_device *); +void iterate_bdevs(void (*)(struct block_device *, void *), void *); +int sync_blockdev(struct block_device *bdev); +void kill_bdev(struct block_device *); +struct super_block *freeze_bdev(struct block_device *); +void emergency_thaw_all(void); +int thaw_bdev(struct block_device *bdev, struct super_block *sb); +int fsync_bdev(struct block_device *); +int sb_is_blkdev_sb(struct super_block *sb); #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } @@ -2109,24 +2138,23 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) return 0; } #endif -extern int sync_filesystem(struct super_block *); +int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; extern const struct file_operations bad_sock_fops; #ifdef CONFIG_BLOCK -extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); -extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); -extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); -extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder); -extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, - void *holder); -extern void blkdev_put(struct block_device *bdev, fmode_t mode); +int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); +int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); +long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder); +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, + void *holder); +void blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS -extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); -extern void bd_unlink_disk_holder(struct block_device *bdev, - struct gendisk *disk); +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -2142,15 +2170,15 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, /* fs/char_dev.c */ #define CHRDEV_MAJOR_HASH_SIZE 255 -extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); -extern int register_chrdev_region(dev_t, unsigned, const char *); -extern int __register_chrdev(unsigned int major, unsigned int baseminor, - unsigned int count, const char *name, - const struct file_operations *fops); -extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, - unsigned int count, const char *name); -extern void unregister_chrdev_region(dev_t, unsigned); -extern void chrdev_show(struct seq_file *,off_t); +int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); +int register_chrdev_region(dev_t, unsigned, const char *); +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops); +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name); +void unregister_chrdev_region(dev_t, unsigned); +void chrdev_show(struct seq_file *, off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) @@ -2169,20 +2197,20 @@ static inline void unregister_chrdev(unsigned int major, const char *name) #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_HASH_SIZE 255 -extern const char *__bdevname(dev_t, char *buffer); -extern const char *bdevname(struct block_device *bdev, char *buffer); -extern struct block_device *lookup_bdev(const char *); -extern void blkdev_show(struct seq_file *,off_t); +const char *__bdevname(dev_t, char *buffer); +const char *bdevname(struct block_device *bdev, char *buffer); +struct block_device *lookup_bdev(const char *); +void blkdev_show(struct seq_file *, off_t); #else #define BLKDEV_MAJOR_HASH_SIZE 0 #endif -extern void init_special_inode(struct inode *, umode_t, dev_t); +void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ -extern void make_bad_inode(struct inode *); -extern int is_bad_inode(struct inode *); +void make_bad_inode(struct inode *); +int is_bad_inode(struct inode *); #ifdef CONFIG_BLOCK /* @@ -2195,15 +2223,14 @@ extern int is_bad_inode(struct inode *); */ #define bio_data_dir(bio) ((bio)->bi_rw & 1) -extern void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev); -extern int revalidate_disk(struct gendisk *); -extern int check_disk_change(struct block_device *); -extern int __invalidate_device(struct block_device *, bool); -extern int invalidate_partition(struct gendisk *, int); +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev); +int revalidate_disk(struct gendisk *); +int check_disk_change(struct block_device *); +int __invalidate_device(struct block_device *, bool); +int invalidate_partition(struct gendisk *, int); #endif unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end); + pgoff_t start, pgoff_t end); static inline void invalidate_remote_inode(struct inode *inode) { @@ -2211,26 +2238,24 @@ static inline void invalidate_remote_inode(struct inode *inode) S_ISLNK(inode->i_mode)) invalidate_mapping_pages(inode->i_mapping, 0, -1); } -extern int invalidate_inode_pages2(struct address_space *mapping); -extern int invalidate_inode_pages2_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -extern int write_inode_now(struct inode *, int); -extern int filemap_fdatawrite(struct address_space *); -extern int filemap_flush(struct address_space *); -extern int filemap_fdatawait(struct address_space *); -extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, - loff_t lend); -extern int filemap_write_and_wait(struct address_space *mapping); -extern int filemap_write_and_wait_range(struct address_space *mapping, - loff_t lstart, loff_t lend); -extern int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); -extern int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end); - -extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, - int datasync); -extern int vfs_fsync(struct file *file, int datasync); +int invalidate_inode_pages2(struct address_space *mapping); +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); +int write_inode_now(struct inode *, int); +int filemap_fdatawrite(struct address_space *); +int filemap_flush(struct address_space *); +int filemap_fdatawait(struct address_space *); +int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +int filemap_write_and_wait(struct address_space *mapping); +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); +int __filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end, int sync_mode); +int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end); + +int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); +int vfs_fsync(struct file *file, int datasync); static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count) { if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) @@ -2238,14 +2263,14 @@ static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count return vfs_fsync_range(file, pos, pos + count - 1, (file->f_flags & __O_SYNC) ? 0 : 1); } -extern void emergency_sync(void); -extern void emergency_remount(void); +void emergency_sync(void); +void emergency_remount(void); #ifdef CONFIG_BLOCK -extern sector_t bmap(struct inode *, sector_t); +sector_t bmap(struct inode *, sector_t); #endif -extern int notify_change(struct dentry *, struct iattr *, struct inode **); -extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int); +int notify_change(struct dentry *, struct iattr *, struct inode **); +int inode_permission(struct inode *, int); +int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) { @@ -2298,7 +2323,7 @@ static inline int deny_write_access(struct file *file) struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } -static inline void put_write_access(struct inode * inode) +static inline void put_write_access(struct inode *inode) { atomic_dec(&inode->i_writecount); } @@ -2325,146 +2350,157 @@ static inline void i_readcount_inc(struct inode *inode) #else static inline void i_readcount_dec(struct inode *inode) { - return; } static inline void i_readcount_inc(struct inode *inode) { - return; } #endif -extern int do_pipe_flags(int *, int); +int do_pipe_flags(int *, int); + +int kernel_read(struct file *, loff_t, char *, unsigned long); +ssize_t kernel_write(struct file *, const char *, size_t, loff_t); +ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); +struct file *open_exec(const char *); -extern int kernel_read(struct file *, loff_t, char *, unsigned long); -extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t); -extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); -extern struct file * open_exec(const char *); - /* fs/dcache.c -- generic fs support functions */ -extern int is_subdir(struct dentry *, struct dentry *); -extern int path_is_under(struct path *, struct path *); +int is_subdir(struct dentry *, struct dentry *); +int path_is_under(struct path *, struct path *); #include <linux/err.h> /* needed for stackable file system support */ -extern loff_t default_llseek(struct file *file, loff_t offset, int whence); +loff_t default_llseek(struct file *file, loff_t offset, int whence); -extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); +loff_t vfs_llseek(struct file *file, loff_t offset, int whence); -extern int inode_init_always(struct super_block *, struct inode *); -extern void inode_init_once(struct inode *); -extern void address_space_init_once(struct address_space *mapping); -extern struct inode * igrab(struct inode *); -extern ino_t iunique(struct super_block *, ino_t); -extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); +int inode_init_always(struct super_block *, struct inode *); +void inode_init_once(struct inode *); +void address_space_init_once(struct address_space *mapping); +struct inode *igrab(struct inode *); +ino_t iunique(struct super_block *, ino_t); +int inode_needs_sync(struct inode *inode); +int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } -extern struct inode *ilookup5_nowait(struct super_block *sb, - unsigned long hashval, int (*test)(struct inode *, void *), - void *data); -extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data); -extern struct inode *ilookup(struct super_block *sb, unsigned long ino); +struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *ilookup(struct super_block *sb, unsigned long ino); -extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -extern struct inode * iget_locked(struct super_block *, unsigned long); -extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); -extern int insert_inode_locked(struct inode *); +struct inode *iget5_locked(struct super_block *, unsigned long, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *); +struct inode *iget_locked(struct super_block *, unsigned long); +int insert_inode_locked4(struct inode *, unsigned long, + int (*test)(struct inode *, void *), void *); +int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void lockdep_annotate_inode_mutex_key(struct inode *inode); +void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif -extern void unlock_new_inode(struct inode *); -extern unsigned int get_next_ino(void); - -extern void __iget(struct inode * inode); -extern void iget_failed(struct inode *); -extern void clear_inode(struct inode *); -extern void __destroy_inode(struct inode *); -extern struct inode *new_inode_pseudo(struct super_block *sb); -extern struct inode *new_inode(struct super_block *sb); -extern void free_inode_nonrcu(struct inode *inode); -extern int should_remove_suid(struct dentry *); -extern int file_remove_suid(struct file *); - -extern void __insert_inode_hash(struct inode *, unsigned long hashval); +void unlock_new_inode(struct inode *); +unsigned int get_next_ino(void); + +void __iget(struct inode *inode); +void iget_failed(struct inode *); +void clear_inode(struct inode *); +void __destroy_inode(struct inode *); +struct inode *new_inode_pseudo(struct super_block *sb); +struct inode *new_inode(struct super_block *sb); +void free_inode_nonrcu(struct inode *inode); +int should_remove_suid(struct dentry *); +int file_remove_suid(struct file *); + +void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern void __remove_inode_hash(struct inode *); +void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode)) __remove_inode_hash(inode); } -extern void inode_sb_list_add(struct inode *inode); +void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -extern void submit_bio(int, struct bio *); -extern int bdev_read_only(struct block_device *); +void submit_bio(int, struct bio *); +int bdev_read_only(struct block_device *); #endif -extern int set_blocksize(struct block_device *, int); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); - -extern int generic_file_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); -int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); -extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); -extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); -extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); -extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t); -extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); -extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); -extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); -extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); -extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); +int set_blocksize(struct block_device *, int); +int sb_set_blocksize(struct super_block *, int); +int sb_min_blocksize(struct super_block *, int); + +int generic_file_mmap(struct file *, struct vm_area_struct *); +int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} +int generic_write_checks(struct file *file, loff_t *pos, size_t *count, + int isblk); +ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); +ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); +ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); +ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t); +ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); +ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, + loff_t *ppos); +ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos); +ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, + loff_t *ppos); +ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos); /* fs/block_dev.c */ -extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from); -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync); -extern void block_sync_page(struct page *page); +ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from); +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync); +void block_sync_page(struct page *page); /* fs/splice.c */ -extern ssize_t generic_file_splice_read(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); -extern ssize_t default_file_splice_read(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); -extern ssize_t iter_file_splice_write(struct pipe_inode_info *, - struct file *, loff_t *, size_t, unsigned int); -extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, - struct file *out, loff_t *, size_t len, unsigned int flags); - -extern void +ssize_t generic_file_splice_read(struct file *, loff_t *, + struct pipe_inode_info *, size_t, + unsigned int); +ssize_t default_file_splice_read(struct file *, loff_t *, + struct pipe_inode_info *, size_t, + unsigned int); +ssize_t iter_file_splice_write(struct pipe_inode_info *, + struct file *, loff_t *, size_t, unsigned int); +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, + struct file *out, loff_t *, size_t len, + unsigned int flags); + +void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); -extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); -extern loff_t no_llseek(struct file *file, loff_t offset, int whence); -extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); -extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); -extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, - int whence, loff_t maxsize, loff_t eof); -extern loff_t fixed_size_llseek(struct file *file, loff_t offset, - int whence, loff_t size); -extern int generic_file_open(struct inode * inode, struct file * filp); -extern int nonseekable_open(struct inode * inode, struct file * filp); +loff_t noop_llseek(struct file *file, loff_t offset, int whence); +loff_t no_llseek(struct file *file, loff_t offset, int whence); +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); +loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); +loff_t generic_file_llseek_size(struct file *file, loff_t offset, + int whence, loff_t maxsize, loff_t eof); +loff_t fixed_size_llseek(struct file *file, loff_t offset, + int whence, loff_t size); +int generic_file_open(struct inode *inode, struct file *filp); +int nonseekable_open(struct inode *inode, struct file *filp); #ifdef CONFIG_FS_XIP -extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, - loff_t *ppos); -extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); -extern ssize_t xip_file_write(struct file *filp, const char __user *buf, - size_t len, loff_t *ppos); -extern int xip_truncate_page(struct address_space *mapping, loff_t from); +ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *ppos); +int xip_file_mmap(struct file *file, struct vm_area_struct *vma); +ssize_t xip_file_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos); +int xip_truncate_page(struct address_space *mapping, loff_t from); #else static inline int xip_truncate_page(struct address_space *mapping, loff_t from) { @@ -2490,13 +2526,15 @@ enum { void dio_end_io(struct bio *bio, int error); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, loff_t offset, - get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags); + struct block_device *bdev, struct iov_iter *iter, + loff_t offset, get_block_t get_block, + dio_iodone_t end_io, dio_submit_t submit_io, + int flags); static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, - struct inode *inode, struct iov_iter *iter, loff_t offset, - get_block_t get_block) + struct inode *inode, + struct iov_iter *iter, loff_t offset, + get_block_t get_block) { return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iter, offset, get_block, NULL, NULL, @@ -2507,26 +2545,25 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, void inode_dio_wait(struct inode *inode); void inode_dio_done(struct inode *inode); -extern void inode_set_flags(struct inode *inode, unsigned int flags, - unsigned int mask); +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask); extern const struct file_operations generic_ro_fops; -#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) +#define special_file(m) (S_ISCHR(m) || S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m)) -extern int readlink_copy(char __user *, int, const char *); -extern int page_readlink(struct dentry *, char __user *, int); -extern void *page_follow_link_light(struct dentry *, struct nameidata *); -extern void page_put_link(struct dentry *, struct nameidata *, void *); -extern int __page_symlink(struct inode *inode, const char *symname, int len, - int nofs); -extern int page_symlink(struct inode *inode, const char *symname, int len); +int readlink_copy(char __user *, int, const char *); +int page_readlink(struct dentry *, char __user *, int); +void *page_follow_link_light(struct dentry *, struct nameidata *); +void page_put_link(struct dentry *, struct nameidata *, void *); +int __page_symlink(struct inode *inode, const char *symname, int len, int nofs); +int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; -extern void kfree_put_link(struct dentry *, struct nameidata *, void *); -extern int generic_readlink(struct dentry *, char __user *, int); -extern void generic_fillattr(struct inode *, struct kstat *); +void kfree_put_link(struct dentry *, struct nameidata *, void *); +int generic_readlink(struct dentry *, char __user *, int); +void generic_fillattr(struct inode *, struct kstat *); int vfs_getattr_nosec(struct path *path, struct kstat *stat); -extern int vfs_getattr(struct path *, struct kstat *); +int vfs_getattr(struct path *, struct kstat *); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); @@ -2534,97 +2571,98 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -extern int vfs_readdir(struct file *, filldir_t, void *); -extern int iterate_dir(struct file *, struct dir_context *); - -extern int vfs_stat(const char __user *, struct kstat *); -extern int vfs_lstat(const char __user *, struct kstat *); -extern int vfs_fstat(unsigned int, struct kstat *); -extern int vfs_fstatat(int , const char __user *, struct kstat *, int); - -extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, - unsigned long arg); -extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, - get_block_t *get_block); -extern int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); - -extern void get_filesystem(struct file_system_type *fs); -extern void put_filesystem(struct file_system_type *fs); -extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_super(struct block_device *); -extern struct super_block *get_super_thawed(struct block_device *); -extern struct super_block *get_active_super(struct block_device *bdev); -extern void drop_super(struct super_block *sb); -extern void iterate_supers(void (*)(struct super_block *, void *), void *); -extern void iterate_supers_type(struct file_system_type *, - void (*)(struct super_block *, void *), void *); - -extern int dcache_dir_open(struct inode *, struct file *); -extern int dcache_dir_close(struct inode *, struct file *); -extern loff_t dcache_dir_lseek(struct file *, loff_t, int); -extern int dcache_readdir(struct file *, struct dir_context *); -extern int simple_setattr(struct dentry *, struct iattr *); -extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int simple_statfs(struct dentry *, struct kstatfs *); -extern int simple_open(struct inode *inode, struct file *file); -extern int simple_link(struct dentry *, struct inode *, struct dentry *); -extern int simple_unlink(struct inode *, struct dentry *); -extern int simple_rmdir(struct inode *, struct dentry *); -extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int noop_fsync(struct file *, loff_t, loff_t, int); -extern int simple_empty(struct dentry *); -extern int simple_readpage(struct file *file, struct page *page); -extern int simple_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); -extern int simple_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int always_delete_dentry(const struct dentry *); -extern struct inode *alloc_anon_inode(struct super_block *); +int vfs_readdir(struct file *, filldir_t, void *); +int iterate_dir(struct file *, struct dir_context *); + +int vfs_stat(const char __user *, struct kstat *); +int vfs_lstat(const char __user *, struct kstat *); +int vfs_fstat(unsigned int, struct kstat *); +int vfs_fstatat(int , const char __user *, struct kstat *, int); + +int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, + unsigned long arg); +int __generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + loff_t start, loff_t len, get_block_t *get_block); +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, u64 len, get_block_t *get_block); + +void get_filesystem(struct file_system_type *fs); +void put_filesystem(struct file_system_type *fs); +struct file_system_type *get_fs_type(const char *name); +struct super_block *get_super(struct block_device *); +struct super_block *get_super_thawed(struct block_device *); +struct super_block *get_active_super(struct block_device *bdev); +void drop_super(struct super_block *sb); +void iterate_supers(void (*)(struct super_block *, void *), void *); +void iterate_supers_type(struct file_system_type *, + void (*)(struct super_block *, void *), void *); + +int dcache_dir_open(struct inode *, struct file *); +int dcache_dir_close(struct inode *, struct file *); +loff_t dcache_dir_lseek(struct file *, loff_t, int); +int dcache_readdir(struct file *, struct dir_context *); +int simple_setattr(struct dentry *, struct iattr *); +int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); +int simple_statfs(struct dentry *, struct kstatfs *); +int simple_open(struct inode *inode, struct file *file); +int simple_link(struct dentry *, struct inode *, struct dentry *); +int simple_unlink(struct inode *, struct dentry *); +int simple_rmdir(struct inode *, struct dentry *); +int simple_rename(struct inode *, struct dentry *, struct inode *, + struct dentry *); +int noop_fsync(struct file *, loff_t, loff_t, int); +int simple_empty(struct dentry *); +int simple_readpage(struct file *file, struct page *page); +int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); +int always_delete_dentry(const struct dentry *); +struct inode *alloc_anon_inode(struct super_block *); extern const struct dentry_operations simple_dentry_operations; -extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); -extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); +struct dentry *simple_lookup(struct inode *, struct dentry *, + unsigned int flags); +ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; struct tree_descr { char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); -extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *); -extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); -extern void simple_release_fs(struct vfsmount **mount, int *count); +int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *); +int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, + int *count); +void simple_release_fs(struct vfsmount **mount, int *count); -extern ssize_t simple_read_from_buffer(void __user *to, size_t count, - loff_t *ppos, const void *from, size_t available); -extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, - const void __user *from, size_t count); +ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, + const void *from, size_t available); +ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, + const void __user *from, size_t count); -extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); -extern int generic_file_fsync(struct file *, loff_t, loff_t, int); +int __generic_file_fsync(struct file *, loff_t, loff_t, int); +int generic_file_fsync(struct file *, loff_t, loff_t, int); -extern int generic_check_addressable(unsigned, u64); +int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION -extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, - enum migrate_mode); +int buffer_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); #else #define buffer_migrate_page NULL #endif -extern int inode_change_ok(const struct inode *, struct iattr *); -extern int inode_newsize_ok(const struct inode *, loff_t offset); -extern void setattr_copy(struct inode *inode, const struct iattr *attr); +int inode_change_ok(const struct inode *, struct iattr *); +int inode_newsize_ok(const struct inode *, loff_t offset); +void setattr_copy(struct inode *inode, const struct iattr *attr); -extern int file_update_time(struct file *file); +int file_update_time(struct file *file); -extern int generic_show_options(struct seq_file *m, struct dentry *root); -extern void save_mount_options(struct super_block *sb, char *options); -extern void replace_mount_options(struct super_block *sb, char *options); +int generic_show_options(struct seq_file *m, struct dentry *root); +void save_mount_options(struct super_block *sb, char *options); +void replace_mount_options(struct super_block *sb, char *options); static inline ino_t parent_ino(struct dentry *dentry) { @@ -2654,7 +2692,7 @@ struct simple_transaction_argresp { #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, - size_t size); + size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); @@ -2695,7 +2733,7 @@ static const struct file_operations __fops = { \ static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { - /* don't do anything, just let the compiler check the arguments; */ +/* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, @@ -2711,16 +2749,16 @@ struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void __user *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); -#define __FMODE_EXEC ((__force int) FMODE_EXEC) -#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) +#define __FMODE_EXEC ((__force int)FMODE_EXEC) +#define __FMODE_NONOTIFY ((__force int)FMODE_NONOTIFY) -#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ +#define ACC_MODE(x) ("\004\002\006\006"[(x) & O_ACCMODE]) +#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ (flag & __FMODE_NONOTIFY))) static inline int is_sxid(umode_t mode) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5e3a906cc089..142ec544167c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -237,6 +237,12 @@ extern int iomem_is_exclusive(u64 addr); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +extern int +walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); +extern int +walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index aa2a0cb57f50..058880200678 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -376,10 +376,6 @@ extern unsigned long simple_strtoul(const char *,char **,unsigned int); extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -#define strict_strtoul kstrtoul -#define strict_strtol kstrtol -#define strict_strtoull kstrtoull -#define strict_strtoll kstrtoll extern int num_to_str(char *buf, int size, unsigned long long num); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index a75641930049..4b2a0e11cc5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -10,6 +10,7 @@ #include <linux/ioport.h> #include <linux/elfcore.h> #include <linux/elf.h> +#include <linux/module.h> #include <asm/kexec.h> /* Verify architecture specific macros are defined */ @@ -69,7 +70,18 @@ typedef unsigned long kimage_entry_t; #define IND_SOURCE 0x8 struct kexec_segment { - void __user *buf; + /* + * This pointer can point to user memory if kexec_load() system + * call is used or will point to kernel memory if + * kexec_file_load() system call is used. + * + * Use ->buf when expecting to deal with user memory and use ->kbuf + * when expecting to deal with kernel memory. + */ + union { + void __user *buf; + void *kbuf; + }; size_t bufsz; unsigned long mem; size_t memsz; @@ -84,6 +96,27 @@ struct compat_kexec_segment { }; #endif +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +struct purgatory_info { + /* Pointer to elf header of read only purgatory */ + Elf_Ehdr *ehdr; + + /* Pointer to purgatory sechdrs which are modifiable */ + Elf_Shdr *sechdrs; + /* + * Temporary buffer location where purgatory is loaded and relocated + * This memory can be freed post image load + */ + void *purgatory_buf; + + /* Address where purgatory is finally loaded and is executed from */ + unsigned long purgatory_load_addr; +}; + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -100,7 +133,7 @@ struct kimage { struct list_head control_pages; struct list_head dest_pages; - struct list_head unuseable_pages; + struct list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ unsigned long control_page; @@ -110,13 +143,63 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + /* If set, we are using file mode kexec syscall */ + unsigned int file_mode:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; #endif + + /* Additional fields for file based kexec syscall */ + void *kernel_buf; + unsigned long kernel_buf_len; + + void *initrd_buf; + unsigned long initrd_buf_len; + + char *cmdline_buf; + unsigned long cmdline_buf_len; + + /* File operations provided by image loader */ + struct kexec_file_ops *fops; + + /* Image loader handling the kernel can store a pointer here */ + void *image_loader_data; + + /* Information for loading purgatory */ + struct purgatory_info purgatory_info; }; +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(void *loader_data); +typedef int (kexec_verify_sig_t)(const char *kernel_buf, + unsigned long kernel_len); + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; + kexec_verify_sig_t *verify_sig; +}; /* kexec interface functions */ extern void machine_kexec(struct kimage *image); @@ -127,8 +210,21 @@ extern asmlinkage long sys_kexec_load(unsigned long entry, struct kexec_segment __user *segments, unsigned long flags); extern int kernel_kexec(void); +extern int kexec_add_buffer(struct kimage *image, char *buffer, + unsigned long bufsz, unsigned long memsz, + unsigned long buf_align, unsigned long buf_min, + unsigned long buf_max, bool top_down, + unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr); +extern int kexec_purgatory_get_set_symbol(struct kimage *image, + const char *name, void *buf, + unsigned int size, bool get_value); +extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, + const char *name); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); @@ -177,6 +273,10 @@ extern int kexec_load_disabled; #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT) #endif +/* List of defined/legal kexec file flags */ +#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ + KEXEC_FILE_NO_INITRAMFS) + #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) diff --git a/include/linux/mm.h b/include/linux/mm.h index e03dd29145a0..8981cc882ed2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2014,13 +2014,20 @@ static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ #endif +#ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); -#ifdef __HAVE_ARCH_GATE_AREA -int in_gate_area_no_mm(unsigned long addr); -int in_gate_area(struct mm_struct *mm, unsigned long addr); +extern int in_gate_area_no_mm(unsigned long addr); +extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else -int in_gate_area_no_mm(unsigned long addr); -#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) +static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} +static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } +static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} #endif /* __HAVE_ARCH_GATE_AREA */ #ifdef CONFIG_SYSCTL diff --git a/include/linux/sched.h b/include/linux/sched.h index 857ba40426ba..b9dc87203577 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -377,6 +377,8 @@ extern int in_sched_functions(unsigned long addr); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); +extern signed long +schedule_timeout_deferrable_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 4d1771c2d29f..50777b5b1e4c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -1,6 +1,7 @@ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H +#include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> @@ -11,6 +12,7 @@ struct shmem_inode_info { spinlock_t lock; + unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ union { @@ -65,4 +67,19 @@ static inline struct page *shmem_read_mapping_page( mapping_gfp_mask(mapping)); } +#ifdef CONFIG_TMPFS + +extern int shmem_add_seals(struct file *file, unsigned int seals); +extern int shmem_get_seals(struct file *file); +extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); + +#else + +static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} + +#endif + #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 701daff5d899..0f86d85a9ce4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -317,6 +317,10 @@ asmlinkage long sys_restart_syscall(void); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags); +asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd, + unsigned long cmdline_len, + const char __user *cmdline_ptr, + unsigned long flags); asmlinkage long sys_exit(int error_code); asmlinkage long sys_exit_group(int error_code); @@ -802,6 +806,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); +asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 074b886c6be0..beed138bd359 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -28,6 +28,21 @@ #define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) /* + * Set/Get seals + */ +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +/* + * Types of seals + */ +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +/* (1U << 31) is reserved for signed error codes */ + +/* * Types of directory notifications that may be requested. */ #define DN_ACCESS 0x00000001 /* File accessed */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index d6629d49a243..6925f5b42f89 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -13,6 +13,17 @@ #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_ARCH_MASK 0xffff0000 +/* + * Kexec file load interface flags. + * KEXEC_FILE_UNLOAD : Unload already loaded kexec/kdump image. + * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. + * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd + * fd field. + */ +#define KEXEC_FILE_UNLOAD 0x00000001 +#define KEXEC_FILE_ON_CRASH 0x00000002 +#define KEXEC_FILE_NO_INITRAMFS 0x00000004 + /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. */ diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h new file mode 100644 index 000000000000..534e364bda92 --- /dev/null +++ b/include/uapi/linux/memfd.h @@ -0,0 +1,8 @@ +#ifndef _UAPI_LINUX_MEMFD_H +#define _UAPI_LINUX_MEMFD_H + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U + +#endif /* _UAPI_LINUX_MEMFD_H */ diff --git a/init/Kconfig b/init/Kconfig index d3ef635b9779..1dfdd81ff9bd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -783,8 +783,13 @@ endchoice endmenu # "RCU Subsystem" +config BUILD_BIN2C + bool + default n + config IKCONFIG tristate "Kernel .config support" + select BUILD_BIN2C ---help--- This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation diff --git a/init/main.c b/init/main.c index 5d8c83ae6c55..e3c4cdd94d5b 100644 --- a/init/main.c +++ b/init/main.c @@ -6,7 +6,7 @@ * GK 2/5/95 - Changed to support mounting root fs via NFS * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 - * Simplified starting of init: Michael A. Griffith <grif@acm.org> + * Simplified starting of init: Michael A. Griffith <grif@acm.org> */ #define DEBUG /* Enable initcall_debug */ @@ -136,7 +136,7 @@ static char *ramdisk_execute_command; * Used to generate warnings if static_key manipulation functions are used * before jump_label_init is called. */ -bool static_key_initialized __read_mostly = false; +bool static_key_initialized __read_mostly; EXPORT_SYMBOL_GPL(static_key_initialized); /* @@ -159,8 +159,8 @@ static int __init set_reset_devices(char *str) __setup("reset_devices", set_reset_devices); -static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; -const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; extern const struct obs_kernel_param __setup_start[], __setup_end[]; @@ -199,7 +199,6 @@ static int __init obsolete_checksetup(char *line) * still work even if initially too large, it will just take slightly longer */ unsigned long loops_per_jiffy = (1<<12); - EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) @@ -376,8 +375,8 @@ static void __init setup_command_line(char *command_line) initcall_command_line = memblock_virt_alloc(strlen(boot_command_line) + 1, 0); static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); - strcpy (saved_command_line, boot_command_line); - strcpy (static_command_line, command_line); + strcpy(saved_command_line, boot_command_line); + strcpy(static_command_line, command_line); } /* @@ -445,8 +444,8 @@ void __init parse_early_options(char *cmdline) /* Arch code calls this early on, or if not, just before other parsing. */ void __init parse_early_param(void) { - static __initdata int done = 0; - static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; + static int done __initdata; + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; if (done) return; @@ -500,7 +499,8 @@ static void __init mm_init(void) asmlinkage __visible void __init start_kernel(void) { - char * command_line, *after_dashes; + char *command_line; + char *after_dashes; extern const struct kernel_param __start___param[], __stop___param[]; /* @@ -572,7 +572,8 @@ asmlinkage __visible void __init start_kernel(void) * fragile until we cpu_idle() for the first time. */ preempt_disable(); - if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) + if (WARN(!irqs_disabled(), + "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); idr_init_cache(); rcu_init(); diff --git a/kernel/Makefile b/kernel/Makefile index 0026cf531769..dc5c77544fd6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/kernel/fork.c b/kernel/fork.c index fa9124322cd4..1380d8ace334 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -429,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..0b49a0a58102 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -40,6 +42,9 @@ #include <asm/io.h> #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: - return result; + return ret; } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { - int result; - struct kimage *image; - unsigned long i; + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; goto out; } - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; goto out; + } - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; + if (bytes == 0) + break; + pos += bytes; } + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. */ - result = -ENOMEM; + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: kfree(image); -out: - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; @@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) @@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7d280b..81807b0b2c99 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -76,7 +76,7 @@ static bool kprobes_all_disarmed; /* This protects kprobe_table and optimizing_list */ static DEFINE_MUTEX(kprobe_mutex); -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance); static struct { raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; @@ -297,9 +297,9 @@ static inline void reset_kprobe_instance(void) /* * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c + * - under the kprobe_mutex - during kprobe_[un]register() + * OR + * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ struct kprobe *get_kprobe(void *addr) { @@ -567,7 +567,8 @@ static void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); - while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { + while (!list_empty(&optimizing_list) || + !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); /* this will also make optimizing_work execute immmediately */ @@ -676,8 +677,8 @@ static void reuse_unused_kprobe(struct kprobe *ap) */ op = container_of(ap, struct optimized_kprobe, kp); if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); + pr_warn("Warning: found a stray unused aggrprobe@%p\n", + ap->addr); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ @@ -794,7 +795,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } - printk(KERN_INFO "Kprobes globally optimized\n"); + pr_info("Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); } @@ -824,7 +825,7 @@ static void unoptimize_all_kprobes(void) /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); - printk(KERN_INFO "Kprobes globally unoptimized\n"); + pr_info("Kprobes globally unoptimized\n"); } static DEFINE_MUTEX(kprobe_sysctl_mutex); @@ -896,7 +897,7 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) /* There should be no unused kprobes can be reused without optimization */ static void reuse_unused_kprobe(struct kprobe *ap) { - printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + pr_err("Error: There should be no unused kprobe here.\n"); BUG_ON(kprobe_unused(ap)); } @@ -955,7 +956,8 @@ static void disarm_kprobe_ftrace(struct kprobe *p) } ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); - WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", + p->addr, ret); } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) @@ -1389,7 +1391,7 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) if (p != ap) { list_for_each_entry_rcu(list_p, &ap->list, list) if (list_p == p) - /* kprobe p is a valid probe */ + /* kprobe p is a valid probe */ goto valid; return NULL; } @@ -2018,8 +2020,8 @@ EXPORT_SYMBOL_GPL(enable_kprobe); void dump_kprobe(struct kprobe *kp) { - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + pr_warn("Dumping kprobe:\n"); + pr_warn("Name: %s\nAddress: %p\nOffset: %x\n", kp->symbol_name, kp->addr, kp->offset); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2132,7 +2134,7 @@ static int __init init_kprobes(void) kprobe_lookup_name(kretprobe_blacklist[i].name, kretprobe_blacklist[i].addr); if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", + pr_warn("kretprobe: lookup failed: %s\n", kretprobe_blacklist[i].name); } } @@ -2314,7 +2316,7 @@ static void arm_all_kprobes(void) } kprobes_all_disarmed = false; - printk(KERN_INFO "Kprobes globally enabled\n"); + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); @@ -2336,7 +2338,7 @@ static void disarm_all_kprobes(void) } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); + pr_info("Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2904a2105914..391d4ddb6f4b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); @@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..a81083797e97 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(hash_lock); * SIGEV values. Here we put out an error if this assumption fails. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif @@ -254,7 +254,8 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, return 0; } -static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +static int posix_get_coarse_res(const clockid_t which_clock, + struct timespec *tp) { *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; @@ -335,14 +336,16 @@ static __init int init_posix_timers(void) posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, + &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, + &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_register_clock(CLOCK_TAI, &clock_tai); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC, NULL); return 0; } @@ -496,11 +499,11 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return ret; } -static struct pid *good_sigevent(sigevent_t * event) +static struct pid *good_sigevent(sigevent_t *event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && + if ((event->sigev_notify & SIGEV_THREAD_ID) && (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || !same_thread_group(rtn, current) || (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) @@ -517,18 +520,18 @@ void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + pr_warn("POSIX clock register failed for clock_id %d\n", clock_id); return; } if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + pr_warn("POSIX clock id %d lacks clock_get()\n", clock_id); return; } if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", + pr_warn("POSIX clock id %d lacks clock_getres()\n", clock_id); return; } @@ -537,7 +540,7 @@ void posix_timers_register_clock(const clockid_t clock_id, } EXPORT_SYMBOL_GPL(posix_timers_register_clock); -static struct k_itimer * alloc_posix_timer(void) +static struct k_itimer *alloc_posix_timer(void) { struct k_itimer *tmr; tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); @@ -624,7 +627,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_overrun = -1; if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { + if (copy_from_user(&event, timer_event_spec, sizeof(event))) { error = -EFAULT; goto out; } @@ -649,7 +652,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_code = SI_TIMER; if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { + &new_timer_id, sizeof(new_timer_id))) { error = -EFAULT; goto out; } @@ -750,7 +753,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) */ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, + iv); remaining = ktime_sub(hrtimer_get_expires(timer), now); /* Return 0 only, when the timer is expired and not pending */ @@ -787,7 +791,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, unlock_timer(timr, flags); - if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof(cur_setting))) return -EFAULT; return ret; @@ -839,7 +843,7 @@ common_timer_set(struct k_itimer *timr, int flags, if (hrtimer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; @@ -859,9 +863,8 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { + if (mode == HRTIMER_MODE_REL) hrtimer_add_expires(timer, timer->base->get_time()); - } return 0; } @@ -884,7 +887,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!new_setting) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + if (copy_from_user(&new_spec, new_setting, sizeof(new_spec))) return -EFAULT; if (!timespec_valid(&new_spec.it_interval) || @@ -903,12 +906,12 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + rtn = NULL; /* We already got the old time... */ goto retry; } if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + copy_to_user(old_setting, &old_spec, sizeof(old_spec))) error = -EFAULT; return error; @@ -1010,14 +1013,14 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (!kc || !kc->clock_set) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (copy_from_user(&new_tp, tp, sizeof(*tp))) return -EFAULT; return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) + struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; @@ -1028,7 +1031,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, error = kc->clock_get(which_clock, &kernel_tp); - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + if (!error && copy_to_user(tp, &kernel_tp, sizeof(kernel_tp))) error = -EFAULT; return error; @@ -1069,7 +1072,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) error = -EFAULT; return error; @@ -1098,7 +1101,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (copy_from_user(&t, rqtp, sizeof(struct timespec))) return -EFAULT; if (!timespec_valid(&t)) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..e81a30181e41 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1432,7 +1432,7 @@ static void process_timeout(unsigned long __data) } /** - * schedule_timeout - sleep until timeout + * __schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have @@ -1457,7 +1457,8 @@ static void process_timeout(unsigned long __data) * * In all cases the return value is guaranteed to be non-negative. */ -signed long __sched schedule_timeout(signed long timeout) +static signed long +__sched __schedule_timeout(signed long timeout, unsigned long flag) { struct timer_list timer; unsigned long expire; @@ -1493,7 +1494,13 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + if (flag & TIMER_DEFERRABLE) + setup_deferrable_timer_on_stack(&timer, process_timeout, + (unsigned long)current); + else + setup_timer_on_stack(&timer, process_timeout, + (unsigned long)current); + __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); schedule(); del_singleshot_timer_sync(&timer); @@ -1506,12 +1513,26 @@ signed long __sched schedule_timeout(signed long timeout) out: return timeout < 0 ? 0 : timeout; } + +signed long __sched schedule_timeout(signed long timeout) +{ + return __schedule_timeout(timeout, 0); +} EXPORT_SYMBOL(schedule_timeout); /* * We can use __set_current_state() here because schedule_timeout() calls * schedule() unconditionally. */ + +signed long +__sched schedule_timeout_deferrable_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return __schedule_timeout(timeout, TIMER_DEFERRABLE); +} +EXPORT_SYMBOL(schedule_timeout_deferrable_interruptible); + signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); diff --git a/mm/Makefile b/mm/Makefile index 632ae77e6070..a96e3a1dea1f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..382f8ece1cb2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -234,11 +234,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 72b8fa361433..000000000000 --- a/mm/fremap.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * linux/mm/fremap.c - * - * Explicit pagetable population and nonlinear (random) mappings support. - * - * started by Ingo Molnar, Copyright (C) 2002, 2003 - */ -#include <linux/export.h> -#include <linux/backing-dev.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/file.h> -#include <linux/mman.h> -#include <linux/pagemap.h> -#include <linux/swapops.h> -#include <linux/rmap.h> -#include <linux/syscalls.h> -#include <linux/mmu_notifier.h> - -#include <asm/mmu_context.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> - -#include "internal.h" - -static int mm_counter(struct page *page) -{ - return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; -} - -static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - struct page *page; - swp_entry_t entry; - - if (pte_present(pte)) { - flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); - if (page) { - if (pte_dirty(pte)) - set_page_dirty(page); - update_hiwater_rss(mm); - dec_mm_counter(mm, mm_counter(page)); - page_remove_rmap(page); - page_cache_release(page); - } - } else { /* zap_pte() is not called when pte_none() */ - if (!pte_file(pte)) { - update_hiwater_rss(mm); - entry = pte_to_swp_entry(pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - dec_mm_counter(mm, mm_counter(page)); - } - } else { - free_swap_and_cache(entry); - dec_mm_counter(mm, MM_SWAPENTS); - } - } - pte_clear_not_present_full(mm, addr, ptep, 0); - } -} - -/* - * Install a file pte to a given virtual memory address, release any - * previously existing mapping. - */ -static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) -{ - int err = -ENOMEM; - pte_t *pte, ptfile; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - ptfile = pgoff_to_pte(pgoff); - - if (!pte_none(*pte)) - zap_pte(mm, vma, addr, pte); - - set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); - /* - * We don't need to run update_mmu_cache() here because the "file pte" - * being installed by install_file_pte() is not a real pte - it's a - * non-present entry (like a swap entry), noting what file offset should - * be mapped there when there's a fault (in a non-linear vma where - * that's not obvious). - */ - pte_unmap_unlock(pte, ptl); - err = 0; -out: - return err; -} - -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - struct mm_struct *mm = vma->vm_mm; - int err; - - do { - err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); - if (err) - return err; - - size -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } while (size); - - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - -/** - * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma - * @start: start of the remapped virtual memory range - * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range (see NOTE) - * @pgoff: to-be-mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. - * - * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma - * (shared backing store file). - * - * This syscall works purely via pagetables, so it's the most efficient - * way to map the same (large) file into a given virtual window. Unlike - * mmap()/mremap() it does not create any new vmas. The new mappings are - * also safe across swapout. - * - * NOTE: the @prot parameter right now is ignored (but must be zero), - * and the vma's default protection is used. Arbitrary protections - * might be implemented in the future. - */ -SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, - unsigned long, prot, unsigned long, pgoff, unsigned long, flags) -{ - struct mm_struct *mm = current->mm; - struct address_space *mapping; - struct vm_area_struct *vma; - int err = -EINVAL; - int has_write_lock = 0; - vm_flags_t vm_flags = 0; - - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); - - if (prot) - return err; - /* - * Sanitize the syscall parameters: - */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; - - /* Does the address range wrap, or is the span zero-sized? */ - if (start + size <= start) - return err; - - /* Does pgoff wrap? */ - if (pgoff + (size >> PAGE_SHIFT) < pgoff) - return err; - - /* Can we represent this offset inside this architecture's pte's? */ -#if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) - return err; -#endif - - /* We need down_write() to change vma->vm_flags. */ - down_read(&mm->mmap_sem); - retry: - vma = find_vma(mm, start); - - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma. - */ - if (!vma || !(vma->vm_flags & VM_SHARED)) - goto out; - - if (!vma->vm_ops || !vma->vm_ops->remap_pages) - goto out; - - if (start < vma->vm_start || start + size > vma->vm_end) - goto out; - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (!(vma->vm_flags & VM_NONLINEAR)) { - /* - * vm_private_data is used as a swapout cursor - * in a VM_NONLINEAR vma. - */ - if (vma->vm_private_data) - goto out; - - /* Don't need a nonlinear mapping, exit success */ - if (pgoff == linear_page_index(vma, start)) { - err = 0; - goto out; - } - - if (!has_write_lock) { -get_write_lock: - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - /* - * page_mkclean doesn't work on nonlinear vmas, so if - * dirty pages need to be accounted, emulate with linear - * vmas. - */ - if (mapping_cap_account_dirty(mapping)) { - unsigned long addr; - struct file *file = get_file(vma->vm_file); - /* mmap_region may free vma; grab the info now */ - vm_flags = vma->vm_flags; - - addr = mmap_region(file, start, size, vm_flags, pgoff); - fput(file); - if (IS_ERR_VALUE(addr)) { - err = addr; - } else { - BUG_ON(addr != start); - err = 0; - } - goto out_freed; - } - mutex_lock(&mapping->i_mmap_mutex); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_interval_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); - } - - if (vma->vm_flags & VM_LOCKED) { - /* - * drop PG_Mlocked flag for over-mapped range - */ - if (!has_write_lock) - goto get_write_lock; - vm_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = vm_flags; - } - - mmu_notifier_invalidate_range_start(mm, start, start + size); - err = vma->vm_ops->remap_pages(vma, start, size, pgoff); - mmu_notifier_invalidate_range_end(mm, start, start + size); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - -out: - if (vma) - vm_flags = vma->vm_flags; -out_freed: - if (likely(!has_write_lock)) - up_read(&mm->mmap_sem); - else - up_write(&mm->mmap_sem); - if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) - mm_populate(start, size); - - return err; -} @@ -223,6 +223,9 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +/* Boolean to indicate whether to use deferrable timer or not */ +static bool use_deferrable_timer; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -1705,6 +1708,11 @@ static void ksm_do_scan(unsigned int scan_npages) } } +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); @@ -1725,8 +1733,13 @@ static int ksm_scan_thread(void *nothing) try_to_freeze(); if (ksmd_should_run()) { - schedule_timeout_interruptible( - msecs_to_jiffies(ksm_thread_sleep_millisecs)); + signed long to; + + to = msecs_to_jiffies(ksm_thread_sleep_millisecs); + if (use_deferrable_timer) + schedule_timeout_deferrable_interruptible(to); + else + schedule_timeout_interruptible(to); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); @@ -2175,6 +2188,29 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); +static ssize_t deferrable_timer_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 8, "%d\n", use_deferrable_timer); +} + +static ssize_t deferrable_timer_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long enable; + int err; + + err = kstrtoul(buf, 10, &enable); + if (err < 0) + return err; + if (enable >= 1) + return -EINVAL; + use_deferrable_timer = enable; + return count; +} +KSM_ATTR(deferrable_timer); + #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -2287,6 +2323,7 @@ static struct attribute *ksm_attrs[] = { &pages_unshared_attr.attr, &pages_volatile_attr.attr, &full_scans_attr.attr, + &deferrable_timer_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif diff --git a/mm/memory.c b/mm/memory.c index 2476f5bdbfe3..042f8b3cabc1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3438,44 +3438,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -#if !defined(__HAVE_ARCH_GATE_AREA) - -#if defined(AT_SYSINFO_EHDR) -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - - return 0; -} -__initcall(gate_vma_init); -#endif - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef AT_SYSINFO_EHDR - return &gate_vma; -#else - return NULL; -#endif -} - -int in_gate_area_no_mm(unsigned long addr) -{ -#ifdef AT_SYSINFO_EHDR - if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) - return 1; -#endif - return 0; -} - -#endif /* __HAVE_ARCH_GATE_AREA */ - static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { diff --git a/mm/mmap.c b/mm/mmap.c index 64c9d736155c..80217c935608 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -221,7 +221,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; + mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -622,7 +622,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -1577,6 +1577,17 @@ munmap_back: if (error) goto free_vma; } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) @@ -1616,8 +1627,12 @@ munmap_back: vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } file = vma->vm_file; out: perf_event_mmap(vma); @@ -1646,14 +1661,17 @@ out: return addr; unmap_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: @@ -2586,6 +2604,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return vm_munmap(addr, len); } + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM diff --git a/mm/nommu.c b/mm/nommu.c index 4a852f6c5709..026ac6375aaa 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1981,11 +1981,6 @@ error: return -ENOMEM; } -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} - int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); @@ -1999,14 +1994,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_map_pages); -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { diff --git a/mm/shmem.c b/mm/shmem.c index 5909f298c3e7..0e5fb225007c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,6 +66,9 @@ static struct vfsmount *shm_mnt; #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> +#include <linux/syscalls.h> +#include <linux/fcntl.h> +#include <uapi/linux/memfd.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -547,6 +550,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); int error; error = inode_change_ok(inode, attr); @@ -557,6 +561,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + if (newsize != oldsize) { error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); @@ -1412,6 +1421,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -1470,7 +1480,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + /* i_mutex is held by caller */ + if (unlikely(info->seals)) { + if (info->seals & F_SEAL_WRITE) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1808,11 +1828,233 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on shmem. + */ +#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void shmem_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + } else if (page_count(page) - page_mapcount(page) > 1) { + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, iter.index, + SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); + } + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int shmem_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + int error, scan; + + shmem_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + start, SHMEM_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(&mapping->page_tree, + iter.index, SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); +continue_resched: + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); + } + + return error; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +int shmem_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a shmem-file but restrict + * access to a specific subset of file operations. Seals can only be + * added, but never removed. This way, mutually untrusted parties can + * share common memory regions with a well-defined policy. A malicious + * peer can thus never perform unwanted operations on a shared object. + * + * Seals are only supported on special shmem-files and always affect + * the whole underlying inode. Once a seal is set, it may prevent some + * kinds of access to the file. Currently, the following seals are + * defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous shmem files support sealing. More importantly, seals are + * never written to disk. Therefore, there's no plan to support it on + * other file types. + */ + + if (file->f_op != &shmem_file_operations) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + mutex_lock(&inode->i_mutex); + + if (info->seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = shmem_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + info->seals |= seals; + error = 0; + +unlock: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(shmem_add_seals); + +int shmem_get_seals(struct file *file) +{ + if (file->f_op != &shmem_file_operations) + return -EINVAL; + + return SHMEM_I(file_inode(file))->seals; +} +EXPORT_SYMBOL_GPL(shmem_get_seals); + +long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = shmem_add_seals(file, arg); + break; + case F_GET_SEALS: + error = shmem_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; @@ -1828,6 +2070,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + error = -EPERM; + goto out; + } + shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; @@ -1854,6 +2102,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + start = offset >> PAGE_CACHE_SHIFT; end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ @@ -2617,6 +2870,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) shmem_show_mpol(seq, sbinfo->mpol); return 0; } + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) diff --git a/mm/swap_state.c b/mm/swap_state.c index e160151da6b8..3e0ec83d000c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,6 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, } diff --git a/scripts/.gitignore b/scripts/.gitignore index fb070fa1038f..5ecfe93f2028 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -4,7 +4,6 @@ conmakehash kallsyms pnmtologo -bin2c unifdef ihex2fw recordmcount diff --git a/scripts/Makefile b/scripts/Makefile index 890df5c6adfb..72902b5f2721 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -13,7 +13,6 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash -hostprogs-$(CONFIG_IKCONFIG) += bin2c hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable hostprogs-$(CONFIG_ASN1) += asn1_compiler diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index a776371a3502..9528ec9e5adc 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1 +1,2 @@ fixdep +bin2c diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 4fcef87bb875..ec10d9345bc2 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,6 +9,7 @@ # fixdep: Used to generate dependency information during build process hostprogs-y := fixdep +hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/bin2c.c b/scripts/basic/bin2c.c index 96dd2bcbb407..af187e695345 100644 --- a/scripts/bin2c.c +++ b/scripts/basic/bin2c.c @@ -11,7 +11,7 @@ int main(int argc, char *argv[]) { - int ch, total=0; + int ch, total = 0; if (argc > 1) printf("const char %s[] %s=\n", @@ -19,10 +19,9 @@ int main(int argc, char *argv[]) do { printf("\t\""); - while ((ch = getchar()) != EOF) - { + while ((ch = getchar()) != EOF) { total++; - printf("\\x%02x",ch); + printf("\\x%02x", ch); if (total % 16 == 0) break; } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d36d79e35c76..36ff2e4c7b6f 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -2,6 +2,7 @@ TARGETS = breakpoints TARGETS += cpu-hotplug TARGETS += efivarfs TARGETS += kcmp +TARGETS += memfd TARGETS += memory-hotplug TARGETS += mqueue TARGETS += mount diff --git a/tools/testing/selftests/memfd/.gitignore b/tools/testing/selftests/memfd/.gitignore new file mode 100644 index 000000000000..afe87c40ac80 --- /dev/null +++ b/tools/testing/selftests/memfd/.gitignore @@ -0,0 +1,4 @@ +fuse_mnt +fuse_test +memfd_test +memfd-test-file diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile new file mode 100644 index 000000000000..6816c491c5ff --- /dev/null +++ b/tools/testing/selftests/memfd/Makefile @@ -0,0 +1,41 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 +endif + +CFLAGS += -D_FILE_OFFSET_BITS=64 +CFLAGS += -I../../../../arch/x86/include/generated/uapi/ +CFLAGS += -I../../../../arch/x86/include/uapi/ +CFLAGS += -I../../../../include/uapi/ +CFLAGS += -I../../../../include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) memfd_test.c -o memfd_test +else + echo "Not an x86 target, can't build memfd selftest" +endif + +run_tests: all +ifeq ($(ARCH),X86) + gcc $(CFLAGS) memfd_test.c -o memfd_test +endif + @./memfd_test || echo "memfd_test: [FAIL]" + +build_fuse: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt + gcc $(CFLAGS) fuse_test.c -o fuse_test +else + echo "Not an x86 target, can't build memfd selftest" +endif + +run_fuse: build_fuse + @./run_fuse_test.sh || echo "fuse_test: [FAIL]" + +clean: + $(RM) memfd_test fuse_test diff --git a/tools/testing/selftests/memfd/fuse_mnt.c b/tools/testing/selftests/memfd/fuse_mnt.c new file mode 100644 index 000000000000..feacf1280fcd --- /dev/null +++ b/tools/testing/selftests/memfd/fuse_mnt.c @@ -0,0 +1,110 @@ +/* + * memfd test file-system + * This file uses FUSE to create a dummy file-system with only one file /memfd. + * This file is read-only and takes 1s per read. + * + * This file-system is used by the memfd test-cases to force the kernel to pin + * pages during reads(). Due to the 1s delay of this file-system, this is a + * nice way to test race-conditions against get_user_pages() in the kernel. + * + * We use direct_io==1 to force the kernel to use direct-IO for this + * file-system. + */ + +#define FUSE_USE_VERSION 26 + +#include <fuse.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> + +static const char memfd_content[] = "memfd-example-content"; +static const char memfd_path[] = "/memfd"; + +static int memfd_getattr(const char *path, struct stat *st) +{ + memset(st, 0, sizeof(*st)); + + if (!strcmp(path, "/")) { + st->st_mode = S_IFDIR | 0755; + st->st_nlink = 2; + } else if (!strcmp(path, memfd_path)) { + st->st_mode = S_IFREG | 0444; + st->st_nlink = 1; + st->st_size = strlen(memfd_content); + } else { + return -ENOENT; + } + + return 0; +} + +static int memfd_readdir(const char *path, + void *buf, + fuse_fill_dir_t filler, + off_t offset, + struct fuse_file_info *fi) +{ + if (strcmp(path, "/")) + return -ENOENT; + + filler(buf, ".", NULL, 0); + filler(buf, "..", NULL, 0); + filler(buf, memfd_path + 1, NULL, 0); + + return 0; +} + +static int memfd_open(const char *path, struct fuse_file_info *fi) +{ + if (strcmp(path, memfd_path)) + return -ENOENT; + + if ((fi->flags & 3) != O_RDONLY) + return -EACCES; + + /* force direct-IO */ + fi->direct_io = 1; + + return 0; +} + +static int memfd_read(const char *path, + char *buf, + size_t size, + off_t offset, + struct fuse_file_info *fi) +{ + size_t len; + + if (strcmp(path, memfd_path) != 0) + return -ENOENT; + + sleep(1); + + len = strlen(memfd_content); + if (offset < len) { + if (offset + size > len) + size = len - offset; + + memcpy(buf, memfd_content + offset, size); + } else { + size = 0; + } + + return size; +} + +static struct fuse_operations memfd_ops = { + .getattr = memfd_getattr, + .readdir = memfd_readdir, + .open = memfd_open, + .read = memfd_read, +}; + +int main(int argc, char *argv[]) +{ + return fuse_main(argc, argv, &memfd_ops, NULL); +} diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c new file mode 100644 index 000000000000..67908b18f035 --- /dev/null +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -0,0 +1,311 @@ +/* + * memfd GUP test-case + * This tests memfd interactions with get_user_pages(). We require the + * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This + * file-system delays _all_ reads by 1s and forces direct-IO. This means, any + * read() on files in that file-system will pin the receive-buffer pages for at + * least 1s via get_user_pages(). + * + * We use this trick to race ADD_SEALS against a write on a memfd object. The + * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use + * the read() syscall with our memory-mapped memfd object as receive buffer to + * force the kernel to write into our memfd object. + */ + +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <linux/falloc.h> +#include <linux/fcntl.h> +#include <linux/memfd.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <unistd.h> + +#define MFD_DEF_SIZE 8192 +#define STACK_SIZE 65535 + +static int sys_memfd_create(const char *name, + unsigned int flags) +{ + return syscall(__NR_memfd_create, name, flags); +} + +static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) +{ + int r, fd; + + fd = sys_memfd_create(name, flags); + if (fd < 0) { + printf("memfd_create(\"%s\", %u) failed: %m\n", + name, flags); + abort(); + } + + r = ftruncate(fd, sz); + if (r < 0) { + printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); + abort(); + } + + return fd; +} + +static __u64 mfd_assert_get_seals(int fd) +{ + long r; + + r = fcntl(fd, F_GET_SEALS); + if (r < 0) { + printf("GET_SEALS(%d) failed: %m\n", fd); + abort(); + } + + return r; +} + +static void mfd_assert_has_seals(int fd, __u64 seals) +{ + __u64 s; + + s = mfd_assert_get_seals(fd); + if (s != seals) { + printf("%llu != %llu = GET_SEALS(%d)\n", + (unsigned long long)seals, (unsigned long long)s, fd); + abort(); + } +} + +static void mfd_assert_add_seals(int fd, __u64 seals) +{ + long r; + __u64 s; + + s = mfd_assert_get_seals(fd); + r = fcntl(fd, F_ADD_SEALS, seals); + if (r < 0) { + printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", + fd, (unsigned long long)s, (unsigned long long)seals); + abort(); + } +} + +static int mfd_busy_add_seals(int fd, __u64 seals) +{ + long r; + __u64 s; + + r = fcntl(fd, F_GET_SEALS); + if (r < 0) + s = 0; + else + s = r; + + r = fcntl(fd, F_ADD_SEALS, seals); + if (r < 0 && errno != EBUSY) { + printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", + fd, (unsigned long long)s, (unsigned long long)seals); + abort(); + } + + return r; +} + +static void *mfd_assert_mmap_shared(int fd) +{ + void *p; + + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + +static void *mfd_assert_mmap_private(int fd) +{ + void *p; + + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + +static int global_mfd = -1; +static void *global_p = NULL; + +static int sealing_thread_fn(void *arg) +{ + int sig, r; + + /* + * This thread first waits 200ms so any pending operation in the parent + * is correctly started. After that, it tries to seal @global_mfd as + * SEAL_WRITE. This _must_ fail as the parent thread has a read() into + * that memory mapped object still ongoing. + * We then wait one more second and try sealing again. This time it + * must succeed as there shouldn't be anyone else pinning the pages. + */ + + /* wait 200ms for FUSE-request to be active */ + usleep(200000); + + /* unmount mapping before sealing to avoid i_mmap_writable failures */ + munmap(global_p, MFD_DEF_SIZE); + + /* Try sealing the global file; expect EBUSY or success. Current + * kernels will never succeed, but in the future, kernels might + * implement page-replacements or other fancy ways to avoid racing + * writes. */ + r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); + if (r >= 0) { + printf("HURRAY! This kernel fixed GUP races!\n"); + } else { + /* wait 1s more so the FUSE-request is done */ + sleep(1); + + /* try sealing the global file again */ + mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); + } + + return 0; +} + +static pid_t spawn_sealing_thread(void) +{ + uint8_t *stack; + pid_t pid; + + stack = malloc(STACK_SIZE); + if (!stack) { + printf("malloc(STACK_SIZE) failed: %m\n"); + abort(); + } + + pid = clone(sealing_thread_fn, + stack + STACK_SIZE, + SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, + NULL); + if (pid < 0) { + printf("clone() failed: %m\n"); + abort(); + } + + return pid; +} + +static void join_sealing_thread(pid_t pid) +{ + waitpid(pid, NULL, 0); +} + +int main(int argc, char **argv) +{ + static const char zero[MFD_DEF_SIZE]; + int fd, mfd, r; + void *p; + int was_sealed; + pid_t pid; + + if (argc < 2) { + printf("error: please pass path to file in fuse_mnt mount-point\n"); + abort(); + } + + /* open FUSE memfd file for GUP testing */ + printf("opening: %s\n", argv[1]); + fd = open(argv[1], O_RDONLY | O_CLOEXEC); + if (fd < 0) { + printf("cannot open(\"%s\"): %m\n", argv[1]); + abort(); + } + + /* create new memfd-object */ + mfd = mfd_assert_new("kern_memfd_fuse", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + /* mmap memfd-object for writing */ + p = mfd_assert_mmap_shared(mfd); + + /* pass mfd+mapping to a separate sealing-thread which tries to seal + * the memfd objects with SEAL_WRITE while we write into it */ + global_mfd = mfd; + global_p = p; + pid = spawn_sealing_thread(); + + /* Use read() on the FUSE file to read into our memory-mapped memfd + * object. This races the other thread which tries to seal the + * memfd-object. + * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. + * This guarantees that the receive-buffer is pinned for 1s until the + * data is written into it. The racing ADD_SEALS should thus fail as + * the pages are still pinned. */ + r = read(fd, p, MFD_DEF_SIZE); + if (r < 0) { + printf("read() failed: %m\n"); + abort(); + } else if (!r) { + printf("unexpected EOF on read()\n"); + abort(); + } + + was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; + + /* Wait for sealing-thread to finish and verify that it + * successfully sealed the file after the second try. */ + join_sealing_thread(pid); + mfd_assert_has_seals(mfd, F_SEAL_WRITE); + + /* *IF* the memfd-object was sealed at the time our read() returned, + * then the kernel did a page-replacement or canceled the read() (or + * whatever magic it did..). In that case, the memfd object is still + * all zero. + * In case the memfd-object was *not* sealed, the read() was successfull + * and the memfd object must *not* be all zero. + * Note that in real scenarios, there might be a mixture of both, but + * in this test-cases, we have explicit 200ms delays which should be + * enough to avoid any in-flight writes. */ + + p = mfd_assert_mmap_private(mfd); + if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { + printf("memfd sealed during read() but data not discarded\n"); + abort(); + } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { + printf("memfd sealed after read() but data discarded\n"); + abort(); + } + + close(mfd); + close(fd); + + printf("fuse: DONE\n"); + + return 0; +} diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c new file mode 100644 index 000000000000..3634c909b1b0 --- /dev/null +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -0,0 +1,913 @@ +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <linux/falloc.h> +#include <linux/fcntl.h> +#include <linux/memfd.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <unistd.h> + +#define MFD_DEF_SIZE 8192 +#define STACK_SIZE 65535 + +static int sys_memfd_create(const char *name, + unsigned int flags) +{ + return syscall(__NR_memfd_create, name, flags); +} + +static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) +{ + int r, fd; + + fd = sys_memfd_create(name, flags); + if (fd < 0) { + printf("memfd_create(\"%s\", %u) failed: %m\n", + name, flags); + abort(); + } + + r = ftruncate(fd, sz); + if (r < 0) { + printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); + abort(); + } + + return fd; +} + +static void mfd_fail_new(const char *name, unsigned int flags) +{ + int r; + + r = sys_memfd_create(name, flags); + if (r >= 0) { + printf("memfd_create(\"%s\", %u) succeeded, but failure expected\n", + name, flags); + close(r); + abort(); + } +} + +static __u64 mfd_assert_get_seals(int fd) +{ + long r; + + r = fcntl(fd, F_GET_SEALS); + if (r < 0) { + printf("GET_SEALS(%d) failed: %m\n", fd); + abort(); + } + + return r; +} + +static void mfd_assert_has_seals(int fd, __u64 seals) +{ + __u64 s; + + s = mfd_assert_get_seals(fd); + if (s != seals) { + printf("%llu != %llu = GET_SEALS(%d)\n", + (unsigned long long)seals, (unsigned long long)s, fd); + abort(); + } +} + +static void mfd_assert_add_seals(int fd, __u64 seals) +{ + long r; + __u64 s; + + s = mfd_assert_get_seals(fd); + r = fcntl(fd, F_ADD_SEALS, seals); + if (r < 0) { + printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", + fd, (unsigned long long)s, (unsigned long long)seals); + abort(); + } +} + +static void mfd_fail_add_seals(int fd, __u64 seals) +{ + long r; + __u64 s; + + r = fcntl(fd, F_GET_SEALS); + if (r < 0) + s = 0; + else + s = r; + + r = fcntl(fd, F_ADD_SEALS, seals); + if (r >= 0) { + printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected\n", + fd, (unsigned long long)s, (unsigned long long)seals); + abort(); + } +} + +static void mfd_assert_size(int fd, size_t size) +{ + struct stat st; + int r; + + r = fstat(fd, &st); + if (r < 0) { + printf("fstat(%d) failed: %m\n", fd); + abort(); + } else if (st.st_size != size) { + printf("wrong file size %lld, but expected %lld\n", + (long long)st.st_size, (long long)size); + abort(); + } +} + +static int mfd_assert_dup(int fd) +{ + int r; + + r = dup(fd); + if (r < 0) { + printf("dup(%d) failed: %m\n", fd); + abort(); + } + + return r; +} + +static void *mfd_assert_mmap_shared(int fd) +{ + void *p; + + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + +static void *mfd_assert_mmap_private(int fd) +{ + void *p; + + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ, + MAP_PRIVATE, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + +static int mfd_assert_open(int fd, int flags, mode_t mode) +{ + char buf[512]; + int r; + + sprintf(buf, "/proc/self/fd/%d", fd); + r = open(buf, flags, mode); + if (r < 0) { + printf("open(%s) failed: %m\n", buf); + abort(); + } + + return r; +} + +static void mfd_fail_open(int fd, int flags, mode_t mode) +{ + char buf[512]; + int r; + + sprintf(buf, "/proc/self/fd/%d", fd); + r = open(buf, flags, mode); + if (r >= 0) { + printf("open(%s) didn't fail as expected\n"); + abort(); + } +} + +static void mfd_assert_read(int fd) +{ + char buf[16]; + void *p; + ssize_t l; + + l = read(fd, buf, sizeof(buf)); + if (l != sizeof(buf)) { + printf("read() failed: %m\n"); + abort(); + } + + /* verify PROT_READ *is* allowed */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ, + MAP_PRIVATE, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + munmap(p, MFD_DEF_SIZE); + + /* verify MAP_PRIVATE is *always* allowed (even writable) */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + munmap(p, MFD_DEF_SIZE); +} + +static void mfd_assert_write(int fd) +{ + ssize_t l; + void *p; + int r; + + /* verify write() succeeds */ + l = write(fd, "\0\0\0\0", 4); + if (l != 4) { + printf("write() failed: %m\n"); + abort(); + } + + /* verify PROT_READ | PROT_WRITE is allowed */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + *(char *)p = 0; + munmap(p, MFD_DEF_SIZE); + + /* verify PROT_WRITE is allowed */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_WRITE, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + *(char *)p = 0; + munmap(p, MFD_DEF_SIZE); + + /* verify PROT_READ with MAP_SHARED is allowed and a following + * mprotect(PROT_WRITE) allows writing */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); + if (r < 0) { + printf("mprotect() failed: %m\n"); + abort(); + } + + *(char *)p = 0; + munmap(p, MFD_DEF_SIZE); + + /* verify PUNCH_HOLE works */ + r = fallocate(fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, + MFD_DEF_SIZE); + if (r < 0) { + printf("fallocate(PUNCH_HOLE) failed: %m\n"); + abort(); + } +} + +static void mfd_fail_write(int fd) +{ + ssize_t l; + void *p; + int r; + + /* verify write() fails */ + l = write(fd, "data", 4); + if (l != -EPERM) { + printf("expected EPERM on write(), but got %d: %m\n", (int)l); + abort(); + } + + /* verify PROT_READ | PROT_WRITE is not allowed */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, + 0); + if (p != MAP_FAILED) { + printf("mmap() didn't fail as expected\n"); + abort(); + } + + /* verify PROT_WRITE is not allowed */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_WRITE, + MAP_SHARED, + fd, + 0); + if (p != MAP_FAILED) { + printf("mmap() didn't fail as expected\n"); + abort(); + } + + /* Verify PROT_READ with MAP_SHARED with a following mprotect is not + * allowed. Note that for r/w the kernel already prevents the mmap. */ + p = mmap(NULL, + MFD_DEF_SIZE, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p != MAP_FAILED) { + r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE); + if (r >= 0) { + printf("mmap()+mprotect() didn't fail as expected\n"); + abort(); + } + } + + /* verify PUNCH_HOLE fails */ + r = fallocate(fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, + MFD_DEF_SIZE); + if (r >= 0) { + printf("fallocate(PUNCH_HOLE) didn't fail as expected\n"); + abort(); + } +} + +static void mfd_assert_shrink(int fd) +{ + int r, fd2; + + r = ftruncate(fd, MFD_DEF_SIZE / 2); + if (r < 0) { + printf("ftruncate(SHRINK) failed: %m\n"); + abort(); + } + + mfd_assert_size(fd, MFD_DEF_SIZE / 2); + + fd2 = mfd_assert_open(fd, + O_RDWR | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR); + close(fd2); + + mfd_assert_size(fd, 0); +} + +static void mfd_fail_shrink(int fd) +{ + int r; + + r = ftruncate(fd, MFD_DEF_SIZE / 2); + if (r >= 0) { + printf("ftruncate(SHRINK) didn't fail as expected\n"); + abort(); + } + + mfd_fail_open(fd, + O_RDWR | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR); +} + +static void mfd_assert_grow(int fd) +{ + int r; + + r = ftruncate(fd, MFD_DEF_SIZE * 2); + if (r < 0) { + printf("ftruncate(GROW) failed: %m\n"); + abort(); + } + + mfd_assert_size(fd, MFD_DEF_SIZE * 2); + + r = fallocate(fd, + 0, + 0, + MFD_DEF_SIZE * 4); + if (r < 0) { + printf("fallocate(ALLOC) failed: %m\n"); + abort(); + } + + mfd_assert_size(fd, MFD_DEF_SIZE * 4); +} + +static void mfd_fail_grow(int fd) +{ + int r; + + r = ftruncate(fd, MFD_DEF_SIZE * 2); + if (r >= 0) { + printf("ftruncate(GROW) didn't fail as expected\n"); + abort(); + } + + r = fallocate(fd, + 0, + 0, + MFD_DEF_SIZE * 4); + if (r >= 0) { + printf("fallocate(ALLOC) didn't fail as expected\n"); + abort(); + } +} + +static void mfd_assert_grow_write(int fd) +{ + static char buf[MFD_DEF_SIZE * 8]; + ssize_t l; + + l = pwrite(fd, buf, sizeof(buf), 0); + if (l != sizeof(buf)) { + printf("pwrite() failed: %m\n"); + abort(); + } + + mfd_assert_size(fd, MFD_DEF_SIZE * 8); +} + +static void mfd_fail_grow_write(int fd) +{ + static char buf[MFD_DEF_SIZE * 8]; + ssize_t l; + + l = pwrite(fd, buf, sizeof(buf), 0); + if (l == sizeof(buf)) { + printf("pwrite() didn't fail as expected\n"); + abort(); + } +} + +static int idle_thread_fn(void *arg) +{ + sigset_t set; + int sig; + + /* dummy waiter; SIGTERM terminates us anyway */ + sigemptyset(&set); + sigaddset(&set, SIGTERM); + sigwait(&set, &sig); + + return 0; +} + +static pid_t spawn_idle_thread(unsigned int flags) +{ + uint8_t *stack; + pid_t pid; + + stack = malloc(STACK_SIZE); + if (!stack) { + printf("malloc(STACK_SIZE) failed: %m\n"); + abort(); + } + + pid = clone(idle_thread_fn, + stack + STACK_SIZE, + SIGCHLD | flags, + NULL); + if (pid < 0) { + printf("clone() failed: %m\n"); + abort(); + } + + return pid; +} + +static void join_idle_thread(pid_t pid) +{ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +/* + * Test memfd_create() syscall + * Verify syscall-argument validation, including name checks, flag validation + * and more. + */ +static void test_create(void) +{ + char buf[2048]; + int fd; + + /* test NULL name */ + mfd_fail_new(NULL, 0); + + /* test over-long name (not zero-terminated) */ + memset(buf, 0xff, sizeof(buf)); + mfd_fail_new(buf, 0); + + /* test over-long zero-terminated name */ + memset(buf, 0xff, sizeof(buf)); + buf[sizeof(buf) - 1] = 0; + mfd_fail_new(buf, 0); + + /* verify "" is a valid name */ + fd = mfd_assert_new("", 0, 0); + close(fd); + + /* verify invalid O_* open flags */ + mfd_fail_new("", 0x0100); + mfd_fail_new("", ~MFD_CLOEXEC); + mfd_fail_new("", ~MFD_ALLOW_SEALING); + mfd_fail_new("", ~0); + mfd_fail_new("", 0x80000000U); + + /* verify MFD_CLOEXEC is allowed */ + fd = mfd_assert_new("", 0, MFD_CLOEXEC); + close(fd); + + /* verify MFD_ALLOW_SEALING is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); + close(fd); + + /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); + close(fd); +} + +/* + * Test basic sealing + * A very basic sealing test to see whether setting/retrieving seals works. + */ +static void test_basic(void) +{ + int fd; + + fd = mfd_assert_new("kern_memfd_basic", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + /* add basic seals */ + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_SHRINK | + F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SHRINK | + F_SEAL_WRITE); + + /* add them again */ + mfd_assert_add_seals(fd, F_SEAL_SHRINK | + F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SHRINK | + F_SEAL_WRITE); + + /* add more seals and seal against sealing */ + mfd_assert_add_seals(fd, F_SEAL_GROW | F_SEAL_SEAL); + mfd_assert_has_seals(fd, F_SEAL_SHRINK | + F_SEAL_GROW | + F_SEAL_WRITE | + F_SEAL_SEAL); + + /* verify that sealing no longer works */ + mfd_fail_add_seals(fd, F_SEAL_GROW); + mfd_fail_add_seals(fd, 0); + + close(fd); + + /* verify sealing does not work without MFD_ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_basic", + MFD_DEF_SIZE, + MFD_CLOEXEC); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + mfd_fail_add_seals(fd, F_SEAL_SHRINK | + F_SEAL_GROW | + F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + close(fd); +} + +/* + * Test SEAL_WRITE + * Test whether SEAL_WRITE actually prevents modifications. + */ +static void test_seal_write(void) +{ + int fd; + + fd = mfd_assert_new("kern_memfd_seal_write", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + + mfd_assert_read(fd); + mfd_fail_write(fd); + mfd_assert_shrink(fd); + mfd_assert_grow(fd); + mfd_fail_grow_write(fd); + + close(fd); +} + +/* + * Test SEAL_SHRINK + * Test whether SEAL_SHRINK actually prevents shrinking + */ +static void test_seal_shrink(void) +{ + int fd; + + fd = mfd_assert_new("kern_memfd_seal_shrink", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_SHRINK); + mfd_assert_has_seals(fd, F_SEAL_SHRINK); + + mfd_assert_read(fd); + mfd_assert_write(fd); + mfd_fail_shrink(fd); + mfd_assert_grow(fd); + mfd_assert_grow_write(fd); + + close(fd); +} + +/* + * Test SEAL_GROW + * Test whether SEAL_GROW actually prevents growing + */ +static void test_seal_grow(void) +{ + int fd; + + fd = mfd_assert_new("kern_memfd_seal_grow", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_GROW); + mfd_assert_has_seals(fd, F_SEAL_GROW); + + mfd_assert_read(fd); + mfd_assert_write(fd); + mfd_assert_shrink(fd); + mfd_fail_grow(fd); + mfd_fail_grow_write(fd); + + close(fd); +} + +/* + * Test SEAL_SHRINK | SEAL_GROW + * Test whether SEAL_SHRINK | SEAL_GROW actually prevents resizing + */ +static void test_seal_resize(void) +{ + int fd; + + fd = mfd_assert_new("kern_memfd_seal_resize", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); + mfd_assert_has_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW); + + mfd_assert_read(fd); + mfd_assert_write(fd); + mfd_fail_shrink(fd); + mfd_fail_grow(fd); + mfd_fail_grow_write(fd); + + close(fd); +} + +/* + * Test sharing via dup() + * Test that seals are shared between dupped FDs and they're all equal. + */ +static void test_share_dup(void) +{ + int fd, fd2; + + fd = mfd_assert_new("kern_memfd_share_dup", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + + fd2 = mfd_assert_dup(fd); + mfd_assert_has_seals(fd2, 0); + + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd2, F_SEAL_WRITE); + + mfd_assert_add_seals(fd2, F_SEAL_SHRINK); + mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); + mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK); + + mfd_assert_add_seals(fd, F_SEAL_SEAL); + mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); + mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); + + mfd_fail_add_seals(fd, F_SEAL_GROW); + mfd_fail_add_seals(fd2, F_SEAL_GROW); + mfd_fail_add_seals(fd, F_SEAL_SEAL); + mfd_fail_add_seals(fd2, F_SEAL_SEAL); + + close(fd2); + + mfd_fail_add_seals(fd, F_SEAL_GROW); + close(fd); +} + +/* + * Test sealing with active mmap()s + * Modifying seals is only allowed if no other mmap() refs exist. + */ +static void test_share_mmap(void) +{ + int fd; + void *p; + + fd = mfd_assert_new("kern_memfd_share_mmap", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + + /* shared/writable ref prevents sealing WRITE, but allows others */ + p = mfd_assert_mmap_shared(fd); + mfd_fail_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_SHRINK); + mfd_assert_has_seals(fd, F_SEAL_SHRINK); + munmap(p, MFD_DEF_SIZE); + + /* readable ref allows sealing */ + p = mfd_assert_mmap_private(fd); + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); + munmap(p, MFD_DEF_SIZE); + + close(fd); +} + +/* + * Test sealing with open(/proc/self/fd/%d) + * Via /proc we can get access to a separate file-context for the same memfd. + * This is *not* like dup(), but like a real separate open(). Make sure the + * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR. + */ +static void test_share_open(void) +{ + int fd, fd2; + + fd = mfd_assert_new("kern_memfd_share_open", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + + fd2 = mfd_assert_open(fd, O_RDWR, 0); + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd2, F_SEAL_WRITE); + + mfd_assert_add_seals(fd2, F_SEAL_SHRINK); + mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); + mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK); + + close(fd); + fd = mfd_assert_open(fd2, O_RDONLY, 0); + + mfd_fail_add_seals(fd, F_SEAL_SEAL); + mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK); + mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK); + + close(fd2); + fd2 = mfd_assert_open(fd, O_RDWR, 0); + + mfd_assert_add_seals(fd2, F_SEAL_SEAL); + mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); + mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL); + + close(fd2); + close(fd); +} + +/* + * Test sharing via fork() + * Test whether seal-modifications work as expected with forked childs. + */ +static void test_share_fork(void) +{ + int fd; + pid_t pid; + + fd = mfd_assert_new("kern_memfd_share_fork", + MFD_DEF_SIZE, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_has_seals(fd, 0); + + pid = spawn_idle_thread(0); + mfd_assert_add_seals(fd, F_SEAL_SEAL); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + + mfd_fail_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + + join_idle_thread(pid); + + mfd_fail_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + + close(fd); +} + +int main(int argc, char **argv) +{ + pid_t pid; + + printf("memfd: CREATE\n"); + test_create(); + printf("memfd: BASIC\n"); + test_basic(); + + printf("memfd: SEAL-WRITE\n"); + test_seal_write(); + printf("memfd: SEAL-SHRINK\n"); + test_seal_shrink(); + printf("memfd: SEAL-GROW\n"); + test_seal_grow(); + printf("memfd: SEAL-RESIZE\n"); + test_seal_resize(); + + printf("memfd: SHARE-DUP\n"); + test_share_dup(); + printf("memfd: SHARE-MMAP\n"); + test_share_mmap(); + printf("memfd: SHARE-OPEN\n"); + test_share_open(); + printf("memfd: SHARE-FORK\n"); + test_share_fork(); + + /* Run test-suite in a multi-threaded environment with a shared + * file-table. */ + pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM); + printf("memfd: SHARE-DUP (shared file-table)\n"); + test_share_dup(); + printf("memfd: SHARE-MMAP (shared file-table)\n"); + test_share_mmap(); + printf("memfd: SHARE-OPEN (shared file-table)\n"); + test_share_open(); + printf("memfd: SHARE-FORK (shared file-table)\n"); + test_share_fork(); + join_idle_thread(pid); + + printf("memfd: DONE\n"); + + return 0; +} diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh new file mode 100644 index 000000000000..69b930e1e041 --- /dev/null +++ b/tools/testing/selftests/memfd/run_fuse_test.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +if test -d "./mnt" ; then + fusermount -u ./mnt + rmdir ./mnt +fi + +set -e + +mkdir mnt +./fuse_mnt ./mnt +./fuse_test ./mnt/memfd +fusermount -u ./mnt +rmdir ./mnt |