diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-05-08 13:35:13 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-05-08 13:35:13 +1000 |
commit | 248aa85d2a5e2fa672dc7ac666188df3e7512c4e (patch) | |
tree | 6fca355f27e52b5ed823f9ae81959d14076d3b23 | |
parent | 8443a7e62b4bdde2b6c7b964e40a99266e850b47 (diff) | |
parent | d72588eba1b10afa45d317d3debeee8aad399c18 (diff) |
Merge branch 'akpm/master'
267 files changed, 6325 insertions, 2406 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 09027a9fece5..1178e2357acb 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -72,6 +72,7 @@ Brief summary of control files. memory.move_charge_at_immigrate # set/show controls of moving charges memory.oom_control # set/show oom controls. memory.numa_stat # show the number of memory usage per numa node + memory.dangling_memcgs # show debugging information about dangling groups memory.kmem.limit_in_bytes # set/show hard limit for kernel memory memory.kmem.usage_in_bytes # show current kernel memory allocation @@ -579,6 +580,21 @@ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... And we have total = file + anon + unevictable. +5.7 dangling_memcgs + +This file will only be ever present in the root cgroup, if the option +CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory +consumed by it may not be immediately freed. This is because when some +extensions are used, such as swap or kernel memory, objects can outlive the +group and hold a reference to it. + +If this is the case, the dangling_memcgs file will show information about what +are the memcgs still alive, and which references are still preventing it to be +freed. There is nothing wrong with that, but it is very useful when debugging, +to know where this memory is being held. This is a developer-oriented debugging +facility only, and no guarantees of interface stability will be given. The file +is read-only, and has the sole purpose of displaying information. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. diff --git a/Documentation/devicetree/bindings/video/simple-framebuffer.txt b/Documentation/devicetree/bindings/video/simple-framebuffer.txt new file mode 100644 index 000000000000..3ea460583111 --- /dev/null +++ b/Documentation/devicetree/bindings/video/simple-framebuffer.txt @@ -0,0 +1,25 @@ +Simple Framebuffer + +A simple frame-buffer describes a raw memory region that may be rendered to, +with the assumption that the display hardware has already been set up to scan +out from that buffer. + +Required properties: +- compatible: "simple-framebuffer" +- reg: Should contain the location and size of the framebuffer memory. +- width: The width of the framebuffer in pixels. +- height: The height of the framebuffer in pixels. +- stride: The number of bytes in each line of the framebuffer. +- format: The format of the framebuffer surface. Valid values are: + - r5g6b5 (16-bit pixels, d[15:11]=r, d[10:5]=g, d[4:0]=b). + +Example: + + framebuffer { + compatible = "simple-framebuffer"; + reg = <0x1d385000 (1600 * 1200 * 2)>; + width = <1600>; + height = <1200>; + stride = <(1600 * 2)>; + format = "r5g6b5"; + }; diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fd8d0d594fc7..488c0940f3d8 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is enabled. The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG -bits on both physical and virtual pages associated with a process. +bits on both physical and virtual pages associated with a process, and the +soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details). To clear the bits for all the pages associated with the process > echo 1 > /proc/PID/clear_refs @@ -482,11 +483,17 @@ To clear the bits for the anonymous pages associated with the process To clear the bits for the file mapped pages associated with the process > echo 3 > /proc/PID/clear_refs + +To clear the soft-dirty bit + > echo 4 > /proc/PID/clear_refs + Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags using /proc/kpageflags and number of times a page is mapped using /proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt. +(There's also a /proc/pid/pagemap2 file which is the 2nd version of the + pagemap one). 1.2 Kernel data --------------- diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt index c75694b35d08..d97576f5b082 100644 --- a/Documentation/rapidio/rapidio.txt +++ b/Documentation/rapidio/rapidio.txt @@ -79,20 +79,64 @@ master port that is used to communicate with devices within the network. In order to initialize the RapidIO subsystem, a platform must initialize and register at least one master port within the RapidIO network. To register mport within the subsystem controller driver initialization code calls function -rio_register_mport() for each available master port. After all active master -ports are registered with a RapidIO subsystem, the rio_init_mports() routine -is called to perform enumeration and discovery. +rio_register_mport() for each available master port. -In the current PowerPC-based implementation a subsys_initcall() is specified to -perform controller initialization and mport registration. At the end it directly -calls rio_init_mports() to execute RapidIO enumeration and discovery. +RapidIO subsystem uses subsys_initcall() or device_initcall() to perform +controller initialization (depending on controller device type). + +After all active master ports are registered with a RapidIO subsystem, +an enumeration and/or discovery routine may be called automatically or +by user-space command. 4. Enumeration and Discovery ---------------------------- -When rio_init_mports() is called it scans a list of registered master ports and -calls an enumeration or discovery routine depending on the configured role of a -master port: host or agent. +4.1 Overview +------------ + +RapidIO subsystem configuration options allow users to specify enumeration and +discovery methods as built-in components or loadable modules. +For built-in options only one method can be selected for all mports in a system. +Selecting a modular build option for enumeration/discovery allows to use +different enumerators attached to available mport device individually. + +Depending on selected enumeration/discovery build configuration, there are +several methods to initiate enumeration and/or discovery process: + + (a) Built-in enumeration and discovery process can be started automatically + during kernel initialization time. This is the original method used since + introduction of RapidIO subsystem. This method relies on kernel initcall that + calls rio_init_mports() routine after all available mports have been + registered. When automatic start of enumeration/discovery is used a user has + to ensure that all discovering endpoints are started before the enumerating + endpoint and are waiting for enumeration to be completed. + Configuration option CONFIG_RAPIDIO_DISC_TIMEOUT defines time that discovering + endpoint waits for enumeration to be completed. If the specified timeout + expires the discovery process is terminated without obtaining RapidIO network + information. NOTE: a timed out discovery process may be restarted later using + a user-space command as it is described later if the given endpoint was + enumerated successfully. + + (b) Built-in enumeration and discovery process can be started by a command + from user space. This initiation method provides more freedom for system + startup compared to the option (a) above. After all participating endpoints + have been successfully booted, an enumeration process shall be started first + by issuing a user-space command, as soon as enumeration is completed + a discovery process can be started on all remaining endpoints. + Configuration option CONFIG_RAPIDIO_ENUM_AUTO defines which method will be + used to start a built-in enumeration and discovery process. + + (c) Modular enumeration and discovery process can be started by a command from + user space. After an enumeration/discovery module is loaded, a network scan + process can be started by issuing a user-space command. + Similar to the option (b) above, an enumerator has to be started first. + + (d) Modular enumeration and discovery process can be started by a module + initialization routine. As in the cases above an enumerating module shall be + loaded first. + +When a network scan process is started it calls an enumeration or discovery +routine depending on the configured role of a master port: host or agent. Enumeration is performed by a master port if it is configured as a host port by assigning a host device ID greater than or equal to zero. A host device ID is @@ -104,8 +148,57 @@ for it. The enumeration and discovery routines use RapidIO maintenance transactions to access the configuration space of devices. -The enumeration process is implemented according to the enumeration algorithm -outlined in the RapidIO Interconnect Specification: Annex I [1]. +4.2 Automatic Start of Enumeration and Discovery +------------------------------------------------ + +Automatic enumeration/discovery start method is applicable only to built-in +enumeration/discovery RapidIO configuration selection. To enable automatic +enumeration/discovery start set CONFIG_RAPIDIO_ENUM_AUTO=y. + +This configuration requires synchronized start of all RapidIO endpoints that +form a network which will be enumerated/discovered. Discovering endpoints have +to be started before an enumeration starts to ensure that all RapidIO +controllers have been initialized and are ready to be discovered. Configuration +parameter CONFIG_RAPIDIO_DISC_TIMEOUT defines time (in seconds) which +a discovering endpoint will wait for enumeration to be completed. + +When automatic enumeration/discovery start method is selected RapidIO subsystem +core uses device_initcall_sync() to invoke rio_init_mports() routine to perform +enumeration or discovery for all known mport devices. + +Depending on RapidIO network size and configuration this automatic +enumeration/discovery start method may be difficult to use due to the +requirement for synchronized start of all endpoints. + +4.3 User-space Start of Enumeration and Discovery +------------------------------------------------- + +User-space start of enumeration and discovery can be used with built-in and +modular build configurations. For user-space controlled start RapidIO subsystem +creates the sysfs write-only attribute file '/sys/bus/rapidio/scan'. To initiate +an enumeration or discovery process on specific mport device, a user needs to +write mport_ID (not RapidIO destination ID) into that file. The mport_ID is a +sequential number (0 ... RIO_MAX_MPORTS) assigned during mport device +registration. For example for machine with single RapidIO controller, mport_ID +for that controller always will be 0. + +To initiate RapidIO enumeration/discovery on all available mports a user may +write '-1' (or RIO_MPORT_ANY) into the scan attribute file. + +4.4 Basic Enumeration Method +---------------------------- + +This is an original enumeration/discovery method which is available since +first release of RapidIO subsystem code. The enumeration process is +implemented according to the enumeration algorithm outlined in the RapidIO +Interconnect Specification: Annex I [1]. + +This method can be configured as built-in or loadable module. When used as a +kernel module this method offers parameter "scan" witch allows to trigger the +enumeration/discovery process from module initialization routine. + +This enumeration/discovery method can be started only once and does not support +unloading if it is built as a module. The enumeration process traverses the network using a recursive depth-first algorithm. When a new device is found, the enumerator takes ownership of that @@ -160,6 +253,28 @@ time period. If this wait time period expires before enumeration is completed, an agent skips RapidIO discovery and continues with remaining kernel initialization. +4.5 Adding New Enumeration/Discovery Method +------------------------------------------- + +RapidIO subsystem code organization allows addition of new enumeration/discovery +methods as new configuration options without significant impact to to the core +RapidIO code. + +RapidIO developers can add new enumeration/discovery methods as built-in code or +loadable kernel modules. In case of built-in methods only one method may be +configured in, having multiple built-in enumeration/discovery methods is not +supported by mport role assignment mechanism. + +To be registered with RapidIO subsystem a new built-in method must provide +global data structure named "rio_scan_ops" (see definition of 'struct rio_scan' +in include/linux/rio.h file). Example of this structure is available in the +existing implementation of basic RapidIO enumeration method (see rio-scan.c). + +If a new enumeration/discovery method is implemented as a module, its module +initialization routine has to call rio_register_scan() routine to attach +a loadable enumerator to a specified mport device. The basic enumerator +implementation demonstrates this process as well. + 5. References ------------- diff --git a/Documentation/rapidio/sysfs.txt b/Documentation/rapidio/sysfs.txt index 97f71ce575d6..c3b7d18ebdff 100644 --- a/Documentation/rapidio/sysfs.txt +++ b/Documentation/rapidio/sysfs.txt @@ -88,3 +88,20 @@ that exports additional attributes. IDT_GEN2: errlog - reads contents of device error log until it is empty. + + +5. RapidIO Bus Attributes +------------------------- + +RapidIO bus subdirectory /sys/bus/rapidio implements the following bus-specific +attribute: + + scan - allows to trigger enumeration discovery process from user space. This + is write-only attribute. To initiate an enumeration or discovery + process on specific mport device, a user needs to write mport_ID (not + RapidIO destination ID) into this file. The mport_ID is a sequential + number (0 ... RIO_MAX_MPORTS) assigned to mport device. For example for + machine with single RapidIO controller, mport_ID for that controller + always will be 0. To initiate RapidIO enumeration/discovery on all + available mports a user must write '-1' (or RIO_MPORT_ANY) into this + attribute file. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9ed919..a5717c38834a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -169,18 +169,39 @@ Setting this to zero disables periodic writeback altogether. drop_caches -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. +Writing to this will cause the kernel to drop clean caches, as well as +reclaimable slab objects like dentries and inodes. Once dropped, their +memory becomes free. To free pagecache: echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: +To free reclaimable slab objects (includes dentries and inodes): echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: +To free slab objects and pagecache: echo 3 > /proc/sys/vm/drop_caches -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +This is a non-destructive operation and will not free any dirty objects. +To increase the number of objects freed by this operation, the user may run +`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the +number of dirty objects on the system and create more candidates to be +dropped. + +This file is not a means to control the growth of the various kernel caches +(inodes, dentries, pagecache, etc...) These objects are automatically +reclaimed by the kernel when memory is needed elsewhere on the system. + +Use of this file can cause performance problems. Since it discards cached +objects, it may cost a significant amount of I/O and CPU to recreate the +dropped objects, especially if they were under heavy use. Because of this, +use outside of a testing or debugging environment is not recommended. + +You may see informational messages in your kernel log when this file is +used: + + cat (1234): dropped kernel caches: 3 + +These are informational only. They do not mean that anything is wrong +with your system. ============================================================== diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 7587493c67f1..394cc03c8df6 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -30,6 +30,11 @@ There are three components to pagemap: determine which areas of memory are actually mapped and llseek to skip over unmapped regions. + * /proc/pid/pagemap2. This file provides the same info as the pagemap + does, but bits 56-60 are reserved for future use and thus zero + + Bit 55 means pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * /proc/kpagecount. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt new file mode 100644 index 000000000000..9a12a5956bc0 --- /dev/null +++ b/Documentation/vm/soft-dirty.txt @@ -0,0 +1,36 @@ + SOFT-DIRTY PTEs + + The soft-dirty is a bit on a PTE which helps to track which pages a task +writes to. In order to do this tracking one should + + 1. Clear soft-dirty bits from the task's PTEs. + + This is done by writing "4" into the /proc/PID/clear_refs file of the + task in question. + + 2. Wait some time. + + 3. Read soft-dirty bits from the PTEs. + + This is done by reading from the /proc/PID/pagemap. The bit 55 of the + 64-bit qword is the soft-dirty one. If set, the respective PTE was + written to since step 1. + + + Internally, to do this tracking, the writable bit is cleared from PTEs +when the soft-dirty bit is cleared. So, after this, when the task tries to +modify a page at some virtual address the #PF occurs and the kernel sets +the soft-dirty bit on the respective PTE. + + Note, that although all the task's address space is marked as r/o after the +soft-dirty bits clear, the #PF-s that occur after that are processed fast. +This is so, since the pages are still mapped to physical memory, and thus all +the kernel does is finds this fact out and puts both writable and soft-dirty +bits on the PTE. + + + This feature is actively used by the checkpoint-restore project. You +can find more details about it on http://criu.org + + +-- Pavel Emelyanov, Apr 9, 2013 diff --git a/arch/Kconfig b/arch/Kconfig index a4429bcd609e..651a9e747ceb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -332,6 +332,10 @@ config HAVE_ARCH_SECCOMP_FILTER - secure_computing return value is checked and a return value of -1 results in the system call being skipped immediately. +# Used by archs to tell that they support SECCOMP_FILTER_JIT +config HAVE_SECCOMP_FILTER_JIT + bool + config SECCOMP_FILTER def_bool y depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET @@ -342,6 +346,16 @@ config SECCOMP_FILTER See Documentation/prctl/seccomp_filter.txt for details. +config SECCOMP_FILTER_JIT + bool "enable Seccomp filter Just In Time compiler" + depends on HAVE_SECCOMP_FILTER_JIT && BPF_JIT && SECCOMP_FILTER + help + Seccomp syscall filtering capabilities are normally handled + by an interpreter. This option allows kernel to generate a native + code when filter is loaded in memory. This should speedup + syscall filtering. Note : Admin should enable this feature + changing /proc/sys/net/core/bpf_jit_enable + config HAVE_CONTEXT_TRACKING bool help @@ -365,6 +379,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config HAVE_ARCH_SOFT_DIRTY + bool + config HAVE_MOD_ARCH_SPECIFIC bool help diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d00a7336061c..f95674a76af0 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -25,6 +25,7 @@ config ARM select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_BPF_JIT + select HAVE_SECCOMP_FILTER_JIT select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG @@ -39,6 +40,7 @@ config ARM select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)) select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ diff --git a/arch/arm/boot/compressed/.gitignore b/arch/arm/boot/compressed/.gitignore index f79a08efe000..47279aa96a6a 100644 --- a/arch/arm/boot/compressed/.gitignore +++ b/arch/arm/boot/compressed/.gitignore @@ -6,6 +6,7 @@ piggy.gzip piggy.lzo piggy.lzma piggy.xzkern +piggy.lz4 vmlinux vmlinux.lds diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 3580d57ea218..001a13a0cf6f 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -27,6 +27,9 @@ OBJS += misc.o decompress.o ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o endif +ifeq ($(CONFIG_KERNEL_LZ4),y) +CFLAGS_decompress.o := -Os +endif FONTC = $(srctree)/drivers/video/console/font_acorn_8x8.c # string library code (-Os is enforced to keep it much smaller) @@ -91,6 +94,7 @@ suffix_$(CONFIG_KERNEL_GZIP) = gzip suffix_$(CONFIG_KERNEL_LZO) = lzo suffix_$(CONFIG_KERNEL_LZMA) = lzma suffix_$(CONFIG_KERNEL_XZ) = xzkern +suffix_$(CONFIG_KERNEL_LZ4) = lz4 # Borrowed libfdt files for the ATAG compatibility mode @@ -115,7 +119,7 @@ targets := vmlinux vmlinux.lds \ font.o font.c head.o misc.o $(OBJS) # Make sure files are removed during clean -extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \ +extra-y += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \ lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs) ifeq ($(CONFIG_FUNCTION_TRACER),y) diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index 24b0475cb8bf..bd245d34952d 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -51,6 +51,10 @@ extern char * strstr(const char * s1, const char *s2); #include "../../../../lib/decompress_unxz.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x)) { return decompress(input, len, NULL, NULL, output, NULL, error); diff --git a/arch/arm/boot/compressed/piggy.lz4.S b/arch/arm/boot/compressed/piggy.lz4.S new file mode 100644 index 000000000000..3d9a575618a3 --- /dev/null +++ b/arch/arm/boot/compressed/piggy.lz4.S @@ -0,0 +1,6 @@ + .section .piggydata,#alloc + .globl input_data +input_data: + .incbin "arch/arm/boot/compressed/piggy.lz4" + .globl input_data_end +input_data_end: diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 10062ceadd1c..0c6356255fe3 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 1a643ee8e082..c5ef845ff89e 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -55,7 +55,8 @@ #define FLAG_NEED_X_RESET (1 << 0) struct jit_ctx { - const struct sk_filter *skf; + unsigned short prog_len; + struct sock_filter *prog_insns; unsigned idx; unsigned prologue_bytes; int ret0_fp_idx; @@ -131,8 +132,8 @@ static u16 saved_regs(struct jit_ctx *ctx) { u16 ret = 0; - if ((ctx->skf->len > 1) || - (ctx->skf->insns[0].code == BPF_S_RET_A)) + if ((ctx->prog_len > 1) || + (ctx->prog_insns[0].code == BPF_S_RET_A)) ret |= 1 << r_A; #ifdef CONFIG_FRAME_POINTER @@ -181,7 +182,7 @@ static inline bool is_load_to_a(u16 inst) static void build_prologue(struct jit_ctx *ctx) { u16 reg_set = saved_regs(ctx); - u16 first_inst = ctx->skf->insns[0].code; + u16 first_inst = ctx->prog_insns[0].code; u16 off; #ifdef CONFIG_FRAME_POINTER @@ -279,7 +280,7 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx) ctx->imms[i] = k; /* constants go just after the epilogue */ - offset = ctx->offsets[ctx->skf->len]; + offset = ctx->offsets[ctx->prog_len]; offset += ctx->prologue_bytes; offset += ctx->epilogue_bytes; offset += i * 4; @@ -419,7 +420,7 @@ static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx) emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx); } else { _emit(cond, ARM_MOV_I(ARM_R0, 0), ctx); - _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx); + _emit(cond, ARM_B(b_imm(ctx->prog_len, ctx)), ctx); } } @@ -469,14 +470,13 @@ static inline void update_on_xread(struct jit_ctx *ctx) static int build_body(struct jit_ctx *ctx) { void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; - const struct sk_filter *prog = ctx->skf; const struct sock_filter *inst; unsigned i, load_order, off, condt; int imm12; u32 k; - for (i = 0; i < prog->len; i++) { - inst = &(prog->insns[i]); + for (i = 0; i < ctx->prog_len; i++) { + inst = &(ctx->prog_insns[i]); /* K as an immediate value operand */ k = inst->k; @@ -548,6 +548,15 @@ load_common: emit_err_ret(ARM_COND_NE, ctx); emit(ARM_MOV_R(r_A, ARM_R0), ctx); break; +#ifdef CONFIG_SECCOMP_FILTER_JIT + case BPF_S_ANC_SECCOMP_LD_W: + ctx->seen |= SEEN_CALL; + emit_mov_i(ARM_R3, (u32)seccomp_bpf_load, ctx); + emit_mov_i(ARM_R0, k, ctx); + emit_blx_r(ARM_R3, ctx); + emit(ARM_MOV_R(r_A, ARM_R0), ctx); + break; +#endif case BPF_S_LD_W_IND: load_order = 2; goto load_ind; @@ -769,8 +778,8 @@ cmp_x: ctx->ret0_fp_idx = i; emit_mov_i(ARM_R0, k, ctx); b_epilogue: - if (i != ctx->skf->len - 1) - emit(ARM_B(b_imm(prog->len, ctx)), ctx); + if (i != ctx->prog_len - 1) + emit(ARM_B(b_imm(ctx->prog_len, ctx)), ctx); break; case BPF_S_MISC_TAX: /* X = A */ @@ -858,7 +867,7 @@ b_epilogue: } -void bpf_jit_compile(struct sk_filter *fp) +static void __bpf_jit_compile(struct jit_ctx *out_ctx) { struct jit_ctx ctx; unsigned tmp_idx; @@ -867,11 +876,10 @@ void bpf_jit_compile(struct sk_filter *fp) if (!bpf_jit_enable) return; - memset(&ctx, 0, sizeof(ctx)); - ctx.skf = fp; + ctx = *out_ctx; ctx.ret0_fp_idx = -1; - ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL); + ctx.offsets = kzalloc(4 * (ctx.prog_len + 1), GFP_KERNEL); if (ctx.offsets == NULL) return; @@ -920,13 +928,26 @@ void bpf_jit_compile(struct sk_filter *fp) if (bpf_jit_enable > 1) /* there are 2 passes here */ bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); - - fp->bpf_func = (void *)ctx.target; out: kfree(ctx.offsets); + + *out_ctx = ctx; return; } +void bpf_jit_compile(struct sk_filter *fp) +{ + struct jit_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.prog_len = fp->len; + ctx.prog_insns = fp->insns; + + __bpf_jit_compile(&ctx); + if (ctx.target) + fp->bpf_func = (void *)ctx.target; +} + static void bpf_jit_free_worker(struct work_struct *work) { module_free(NULL, work); @@ -937,9 +958,45 @@ void bpf_jit_free(struct sk_filter *fp) struct work_struct *work; if (fp->bpf_func != sk_run_filter) { + /* + * bpf_jit_free() can be called from softirq; module_free() + * requires process context. + */ work = (struct work_struct *)fp->bpf_func; INIT_WORK(work, bpf_jit_free_worker); schedule_work(work); } } + +#ifdef CONFIG_SECCOMP_FILTER_JIT +void seccomp_jit_compile(struct seccomp_filter *fp) +{ + struct jit_ctx ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.prog_len = seccomp_filter_get_len(fp); + ctx.prog_insns = seccomp_filter_get_insns(fp); + + __bpf_jit_compile(&ctx); + if (ctx.target) + seccomp_filter_set_bpf_func(fp, (void *)ctx.target); +} + +void seccomp_jit_free(struct seccomp_filter *fp) +{ + struct work_struct *work; + void *bpf_func = seccomp_filter_get_bpf_func(fp); + + if (bpf_func != sk_run_filter) { + /* + * seccomp_jit_free() can be called from softirq; module_free() + * requires process context. + */ + work = (struct work_struct *)bpf_func; + + INIT_WORK(work, bpf_jit_free_worker); + schedule_work(work); + } +} +#endif diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7c7be7855638..8ed6cb1a900f 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 7e5fe2790d8a..f1baadd56e82 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c index 67a42ed0d2fc..cb8bdbe4972f 100644 --- a/arch/powerpc/mm/mmap_64.c +++ b/arch/powerpc/mm/mmap_64.c @@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index c427ae36374a..7ab1a6655d89 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -699,6 +699,10 @@ static void jit_free_defer(struct work_struct *arg) void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { + /* + * bpf_jit_free() can be called from softirq; module_free() + * requires process context. + */ struct work_struct *work = (struct work_struct *)fp->bpf_func; INIT_WORK(work, jit_free_defer); diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 5f7d7ba2874c..7a539f4f5e30 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/aio.h> #include <asm/ebcdic.h> #include "hypfs.h" diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 06bafec00278..40023290ee5b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } @@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = s390_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = s390_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 82f165f8078c..e199efb50f0a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -818,6 +818,10 @@ void bpf_jit_free(struct sk_filter *fp) if (fp->bpf_func == sk_run_filter) return; + /* + * bpf_jit_free() can be called from softirq; module_free() requires + * process context. + */ work = (struct work_struct *)fp->bpf_func; INIT_WORK(work, jit_free_defer); schedule_work(work); diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index 9b40c9c12a0c..6cfc1b09ec25 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c @@ -253,24 +253,15 @@ void __init leon_smp_done(void) /* Free unneeded trap tables */ if (!cpu_present(1)) { - ClearPageReserved(virt_to_page(&trapbase_cpu1)); - init_page_count(virt_to_page(&trapbase_cpu1)); - free_page((unsigned long)&trapbase_cpu1); - totalram_pages++; + free_reserved_page(virt_to_page(&trapbase_cpu1)); num_physpages++; } if (!cpu_present(2)) { - ClearPageReserved(virt_to_page(&trapbase_cpu2)); - init_page_count(virt_to_page(&trapbase_cpu2)); - free_page((unsigned long)&trapbase_cpu2); - totalram_pages++; + free_reserved_page(virt_to_page(&trapbase_cpu2)); num_physpages++; } if (!cpu_present(3)) { - ClearPageReserved(virt_to_page(&trapbase_cpu3)); - init_page_count(virt_to_page(&trapbase_cpu3)); - free_page((unsigned long)&trapbase_cpu3); - totalram_pages++; + free_reserved_page(virt_to_page(&trapbase_cpu3)); num_physpages++; } /* Ok, they are spinning and ready to go. */ diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 2daaaa6eda23..51561b8b15ba 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 4490c397bb5b..af472cf7c69a 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -366,45 +366,14 @@ void __init mem_init(void) void free_initmem (void) { - unsigned long addr; - unsigned long freed; - - addr = (unsigned long)(&__init_begin); - freed = (unsigned long)(&__init_end) - addr; - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { - struct page *p; - - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - p = virt_to_page(addr); - - ClearPageReserved(p); - init_page_count(p); - __free_page(p); - totalram_pages++; - num_physpages++; - } - printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n", - freed >> 10); + num_physpages += free_initmem_default(POISON_FREE_INITMEM); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", - (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - struct page *p; - - memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE); - p = virt_to_page(start); - - ClearPageReserved(p); - init_page_count(p); - __free_page(p); - totalram_pages++; - num_physpages++; - } + num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index cf72a8a5b3aa..a7171997adfd 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2059,8 +2059,7 @@ void __init mem_init(void) /* We subtract one to account for the mem_map_zero page * allocated below. */ - totalram_pages -= 1; - num_physpages = totalram_pages; + num_physpages = totalram_pages - 1; /* * Set up the zero page, mark it reserved, so that page count @@ -2071,7 +2070,7 @@ void __init mem_init(void) prom_printf("paging_init: Cannot alloc zero page.\n"); prom_halt(); } - SetPageReserved(mem_map_zero); + mark_page_reserved(mem_map_zero); codepages = (((unsigned long) _etext) - ((unsigned long) _start)); codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT; @@ -2111,37 +2110,22 @@ void free_initmem(void) initend = (unsigned long)(__init_end) & PAGE_MASK; for (; addr < initend; addr += PAGE_SIZE) { unsigned long page; - struct page *p; page = (addr + ((unsigned long) __va(kern_base)) - ((unsigned long) KERNBASE)); memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - if (do_free) { - p = virt_to_page(page); - - ClearPageReserved(p); - init_page_count(p); - __free_page(p); - totalram_pages++; - } + if (do_free) + free_reserved_page(virt_to_page(page)); } } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - struct page *p = virt_to_page(start); - - ClearPageReserved(p); - init_page_count(p); - __free_page(p); - totalram_pages++; - } + num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index d36a85ebb5e0..4bd140a8536c 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -817,6 +817,10 @@ static void jit_free_defer(struct work_struct *arg) void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { + /* + * bpf_jit_free() can be called from softirq; module_free() + * requires process context. + */ struct work_struct *work = (struct work_struct *)fp->bpf_func; INIT_WORK(work, jit_free_defer); diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c index f96f4cec602a..d67d91ebf63e 100644 --- a/arch/tile/mm/mmap.c +++ b/arch/tile/mm/mmap.c @@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(mm); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a154a91c7e7..08bffb2bb962 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -65,6 +65,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_XZ select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZ4 select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS @@ -102,6 +103,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE + select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA if X86_64 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5ef205c5f37b..dcd90df10ab4 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,8 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo +targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE $(call if_changed,xzkern) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lz4) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_LZ4) := lz4 quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7cb56c6ca351..0319c88290a5 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -145,6 +145,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzo.c" #endif +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + static void scroll(void) { int i; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 805078e08013..fbeba82c703b 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3a..b8e9224f0b45 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -17,6 +17,8 @@ extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void e820_add_region(u64 start, u64 size, int type); +extern void e820_add_limit_region(u64 start, u64 size, int type); +extern void e820_adjust_region(u64 *start, u64 *size); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e672234c4ff..ebf937362479 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY); + return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY); + return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } static inline pmd_t pmd_mkhuge(pmd_t pmd) @@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +static inline int pte_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..c98ac63aae48 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -55,6 +55,18 @@ #define _PAGE_HIDDEN (_AT(pteval_t, 0)) #endif +/* + * The same hidden bit is used by kmemcheck, but since kmemcheck + * works on kernel pages while soft-dirty engine on user space, + * they do not conflict with each other. + */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda5..0d5bb689649a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +static u64 mem_limit = ~0ULL; /* * This function checks if any part of the range <start,end> is mapped @@ -108,7 +109,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) * Add a memory region to the kernel e820 map. */ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, - int type) + int type, bool limited) { int x = e820x->nr_map; @@ -119,6 +120,22 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, return; } + if (limited) { + if (start >= mem_limit) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)start, + (unsigned long long)(start + size - 1)); + return; + } + + if (mem_limit - start < size) { + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long)mem_limit, + (unsigned long long)(start + size - 1)); + size = mem_limit - start; + } + } + e820x->map[x].addr = start; e820x->map[x].size = size; e820x->map[x].type = type; @@ -127,7 +144,37 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(&e820, start, size, type, false); +} + +/* + * do_add_efi_memmap() calls this function(). + * + * Note: BOOT_SERVICES_{CODE,DATA} regions on some efi machines are marked + * as E820_RAM, and they are needed to be mapped. Please use e820_add_region() + * to add BOOT_SERVICES_{CODE,DATA} regions. + */ +void __init e820_add_limit_region(u64 start, u64 size, int type) +{ + /* + * efi_init() is called after finish_e820_parsing(), so we should + * check whether [start, start + size) contains address above + * mem_limit if the type is E820_RAM. + */ + __e820_add_region(&e820, start, size, type, type == E820_RAM); +} + +void __init e820_adjust_region(u64 *start, u64 *size) +{ + if (*start >= mem_limit) { + *size = 0; + return; + } + + if (mem_limit - *start < *size) + *size = mem_limit - *start; + + return; } static void __init e820_print_type(u32 type) @@ -455,8 +502,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, /* new range is totally covered? */ if (ei->addr < start && ei_end > end) { - __e820_add_region(e820x, start, size, new_type); - __e820_add_region(e820x, end, ei_end - end, ei->type); + __e820_add_region(e820x, start, size, new_type, false); + __e820_add_region(e820x, end, ei_end - end, ei->type, + false); ei->size = start - ei->addr; real_updated_size += size; continue; @@ -469,7 +517,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, continue; __e820_add_region(e820x, final_start, final_end - final_start, - new_type); + new_type, false); real_updated_size += final_end - final_start; @@ -809,7 +857,7 @@ static int userdef __initdata; /* "mem=nopentium" disables the 4MB page tables. */ static int __init parse_memopt(char *p) { - u64 mem_size; + char *oldp; if (!p) return -EINVAL; @@ -825,11 +873,11 @@ static int __init parse_memopt(char *p) } userdef = 1; - mem_size = memparse(p, &p); + oldp = p; + mem_limit = memparse(p, &p); /* don't remove all of memory when handling "mem={invalid}" param */ - if (mem_size == 0) + if (mem_limit == 0 || p == oldp) return -EINVAL; - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; } @@ -895,6 +943,12 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { + if (mem_limit != ~0ULL) { + userdef = 1; + e820_remove_range(mem_limit, ULLONG_MAX - mem_limit, + E820_RAM, 1); + } + if (userdef) { u32 nr = e820.nr_map; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df6835f9f..62c29a5bfe26 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f66b54086ce5..96598170c074 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -749,6 +749,10 @@ static void jit_free_defer(struct work_struct *arg) void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { + /* + * bpf_jit_free() can be called from softirq; module_free() + * requires process context. + */ struct work_struct *work = (struct work_struct *)fp->bpf_func; INIT_WORK(work, jit_free_defer); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 55856b2310d3..83e5cc472752 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -419,10 +419,17 @@ static void __init do_add_efi_memmap(void) int e820_type; switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: + /* EFI_BOOT_SERVICES_{CODE,DATA} needs to be mapped */ + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + e820_add_region(start, size, e820_type); + continue; + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: case EFI_CONVENTIONAL_MEMORY: if (md->attribute & EFI_MEMORY_WB) e820_type = E820_RAM; @@ -447,7 +454,7 @@ static void __init do_add_efi_memmap(void) e820_type = E820_RESERVED; break; } - e820_add_region(start, size, e820_type); + e820_add_limit_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -555,6 +562,8 @@ void __init efi_free_boot_services(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + e820_adjust_region(&start, &size); + /* Could not reserve boot area */ if (!size) continue; diff --git a/block/blk-core.c b/block/blk-core.c index 33c33bc99ddd..94aa4e75d626 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -153,7 +153,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) + unsigned int nbytes, int error, + struct batch_complete *batch) { if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); @@ -167,7 +168,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio_batch(bio, error, batch); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -2281,7 +2282,8 @@ EXPORT_SYMBOL(blk_fetch_request); * %false - this request doesn't have any more data * %true - this request has more data **/ -bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) +bool blk_update_request(struct request *req, int error, unsigned int nr_bytes, + struct batch_complete *batch) { int total_bytes; @@ -2337,7 +2339,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (bio_bytes == bio->bi_size) req->bio = bio->bi_next; - req_bio_endio(req, bio, bio_bytes, error); + req_bio_endio(req, bio, bio_bytes, error, batch); total_bytes += bio_bytes; nr_bytes -= bio_bytes; @@ -2390,14 +2392,15 @@ EXPORT_SYMBOL_GPL(blk_update_request); static bool blk_update_bidi_request(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes) + unsigned int bidi_bytes, + struct batch_complete *batch) { - if (blk_update_request(rq, error, nr_bytes)) + if (blk_update_request(rq, error, nr_bytes, batch)) return true; /* Bidi request must be completed as a whole */ if (unlikely(blk_bidi_rq(rq)) && - blk_update_request(rq->next_rq, error, bidi_bytes)) + blk_update_request(rq->next_rq, error, bidi_bytes, batch)) return true; if (blk_queue_add_random(rq->q)) @@ -2480,7 +2483,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, struct request_queue *q = rq->q; unsigned long flags; - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, NULL)) return true; spin_lock_irqsave(q->queue_lock, flags); @@ -2506,9 +2509,11 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %true - still buffers pending for this request **/ bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) + unsigned int nr_bytes, + unsigned int bidi_bytes, + struct batch_complete *batch) { - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes, batch)) return true; blk_finish_request(rq, error); @@ -2609,7 +2614,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_err); **/ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - return __blk_end_bidi_request(rq, error, nr_bytes, 0); + return __blk_end_bidi_request(rq, error, nr_bytes, 0, NULL); } EXPORT_SYMBOL(__blk_end_request); @@ -2621,7 +2626,8 @@ EXPORT_SYMBOL(__blk_end_request); * Description: * Completely finish @rq. Must be called with queue lock held. */ -void __blk_end_request_all(struct request *rq, int error) +void blk_end_request_all_batch(struct request *rq, int error, + struct batch_complete *batch) { bool pending; unsigned int bidi_bytes = 0; @@ -2629,10 +2635,11 @@ void __blk_end_request_all(struct request *rq, int error) if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); - pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); + pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), + bidi_bytes, batch); BUG_ON(pending); } -EXPORT_SYMBOL(__blk_end_request_all); +EXPORT_SYMBOL(blk_end_request_all_batch); /** * __blk_end_request_cur - Helper function to finish the current request chunk. diff --git a/block/blk-flush.c b/block/blk-flush.c index cc2b827a853c..ab0ed2358947 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -316,7 +316,7 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - __blk_end_bidi_request(rq, 0, 0, 0); + __blk_end_bidi_request(rq, 0, 0, 0, NULL); return; } @@ -384,7 +384,8 @@ void blk_abort_flushes(struct request_queue *q) } } -static void bio_end_flush(struct bio *bio, int err) +static void bio_end_flush(struct bio *bio, int err, + struct batch_complete *batch) { if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); diff --git a/block/blk-lib.c b/block/blk-lib.c index d6f50d572565..279f9de415be 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -15,7 +15,8 @@ struct bio_batch { struct completion *wait; }; -static void bio_batch_end_io(struct bio *bio, int err) +static void bio_batch_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct bio_batch *bb = bio->bi_private; diff --git a/block/blk.h b/block/blk.h index e837b8f619b7..dc8fee6d41d6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -31,7 +31,8 @@ void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes); + unsigned int nr_bytes, unsigned int bidi_bytes, + struct batch_complete *batch); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); diff --git a/block/genhd.c b/block/genhd.c index 20625eed5511..5a9f8931b0e6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9a87daa6f4fb..a5ffcc988f0b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -27,6 +27,7 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/times.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include <scsi/scsi.h> diff --git a/crypto/Kconfig b/crypto/Kconfig index 622d8a48cbe9..aa3a34907fd0 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1359,6 +1359,22 @@ config CRYPTO_842 help This is the 842 algorithm. +config CRYPTO_LZ4 + tristate "LZ4 compression algorithm" + select CRYPTO_ALGAPI + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 algorithm. + +config CRYPTO_LZ4HC + tristate "LZ4HC compression algorithm" + select CRYPTO_ALGAPI + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 high compression mode algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index a8e9b0fefbe9..2ba0df2f908f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -85,6 +85,8 @@ obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_CRC32) += crc32.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o +obj-$(CONFIG_CRYPTO_LZ4) += lz4.o +obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o obj-$(CONFIG_CRYPTO_842) += 842.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o diff --git a/crypto/lz4.c b/crypto/lz4.c new file mode 100644 index 000000000000..4586dd15b0d8 --- /dev/null +++ b/crypto/lz4.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min <chanho.min@lge.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/vmalloc.h> +#include <linux/lz4.h> + +struct lz4_ctx { + void *lz4_comp_mem; +}; + +static int lz4_init(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS); + if (!ctx->lz4_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4_exit(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + vfree(ctx->lz4_comp_mem); +} + +static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress(src, &__slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4 = { + .cra_name = "lz4", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list), + .cra_init = lz4_init, + .cra_exit = lz4_exit, + .cra_u = { .compress = { + .coa_compress = lz4_compress_crypto, + .coa_decompress = lz4_decompress_crypto } } +}; + +static int __init lz4_mod_init(void) +{ + return crypto_register_alg(&alg_lz4); +} + +static void __exit lz4_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4); +} + +module_init(lz4_mod_init); +module_exit(lz4_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Compression Algorithm"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c new file mode 100644 index 000000000000..151ba31d34e3 --- /dev/null +++ b/crypto/lz4hc.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min <chanho.min@lge.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/vmalloc.h> +#include <linux/lz4.h> + +struct lz4hc_ctx { + void *lz4hc_comp_mem; +}; + +static int lz4hc_init(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!ctx->lz4hc_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4hc_exit(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + vfree(ctx->lz4hc_comp_mem); +} + +static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress(src, &__slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4hc = { + .cra_name = "lz4hc", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4hc_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list), + .cra_init = lz4hc_init, + .cra_exit = lz4hc_exit, + .cra_u = { .compress = { + .coa_compress = lz4hc_compress_crypto, + .coa_decompress = lz4hc_decompress_crypto } } +}; + +static int __init lz4hc_mod_init(void) +{ + return crypto_register_alg(&alg_lz4hc); +} + +static void __exit lz4hc_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4hc); +} + +module_init(lz4hc_mod_init); +module_exit(lz4hc_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); diff --git a/drivers/block/blockconsole.c b/drivers/block/blockconsole.c index 65f8ace6dc02..656c5a6c1f4e 100644 --- a/drivers/block/blockconsole.c +++ b/drivers/block/blockconsole.c @@ -164,7 +164,8 @@ static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes) } while (cmpxchg64(&bc->console_bytes, old, new) != old); } -static void request_complete(struct bio *bio, int err) +static void request_complete(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -289,7 +290,7 @@ static void bcon_unregister(struct work_struct *work) } #define BCON_MAX_ERRORS 10 -static void bcon_end_io(struct bio *bio, int err) +static void bcon_end_io(struct bio *bio, int err, struct batch_complete *batch) { struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio); struct blockconsole *bc = bio->bi_private; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 64fbb8385cdc..046aa1793514 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -948,7 +948,8 @@ static void bm_aio_ctx_destroy(struct kref *kref) } /* bv_page may be a copy, or may be the original */ -static void bm_async_io_complete(struct bio *bio, int error) +static void bm_async_io_complete(struct bio *bio, int error, + struct batch_complete *batch) { struct bm_aio_ctx *ctx = bio->bi_private; struct drbd_conf *mdev = ctx->mdev; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 891c0ecaa292..04a80af8fddb 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -64,7 +64,8 @@ rwlock_t global_state_lock; /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ -void drbd_md_io_complete(struct bio *bio, int error) +void drbd_md_io_complete(struct bio *bio, int error, + struct batch_complete *batch) { struct drbd_md_io *md_io; struct drbd_conf *mdev; @@ -167,7 +168,8 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver. */ -void drbd_peer_request_endio(struct bio *bio, int error) +void drbd_peer_request_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct drbd_peer_request *peer_req = bio->bi_private; struct drbd_conf *mdev = peer_req->w.mdev; @@ -203,7 +205,8 @@ void drbd_peer_request_endio(struct bio *bio, int error) /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ -void drbd_request_endio(struct bio *bio, int error) +void drbd_request_endio(struct bio *bio, int error, + struct batch_complete *batch) { unsigned long flags; struct drbd_request *req = bio->bi_private; diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h index 328f18e4b4ee..d443dc06b854 100644 --- a/drivers/block/drbd/drbd_wrappers.h +++ b/drivers/block/drbd/drbd_wrappers.h @@ -20,9 +20,12 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) /* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); +extern void drbd_md_io_complete(struct bio *bio, int error, + struct batch_complete *batch); +extern void drbd_peer_request_endio(struct bio *bio, int error, + struct batch_complete *batch); +extern void drbd_request_endio(struct bio *bio, int error, + struct batch_complete *batch); /* * used to submit our private bio diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 83232639034e..629b6d506cd0 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3748,7 +3748,8 @@ static unsigned int floppy_check_events(struct gendisk *disk, * a disk in the drive, and whether that disk is writable. */ -static void floppy_rb0_complete(struct bio *bio, int err) +static void floppy_rb0_complete(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 847107ef0cce..126232104173 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -151,6 +151,9 @@ static void mtip_command_cleanup(struct driver_data *dd) struct mtip_cmd *command; struct mtip_port *port = dd->port; static int in_progress; + struct batch_complete batch; + + batch_complete_init(&batch); if (in_progress) return; @@ -166,11 +169,9 @@ static void mtip_command_cleanup(struct driver_data *dd) command = &port->commands[commandindex]; if (atomic_read(&command->active) - && (command->async_callback)) { - command->async_callback(command->async_data, - -ENODEV); - command->async_callback = NULL; - command->async_data = NULL; + && (command->bio)) { + bio_endio_batch(command->bio, -ENODEV, &batch); + command->bio = NULL; } dma_unmap_sg(&port->dd->pdev->dev, @@ -178,9 +179,10 @@ static void mtip_command_cleanup(struct driver_data *dd) command->scatter_ents, command->direction); } + up(&port->cmd_slot); } - up(&port->cmd_slot); + batch_complete(&batch); set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); in_progress = 0; @@ -580,6 +582,9 @@ static void mtip_timeout_function(unsigned long int data) unsigned int bit, group; unsigned int num_command_slots; unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; + struct batch_complete batch; + + batch_complete_init(&batch); if (unlikely(!port)) return; @@ -622,11 +627,9 @@ static void mtip_timeout_function(unsigned long int data) writel(1 << bit, port->completed[group]); /* Call the async completion callback. */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, - -EIO); - command->async_callback = NULL; - command->comp_func = NULL; + if (likely(command->bio)) + bio_endio_batch(command->bio, -EIO, &batch); + command->bio = NULL; /* Unmap the DMA scatter list entries */ dma_unmap_sg(&port->dd->pdev->dev, @@ -645,6 +648,8 @@ static void mtip_timeout_function(unsigned long int data) } } + batch_complete(&batch); + if (cmdto_cnt) { print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { @@ -695,7 +700,8 @@ static void mtip_timeout_function(unsigned long int data) static void mtip_async_complete(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { struct mtip_cmd *command; struct driver_data *dd = data; @@ -712,11 +718,10 @@ static void mtip_async_complete(struct mtip_port *port, } /* Upper layer callback */ - if (likely(command->async_callback)) - command->async_callback(command->async_data, cb_status); + if (likely(command->bio)) + bio_endio_batch(command->bio, cb_status, batch); - command->async_callback = NULL; - command->comp_func = NULL; + command->bio = NULL; /* Unmap the DMA scatter list entries */ dma_unmap_sg(&dd->pdev->dev, @@ -752,24 +757,22 @@ static void mtip_async_complete(struct mtip_port *port, static void mtip_completion(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { - struct mtip_cmd *command = &port->commands[tag]; struct completion *waiting = data; if (unlikely(status == PORT_IRQ_TF_ERR)) dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); - command->async_callback = NULL; - command->comp_func = NULL; - complete(waiting); } static void mtip_null_completion(struct mtip_port *port, int tag, void *data, - int status) + int status, + struct batch_complete *batch) { return; } @@ -798,6 +801,7 @@ static void mtip_handle_tfe(struct driver_data *dd) unsigned char *buf; char *fail_reason = NULL; int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0; + struct batch_complete batch; dev_warn(&dd->pdev->dev, "Taskfile error\n"); @@ -815,13 +819,14 @@ static void mtip_handle_tfe(struct driver_data *dd) atomic_inc(&cmd->active); /* active > 1 indicates error */ if (cmd->comp_data && cmd->comp_func) { cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, PORT_IRQ_TF_ERR); + cmd->comp_data, PORT_IRQ_TF_ERR, NULL); } goto handle_tfe_exit; } /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + batch_complete_init(&batch); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -848,7 +853,7 @@ static void mtip_handle_tfe(struct driver_data *dd) cmd->comp_func(port, tag, cmd->comp_data, - 0); + 0, &batch); } else { dev_err(&port->dd->pdev->dev, "Missing completion func for tag %d", @@ -861,6 +866,7 @@ static void mtip_handle_tfe(struct driver_data *dd) } } } + batch_complete(&batch); print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt); @@ -902,6 +908,7 @@ static void mtip_handle_tfe(struct driver_data *dd) /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + batch_complete_init(&batch); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -935,7 +942,7 @@ static void mtip_handle_tfe(struct driver_data *dd) if (cmd->comp_func) { cmd->comp_func(port, tag, cmd->comp_data, - -ENODATA); + -ENODATA, &batch); } continue; } @@ -965,13 +972,15 @@ static void mtip_handle_tfe(struct driver_data *dd) port, tag, cmd->comp_data, - PORT_IRQ_TF_ERR); + PORT_IRQ_TF_ERR, &batch); else dev_warn(&port->dd->pdev->dev, "Bad completion for tag %d\n", tag); } } + + batch_complete(&batch); print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); handle_tfe_exit: @@ -992,6 +1001,9 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, struct driver_data *dd = port->dd; int tag, bit; struct mtip_cmd *command; + struct batch_complete batch; + + batch_complete_init(&batch); if (!completed) { WARN_ON_ONCE(!completed); @@ -1016,7 +1028,8 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, port, tag, command->comp_data, - 0); + 0, + &batch); } else { dev_warn(&dd->pdev->dev, "Null completion " @@ -1026,13 +1039,16 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, if (mtip_check_surprise_removal( dd->pdev)) { mtip_command_cleanup(dd); - return; + goto out; } } } completed >>= 1; } +out: + batch_complete(&batch); + /* If last, re-enable interrupts */ if (atomic_dec_return(&dd->irq_workers_active) == 0) writel(0xffffffff, dd->mmio + HOST_IRQ_STAT); @@ -1053,7 +1069,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd->comp_data, - 0); + 0, NULL); return; } } @@ -2561,8 +2577,8 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * None */ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, - int nsect, int nents, int tag, void *callback, - void *data, int dir, int unaligned) + int nsect, int nents, int tag, + struct bio *bio, int dir, int unaligned) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; @@ -2621,12 +2637,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->comp_func = mtip_async_complete; command->direction = dma_dir; - /* - * Set the completion function and data for the command passed - * from the upper layer. - */ - command->async_data = data; - command->async_callback = callback; + command->bio = bio; /* * To prevent this command from being issued @@ -3934,7 +3945,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) bio_sectors(bio), nents, tag, - bio_endio, bio, bio_data_dir(bio), unaligned); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3bb8a295fbe4..7a2ddfdde9f4 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -328,11 +328,9 @@ struct mtip_cmd { void (*comp_func)(struct mtip_port *port, int tag, void *data, - int status); - /* Additional callback function that may be called by comp_func() */ - void (*async_callback)(void *data, int status); - - void *async_data; /* Addl. data passed to async_callback() */ + int status, + struct batch_complete *batch); + struct bio *bio; int scatter_ents; /* Number of scatter list entries used */ diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index dcb18a3d3314..ce42c14808ac 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -980,7 +980,8 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec) } } -static void pkt_end_io_read(struct bio *bio, int err) +static void pkt_end_io_read(struct bio *bio, int err, + struct batch_complete *batch) { struct packet_data *pkt = bio->bi_private; struct pktcdvd_device *pd = pkt->pd; @@ -998,7 +999,8 @@ static void pkt_end_io_read(struct bio *bio, int err) pkt_bio_finished(pd); } -static void pkt_end_io_packet_write(struct bio *bio, int err) +static void pkt_end_io_packet_write(struct bio *bio, int err, + struct batch_complete *batch) { struct packet_data *pkt = bio->bi_private; struct pktcdvd_device *pd = pkt->pd; @@ -2339,7 +2341,8 @@ static int pkt_close(struct gendisk *disk, fmode_t mode) } -static void pkt_end_io_read_cloned(struct bio *bio, int err) +static void pkt_end_io_read_cloned(struct bio *bio, int err, + struct batch_complete *batch) { struct packet_stacked_data *psd = bio->bi_private; struct pktcdvd_device *pd = psd->pd; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 758f2ac878cf..deb722d63d68 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -775,7 +775,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) if (intr & ERROR_INTR) { n = fs->scount - 1 - resid / 512; if (n > 0) { - blk_update_request(req, 0, n << 9); + blk_update_request(req, 0, n << 9, NULL); fs->req_sector += n; } if (fs->retries < 5) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 64723953e1c9..49d0ec2472c5 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -217,7 +217,8 @@ static void virtblk_bio_send_flush_work(struct work_struct *work) virtblk_bio_send_flush(vbr); } -static inline void virtblk_request_done(struct virtblk_req *vbr) +static inline void virtblk_request_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; struct request *req = vbr->req; @@ -231,11 +232,12 @@ static inline void virtblk_request_done(struct virtblk_req *vbr) req->errors = (error != 0); } - __blk_end_request_all(req, error); + blk_end_request_all_batch(req, error, batch); mempool_free(vbr, vblk->pool); } -static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) +static inline void virtblk_bio_flush_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; @@ -244,12 +246,13 @@ static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) INIT_WORK(&vbr->work, virtblk_bio_send_data_work); queue_work(virtblk_wq, &vbr->work); } else { - bio_endio(vbr->bio, virtblk_result(vbr)); + bio_endio_batch(vbr->bio, virtblk_result(vbr), batch); mempool_free(vbr, vblk->pool); } } -static inline void virtblk_bio_data_done(struct virtblk_req *vbr) +static inline void virtblk_bio_data_done(struct virtblk_req *vbr, + struct batch_complete *batch) { struct virtio_blk *vblk = vbr->vblk; @@ -259,17 +262,18 @@ static inline void virtblk_bio_data_done(struct virtblk_req *vbr) INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); queue_work(virtblk_wq, &vbr->work); } else { - bio_endio(vbr->bio, virtblk_result(vbr)); + bio_endio_batch(vbr->bio, virtblk_result(vbr), batch); mempool_free(vbr, vblk->pool); } } -static inline void virtblk_bio_done(struct virtblk_req *vbr) +static inline void virtblk_bio_done(struct virtblk_req *vbr, + struct batch_complete *batch) { if (unlikely(vbr->flags & VBLK_IS_FLUSH)) - virtblk_bio_flush_done(vbr); + virtblk_bio_flush_done(vbr, batch); else - virtblk_bio_data_done(vbr); + virtblk_bio_data_done(vbr, batch); } static void virtblk_done(struct virtqueue *vq) @@ -279,16 +283,19 @@ static void virtblk_done(struct virtqueue *vq) struct virtblk_req *vbr; unsigned long flags; unsigned int len; + struct batch_complete batch; + + batch_complete_init(&batch); spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { if (vbr->bio) { - virtblk_bio_done(vbr); + virtblk_bio_done(vbr, &batch); bio_done = true; } else { - virtblk_request_done(vbr); + virtblk_request_done(vbr, &batch); req_done = true; } } @@ -298,6 +305,8 @@ static void virtblk_done(struct virtqueue *vq) blk_start_queue(vblk->disk->queue); spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); + batch_complete(&batch); + if (bio_done) wake_up(&vblk->queue_wait); } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index dd5b2fed97e9..990c1d81849f 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -741,7 +741,8 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) /* * bio callback. */ -static void end_block_io_op(struct bio *bio, int error) +static void end_block_io_op(struct bio *bio, int error, + struct batch_complete *batch) { __end_block_io_op(bio->bi_private, error); bio_put(bio); diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 2c644afbcdd4..1ccbe9482faa 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -28,6 +28,7 @@ #include <linux/pfn.h> #include <linux/export.h> #include <linux/io.h> +#include <linux/aio.h> #include <asm/uaccess.h> @@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf, return count; } +static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return 0; +} + +static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return iov_length(iov, nr_segs); +} + static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) { @@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf, return written ? written : -EFAULT; } +static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + size_t written = 0; + unsigned long i; + ssize_t ret; + + for (i = 0; i < nr_segs; i++) { + ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len, + &pos); + if (ret < 0) + break; + written += ret; + } + + return written ? written : -EFAULT; +} + static int mmap_zero(struct file *file, struct vm_area_struct *vma) { #ifndef CONFIG_MMU @@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp) #define full_lseek null_lseek #define write_zero write_null #define read_full read_zero +#define aio_write_zero aio_write_null #define open_mem open_port #define open_kmem open_mem #define open_oldmem open_mem @@ -766,6 +798,8 @@ static const struct file_operations null_fops = { .llseek = null_lseek, .read = read_null, .write = write_null, + .aio_read = aio_read_null, + .aio_write = aio_write_null, .splice_write = splice_write_null, }; @@ -782,6 +816,8 @@ static const struct file_operations zero_fops = { .llseek = zero_lseek, .read = read_zero, .write = write_zero, + .aio_read = aio_read_zero, + .aio_write = aio_write_zero, .mmap = mmap_zero, }; diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c index c68969708068..04e6d6a27994 100644 --- a/drivers/char/mwave/tp3780i.c +++ b/drivers/char/mwave/tp3780i.c @@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities PRINTK_2(TRACE_TP3780I, "tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData); + memset(pAbilities, 0, sizeof(*pAbilities)); /* fill out standard constant fields */ pAbilities->instr_per_sec = pBDData->rDspSettings.uIps; pAbilities->data_size = pBDData->rDspSettings.uDStoreSize; diff --git a/drivers/char/random.c b/drivers/char/random.c index cd9a6211dcad..35487e8ded59 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -865,16 +865,24 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min, if (r->entropy_count / 8 < min + reserved) { nbytes = 0; } else { + int entropy_count, orig; +retry: + entropy_count = orig = ACCESS_ONCE(r->entropy_count); /* If limited, never pull more than available */ - if (r->limit && nbytes + reserved >= r->entropy_count / 8) - nbytes = r->entropy_count/8 - reserved; - - if (r->entropy_count / 8 >= nbytes + reserved) - r->entropy_count -= nbytes*8; - else - r->entropy_count = reserved; + if (r->limit && nbytes + reserved >= entropy_count / 8) + nbytes = entropy_count/8 - reserved; + + if (entropy_count / 8 >= nbytes + reserved) { + entropy_count -= nbytes*8; + if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) + goto retry; + } else { + entropy_count = reserved; + if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) + goto retry; + } - if (r->entropy_count < random_write_wakeup_thresh) + if (entropy_count < random_write_wakeup_thresh) wakeup_write = 1; } @@ -957,10 +965,23 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, { ssize_t ret = 0, i; __u8 tmp[EXTRACT_SIZE]; + unsigned long flags; /* if last_data isn't primed, we need EXTRACT_SIZE extra bytes */ - if (fips_enabled && !r->last_data_init) - nbytes += EXTRACT_SIZE; + if (fips_enabled) { + spin_lock_irqsave(&r->lock, flags); + if (!r->last_data_init) { + r->last_data_init = true; + spin_unlock_irqrestore(&r->lock, flags); + trace_extract_entropy(r->name, EXTRACT_SIZE, + r->entropy_count, _RET_IP_); + xfer_secondary_pool(r, EXTRACT_SIZE); + extract_buf(r, tmp); + spin_lock_irqsave(&r->lock, flags); + memcpy(r->last_data, tmp, EXTRACT_SIZE); + } + spin_unlock_irqrestore(&r->lock, flags); + } trace_extract_entropy(r->name, nbytes, r->entropy_count, _RET_IP_); xfer_secondary_pool(r, nbytes); @@ -970,19 +991,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, extract_buf(r, tmp); if (fips_enabled) { - unsigned long flags; - - - /* prime last_data value if need be, per fips 140-2 */ - if (!r->last_data_init) { - spin_lock_irqsave(&r->lock, flags); - memcpy(r->last_data, tmp, EXTRACT_SIZE); - r->last_data_init = true; - nbytes -= EXTRACT_SIZE; - spin_unlock_irqrestore(&r->lock, flags); - extract_buf(r, tmp); - } - spin_lock_irqsave(&r->lock, flags); if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) panic("Hardware RNG duplicated output!\n"); diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index b78cbe74dadf..442a15443fa5 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -399,6 +399,14 @@ static void drm_fb_helper_dpms(struct fb_info *info, int dpms_mode) return; /* + * fbdev->blank can be called from irq context in case of a panic. + * Since we already have our own special panic handler which will + * restore the fbdev console mode completely, just bail out early. + */ + if (oops_in_progress) + return; + + /* * For each CRTC in this fb, turn the connectors on/off. */ drm_modeset_lock_all(dev); diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index aed8afee56da..6d7f453b4d05 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -40,6 +40,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/aio.h> #include <linux/jiffies.h> #include <linux/cpu.h> #include <asm/pgtable.h> diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 4f7aa301b3b1..b56c9428f3c5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -39,7 +39,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/io.h> -#include <linux/uio.h> +#include <linux/aio.h> #include <linux/jiffies.h> #include <asm/pgtable.h> #include <linux/delay.h> diff --git a/drivers/leds/leds-ot200.c b/drivers/leds/leds-ot200.c index ee14662ed5ce..98cae529373f 100644 --- a/drivers/leds/leds-ot200.c +++ b/drivers/leds/leds-ot200.c @@ -47,37 +47,37 @@ static struct ot200_led leds[] = { { .name = "led_1", .port = 0x49, - .mask = BIT(7), + .mask = BIT(6), }, { .name = "led_2", .port = 0x49, - .mask = BIT(6), + .mask = BIT(5), }, { .name = "led_3", .port = 0x49, - .mask = BIT(5), + .mask = BIT(4), }, { .name = "led_4", .port = 0x49, - .mask = BIT(4), + .mask = BIT(3), }, { .name = "led_5", .port = 0x49, - .mask = BIT(3), + .mask = BIT(2), }, { .name = "led_6", .port = 0x49, - .mask = BIT(2), + .mask = BIT(1), }, { .name = "led_7", .port = 0x49, - .mask = BIT(1), + .mask = BIT(0), } }; diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08b..1f75edd7f329 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -156,7 +156,8 @@ static void discard_finish(struct work_struct *w) closure_put(&ca->set->cl); } -static void discard_endio(struct bio *bio, int error) +static void discard_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct discard *d = container_of(bio, struct discard, bio); schedule_work(&d->work); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..36688d6bb061 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -134,7 +134,8 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +static void btree_bio_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io.cl); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd5204924e..3a32b063040b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -177,7 +177,8 @@ void bch_btree_verify(struct btree *b, struct bset *new) mutex_unlock(&b->c->verify_lock); } -static void data_verify_endio(struct bio *bio, int error) +static void data_verify_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; closure_put(cl); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..29f344b9e408 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -9,7 +9,8 @@ #include "bset.h" #include "debug.h" -static void bch_bi_idx_hack_endio(struct bio *bio, int error) +static void bch_bi_idx_hack_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct bio *p = bio->bi_private; @@ -206,7 +207,8 @@ static void bch_bio_submit_split_done(struct closure *cl) mempool_free(s, s->p->bio_split_hook); } -static void bch_bio_submit_split_endio(struct bio *bio, int error) +static void bch_bio_submit_split_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..bff194bb3e08 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -22,7 +22,8 @@ * bit. */ -static void journal_read_endio(struct bio *bio, int error) +static void journal_read_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; closure_put(cl); @@ -390,7 +391,8 @@ found: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio, int error) +static void journal_discard_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct journal_device *ja = container_of(bio, struct journal_device, discard_bio); @@ -535,7 +537,8 @@ void bch_journal_next(struct journal *j) pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); } -static void journal_write_endio(struct bio *bio, int error) +static void journal_write_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct journal_write *w = bio->bi_private; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512c972e..8bf7ae12d4b0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -61,7 +61,8 @@ static void write_moving_finish(struct closure *cl) closure_return_with_destructor(cl, moving_io_destructor); } -static void read_moving_endio(struct bio *bio, int error) +static void read_moving_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct moving_io *io = container_of(bio->bi_private, struct moving_io, s.cl); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..bc837edd52ea 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -456,7 +456,8 @@ static void bch_insert_data_error(struct closure *cl) bch_journal(cl); } -static void bch_insert_data_endio(struct bio *bio, int error) +static void bch_insert_data_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct btree_op *op = container_of(cl, struct btree_op, cl); @@ -621,7 +622,8 @@ void bch_btree_insert_async(struct closure *cl) /* Common code for the make_request functions */ -static void request_endio(struct bio *bio, int error) +static void request_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct closure *cl = bio->bi_private; @@ -636,7 +638,8 @@ static void request_endio(struct bio *bio, int error) closure_put(cl); } -void bch_cache_read_endio(struct bio *bio, int error) +void bch_cache_read_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct bbio *b = container_of(bio, struct bbio, bio); struct closure *cl = bio->bi_private; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab5707c..3b794625c4c1 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -29,11 +29,10 @@ struct search { struct btree_op op; }; -void bch_cache_read_endio(struct bio *, int); +void bch_cache_read_endio(struct bio *, int, struct batch_complete *batch); int bch_get_congested(struct cache_set *); void bch_insert_data(struct closure *cl); void bch_btree_insert_async(struct closure *); -void bch_cache_read_endio(struct bio *, int); void bch_open_buckets_free(struct cache_set *); int bch_open_buckets_alloc(struct cache_set *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c8046bc4aa57..76c7f6cc5c56 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -224,7 +224,8 @@ err: return err; } -static void write_bdev_super_endio(struct bio *bio, int error) +static void write_bdev_super_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct cached_dev *dc = bio->bi_private; /* XXX: error checking */ @@ -285,7 +286,8 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_return(cl); } -static void write_super_endio(struct bio *bio, int error) +static void write_super_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct cache *ca = bio->bi_private; @@ -326,7 +328,7 @@ void bcache_write_super(struct cache_set *c) /* UUID io */ -static void uuid_endio(struct bio *bio, int error) +static void uuid_endio(struct bio *bio, int error, struct batch_complete *batch) { struct closure *cl = bio->bi_private; struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); @@ -490,7 +492,8 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c) * disk. */ -static void prio_endio(struct bio *bio, int error) +static void prio_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct cache *ca = bio->bi_private; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 93e7e31a4bd3..daf9347833be 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -253,7 +253,8 @@ static void write_dirty_finish(struct closure *cl) closure_return_with_destructor(cl, dirty_io_destructor); } -static void dirty_endio(struct bio *bio, int error) +static void dirty_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; @@ -281,7 +282,8 @@ static void write_dirty(struct closure *cl) continue_at(cl, write_dirty_finish, dirty_wq); } -static void read_dirty_endio(struct bio *bio, int error) +static void read_dirty_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; @@ -289,7 +291,7 @@ static void read_dirty_endio(struct bio *bio, int error) bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), error, "reading dirty data from cache"); - dirty_endio(bio, error); + dirty_endio(bio, error, NULL); } static void read_dirty_submit(struct closure *cl) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0387e05cdb98..d489dfd306e6 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -494,7 +494,7 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_end_io(&b->bio, error ? -EIO : 0); + b->bio.bi_end_io(&b->bio, error ? -EIO : 0, NULL); } static void use_dmio(struct dm_buffer *b, int rw, sector_t block, @@ -525,7 +525,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, r = dm_io(&io_req, 1, ®ion, NULL); if (r) - end_io(&b->bio, r); + end_io(&b->bio, r, NULL); } static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, @@ -592,7 +592,8 @@ static void submit_io(struct dm_buffer *b, int rw, sector_t block, * Set the error, clear B_WRITING bit and wake anyone who was waiting on * it. */ -static void write_endio(struct bio *bio, int error) +static void write_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); @@ -965,7 +966,7 @@ found_buffer: * The endio routine for reading: set the error, clear the bit and wake up * anyone waiting on the buffer. */ -static void read_endio(struct bio *bio, int error) +static void read_endio(struct bio *bio, int error, struct batch_complete *batch) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index df44b60e66f2..53fb7b25d7eb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -653,7 +653,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) wake_worker(cache); } -static void writethrough_endio(struct bio *bio, int err) +static void writethrough_endio(struct bio *bio, int err, + struct batch_complete *batch) { struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); bio->bi_end_io = pb->saved_bi_end_io; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6d2d41ae9e32..ec0e3c0883b5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -929,7 +929,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * The work is done per CPU global for all dm-crypt instances. * They should not depend on each other and do not block. */ -static void crypt_endio(struct bio *clone, int error) +static void crypt_endio(struct bio *clone, int error, + struct batch_complete *batch) { struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea49834377c8..a727b267f86d 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -136,7 +136,7 @@ static void dec_count(struct io *io, unsigned int region, int error) } } -static void endio(struct bio *bio, int error) +static void endio(struct bio *bio, int error, struct batch_complete *batch) { struct io *io; unsigned region; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c434e5aab2df..fb3ea3c63fd1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1486,7 +1486,8 @@ static void start_copy(struct dm_snap_pending_exception *pe) dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } -static void full_bio_end_io(struct bio *bio, int error) +static void full_bio_end_io(struct bio *bio, int error, + struct batch_complete *batch) { void *callback_data = bio->bi_private; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 759cffc45cab..0390a0370503 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -553,7 +553,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) spin_unlock_irqrestore(&pool->lock, flags); } -static void overwrite_endio(struct bio *bio, int err) +static void overwrite_endio(struct bio *bio, int err, + struct batch_complete *batch) { unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index b948fd864d45..b373bb7d1c2d 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -413,7 +413,8 @@ static void verity_work(struct work_struct *w) verity_finish_io(io, verity_verify_io(io)); } -static void verity_end_io(struct bio *bio, int error) +static void verity_end_io(struct bio *bio, int error, + struct batch_complete *batch) { struct dm_verity_io *io = bio->bi_private; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9a0bdad9ad8f..0baa5e75683e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -617,7 +617,8 @@ static void dec_pending(struct dm_io *io, int error) } } -static void clone_endio(struct bio *bio, int error) +static void clone_endio(struct bio *bio, int error, + struct batch_complete *batch) { int r = 0; struct dm_target_io *tio = bio->bi_private; @@ -652,7 +653,8 @@ static void clone_endio(struct bio *bio, int error) /* * Partial completion handling for request-based dm */ -static void end_clone_bio(struct bio *clone, int error) +static void end_clone_bio(struct bio *clone, int error, + struct batch_complete *batch) { struct dm_rq_clone_bio_info *info = clone->bi_private; struct dm_rq_target_io *tio = info->tio; @@ -696,7 +698,7 @@ static void end_clone_bio(struct bio *clone, int error) * Do not use blk_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - blk_update_request(tio->orig, 0, nr_bytes); + blk_update_request(tio->orig, 0, nr_bytes, NULL); } /* diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 3193aefe982b..ac8af5253fb6 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -70,7 +70,8 @@ #include <linux/seq_file.h> -static void faulty_fail(struct bio *bio, int error) +static void faulty_fail(struct bio *bio, int error, + struct batch_complete *batch) { struct bio *b = bio->bi_private; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a11e3e49a9b..e2fc596504ec 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -379,7 +379,8 @@ EXPORT_SYMBOL(mddev_congested); * Generic flush handling for md */ -static void md_end_flush(struct bio *bio, int err) +static void md_end_flush(struct bio *bio, int err, + struct batch_complete *batch) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; @@ -756,7 +757,8 @@ void md_rdev_clear(struct md_rdev *rdev) } EXPORT_SYMBOL_GPL(md_rdev_clear); -static void super_written(struct bio *bio, int error) +static void super_written(struct bio *bio, int error, + struct batch_complete *batch) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; @@ -807,7 +809,8 @@ void md_super_wait(struct mddev *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) +static void bi_complete(struct bio *bio, int error, + struct batch_complete *batch) { complete((struct completion*)bio->bi_private); } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 1642eae75a33..fecad70f53f6 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -83,7 +83,8 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) mempool_free(mp_bh, conf->pool); } -static void multipath_end_request(struct bio *bio, int error) +static void multipath_end_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh *mp_bh = bio->bi_private; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 55951182af73..d55d0d9ab360 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -294,7 +294,8 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) return mirror; } -static void raid1_end_read_request(struct bio *bio, int error) +static void raid1_end_read_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; @@ -379,7 +380,8 @@ static void r1_bio_write_done(struct r1bio *r1_bio) } } -static void raid1_end_write_request(struct bio *bio, int error) +static void raid1_end_write_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; @@ -1612,7 +1614,8 @@ abort: } -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio, int error, + struct batch_complete *batch) { struct r1bio *r1_bio = bio->bi_private; @@ -1630,7 +1633,8 @@ static void end_sync_read(struct bio *bio, int error) reschedule_retry(r1_bio); } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 59d4daa5f4c7..6c634061ec71 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -101,7 +101,8 @@ static int enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); -static void end_reshape_write(struct bio *bio, int error); +static void end_reshape_write(struct bio *bio, int error, + struct batch_complete *batch); static void end_reshape(struct r10conf *conf); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -358,7 +359,8 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, return r10_bio->devs[slot].devnum; } -static void raid10_end_read_request(struct bio *bio, int error) +static void raid10_end_read_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; @@ -441,7 +443,8 @@ static void one_write_done(struct r10bio *r10_bio) } } -static void raid10_end_write_request(struct bio *bio, int error) +static void raid10_end_write_request(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; @@ -1912,7 +1915,8 @@ abort: } -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio, int error, + struct batch_complete *batch) { struct r10bio *r10_bio = bio->bi_private; struct r10conf *conf = r10_bio->mddev->private; @@ -1973,7 +1977,8 @@ static void end_sync_request(struct r10bio *r10_bio) } } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; @@ -4598,7 +4603,8 @@ static int handle_reshape_read_error(struct mddev *mddev, return 0; } -static void end_reshape_write(struct bio *bio, int error) +static void end_reshape_write(struct bio *bio, int error, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r10bio *r10_bio = bio->bi_private; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0e8c9cedf7f0..f745a1b86e1d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -593,9 +593,11 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) } static void -raid5_end_read_request(struct bio *bi, int error); +raid5_end_read_request(struct bio *bi, int error, + struct batch_complete *batch); static void -raid5_end_write_request(struct bio *bi, int error); +raid5_end_write_request(struct bio *bi, int error, + struct batch_complete *batch); static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { @@ -1774,7 +1776,8 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } -static void raid5_end_read_request(struct bio * bi, int error) +static void raid5_end_read_request(struct bio *bi, int error, + struct batch_complete *batch) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; @@ -1894,7 +1897,8 @@ static void raid5_end_read_request(struct bio * bi, int error) release_stripe(sh); } -static void raid5_end_write_request(struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi, int error, + struct batch_complete *batch) { struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; @@ -3972,7 +3976,8 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * first). * If the read failed.. */ -static void raid5_align_endio(struct bio *bi, int error) +static void raid5_align_endio(struct bio *bi, int error, + struct batch_complete *batch) { struct bio* raid_bi = bi->bi_private; struct mddev *mddev; diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 40649a8bf390..6b0dc131b20e 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4085,7 +4085,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev) if (!cp->csk_tbl) return -ENOMEM; - port_id = random32(); + port_id = prandom_u32(); port_id %= CNIC_LOCAL_PORT_RANGE; if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, CNIC_LOCAL_PORT_MIN, port_id)) { @@ -4145,7 +4145,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev) { u32 seed; - seed = random32(); + seed = prandom_u32(); cnic_ctx_wr(dev, 45, 0, seed); return 0; } diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 49b8b58fc5c6..484f77ec2ce1 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -449,7 +449,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat) if ((--bc->hdlctx.slotcnt) > 0) return 0; bc->hdlctx.slotcnt = bc->ch_params.slottime; - if ((random32() % 256) > bc->ch_params.ppersist) + if ((prandom_u32() % 256) > bc->ch_params.ppersist) return 0; } } diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index a4a3516b6bbf..3169252613fa 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -389,7 +389,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s) if ((--s->hdlctx.slotcnt) > 0) return; s->hdlctx.slotcnt = s->ch_params.slottime; - if ((random32() % 256) > s->ch_params.ppersist) + if ((prandom_u32() % 256) > s->ch_params.ppersist) return; start_tx(dev, s); } diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index b2d863f2ea42..0721e72f9299 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -638,7 +638,7 @@ static void yam_arbitrate(struct net_device *dev) yp->slotcnt = yp->slot / 10; /* is random > persist ? */ - if ((random32() % 256) > yp->pers) + if ((prandom_u32() % 256) > yp->pers) return; yam_start_tx(dev, yp); diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c index 9eabfaa22f3e..5ca14d463ba7 100644 --- a/drivers/net/team/team_mode_random.c +++ b/drivers/net/team/team_mode_random.c @@ -18,7 +18,7 @@ static u32 random_N(unsigned int N) { - return reciprocal_divide(random32(), N); + return reciprocal_divide(prandom_u32(), N); } static bool rnd_transmit(struct team *team, struct sk_buff *skb) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 2b90da0d85f3..e7a1a4770996 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1117,7 +1117,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work) if (afx_hdl->is_listen && afx_hdl->my_listen_chan) /* 100ms ~ 300ms */ err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, - 100 * (1 + (random32() % 3))); + 100 * (1 + prandom_u32() % 3)); else err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan); diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index a0cb0770d319..d3c8ece980d8 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c @@ -216,7 +216,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, mwifiex_form_mgmt_frame(skb, buf, len); mwifiex_queue_tx_pkt(priv, skb); - *cookie = random32() | 1; + *cookie = prandom_u32() | 1; cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC); wiphy_dbg(wiphy, "info: management frame transmitted\n"); @@ -271,7 +271,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy, duration); if (!ret) { - *cookie = random32() | 1; + *cookie = prandom_u32() | 1; priv->roc_cfg.cookie = *cookie; priv->roc_cfg.chan = *chan; diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig index 6194d35ebb97..8b7f92d5e14c 100644 --- a/drivers/rapidio/Kconfig +++ b/drivers/rapidio/Kconfig @@ -47,4 +47,32 @@ config RAPIDIO_DEBUG If you are unsure about this, say N here. +choice + prompt "Enumeration method" + depends on RAPIDIO + help + There are different enumeration and discovery mechanisms offered + for RapidIO subsystem. You may select single built-in method or + or any number of methods to be built as modules. + Selecting a built-in method disables use of loadable methods. + + If unsure, select Basic built-in. + +config RAPIDIO_ENUM_BASIC + tristate "Basic" + help + This option includes basic RapidIO fabric enumeration and discovery + mechanism similar to one described in RapidIO specification Annex 1. + +endchoice + +config RAPIDIO_ENUM_AUTO + bool "Automatic RapidIO enumeration/discovery start" + depends on RAPIDIO && RAPIDIO_ENUM_BASIC = y + help + Say Y if you want RapidIO subsystem to start fabric enumeration and + discovery automatically during kernel initialization time. + If you are unsure, say N here. You will be able to start RapidIO + fabric enumeration or discovery later by a user request. + source "drivers/rapidio/switches/Kconfig" diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile index ec3fb8121004..3036702ffe8b 100644 --- a/drivers/rapidio/Makefile +++ b/drivers/rapidio/Makefile @@ -1,7 +1,8 @@ # # Makefile for RapidIO interconnect services # -obj-y += rio.o rio-access.o rio-driver.o rio-scan.o rio-sysfs.o +obj-y += rio.o rio-access.o rio-driver.o rio-sysfs.o +obj-$(CONFIG_RAPIDIO_ENUM_BASIC) += rio-scan.o obj-$(CONFIG_RAPIDIO) += switches/ obj-$(CONFIG_RAPIDIO) += devices/ diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 0f4a53bdaa3c..a0c875563d76 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -164,6 +164,13 @@ void rio_unregister_driver(struct rio_driver *rdrv) driver_unregister(&rdrv->driver); } +void rio_attach_device(struct rio_dev *rdev) +{ + rdev->dev.bus = &rio_bus_type; + rdev->dev.parent = &rio_bus; +} +EXPORT_SYMBOL_GPL(rio_attach_device); + /** * rio_match_bus - Tell if a RIO device structure has a matching RIO driver device id structure * @dev: the standard device structure to match against @@ -200,6 +207,7 @@ struct bus_type rio_bus_type = { .name = "rapidio", .match = rio_match_bus, .dev_attrs = rio_dev_attrs, + .bus_attrs = rio_bus_attrs, .probe = rio_device_probe, .remove = rio_device_remove, }; diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index a965acd3c0e4..3e371b1d0aab 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -37,12 +37,8 @@ #include "rio.h" -LIST_HEAD(rio_devices); - static void rio_init_em(struct rio_dev *rdev); -DEFINE_SPINLOCK(rio_global_list_lock); - static int next_destid = 0; static int next_comptag = 1; @@ -327,127 +323,6 @@ static int rio_is_switch(struct rio_dev *rdev) } /** - * rio_switch_init - Sets switch operations for a particular vendor switch - * @rdev: RIO device - * @do_enum: Enumeration/Discovery mode flag - * - * Searches the RIO switch ops table for known switch types. If the vid - * and did match a switch table entry, then call switch initialization - * routine to setup switch-specific routines. - */ -static void rio_switch_init(struct rio_dev *rdev, int do_enum) -{ - struct rio_switch_ops *cur = __start_rio_switch_ops; - struct rio_switch_ops *end = __end_rio_switch_ops; - - while (cur < end) { - if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) { - pr_debug("RIO: calling init routine for %s\n", - rio_name(rdev)); - cur->init_hook(rdev, do_enum); - break; - } - cur++; - } - - if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) { - pr_debug("RIO: adding STD routing ops for %s\n", - rio_name(rdev)); - rdev->rswitch->add_entry = rio_std_route_add_entry; - rdev->rswitch->get_entry = rio_std_route_get_entry; - rdev->rswitch->clr_table = rio_std_route_clr_table; - } - - if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry) - printk(KERN_ERR "RIO: missing routing ops for %s\n", - rio_name(rdev)); -} - -/** - * rio_add_device- Adds a RIO device to the device model - * @rdev: RIO device - * - * Adds the RIO device to the global device list and adds the RIO - * device to the RIO device list. Creates the generic sysfs nodes - * for an RIO device. - */ -static int rio_add_device(struct rio_dev *rdev) -{ - int err; - - err = device_add(&rdev->dev); - if (err) - return err; - - spin_lock(&rio_global_list_lock); - list_add_tail(&rdev->global_list, &rio_devices); - spin_unlock(&rio_global_list_lock); - - rio_create_sysfs_dev_files(rdev); - - return 0; -} - -/** - * rio_enable_rx_tx_port - enable input receiver and output transmitter of - * given port - * @port: Master port associated with the RIO network - * @local: local=1 select local port otherwise a far device is reached - * @destid: Destination ID of the device to check host bit - * @hopcount: Number of hops to reach the target - * @port_num: Port (-number on switch) to enable on a far end device - * - * Returns 0 or 1 from on General Control Command and Status Register - * (EXT_PTR+0x3C) - */ -inline int rio_enable_rx_tx_port(struct rio_mport *port, - int local, u16 destid, - u8 hopcount, u8 port_num) { -#ifdef CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS - u32 regval; - u32 ext_ftr_ptr; - - /* - * enable rx input tx output port - */ - pr_debug("rio_enable_rx_tx_port(local = %d, destid = %d, hopcount = " - "%d, port_num = %d)\n", local, destid, hopcount, port_num); - - ext_ftr_ptr = rio_mport_get_physefb(port, local, destid, hopcount); - - if (local) { - rio_local_read_config_32(port, ext_ftr_ptr + - RIO_PORT_N_CTL_CSR(0), - ®val); - } else { - if (rio_mport_read_config_32(port, destid, hopcount, - ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), ®val) < 0) - return -EIO; - } - - if (regval & RIO_PORT_N_CTL_P_TYP_SER) { - /* serial */ - regval = regval | RIO_PORT_N_CTL_EN_RX_SER - | RIO_PORT_N_CTL_EN_TX_SER; - } else { - /* parallel */ - regval = regval | RIO_PORT_N_CTL_EN_RX_PAR - | RIO_PORT_N_CTL_EN_TX_PAR; - } - - if (local) { - rio_local_write_config_32(port, ext_ftr_ptr + - RIO_PORT_N_CTL_CSR(0), regval); - } else { - if (rio_mport_write_config_32(port, destid, hopcount, - ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), regval) < 0) - return -EIO; - } -#endif - return 0; -} - -/** * rio_setup_device- Allocates and sets up a RIO device * @net: RIO network * @port: Master port to send transactions @@ -587,8 +462,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, rdev->destid); } - rdev->dev.bus = &rio_bus_type; - rdev->dev.parent = &rio_bus; + rio_attach_device(rdev); device_initialize(&rdev->dev); rdev->dev.release = rio_release_dev; @@ -1260,19 +1134,30 @@ static void rio_pw_enable(struct rio_mport *port, int enable) /** * rio_enum_mport- Start enumeration through a master port * @mport: Master port to send transactions + * @flags: Enumeration control flags * * Starts the enumeration process. If somebody has enumerated our * master port device, then give up. If not and we have an active * link, then start recursive peer enumeration. Returns %0 if * enumeration succeeds or %-EBUSY if enumeration fails. */ -int rio_enum_mport(struct rio_mport *mport) +int rio_enum_mport(struct rio_mport *mport, u32 flags) { struct rio_net *net = NULL; int rc = 0; printk(KERN_INFO "RIO: enumerate master port %d, %s\n", mport->id, mport->name); + + /* + * To avoid multiple start requests (repeat enumeration is not supported + * by this method) check if enumeration/discovery was performed for this + * mport: if mport was added into the list of mports for a net exit + * with error. + */ + if (mport->nnode.next || mport->nnode.prev) + return -EBUSY; + /* If somebody else enumerated our master port device, bail. */ if (rio_enum_host(mport) < 0) { printk(KERN_INFO @@ -1362,14 +1247,16 @@ static void rio_build_route_tables(struct rio_net *net) /** * rio_disc_mport- Start discovery through a master port * @mport: Master port to send transactions + * @flags: discovery control flags * * Starts the discovery process. If we have an active link, - * then wait for the signal that enumeration is complete. + * then wait for the signal that enumeration is complete (if wait + * is allowed). * When enumeration completion is signaled, start recursive * peer discovery. Returns %0 if discovery succeeds or %-EBUSY * on failure. */ -int rio_disc_mport(struct rio_mport *mport) +int rio_disc_mport(struct rio_mport *mport, u32 flags) { struct rio_net *net = NULL; unsigned long to_end; @@ -1379,6 +1266,11 @@ int rio_disc_mport(struct rio_mport *mport) /* If master port has an active link, allocate net and discover peers */ if (rio_mport_is_active(mport)) { + if (rio_enum_complete(mport)) + goto enum_done; + else if (flags & RIO_SCAN_ENUM_NO_WAIT) + return -EAGAIN; + pr_debug("RIO: wait for enumeration to complete...\n"); to_end = jiffies + CONFIG_RAPIDIO_DISC_TIMEOUT * HZ; @@ -1421,3 +1313,46 @@ enum_done: bail: return -EBUSY; } + +struct rio_scan rio_scan_ops = { + .enumerate = rio_enum_mport, + .discover = rio_disc_mport, +}; + + +#ifdef MODULE + +static bool scan; +module_param(scan, bool, 0); +MODULE_PARM_DESC(scan, "Start RapidIO network enumeration/discovery " + "(default = 1)"); + +/** + * rio_basic_attach: + * + * When this enumeration/discovery method is loaded as a module this function + * registers its specific enumeration and discover routines for all available + * RapidIO mport devices. The "scan" command line parameter controls ability of + * the module to start RapidIO enumeration/discovery automatically. + * + * Returns 0 for success or -EIO if unable to register itself. + * + * This enumeration/discovery method cannot be unloaded and therefore does not + * provide a matching cleanup_module routine. + */ + +int __init rio_basic_attach(void) +{ + if (rio_register_scan(RIO_MPORT_ANY, &rio_scan_ops)) + return -EIO; + if (scan) + rio_init_mports(); + return 0; +} + +module_init(rio_basic_attach); + +MODULE_DESCRIPTION("Basic RapidIO enumeration/discovery"); +MODULE_LICENSE("GPL"); + +#endif /* MODULE */ diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c index 4dbe360989be..66d4acd5e18f 100644 --- a/drivers/rapidio/rio-sysfs.c +++ b/drivers/rapidio/rio-sysfs.c @@ -285,3 +285,48 @@ void rio_remove_sysfs_dev_files(struct rio_dev *rdev) rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_REMOVE); } } + +static ssize_t bus_scan_store(struct bus_type *bus, const char *buf, + size_t count) +{ + long val; + struct rio_mport *port = NULL; + int rc; + + if (kstrtol(buf, 0, &val) < 0) + return -EINVAL; + + if (val == RIO_MPORT_ANY) { + rc = rio_init_mports(); + goto exit; + } + + if (val < 0 || val >= RIO_MAX_MPORTS) + return -EINVAL; + + port = rio_find_mport((int)val); + + if (!port) { + pr_debug("RIO: %s: mport_%d not available\n", + __func__, (int)val); + return -EINVAL; + } + + if (!port->nscan) + return -EINVAL; + + if (port->host_deviceid >= 0) + rc = port->nscan->enumerate(port, 0); + else + rc = port->nscan->discover(port, RIO_SCAN_ENUM_NO_WAIT); +exit: + if (!rc) + rc = count; + + return rc; +} + +struct bus_attribute rio_bus_attrs[] = { + __ATTR(scan, (S_IWUSR|S_IWGRP), NULL, bus_scan_store), + __ATTR_NULL +}; diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index d553b5d13722..3bfbcecaae0b 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -31,6 +31,9 @@ #include "rio.h" +LIST_HEAD(rio_devices); +DEFINE_SPINLOCK(rio_global_list_lock); + static LIST_HEAD(rio_mports); static unsigned char next_portid; static DEFINE_SPINLOCK(rio_mmap_lock); @@ -53,6 +56,32 @@ u16 rio_local_get_device_id(struct rio_mport *port) } /** + * rio_add_device- Adds a RIO device to the device model + * @rdev: RIO device + * + * Adds the RIO device to the global device list and adds the RIO + * device to the RIO device list. Creates the generic sysfs nodes + * for an RIO device. + */ +int rio_add_device(struct rio_dev *rdev) +{ + int err; + + err = device_add(&rdev->dev); + if (err) + return err; + + spin_lock(&rio_global_list_lock); + list_add_tail(&rdev->global_list, &rio_devices); + spin_unlock(&rio_global_list_lock); + + rio_create_sysfs_dev_files(rdev); + + return 0; +} +EXPORT_SYMBOL_GPL(rio_add_device); + +/** * rio_request_inb_mbox - request inbound mailbox service * @mport: RIO master port from which to allocate the mailbox resource * @dev_id: Device specific pointer to pass on event @@ -489,6 +518,7 @@ rio_mport_get_physefb(struct rio_mport *port, int local, return ext_ftr_ptr; } +EXPORT_SYMBOL_GPL(rio_mport_get_physefb); /** * rio_get_comptag - Begin or continue searching for a RIO device by component tag @@ -521,6 +551,7 @@ exit: spin_unlock(&rio_global_list_lock); return rdev; } +EXPORT_SYMBOL_GPL(rio_get_comptag); /** * rio_set_port_lockout - Sets/clears LOCKOUT bit (RIO EM 1.3) for a switch port. @@ -545,6 +576,107 @@ int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock) regval); return 0; } +EXPORT_SYMBOL_GPL(rio_set_port_lockout); + +/** + * rio_switch_init - Sets switch operations for a particular vendor switch + * @rdev: RIO device + * @do_enum: Enumeration/Discovery mode flag + * + * Searches the RIO switch ops table for known switch types. If the vid + * and did match a switch table entry, then call switch initialization + * routine to setup switch-specific routines. + */ +void rio_switch_init(struct rio_dev *rdev, int do_enum) +{ + struct rio_switch_ops *cur = __start_rio_switch_ops; + struct rio_switch_ops *end = __end_rio_switch_ops; + + while (cur < end) { + if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) { + pr_debug("RIO: calling init routine for %s\n", + rio_name(rdev)); + cur->init_hook(rdev, do_enum); + break; + } + cur++; + } + + if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) { + pr_debug("RIO: adding STD routing ops for %s\n", + rio_name(rdev)); + rdev->rswitch->add_entry = rio_std_route_add_entry; + rdev->rswitch->get_entry = rio_std_route_get_entry; + rdev->rswitch->clr_table = rio_std_route_clr_table; + } + + if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry) + printk(KERN_ERR "RIO: missing routing ops for %s\n", + rio_name(rdev)); +} +EXPORT_SYMBOL_GPL(rio_switch_init); + +/** + * rio_enable_rx_tx_port - enable input receiver and output transmitter of + * given port + * @port: Master port associated with the RIO network + * @local: local=1 select local port otherwise a far device is reached + * @destid: Destination ID of the device to check host bit + * @hopcount: Number of hops to reach the target + * @port_num: Port (-number on switch) to enable on a far end device + * + * Returns 0 or 1 from on General Control Command and Status Register + * (EXT_PTR+0x3C) + */ +int rio_enable_rx_tx_port(struct rio_mport *port, + int local, u16 destid, + u8 hopcount, u8 port_num) +{ +#ifdef CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS + u32 regval; + u32 ext_ftr_ptr; + + /* + * enable rx input tx output port + */ + pr_debug("rio_enable_rx_tx_port(local = %d, destid = %d, hopcount = " + "%d, port_num = %d)\n", local, destid, hopcount, port_num); + + ext_ftr_ptr = rio_mport_get_physefb(port, local, destid, hopcount); + + if (local) { + rio_local_read_config_32(port, ext_ftr_ptr + + RIO_PORT_N_CTL_CSR(0), + ®val); + } else { + if (rio_mport_read_config_32(port, destid, hopcount, + ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), ®val) < 0) + return -EIO; + } + + if (regval & RIO_PORT_N_CTL_P_TYP_SER) { + /* serial */ + regval = regval | RIO_PORT_N_CTL_EN_RX_SER + | RIO_PORT_N_CTL_EN_TX_SER; + } else { + /* parallel */ + regval = regval | RIO_PORT_N_CTL_EN_RX_PAR + | RIO_PORT_N_CTL_EN_TX_PAR; + } + + if (local) { + rio_local_write_config_32(port, ext_ftr_ptr + + RIO_PORT_N_CTL_CSR(0), regval); + } else { + if (rio_mport_write_config_32(port, destid, hopcount, + ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), regval) < 0) + return -EIO; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(rio_enable_rx_tx_port); + /** * rio_chk_dev_route - Validate route to the specified device. @@ -610,6 +742,7 @@ rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount) return 0; } +EXPORT_SYMBOL_GPL(rio_mport_chk_dev_access); /** * rio_chk_dev_access - Validate access to the specified device. @@ -941,6 +1074,7 @@ rio_mport_get_efb(struct rio_mport *port, int local, u16 destid, return RIO_GET_BLOCK_ID(reg_val); } } +EXPORT_SYMBOL_GPL(rio_mport_get_efb); /** * rio_mport_get_feature - query for devices' extended features @@ -997,6 +1131,7 @@ rio_mport_get_feature(struct rio_mport * port, int local, u16 destid, return 0; } +EXPORT_SYMBOL_GPL(rio_mport_get_feature); /** * rio_get_asm - Begin or continue searching for a RIO device by vid/did/asm_vid/asm_did @@ -1246,6 +1381,87 @@ EXPORT_SYMBOL_GPL(rio_dma_prep_slave_sg); #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ +/** + * rio_find_mport - find RIO mport by its ID + * @mport_id: number (ID) of mport device + * + * Given a RIO mport number, the desired mport is located + * in the global list of mports. If the mport is found, a pointer to its + * data structure is returned. If no mport is found, %NULL is returned. + */ +struct rio_mport *rio_find_mport(int mport_id) +{ + struct rio_mport *port = NULL; + + list_for_each_entry(port, &rio_mports, node) { + if (port->id == mport_id) + return port; + } + + return NULL; +} + +/** + * rio_register_scan - enumeration/discovery method registration interface + * @mport_id: mport device ID for which fabric scan routine has to be set + * (RIO_MPORT_ANY = set for all available mports) + * @scan_ops: enumeration/discovery control structure + * + * Assigns enumeration or discovery method to the specified mport device (or all + * available mports if RIO_MPORT_ANY is specified). + * Returns error if the mport already has an enumerator attached to it. + * In case of RIO_MPORT_ANY ignores ports with valid scan routines and returns + * an error if was unable to find at least one available mport. + */ +int rio_register_scan(int mport_id, struct rio_scan *scan_ops) +{ + struct rio_mport *port; + int rc = -EBUSY; + + list_for_each_entry(port, &rio_mports, node) { + if (port->id == mport_id || mport_id == RIO_MPORT_ANY) { + if (port->nscan && mport_id == RIO_MPORT_ANY) + continue; + else if (port->nscan) + break; + + port->nscan = scan_ops; + rc = 0; + + if (mport_id != RIO_MPORT_ANY) + break; + } + } + + return rc; +} +EXPORT_SYMBOL_GPL(rio_register_scan); + +/** + * rio_unregister_scan - removes enumeration/discovery method from mport + * @mport_id: mport device ID for which fabric scan routine has to be + * unregistered (RIO_MPORT_ANY = set for all available mports) + * + * Removes enumeration or discovery method assigned to the specified mport + * device (or all available mports if RIO_MPORT_ANY is specified). + */ +int rio_unregister_scan(int mport_id) +{ + struct rio_mport *port; + + list_for_each_entry(port, &rio_mports, node) { + if (port->id == mport_id || mport_id == RIO_MPORT_ANY) { + if (port->nscan) + port->nscan = NULL; + if (mport_id != RIO_MPORT_ANY) + break; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(rio_unregister_scan); + static void rio_fixup_device(struct rio_dev *dev) { } @@ -1274,7 +1490,7 @@ static void disc_work_handler(struct work_struct *_work) work = container_of(_work, struct rio_disc_work, work); pr_debug("RIO: discovery work for mport %d %s\n", work->mport->id, work->mport->name); - rio_disc_mport(work->mport); + work->mport->nscan->discover(work->mport, 0); } int rio_init_mports(void) @@ -1291,9 +1507,10 @@ int rio_init_mports(void) * on any of the registered mports. */ list_for_each_entry(port, &rio_mports, node) { - if (port->host_deviceid >= 0) - rio_enum_mport(port); - else + if (port->host_deviceid >= 0) { + if (port->nscan) + port->nscan->enumerate(port, 0); + } else n++; } @@ -1323,7 +1540,7 @@ int rio_init_mports(void) n = 0; list_for_each_entry(port, &rio_mports, node) { - if (port->host_deviceid < 0) { + if (port->host_deviceid < 0 && port->nscan) { work[n].mport = port; INIT_WORK(&work[n].work, disc_work_handler); queue_work(rio_wq, &work[n].work); @@ -1342,7 +1559,9 @@ no_disc: return 0; } +#ifdef CONFIG_RAPIDIO_ENUM_AUTO device_initcall_sync(rio_init_mports); +#endif static int hdids[RIO_MAX_MPORTS + 1]; @@ -1362,6 +1581,9 @@ static int rio_hdid_setup(char *str) __setup("riohdid=", rio_hdid_setup); +/* Enumerator structure that links built-in enumerator */ +extern struct rio_scan __weak rio_scan_ops; + int rio_register_mport(struct rio_mport *port) { if (next_portid >= RIO_MAX_MPORTS) { @@ -1371,6 +1593,7 @@ int rio_register_mport(struct rio_mport *port) port->id = next_portid++; port->host_deviceid = rio_get_hdid(port->id); + port->nscan = &rio_scan_ops; list_add_tail(&port->node, &rio_mports); return 0; } @@ -1386,3 +1609,4 @@ EXPORT_SYMBOL_GPL(rio_request_inb_mbox); EXPORT_SYMBOL_GPL(rio_release_inb_mbox); EXPORT_SYMBOL_GPL(rio_request_outb_mbox); EXPORT_SYMBOL_GPL(rio_release_outb_mbox); +EXPORT_SYMBOL_GPL(rio_init_mports); diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index b1af414f15e6..bdd339e884b8 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -15,6 +15,7 @@ #include <linux/rio.h> #define RIO_MAX_CHK_RETRY 3 +#define RIO_MPORT_ANY (-1) /* Functions internal to the RIO core code */ @@ -27,8 +28,6 @@ extern u32 rio_mport_get_efb(struct rio_mport *port, int local, u16 destid, extern int rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount); extern int rio_create_sysfs_dev_files(struct rio_dev *rdev); -extern int rio_enum_mport(struct rio_mport *mport); -extern int rio_disc_mport(struct rio_mport *mport); extern int rio_std_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table, u16 route_destid, u8 route_port); @@ -39,9 +38,18 @@ extern int rio_std_route_clr_table(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table); extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock); extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from); +extern int rio_add_device(struct rio_dev *rdev); +extern void rio_switch_init(struct rio_dev *rdev, int do_enum); +extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, + u8 hopcount, u8 port_num); +extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); +extern int rio_unregister_scan(int mport_id); +extern void rio_attach_device(struct rio_dev *rdev); +extern struct rio_mport *rio_find_mport(int mport_id); /* Structures internal to the RIO core code */ extern struct device_attribute rio_dev_attrs[]; +extern struct bus_attribute rio_bus_attrs[]; extern spinlock_t rio_global_list_lock; extern struct rio_switch_ops __start_rio_switch_ops[]; diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index ed037ae91c5f..a2073eb968a2 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -19,6 +19,7 @@ * */ +#include <linux/delay.h> #include <linux/init.h> #include <linux/platform_device.h> #include <linux/module.h> @@ -80,22 +81,29 @@ #define RYAR1 0x1c #define RTCPICR 0x34 #define PIAR 0x38 +#define PSBR_RTC 0x00 #define rtc_readl(pxa_rtc, reg) \ __raw_readl((pxa_rtc)->base + (reg)) #define rtc_writel(pxa_rtc, reg, value) \ __raw_writel((value), (pxa_rtc)->base + (reg)) +#define rtc_readl_psbr(pxa_rtc, reg) \ + __raw_readl((pxa_rtc)->base_psbr + (reg)) +#define rtc_writel_psbr(pxa_rtc, reg, value) \ + __raw_writel((value), (pxa_rtc)->base_psbr + (reg)) struct pxa_rtc { struct resource *ress; + struct resource *ress_psbr; void __iomem *base; + void __iomem *base_psbr; int irq_1Hz; int irq_Alrm; struct rtc_device *rtc; spinlock_t lock; /* Protects this structure */ }; - +static struct pxa_rtc *rtc_info; static u32 ryxr_calc(struct rtc_time *tm) { return ((tm->tm_year + 1900) << RYxR_YEAR_S) @@ -117,7 +125,7 @@ static void tm_calc(u32 rycr, u32 rdcr, struct rtc_time *tm) tm->tm_year = ((rycr & RYxR_YEAR_MASK) >> RYxR_YEAR_S) - 1900; tm->tm_mon = (((rycr & RYxR_MONTH_MASK) >> RYxR_MONTH_S)) - 1; tm->tm_mday = (rycr & RYxR_DAY_MASK); - tm->tm_wday = ((rycr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1; + tm->tm_wday = ((rdcr & RDxR_DOW_MASK) >> RDxR_DOW_S) - 1; tm->tm_hour = (rdcr & RDxR_HOUR_MASK) >> RDxR_HOUR_S; tm->tm_min = (rdcr & RDxR_MIN_MASK) >> RDxR_MIN_S; tm->tm_sec = rdcr & RDxR_SEC_MASK; @@ -175,7 +183,6 @@ static irqreturn_t pxa_rtc_irq(int irq, void *dev_id) /* enable back rtc interrupts */ rtc_writel(pxa_rtc, RTSR, rtsr & ~RTSR_TRIG_MASK); - spin_unlock(&pxa_rtc->lock); return IRQ_HANDLED; } @@ -250,12 +257,45 @@ static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm) static int pxa_rtc_set_time(struct device *dev, struct rtc_time *tm) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - + /* sequence to wirte pxa rtc register RCNR RDCR RYCR is + *1. set PSBR[RWE] bit, take 2x32-khz to complete + *2. write to RTC register,take 2x32-khz to complete + *3. clear PSBR[RWE] bit,take 2x32-khz to complete + */ + if ((tm->tm_year < 70) || (tm->tm_year > 138)) + return -EINVAL; + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01); + udelay(100); rtc_writel(pxa_rtc, RYCR, ryxr_calc(tm)); rtc_writel(pxa_rtc, RDCR, rdxr_calc(tm)); + udelay(100); + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00); + udelay(100); + pxa_rtc_read_time(dev, tm); + dev_info(dev, "tm.year = %d, tm.month = %d, tm.day = %d\n", + tm->tm_year + 1900, tm->tm_mon, tm->tm_mday); + return 0; +} +int pxa_rtc_sync_time(unsigned int ticks) +{ + /* sequence to wirte pxa rtc register RCNR RDCR RYCR is + *1. set PSBR[RWE] bit, take 2x32-khz to complete + *2. write to RTC register,take 2x32-khz to complete + *3. clear PSBR[RWE] bit,take 2x32-khz to complete + */ + struct rtc_time tm; + rtc_time_to_tm(ticks, &tm); + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x01); + udelay(100); + rtc_writel(rtc_info, RYCR, ryxr_calc(&tm)); + rtc_writel(rtc_info, RDCR, rdxr_calc(&tm)); + udelay(100); + rtc_writel_psbr(rtc_info, PSBR_RTC, 0x00); + udelay(100); return 0; } +EXPORT_SYMBOL(pxa_rtc_sync_time); static int pxa_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { @@ -324,10 +364,10 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) int ret; u32 rttr; - pxa_rtc = kzalloc(sizeof(struct pxa_rtc), GFP_KERNEL); + pxa_rtc = devm_kzalloc(dev, sizeof(struct pxa_rtc), GFP_KERNEL); if (!pxa_rtc) return -ENOMEM; - + rtc_info = pxa_rtc; spin_lock_init(&pxa_rtc->lock); platform_set_drvdata(pdev, pxa_rtc); @@ -335,28 +375,38 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) pxa_rtc->ress = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!pxa_rtc->ress) { dev_err(dev, "No I/O memory resource defined\n"); - goto err_ress; + return ret; + } + pxa_rtc->ress_psbr = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!pxa_rtc->ress_psbr) { + dev_err(dev, "No I/O memory resource defined\n"); + return ret; } pxa_rtc->irq_1Hz = platform_get_irq(pdev, 0); if (pxa_rtc->irq_1Hz < 0) { dev_err(dev, "No 1Hz IRQ resource defined\n"); - goto err_ress; + return ret; } pxa_rtc->irq_Alrm = platform_get_irq(pdev, 1); if (pxa_rtc->irq_Alrm < 0) { dev_err(dev, "No alarm IRQ resource defined\n"); - goto err_ress; + return ret; } - pxa_rtc_open(dev); ret = -ENOMEM; - pxa_rtc->base = ioremap(pxa_rtc->ress->start, + pxa_rtc->base = devm_ioremap(&pdev->dev, pxa_rtc->ress->start, resource_size(pxa_rtc->ress)); if (!pxa_rtc->base) { dev_err(&pdev->dev, "Unable to map pxa RTC I/O memory\n"); - goto err_map; + return ret; } + pxa_rtc->base_psbr = devm_ioremap(&pdev->dev, pxa_rtc->ress_psbr->start, + resource_size(pxa_rtc->ress_psbr)); + if (!pxa_rtc->base_psbr) { + dev_err(&pdev->dev, "Unable to map pxa RTC PSBR I/O memory\n"); + return ret; + } /* * If the clock divider is uninitialized then reset it to the * default value to get the 1Hz clock. @@ -370,41 +420,24 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_RDALE1 | RTSR_HZE); - pxa_rtc->rtc = rtc_device_register("pxa-rtc", &pdev->dev, &pxa_rtc_ops, - THIS_MODULE); + pxa_rtc->rtc = devm_rtc_device_register(&pdev->dev, "pxa-rtc", + &pxa_rtc_ops, THIS_MODULE); ret = PTR_ERR(pxa_rtc->rtc); if (IS_ERR(pxa_rtc->rtc)) { dev_err(dev, "Failed to register RTC device -> %d\n", ret); - goto err_rtc_reg; + return ret; } device_init_wakeup(dev, 1); - + pxa_rtc_open(dev); return 0; - -err_rtc_reg: - iounmap(pxa_rtc->base); -err_ress: -err_map: - kfree(pxa_rtc); - return ret; } static int __exit pxa_rtc_remove(struct platform_device *pdev) { - struct pxa_rtc *pxa_rtc = platform_get_drvdata(pdev); - struct device *dev = &pdev->dev; pxa_rtc_release(dev); - rtc_device_unregister(pxa_rtc->rtc); - - spin_lock_irq(&pxa_rtc->lock); - iounmap(pxa_rtc->base); - spin_unlock_irq(&pxa_rtc->lock); - - kfree(pxa_rtc); - return 0; } diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 6e0cba8f47d5..d87878eee8c1 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -49,18 +49,13 @@ struct v3020_chip_ops { #define V3020_RD 2 #define V3020_IO 3 -struct v3020_gpio { - const char *name; - unsigned int gpio; -}; - struct v3020 { /* MMIO access */ void __iomem *ioaddress; int leftshift; /* GPIO access */ - struct v3020_gpio *gpio; + struct gpio *gpio; struct v3020_chip_ops *ops; @@ -107,48 +102,40 @@ static struct v3020_chip_ops v3020_mmio_ops = { .write_bit = v3020_mmio_write_bit, }; -static struct v3020_gpio v3020_gpio[] = { - { "RTC CS", 0 }, - { "RTC WR", 0 }, - { "RTC RD", 0 }, - { "RTC IO", 0 }, +static struct gpio v3020_gpio[] = { + { 0, GPIOF_OUT_INIT_HIGH, "RTC CS"}, + { 0, GPIOF_OUT_INIT_HIGH, "RTC WR"}, + { 0, GPIOF_OUT_INIT_HIGH, "RTC RD"}, + { 0, GPIOF_OUT_INIT_HIGH, "RTC IO"}, }; static int v3020_gpio_map(struct v3020 *chip, struct platform_device *pdev, struct v3020_platform_data *pdata) { - int i, err; + int err; v3020_gpio[V3020_CS].gpio = pdata->gpio_cs; v3020_gpio[V3020_WR].gpio = pdata->gpio_wr; v3020_gpio[V3020_RD].gpio = pdata->gpio_rd; v3020_gpio[V3020_IO].gpio = pdata->gpio_io; - for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) { - err = gpio_request(v3020_gpio[i].gpio, v3020_gpio[i].name); - if (err) - goto err_request; - - gpio_direction_output(v3020_gpio[i].gpio, 1); - } + err = gpio_request_array(v3020_gpio, ARRAY_SIZE(v3020_gpio)); + if (err) + goto err_request; chip->gpio = v3020_gpio; return 0; err_request: - while (--i >= 0) - gpio_free(v3020_gpio[i].gpio); + gpio_free_array(v3020_gpio, ARRAY_SIZE(v3020_gpio)); return err; } static void v3020_gpio_unmap(struct v3020 *chip) { - int i; - - for (i = 0; i < ARRAY_SIZE(v3020_gpio); i++) - gpio_free(v3020_gpio[i].gpio); + gpio_free_array(v3020_gpio, ARRAY_SIZE(v3020_gpio)); } static void v3020_gpio_write_bit(struct v3020 *chip, unsigned char bit) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9f0c46547459..df5e961484e1 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -35,6 +35,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/aio.h> #include <linux/errno.h> #include <linux/mtio.h> #include <linux/ioctl.h> diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index b14a55742559..b040200a5a55 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/vmalloc.h> +#include <linux/aio.h> #include "logger.h" #include <asm/ioctls.h> diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 07f5f94634bb..4e842cb161c6 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -271,7 +271,8 @@ static void iblock_complete_cmd(struct se_cmd *cmd) kfree(ibr); } -static void iblock_bio_done(struct bio *bio, int err) +static void iblock_bio_done(struct bio *bio, int err, + struct batch_complete *batch) { struct se_cmd *cmd = bio->bi_private; struct iblock_req *ibr = cmd->priv; @@ -335,7 +336,8 @@ static void iblock_submit_bios(struct bio_list *list, int rw) blk_finish_plug(&plug); } -static void iblock_end_io_flush(struct bio *bio, int err) +static void iblock_end_io_flush(struct bio *bio, int err, + struct batch_complete *batch) { struct se_cmd *cmd = bio->bi_private; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index e992b27aa090..1e9873179860 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -835,7 +835,8 @@ static ssize_t pscsi_show_configfs_dev_params(struct se_device *dev, char *b) return bl; } -static void pscsi_bi_endio(struct bio *bio, int error) +static void pscsi_bi_endio(struct bio *bio, int error, + struct batch_complete *batch) { bio_put(bio); } diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index f52dcfe8f545..24bd363ca351 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -3099,7 +3099,7 @@ static int init_dma_pools(struct udc *dev) } /* DMA setup */ - dev->data_requests = dma_pool_create("data_requests", NULL, + dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev, sizeof(struct udc_data_dma), 0, 0); if (!dev->data_requests) { DBG(dev, "can't get request data pool\n"); @@ -3111,7 +3111,7 @@ static int init_dma_pools(struct udc *dev) dev->ep[UDC_EP0IN_IX].dma = &dev->regs->ctl; /* dma desc for setup data */ - dev->stp_requests = dma_pool_create("setup requests", NULL, + dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev, sizeof(struct udc_stp_dma), 0, 0); if (!dev->stp_requests) { DBG(dev, "can't get stp request pool\n"); diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index dda0dc4a5567..570c005062ab 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -24,6 +24,8 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> +#include <linux/mmu_context.h> +#include <linux/aio.h> #include <linux/device.h> #include <linux/moduleparam.h> @@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) struct kiocb_priv { struct usb_request *req; struct ep_data *epdata; + struct kiocb *iocb; + struct mm_struct *mm; + struct work_struct work; void *buf; const struct iovec *iv; unsigned long nr_segs; @@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); - kiocbSetCancelled(iocb); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else @@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) return value; } -static ssize_t ep_aio_read_retry(struct kiocb *iocb) +static ssize_t ep_copy_to_user(struct kiocb_priv *priv) { - struct kiocb_priv *priv = iocb->private; ssize_t len, total; void *to_copy; int i; - /* we "retry" to get the right mm context for this: */ - /* copy stuff into user buffers */ total = priv->actual; len = 0; @@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb) if (total == 0) break; } + + return len; +} + +static void ep_user_copy_worker(struct work_struct *work) +{ + struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); + struct mm_struct *mm = priv->mm; + struct kiocb *iocb = priv->iocb; + size_t ret; + + use_mm(mm); + ret = ep_copy_to_user(priv); + unuse_mm(mm); + + /* completing the iocb can drop the ctx and mm, don't touch mm after */ + aio_complete(iocb, ret, ret); + kfree(priv->buf); kfree(priv); - return len; } static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) @@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) aio_complete(iocb, req->actual ? req->actual : req->status, req->status); } else { - /* retry() won't report both; so we hide some faults */ + /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) DBG(epdata->dev, "%s fault %d len %d\n", ep->name, req->status, req->actual); priv->buf = req->buf; priv->actual = req->actual; - kick_iocb(iocb); + schedule_work(&priv->work); } spin_unlock(&epdata->dev->lock); @@ -633,8 +651,10 @@ fail: return value; } iocb->private = priv; + priv->iocb = iocb; priv->iv = iv; priv->nr_segs = nr_segs; + INIT_WORK(&priv->work, ep_user_copy_worker); value = get_ready_ep(iocb->ki_filp->f_flags, epdata); if (unlikely(value < 0)) { @@ -642,10 +662,11 @@ fail: goto fail; } - iocb->ki_cancel = ep_aio_cancel; + kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; + priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ /* each kiocb is coupled to one usb_request, but we can't * allocate or submit those if the host disconnected. @@ -674,7 +695,7 @@ fail: kfree(priv); put_ep(epdata); } else - value = (iv ? -EIOCBRETRY : -EIOCBQUEUED); + value = -EIOCBQUEUED; return value; } @@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, if (unlikely(!buf)) return -ENOMEM; - iocb->ki_retry = ep_aio_read_retry; return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); } diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 1534cf3c1423..d8ecc5242cd3 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -2453,6 +2453,23 @@ config FB_HYPERV help This framebuffer driver supports Microsoft Hyper-V Synthetic Video. +config FB_SIMPLE + bool "Simple framebuffer support" + depends on (FB = y) && OF + select FB_CFB_FILLRECT + select FB_CFB_COPYAREA + select FB_CFB_IMAGEBLIT + help + Say Y if you want support for a simple frame-buffer. + + This driver assumes that the display hardware has been initialized + before the kernel boots, and the kernel will simply render to the + pre-allocated frame buffer surface. + + Configuration re: surface address, size, and format must be provided + through device tree, or potentially plain old platform data in the + future. + source "drivers/video/omap/Kconfig" source "drivers/video/omap2/Kconfig" source "drivers/video/exynos/Kconfig" diff --git a/drivers/video/Makefile b/drivers/video/Makefile index 7234e4a959e8..e8bae8dd4804 100644 --- a/drivers/video/Makefile +++ b/drivers/video/Makefile @@ -166,6 +166,7 @@ obj-$(CONFIG_FB_MX3) += mx3fb.o obj-$(CONFIG_FB_DA8XX) += da8xx-fb.o obj-$(CONFIG_FB_MXS) += mxsfb.o obj-$(CONFIG_FB_SSD1307) += ssd1307fb.o +obj-$(CONFIG_FB_SIMPLE) += simplefb.o # the test framebuffer is last obj-$(CONFIG_FB_VIRTUAL) += vfb.o diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 57886787ead0..e78d9f2233b8 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -518,6 +518,9 @@ static void cyber2000fb_set_timing(struct cfb_info *cfb, struct par_info *hw) cyber2000_grphw(0xb9, 0x00, cfb); spin_unlock(&cfb->reg_b0_lock); + /* wait (for the PLL?) to avoid palette corruption at higher clocks */ + msleep(1000); + cfb->ramdac_ctrl = hw->ramdac; cyber2000fb_write_ramdac_ctrl(cfb); diff --git a/drivers/video/simplefb.c b/drivers/video/simplefb.c new file mode 100644 index 000000000000..e2e9e3e61b72 --- /dev/null +++ b/drivers/video/simplefb.c @@ -0,0 +1,234 @@ +/* + * Simplest possible simple frame-buffer driver, as a platform device + * + * Copyright (c) 2013, Stephen Warren + * + * Based on q40fb.c, which was: + * Copyright (C) 2001 Richard Zidlicky <rz@linux-m68k.org> + * + * Also based on offb.c, which was: + * Copyright (C) 1997 Geert Uytterhoeven + * Copyright (C) 1996 Paul Mackerras + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/errno.h> +#include <linux/fb.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/platform_device.h> + +static struct fb_fix_screeninfo simplefb_fix = { + .id = "simple", + .type = FB_TYPE_PACKED_PIXELS, + .visual = FB_VISUAL_TRUECOLOR, + .accel = FB_ACCEL_NONE, +}; + +static struct fb_var_screeninfo simplefb_var = { + .height = -1, + .width = -1, + .activate = FB_ACTIVATE_NOW, + .vmode = FB_VMODE_NONINTERLACED, +}; + +static int simplefb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, + u_int transp, struct fb_info *info) +{ + u32 *pal = info->pseudo_palette; + u32 cr = red >> (16 - info->var.red.length); + u32 cg = green >> (16 - info->var.green.length); + u32 cb = blue >> (16 - info->var.blue.length); + u32 value; + + if (regno >= 16) + return -EINVAL; + + value = (cr << info->var.red.offset) | + (cg << info->var.green.offset) | + (cb << info->var.blue.offset); + if (info->var.transp.length > 0) { + u32 mask = (1 << info->var.transp.length) - 1; + mask <<= info->var.transp.offset; + value |= mask; + } + pal[regno] = value; + + return 0; +} + +static struct fb_ops simplefb_ops = { + .owner = THIS_MODULE, + .fb_setcolreg = simplefb_setcolreg, + .fb_fillrect = cfb_fillrect, + .fb_copyarea = cfb_copyarea, + .fb_imageblit = cfb_imageblit, +}; + +struct simplefb_format { + const char *name; + u32 bits_per_pixel; + struct fb_bitfield red; + struct fb_bitfield green; + struct fb_bitfield blue; + struct fb_bitfield transp; +}; + +static struct simplefb_format simplefb_formats[] = { + { "r5g6b5", 16, {11, 5}, {5, 6}, {0, 5}, {0, 0} }, +}; + +struct simplefb_params { + u32 width; + u32 height; + u32 stride; + struct simplefb_format *format; +}; + +static int simplefb_parse_dt(struct platform_device *pdev, + struct simplefb_params *params) +{ + struct device_node *np = pdev->dev.of_node; + int ret; + const char *format; + int i; + + ret = of_property_read_u32(np, "width", ¶ms->width); + if (ret) { + dev_err(&pdev->dev, "Can't parse width property\n"); + return ret; + } + + ret = of_property_read_u32(np, "height", ¶ms->height); + if (ret) { + dev_err(&pdev->dev, "Can't parse height property\n"); + return ret; + } + + ret = of_property_read_u32(np, "stride", ¶ms->stride); + if (ret) { + dev_err(&pdev->dev, "Can't parse stride property\n"); + return ret; + } + + ret = of_property_read_string(np, "format", &format); + if (ret) { + dev_err(&pdev->dev, "Can't parse format property\n"); + return ret; + } + params->format = NULL; + for (i = 0; i < ARRAY_SIZE(simplefb_formats); i++) { + if (strcmp(format, simplefb_formats[i].name)) + continue; + params->format = &simplefb_formats[i]; + break; + } + if (!params->format) { + dev_err(&pdev->dev, "Invalid format value\n"); + return -EINVAL; + } + + return 0; +} + +static int simplefb_probe(struct platform_device *pdev) +{ + int ret; + struct simplefb_params params; + struct fb_info *info; + struct resource *mem; + + if (fb_get_options("simplefb", NULL)) + return -ENODEV; + + ret = simplefb_parse_dt(pdev, ¶ms); + if (ret) + return ret; + + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!mem) { + dev_err(&pdev->dev, "No memory resource\n"); + return -EINVAL; + } + + info = framebuffer_alloc(sizeof(u32) * 16, &pdev->dev); + if (!info) + return -ENOMEM; + platform_set_drvdata(pdev, info); + + info->fix = simplefb_fix; + info->fix.smem_start = mem->start; + info->fix.smem_len = resource_size(mem); + info->fix.line_length = params.stride; + + info->var = simplefb_var; + info->var.xres = params.width; + info->var.yres = params.height; + info->var.xres_virtual = params.width; + info->var.yres_virtual = params.height; + info->var.bits_per_pixel = params.format->bits_per_pixel; + info->var.red = params.format->red; + info->var.green = params.format->green; + info->var.blue = params.format->blue; + info->var.transp = params.format->transp; + + info->fbops = &simplefb_ops; + info->flags = FBINFO_DEFAULT; + info->screen_base = devm_ioremap(&pdev->dev, info->fix.smem_start, + info->fix.smem_len); + if (!info->screen_base) { + framebuffer_release(info); + return -ENODEV; + } + info->pseudo_palette = (void *)(info + 1); + + ret = register_framebuffer(info); + if (ret < 0) { + dev_err(&pdev->dev, "Unable to register simplefb: %d\n", ret); + framebuffer_release(info); + return ret; + } + + dev_info(&pdev->dev, "fb%d: simplefb registered!\n", info->node); + + return 0; +} + +static int simplefb_remove(struct platform_device *pdev) +{ + struct fb_info *info = platform_get_drvdata(pdev); + + unregister_framebuffer(info); + framebuffer_release(info); + + return 0; +} + +static const struct of_device_id simplefb_of_match[] = { + { .compatible = "simple-framebuffer", }, + { }, +}; +MODULE_DEVICE_TABLE(of, simplefb_of_match); + +static struct platform_driver simplefb_driver = { + .driver = { + .name = "simple-framebuffer", + .owner = THIS_MODULE, + .of_match_table = simplefb_of_match, + }, + .probe = simplefb_probe, + .remove = simplefb_remove, +}; +module_platform_driver(simplefb_driver); + +MODULE_AUTHOR("Stephen Warren <swarren@wwwdotorg.org>"); +MODULE_DESCRIPTION("Simple framebuffer driver"); +MODULE_LICENSE("GPL v2"); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,6 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/aio.h> #include <net/9p/9p.h> #include <net/9p/client.h> diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -8,6 +8,8 @@ * * See ../COPYING for licensing terms. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,14 +20,14 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#define DEBUG 0 - #include <linux/sched.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/bio.h> #include <linux/mmu_context.h> +#include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -35,15 +37,111 @@ #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/percpu-refcount.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> -#if DEBUG > 1 -#define dprintk printk -#else -#define dprintk(x...) do { ; } while (0) -#endif +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 + +struct kioctx_cpu { + unsigned reqs_available; +}; + +struct kioctx { + struct percpu_ref users; + + /* This needs improving */ + unsigned long user_id; + struct hlist_node list; + + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; + /* + * This is what userspace passed to io_setup(), it's not used for + * anything but counting against the global max_reqs quota. + * + * The real limit is nr_events - 1, which will be larger (see + * aio_setup_ring()) + */ + unsigned max_reqs; + + /* Size of ringbuffer, in units of struct io_event */ + unsigned nr_events; + + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct rcu_head rcu_head; + struct work_struct rcu_work; + + struct { + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. + */ + atomic_t reqs_available; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + + struct { + struct mutex ring_lock; + wait_queue_head_t wait; + + /* + * Copy of the real tail - to reduce cacheline bouncing. Updated + * by aio_complete() whenever it updates the real tail. + */ + unsigned shadow_tail; + } ____cacheline_aligned_in_smp; + + struct { + /* + * This is the canonical copy of the tail pointer, updated by + * aio_complete(). But aio_complete() also uses it as a lock, so + * other code can't use it; aio_complete() keeps shadow_tail in + * sync with the real value of the tail pointer for other code + * to use. + */ + unsigned tail; + } ____cacheline_aligned_in_smp; + + struct page *internal_pages[AIO_RING_PAGES]; +}; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); @@ -54,11 +152,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; -static struct workqueue_struct *aio_wq; - -static void aio_kick_handler(struct work_struct *); -static void aio_queue_work(struct kioctx *); - /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. @@ -68,10 +161,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ - BUG_ON(!aio_wq); - - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); return 0; } @@ -79,28 +169,23 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; long i; - for (i=0; i<info->nr_pages; i++) - put_page(info->ring_pages[i]); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); - if (info->mmap_size) { - BUG_ON(ctx->mm != current->mm); - vm_munmap(info->mmap_base, info->mmap_size); - } + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (info->ring_pages && info->ring_pages != info->internal_pages) - kfree(info->ring_pages); - info->ring_pages = NULL; - info->nr = 0; + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + kfree(ctx->ring_pages); } static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; - struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; @@ -116,46 +201,44 @@ static int aio_setup_ring(struct kioctx *ctx) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; - info->ring_pages = info->internal_pages; + ctx->nr_events = 0; + ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!info->ring_pages) + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) return -ENOMEM; } - info->mmap_size = nr_pages * PAGE_SIZE; - dprintk("attempting mmap of %lu bytes\n", info->mmap_size); - down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, - &populate); - if (IS_ERR((void *)info->mmap_base)) { - up_write(&ctx->mm->mmap_sem); - info->mmap_size = 0; + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); + ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + if (IS_ERR((void *)ctx->mmap_base)) { + up_write(&mm->mmap_sem); + ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } - dprintk("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, ctx->mm, - info->mmap_base, nr_pages, - 1, 0, info->ring_pages, NULL); - up_write(&ctx->mm->mmap_sem); + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, + 1, 0, ctx->ring_pages, NULL); + up_write(&mm->mmap_sem); - if (unlikely(info->nr_pages != nr_pages)) { + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, populate); + mm_populate(ctx->mmap_base, populate); - ctx->user_id = info->mmap_base; + ctx->user_id = ctx->mmap_base; + ctx->nr_events = nr_events; /* trusted copy */ - info->nr = nr_events; /* trusted copy */ - - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -164,72 +247,145 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); return 0; } - -/* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(). Release the pointer with put_aio_ring_event(); - */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) - -#define put_aio_ring_event(event) do { \ - struct io_event *__event = (event); \ - (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ -} while(0) - -static void ctx_rcu_free(struct rcu_head *head) +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +{ + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, + struct io_event *res) +{ + kiocb_cancel_fn *old, *cancel; + int ret = -EINVAL; + + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return ret; + + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); + + atomic_inc(&kiocb->ki_users); + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); + + return ret; +} + +static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. */ -static void __put_ioctx(struct kioctx *ctx) +static void free_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); + struct aio_ring *ring; + struct io_event res; + struct kiocb *req; + unsigned cpu, head, avail; - cancel_delayed_work_sync(&ctx->wq); - aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); } - pr_debug("__put_ioctx: freeing %p\n", ctx); - call_rcu(&ctx->rcu_head, ctx_rcu_free); -} -static inline int try_get_ioctx(struct kioctx *kioctx) -{ - return atomic_inc_not_zero(&kioctx->users); + spin_unlock_irq(&ctx->ctx_lock); + + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); + + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } + + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) { + wait_event(ctx->wait, + (head != ctx->shadow_tail) || + (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)); + + avail = (head <= ctx->shadow_tail + ? ctx->shadow_tail : ctx->nr_events) - head; + + atomic_add(avail, &ctx->reqs_available); + head += avail; + head %= ctx->nr_events; + } + + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); + + aio_free_ring(ctx); + + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + pr_debug("freeing %p\n", ctx); + + /* + * Here the call_rcu() is between the wait_event() for reqs_active to + * hit 0, and freeing the ioctx. + * + * aio_complete() decrements reqs_active, but it has to touch the ioctx + * after to issue a wakeup so we use rcu. + */ + call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static inline void put_ioctx(struct kioctx *kioctx) +static void put_ioctx(struct kioctx *ctx) { - BUG_ON(atomic_read(&kioctx->users) <= 0); - if (unlikely(atomic_dec_and_test(&kioctx->users))) - __put_ioctx(kioctx); + if (percpu_ref_put(&ctx->users)) + free_ioctx(ctx); } /* ioctx_alloc @@ -237,10 +393,22 @@ static inline void put_ioctx(struct kioctx *kioctx) */ static struct kioctx *ioctx_alloc(unsigned nr_events) { - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || (nr_events > (0x10000000U / sizeof(struct kiocb)))) { @@ -256,21 +424,29 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-ENOMEM); ctx->max_reqs = nr_events; - mm = ctx->mm = current->mm; - atomic_inc(&mm->mm_count); - atomic_set(&ctx->users, 2); + percpu_ref_init(&ctx->users); + rcu_read_lock(); + percpu_ref_get(&ctx->users); + rcu_read_unlock(); + spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); + mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - INIT_LIST_HEAD(&ctx->run_list); - INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); - if (aio_setup_ring(ctx) < 0) + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) goto out_freectx; + if (aio_setup_ring(ctx) < 0) + goto out_freepcpu; + + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + BUG_ON(!ctx->req_batch); + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > aio_max_nr || @@ -286,64 +462,58 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); spin_unlock(&mm->ioctx_lock); - dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); out_freectx: - mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - dprintk("aio: error allocating ioctx %d\n", err); + pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -/* kill_ctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static void kill_ctx(struct kioctx *ctx) +static void kill_ioctx_work(struct work_struct *work) { - int (*cancel)(struct kiocb *, struct io_event *); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - struct io_event res; + struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - spin_lock_irq(&ctx->ctx_lock); - ctx->dead = 1; - while (!list_empty(&ctx->active_reqs)) { - struct list_head *pos = ctx->active_reqs.next; - struct kiocb *iocb = list_kiocb(pos); - list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; - kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } - } + wake_up_all(&ctx->wait); + put_ioctx(ctx); +} - if (!ctx->reqs_active) - goto out; +static void kill_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - add_wait_queue(&ctx->wait, &wait); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (ctx->reqs_active) { - spin_unlock_irq(&ctx->ctx_lock); - io_schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&ctx->ctx_lock); - } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); + INIT_WORK(&ctx->rcu_work, kill_ioctx_work); + schedule_work(&ctx->rcu_work); +} -out: - spin_unlock_irq(&ctx->ctx_lock); +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void kill_ioctx(struct kioctx *ctx) +{ + if (percpu_ref_kill(&ctx->users)) { + hlist_del_rcu(&ctx->list); + /* Between hlist_del_rcu() and dropping the initial ref */ + synchronize_rcu(); + + /* + * We can't punt to workqueue here because put_ioctx() -> + * free_ioctx() will unmap the ringbuffer, and that has to be + * done in the original process's context. kill_ioctx_rcu/work() + * exist for exit_aio(), as in that path free_ioctx() won't do + * the unmap. + */ + kill_ioctx_work(&ctx->rcu_work); + } } /* wait_on_sync_kiocb: @@ -351,9 +521,9 @@ out: */ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { - while (iocb->ki_users) { + while (atomic_read(&iocb->ki_users)) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!iocb->ki_users) + if (!atomic_read(&iocb->ki_users)) break; io_schedule(); } @@ -362,28 +532,20 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) } EXPORT_SYMBOL(wait_on_sync_kiocb); -/* exit_aio: called when the last user of mm goes away. At this point, - * there is no way for any new requests to be submited or any of the - * io_* syscalls to be called on the context. However, there may be - * outstanding requests which hold references to the context; as they - * go away, they will call put_ioctx and release any pinned memory - * associated with the request (held via struct page * references). +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx *ctx; + struct hlist_node *n; - while (!hlist_empty(&mm->ioctx_list)) { - ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); - hlist_del_rcu(&ctx->list); - - kill_ctx(ctx); - - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), ctx->dead, - ctx->reqs_active); + hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -391,150 +553,95 @@ void exit_aio(struct mm_struct *mm) * as indicator that it needs to unmap the area, * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. - * That way we get all munmap done to current->mm - - * all other callers have ctx->mm == current->mm. */ - ctx->ring_info.mmap_size = 0; - put_ioctx(ctx); + ctx->mmap_size = 0; + + if (percpu_ref_kill(&ctx->users)) { + hlist_del_rcu(&ctx->list); + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + } } } -/* aio_get_req - * Allocate a slot for an aio request. Increments the users count - * of the kioctx so that the kioctx stays around until all requests are - * complete. Returns NULL if no requests are free. - * - * Returns with kiocb->users set to 2. The io submit code path holds - * an extra reference while submitting the i/o. - * This prevents races between the aio code path referencing the - * req (after submitting it) and aio_complete() freeing the req. - */ -static struct kiocb *__aio_get_req(struct kioctx *ctx) +static void put_reqs_available(struct kioctx *ctx, unsigned nr) { - struct kiocb *req = NULL; + struct kioctx_cpu *kcpu; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); - if (unlikely(!req)) - return NULL; + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); - req->ki_flags = 0; - req->ki_users = 2; - req->ki_key = 0; - req->ki_ctx = ctx; - req->ki_cancel = NULL; - req->ki_retry = NULL; - req->ki_dtor = NULL; - req->private = NULL; - req->ki_iovec = NULL; - INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = NULL; + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } - return req; + preempt_enable(); } -/* - * struct kiocb's are allocated in batches to reduce the number of - * times the ctx lock is acquired and released. - */ -#define KIOCB_BATCH_SIZE 32L -struct kiocb_batch { - struct list_head head; - long count; /* number of requests left to allocate */ -}; - -static void kiocb_batch_init(struct kiocb_batch *batch, long total) +static bool get_reqs_available(struct kioctx *ctx) { - INIT_LIST_HEAD(&batch->head); - batch->count = total; -} + struct kioctx_cpu *kcpu; + bool ret = false; -static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) -{ - struct kiocb *req, *n; + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); - if (list_empty(&batch->head)) - return; + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); - spin_lock_irq(&ctx->ctx_lock); - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - list_del(&req->ki_list); - kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - } - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * Allocate a batch of kiocbs. This avoids taking and dropping the - * context lock a lot during setup. - */ -static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) -{ - unsigned short allocated, to_alloc; - long avail; - struct kiocb *req, *n; - struct aio_ring *ring; - - to_alloc = min(batch->count, KIOCB_BATCH_SIZE); - for (allocated = 0; allocated < to_alloc; allocated++) { - req = __aio_get_req(ctx); - if (!req) - /* allocation failed, go with what we've got */ - break; - list_add(&req->ki_batch, &batch->head); - } - - if (allocated == 0) - goto out; + do { + if (avail < ctx->req_batch) + goto out; - spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - - avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; - BUG_ON(avail < 0); - if (avail < allocated) { - /* Trim back the number of requests. */ - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - if (--allocated <= avail) - break; - } - } + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); - batch->count -= allocated; - list_for_each_entry(req, &batch->head, ki_batch) { - list_add(&req->ki_list, &ctx->active_reqs); - ctx->reqs_active++; + kcpu->reqs_available += ctx->req_batch; } - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); - + ret = true; + kcpu->reqs_available--; out: - return allocated; + preempt_enable(); + return ret; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx, - struct kiocb_batch *batch) +/* aio_get_req + * Allocate a slot for an aio request. Increments the ki_users count + * of the kioctx so that the kioctx stays around until all requests are + * complete. Returns NULL if no requests are free. + * + * Returns with kiocb->ki_users set to 2. The io submit code path holds + * an extra reference while submitting the i/o. + * This prevents races between the aio code path referencing the + * req (after submitting it) and aio_complete() freeing the req. + */ +static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (list_empty(&batch->head)) - if (kiocb_batch_refill(ctx, batch) == 0) - return NULL; - req = list_first_entry(&batch->head, struct kiocb, ki_batch); - list_del(&req->ki_batch); + if (!get_reqs_available(ctx)) + return NULL; + + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) + goto out_put; + + atomic_set(&req->ki_users, 2); + req->ki_ctx = ctx; return req; +out_put: + put_reqs_available(ctx, 1); + return NULL; } -static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +static void kiocb_free(struct kiocb *req) { - assert_spin_locked(&ctx->ctx_lock); - + if (req->ki_filp) + fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) @@ -542,48 +649,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) if (req->ki_iovec != &req->ki_inline_vec) kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); } -/* __aio_put_req - * Returns true if this put was the last user of the request. - */ -static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +void aio_put_req(struct kiocb *req) { - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); - - assert_spin_locked(&ctx->ctx_lock); - - req->ki_users--; - BUG_ON(req->ki_users < 0); - if (likely(req->ki_users)) - return 0; - list_del(&req->ki_list); /* remove from active_reqs */ - req->ki_cancel = NULL; - req->ki_retry = NULL; - - fput(req->ki_filp); - req->ki_filp = NULL; - really_put_req(ctx, req); - return 1; -} - -/* aio_put_req - * Returns true if this put was the last user of the kiocb, - * false if the request is still in use. - */ -int aio_put_req(struct kiocb *req) -{ - struct kioctx *ctx = req->ki_ctx; - int ret; - spin_lock_irq(&ctx->ctx_lock); - ret = __aio_put_req(ctx, req); - spin_unlock_irq(&ctx->ctx_lock); - return ret; + if (atomic_dec_and_test(&req->ki_users)) + kiocb_free(req); } EXPORT_SYMBOL(aio_put_req); @@ -595,13 +666,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - /* - * RCU protects us against accessing freed memory but - * we have to be careful not to get a reference when the - * reference count already dropped to 0 (ctx->dead test - * is unreliable because of races). - */ - if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + if (ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); ret = ctx; break; } @@ -611,610 +677,332 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } -/* - * Queue up a kiocb to be retried. Assumes that the kiocb - * has already been marked as kicked, and places it on - * the retry run list for the corresponding ioctx, if it - * isn't already queued. Returns 1 if it actually queued - * the kiocb (to tell the caller to activate the work - * queue to process it), or 0, if it found that it was - * already queued. - */ -static inline int __queue_kicked_iocb(struct kiocb *iocb) +static inline unsigned kioctx_ring_put(struct kioctx *ctx, struct kiocb *req, + unsigned tail) { - struct kioctx *ctx = iocb->ki_ctx; - - assert_spin_locked(&ctx->ctx_lock); + struct io_event *ev_page, *event; + unsigned pos = tail + AIO_EVENTS_OFFSET; - if (list_empty(&iocb->ki_run_list)) { - list_add_tail(&iocb->ki_run_list, - &ctx->run_list); - return 1; - } - return 0; -} + if (++tail >= ctx->nr_events) + tail = 0; -/* aio_run_iocb - * This is the core aio execution routine. It is - * invoked both for initial i/o submission and - * subsequent retries via the aio_kick_handler. - * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reacquired - * as needed during processing. - * - * Calls the iocb retry method (already setup for the - * iocb on initial submission) for operation specific - * handling, but takes care of most of common retry - * execution details for a given iocb. The retry method - * needs to be non-blocking as far as possible, to avoid - * holding up other iocbs waiting to be serviced by the - * retry kernel thread. - * - * The trickier parts in this code have to do with - * ensuring that only one retry instance is in progress - * for a given iocb at any time. Providing that guarantee - * simplifies the coding of individual aio operations as - * it avoids various potential races. - */ -static ssize_t aio_run_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - ssize_t (*retry)(struct kiocb *); - ssize_t ret; + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; - if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); - return 0; - } + event->obj = (u64)(unsigned long)req->ki_obj.user; + event->data = req->ki_user_data; + event->res = req->ki_res; + event->res2 = req->ki_res2; - /* - * We don't want the next retry iteration for this - * operation to start until this one has returned and - * updated the iocb state. However, wait_queue functions - * can trigger a kick_iocb from interrupt context in the - * meantime, indicating that data is available for the next - * iteration. We want to remember that and enable the - * next retry iteration _after_ we are through with - * this one. - * - * So, in order to be able to register a "kick", but - * prevent it from being queued now, we clear the kick - * flag, but make the kick code *think* that the iocb is - * still on the run list until we are actually done. - * When we are done with this iteration, we check if - * the iocb was kicked in the meantime and if so, queue - * it up afresh. - */ + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - kiocbClearKicked(iocb); + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", + ctx, tail, req, req->ki_obj.user, req->ki_user_data, + req->ki_res, req->ki_res2); - /* - * This is so that aio_complete knows it doesn't need to - * pull the iocb off the run list (We can't just call - * INIT_LIST_HEAD because we don't want a kick_iocb to - * queue this on the run list yet) - */ - iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; - spin_unlock_irq(&ctx->ctx_lock); + return tail; +} - /* Quit retrying if the i/o has been cancelled */ - if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - } +static inline unsigned kioctx_ring_lock(struct kioctx *ctx) +{ + unsigned tail; /* - * Now we are all set to call the retry method in async - * context. + * ctx->tail is both our lock and the canonical version of the tail + * pointer. */ - ret = retry(iocb); - - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(iocb, ret, 0); - } -out: - spin_lock_irq(&ctx->ctx_lock); - - if (-EIOCBRETRY == ret) { - /* - * OK, now that we are done with this iteration - * and know that there is more left to go, - * this is where we let go so that a subsequent - * "kick" can start the next iteration - */ + while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX) + cpu_relax(); - /* will make __queue_kicked_iocb succeed from here on */ - INIT_LIST_HEAD(&iocb->ki_run_list); - /* we must queue the next iteration ourselves, if it - * has already been kicked */ - if (kiocbIsKicked(iocb)) { - __queue_kicked_iocb(iocb); - - /* - * __queue_kicked_iocb will always return 1 here, because - * iocb->ki_run_list is empty at this point so it should - * be safe to unconditionally queue the context into the - * work queue. - */ - aio_queue_work(ctx); - } - } - return ret; + return tail; } -/* - * __aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static int __aio_run_iocbs(struct kioctx *ctx) +static inline void kioctx_ring_unlock(struct kioctx *ctx, unsigned tail) { - struct kiocb *iocb; - struct list_head run_list; + struct aio_ring *ring; - assert_spin_locked(&ctx->ctx_lock); + if (!ctx) + return; - list_replace_init(&ctx->run_list, &run_list); - while (!list_empty(&run_list)) { - iocb = list_entry(run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - /* - * Hold an extra reference while retrying i/o. - */ - iocb->ki_users++; /* grab extra reference */ - aio_run_iocb(iocb); - __aio_put_req(ctx, iocb); - } - if (!list_empty(&ctx->run_list)) - return 1; - return 0; -} + smp_wmb(); + /* make event visible before updating tail */ -static void aio_queue_work(struct kioctx * ctx) -{ - unsigned long timeout; - /* - * if someone is waiting, get the work started right - * away, otherwise, use a longer delay - */ - smp_mb(); - if (waitqueue_active(&ctx->wait)) - timeout = 1; - else - timeout = HZ/10; - queue_delayed_work(aio_wq, &ctx->wq, timeout); -} + ctx->shadow_tail = tail; -/* - * aio_run_all_iocbs: - * Process all pending retries queued on the ioctx - * run list, and keep running them until the list - * stays empty. - * Assumes it is operating within the aio issuer's mm context. - */ -static inline void aio_run_all_iocbs(struct kioctx *ctx) -{ - spin_lock_irq(&ctx->ctx_lock); - while (__aio_run_iocbs(ctx)) - ; - spin_unlock_irq(&ctx->ctx_lock); -} + ring = kmap_atomic(ctx->ring_pages[0]); + ring->tail = tail; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); -/* - * aio_kick_handler: - * Work queue handler triggered to process pending - * retries on an ioctx. Takes on the aio issuer's - * mm context before running the iocbs, so that - * copy_xxx_user operates on the issuer's address - * space. - * Run on aiod's context. - */ -static void aio_kick_handler(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, wq.work); - mm_segment_t oldfs = get_fs(); - struct mm_struct *mm; - int requeue; + /* unlock, make new tail visible before checking waitlist */ + smp_mb(); - set_fs(USER_DS); - use_mm(ctx->mm); - spin_lock_irq(&ctx->ctx_lock); - requeue =__aio_run_iocbs(ctx); - mm = ctx->mm; - spin_unlock_irq(&ctx->ctx_lock); - unuse_mm(mm); - set_fs(oldfs); - /* - * we're in a worker thread already; no point using non-zero delay - */ - if (requeue) - queue_delayed_work(aio_wq, &ctx->wq, 0); -} + ctx->tail = tail; + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} -/* - * Called by kick_iocb to queue the kiocb for retry - * and if required activate the aio work queue to process - * it - */ -static void try_queue_kicked_iocb(struct kiocb *iocb) +void batch_complete_aio(struct batch_complete *batch) { - struct kioctx *ctx = iocb->ki_ctx; + struct kioctx *ctx = NULL; + struct eventfd_ctx *eventfd = NULL; + struct rb_node *n; unsigned long flags; - int run = 0; + unsigned tail = 0; - spin_lock_irqsave(&ctx->ctx_lock, flags); - /* set this inside the lock so that we can't race with aio_run_iocb() - * testing it and putting the iocb on the run list under the lock */ - if (!kiocbTryKick(iocb)) - run = __queue_kicked_iocb(iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (run) - aio_queue_work(ctx); -} - -/* - * kick_iocb: - * Called typically from a wait queue callback context - * to trigger a retry of the iocb. - * The retry is usually executed by aio workqueue - * threads (See aio_kick_handler). - */ -void kick_iocb(struct kiocb *iocb) -{ - /* sync iocbs are easy: they can only ever be executing from a - * single context. */ - if (is_sync_kiocb(iocb)) { - kiocbSetKicked(iocb); - wake_up_process(iocb->ki_obj.tsk); + if (RB_EMPTY_ROOT(&batch->kiocb)) return; - } - - try_queue_kicked_iocb(iocb); -} -EXPORT_SYMBOL(kick_iocb); - -/* aio_complete - * Called when the io request on the given iocb is complete. - * Returns true if this is the last user of the request. The - * only other user of the request can be the cancellation code. - */ -int aio_complete(struct kiocb *iocb, long res, long res2) -{ - struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; - struct aio_ring *ring; - struct io_event *event; - unsigned long flags; - unsigned long tail; - int ret; /* - * Special case handling for sync iocbs: - * - events go directly into the iocb for fast handling - * - the sync task with the iocb in its stack holds the single iocb - * ref, no other paths have a way to get another ref - * - the sync task helpfully left a reference to itself in the iocb + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after incrementing reqs_available. */ - if (is_sync_kiocb(iocb)) { - BUG_ON(iocb->ki_users != 1); - iocb->ki_user_data = res; - iocb->ki_users = 0; - wake_up_process(iocb->ki_obj.tsk); - return 1; - } - - info = &ctx->ring_info; + rcu_read_lock(); + local_irq_save(flags); - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. - */ - spin_lock_irqsave(&ctx->ctx_lock, flags); + n = rb_first(&batch->kiocb); + while (n) { + struct kiocb *req = container_of(n, struct kiocb, ki_node); - if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) - list_del_init(&iocb->ki_run_list); + if (n->rb_right) { + n->rb_right->__rb_parent_color = n->__rb_parent_color; + n = n->rb_right; - /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (kiocbIsCancelled(iocb)) - goto put_rq; + while (n->rb_left) + n = n->rb_left; + } else { + n = rb_parent(n); + } - ring = kmap_atomic(info->ring_pages[0]); + if (unlikely(req->ki_eventfd != eventfd)) { + if (eventfd) { + /* Make event visible */ + kioctx_ring_unlock(ctx, tail); + ctx = NULL; - tail = info->tail; - event = aio_ring_event(info, tail); - if (++tail >= info->nr) - tail = 0; - - event->obj = (u64)(unsigned long)iocb->ki_obj.user; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + eventfd_signal(eventfd, 1); + eventfd_ctx_put(eventfd); + } - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + eventfd = req->ki_eventfd; + req->ki_eventfd = NULL; + } - /* after flagging the request as done, we - * must never even look at it again - */ - smp_wmb(); /* make event visible before updating tail */ + if (unlikely(req->ki_ctx != ctx)) { + kioctx_ring_unlock(ctx, tail); - info->tail = tail; - ring->tail = tail; + ctx = req->ki_ctx; + tail = kioctx_ring_lock(ctx); + } - put_aio_ring_event(event); - kunmap_atomic(ring); + tail = kioctx_ring_put(ctx, req, tail); + aio_put_req(req); + } - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + kioctx_ring_unlock(ctx, tail); + local_irq_restore(flags); + rcu_read_unlock(); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd != NULL) - eventfd_signal(iocb->ki_eventfd, 1); + if (eventfd) { + eventfd_signal(eventfd, 1); + eventfd_ctx_put(eventfd); + } +} +EXPORT_SYMBOL(batch_complete_aio); -put_rq: - /* everything turned out well, dispose of the aiocb. */ - ret = __aio_put_req(ctx, iocb); +/* aio_complete_batch + * Called when the io request on the given iocb is complete; @batch may be + * NULL. + */ +void aio_complete_batch(struct kiocb *req, long res, long res2, + struct batch_complete *batch) +{ + req->ki_res = res; + req->ki_res2 = res2; + + if (req->ki_list.next) { + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&req->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* - * We have to order our ring_info tail store above and test - * of the wait list below outside the wait lock. This is - * like in wake_up_bit() where clearing a bit has to be - * ordered with the unlocked test. + * Special case handling for sync iocbs: + * - events go directly into the iocb for fast handling + * - the sync task with the iocb in its stack holds the single iocb + * ref, no other paths have a way to get another ref + * - the sync task helpfully left a reference to itself in the iocb */ - smp_mb(); + if (is_sync_kiocb(req)) { + BUG_ON(atomic_read(&req->ki_users) != 1); + req->ki_user_data = req->ki_res; + atomic_set(&req->ki_users, 0); + wake_up_process(req->ki_obj.tsk); + } else if (batch) { + int res; + struct kiocb *t; + struct rb_node **n = &batch->kiocb.rb_node, *parent = NULL; + + while (*n) { + parent = *n; + t = container_of(*n, struct kiocb, ki_node); + + res = req->ki_ctx != t->ki_ctx + ? req->ki_ctx < t->ki_ctx + : req->ki_eventfd != t->ki_eventfd + ? req->ki_eventfd < t->ki_eventfd + : req < t; + + n = res ? &(*n)->rb_left : &(*n)->rb_right; + } - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + rb_link_node(&req->ki_node, parent, n); + rb_insert_color(&req->ki_node, &batch->kiocb); + } else { + struct batch_complete batch_stack; - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - return ret; + memset(&req->ki_node, 0, sizeof(req->ki_node)); + batch_stack.kiocb.rb_node = &req->ki_node; + + batch_complete_aio(&batch_stack); + } } -EXPORT_SYMBOL(aio_complete); +EXPORT_SYMBOL(aio_complete_batch); -/* aio_read_evt - * Pull an event off of the ioctx's event ring. Returns the number of - * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. - * TODO: make the ringbuffer user mmap()able (requires FIXME). +/* aio_read_events + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched */ -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; - - ring = kmap_atomic(info->ring_pages[0]); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; + unsigned head, pos; + long ret = 0; + int copy_ret; - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp); - } - spin_unlock(&info->ring_lock); + mutex_lock(&ctx->ring_lock); -out: - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; kunmap_atomic(ring); - return ret; -} - -struct aio_timeout { - struct timer_list timer; - int timed_out; - struct task_struct *p; -}; -static void timeout_func(unsigned long data) -{ - struct aio_timeout *to = (struct aio_timeout *)data; + pr_debug("h%u t%u m%u\n", head, ctx->shadow_tail, ctx->nr_events); - to->timed_out = 1; - wake_up_process(to->p); -} + if (head == ctx->shadow_tail) + goto out; -static inline void init_timeout(struct aio_timeout *to) -{ - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); - to->timed_out = 0; - to->p = current; -} + while (ret < nr) { + long avail; + struct io_event *ev; + struct page *page; -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, - const struct timespec *ts) -{ - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); - if (time_after(to->timer.expires, jiffies)) - add_timer(&to->timer); - else - to->timed_out = 1; -} + avail = (head <= ctx->shadow_tail ? + ctx->shadow_tail : ctx->nr_events) - head; + if (head == ctx->shadow_tail) + break; -static inline void clear_timeout(struct aio_timeout *to) -{ - del_singleshot_timer_sync(&to->timer); -} + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); -static int read_events(struct kioctx *ctx, - long min_nr, long nr, - struct io_event __user *event, - struct timespec __user *timeout) -{ - long start_jiffies = jiffies; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - int ret; - int i = 0; - struct io_event ent; - struct aio_timeout to; - int retry = 0; - - /* needed to zero any padding within an entry (there shouldn't be - * any, but C is fun! - */ - memset(&ent, 0, sizeof(ent)); -retry: - ret = 0; - while (likely(i < nr)) { - ret = aio_read_evt(ctx, &ent); - if (unlikely(ret <= 0)) - break; + pos = head + AIO_EVENTS_OFFSET; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; - dprintk("read event: %Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, + sizeof(*ev) * avail); + kunmap(page); - /* Could we split the check in two? */ - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; } - ret = 0; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; + ret += avail; + head += avail; + head %= ctx->nr_events; } - if (min_nr <= i) - return i; - if (ret) - return ret; - - /* End fast path */ + ring = kmap_atomic(ctx->ring_pages[0]); + ring->head = head; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - /* racey check, but it gets redone */ - if (!retry && unlikely(!list_empty(&ctx->run_list))) { - retry = 1; - aio_run_all_iocbs(ctx); - goto retry; - } + pr_debug("%li h%u t%u\n", ret, head, ctx->shadow_tail); - init_timeout(&to); - if (timeout) { - struct timespec ts; - ret = -EFAULT; - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - goto out; - - set_timeout(start_jiffies, &to, &ts); - } + put_reqs_available(ctx, ret); +out: + mutex_unlock(&ctx->ring_lock); - while (likely(i < nr)) { - add_wait_queue_exclusive(&ctx->wait, &wait); - do { - set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); - if (ret) - break; - if (min_nr <= i) - break; - if (unlikely(ctx->dead)) { - ret = -EINVAL; - break; - } - if (to.timed_out) /* Only check after read evt */ - break; - /* Try to only show up in io wait if there are ops - * in flight */ - if (ctx->reqs_active) - io_schedule(); - else - schedule(); - if (signal_pending(tsk)) { - ret = -EINTR; - break; - } - /*ret = aio_read_evt(ctx, &ent);*/ - } while (1) ; + return ret; +} - set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) +{ + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); - if (unlikely(ret <= 0)) - break; + if (ret > 0) + *i += ret; - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; - } + if (unlikely(percpu_ref_dead(&ctx->users))) + ret = -EINVAL; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } + if (!*i) + *i = ret; - if (timeout) - clear_timeout(&to); -out: - destroy_timer_on_stack(&to.timer); - return i ? i : ret; + return ret < 0 || *i >= min_nr; } -/* Take an ioctx and remove it from the list of ioctx's. Protects - * against races with itself via ->dead. - */ -static void io_destroy(struct kioctx *ioctx) +static long read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) { - struct mm_struct *mm = current->mm; - int was_dead; + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; - /* delete the entry from the list is someone else hasn't already */ - spin_lock(&mm->ioctx_lock); - was_dead = ioctx->dead; - ioctx->dead = 1; - hlist_del_rcu(&ioctx->list); - spin_unlock(&mm->ioctx_lock); + if (timeout) { + struct timespec ts; - dprintk("aio_release(%p)\n", ioctx); - if (likely(!was_dead)) - put_ioctx(ioctx); /* twice for the list */ + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + return -EFAULT; - kill_ctx(ioctx); + until = timespec_to_ktime(ts); + } /* - * Wake up any waiters. The setting of ctx->dead must be seen - * by other CPUs at this point. Right now, we rely on the - * locking done by the above calls to ensure this consistency. + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. */ - wake_up_all(&ioctx->wait); + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), until); + + if (!ret && signal_pending(current)) + ret = -EINTR; + + return ret; } /* sys_io_setup: @@ -1252,7 +1040,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); } @@ -1270,7 +1058,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); return 0; } @@ -1301,30 +1089,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) BUG_ON(ret > 0 && iocb->ki_left == 0); } -static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - ssize_t (*rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); ssize_t ret = 0; - unsigned short opcode; - - if ((iocb->ki_opcode == IOCB_CMD_PREADV) || - (iocb->ki_opcode == IOCB_CMD_PREAD)) { - rw_op = file->f_op->aio_read; - opcode = IOCB_CMD_PREADV; - } else { - rw_op = file->f_op->aio_write; - opcode = IOCB_CMD_PWRITEV; - } /* This matches the pread()/pwrite() logic */ if (iocb->ki_pos < 0) return -EINVAL; - if (opcode == IOCB_CMD_PWRITEV) + if (rw == WRITE) file_start_write(file); do { ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], @@ -1336,9 +1115,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* retry all partial writes. retry partial reads as long as its a * regular file. */ } while (ret > 0 && iocb->ki_left > 0 && - (opcode == IOCB_CMD_PWRITEV || + (rw == WRITE || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (opcode == IOCB_CMD_PWRITEV) + if (rw == WRITE) file_end_write(file); /* This means we must have transferred all that we could */ @@ -1348,81 +1127,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ - if (opcode == IOCB_CMD_PWRITEV - && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + if (rw == WRITE + && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; return ret; } -static ssize_t aio_fdsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 1); - return ret; -} - -static ssize_t aio_fsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 0); - return ret; -} - -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; + kiocb->ki_nr_segs = kiocb->ki_nbytes; + #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(type, + ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); else #endif - ret = rw_copy_check_uvector(type, + ret = rw_copy_check_uvector(rw, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); if (ret < 0) - goto out; - - ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); - if (ret < 0) - goto out; + return ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; - kiocb->ki_cur_seg = 0; - /* ki_nbytes/left now reflect bytes instead of segs */ + /* ki_nbytes now reflect bytes instead of segs */ kiocb->ki_nbytes = ret; - kiocb->ki_left = ret; - - ret = 0; -out: - return ret; + return 0; } -static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) { - int bytes; - - bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); - if (bytes < 0) - return bytes; + if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + return -EFAULT; kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; kiocb->ki_nr_segs = 1; - kiocb->ki_cur_seg = 0; return 0; } @@ -1431,96 +1178,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, bool compat) { - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + aio_rw_op *rw_op; - switch (kiocb->ki_opcode) { + switch (req->ki_opcode) { case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(READ, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(WRITE, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; - break; case IOCB_CMD_PREADV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = aio_setup_vectored_rw(READ, kiocb, compat); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; + mode = FMODE_READ; + rw = READ; + rw_op = file->f_op->aio_read; + goto rw_common; + + case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITEV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + mode = FMODE_WRITE; + rw = WRITE; + rw_op = file->f_op->aio_write; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!rw_op) + return -EINVAL; + + ret = (req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(rw, req, compat) + : aio_setup_single_vector(rw, req); if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; + return ret; + + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (ret < 0) + return ret; + + req->ki_nbytes = ret; + req->ki_left = ret; + + ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fdsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); break; + case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); break; + default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; + pr_debug("EINVAL: no operation provided\n"); + return -EINVAL; } - if (!kiocb->ki_retry) - return ret; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } return 0; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct kiocb_batch *batch, - bool compat) + struct iocb *iocb, bool compat) { struct kiocb *req; - struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { - pr_debug("EINVAL: io_submit: reserve field set\n"); + pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1534,16 +1280,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - file = fget(iocb->aio_fildes); - if (unlikely(!file)) - return -EBADF; - - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ - if (unlikely(!req)) { - fput(file); + req = aio_get_req(ctx); + if (unlikely(!req)) return -EAGAIN; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) { + ret = -EBADF; + goto out_put_req; } - req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an @@ -1559,9 +1305,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } } - ret = put_user(req->ki_key, &user_iocb->aio_key); + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { - dprintk("EFAULT: aio_key\n"); + pr_debug("EFAULT: aio_key\n"); goto out_put_req; } @@ -1573,41 +1319,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req, compat); - + ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - spin_lock_irq(&ctx->ctx_lock); - /* - * We could have raced with io_destroy() and are currently holding a - * reference to ctx which should be destroyed. We cannot submit IO - * since ctx gets freed as soon as io_submit() puts its reference. The - * check here is reliable: io_destroy() sets ctx->dead before waiting - * for outstanding IO and the barrier between these two is realized by - * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we - * increment ctx->reqs_active before checking for ctx->dead and the - * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we - * don't see ctx->dead set here, io_destroy() waits for our IO to - * finish. - */ - if (ctx->dead) { - spin_unlock_irq(&ctx->ctx_lock); - ret = -EINVAL; - goto out_put_req; - } - aio_run_iocb(req); - if (!list_empty(&ctx->run_list)) { - /* drain the run list */ - while (__aio_run_iocbs(ctx)) - ; - } - spin_unlock_irq(&ctx->ctx_lock); - aio_put_req(req); /* drop extra ref to req */ return 0; - out_put_req: + put_reqs_available(ctx, 1); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; @@ -1620,7 +1339,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, long ret = 0; int i = 0; struct blk_plug plug; - struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1633,12 +1351,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { - pr_debug("EINVAL: io_submit: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - kiocb_batch_init(&batch, nr); - blk_start_plug(&plug); /* @@ -1659,13 +1375,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, compat); if (ret) break; } blk_finish_plug(&plug); - kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } @@ -1698,10 +1413,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, assert_spin_locked(&ctx->ctx_lock); + if (key != KIOCB_KEY) + return NULL; + /* TODO: use a hash or array, this sucks. */ list_for_each(pos, &ctx->active_reqs) { struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + if (kiocb->ki_obj.user == iocb) return kiocb; } return NULL; @@ -1720,7 +1438,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1735,32 +1453,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb && kiocb->ki_cancel) { - cancel = kiocb->ki_cancel; - kiocb->ki_users ++; - kiocbSetCancelled(kiocb); - } else - cancel = NULL; + if (kiocb) + ret = kiocb_cancel(ctx, kiocb, &res); + else + ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); - if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } - } else - ret = -EINVAL; + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &res, sizeof(res))) + ret = -EFAULT; + } put_ioctx(ctx); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bce87694f7b0..89dec7f789a4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8a0b0efda44..53b6d624c7a9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -140,6 +140,25 @@ static int padzero(unsigned long elf_bss) #define ELF_BASE_PLATFORM NULL #endif +/* + * Use get_random_int() to implement AT_RANDOM while avoiding depletion + * of the entropy pool. + */ +static void get_atrandom_bytes(unsigned char *buf, size_t nbytes) +{ + unsigned char *p = buf; + + while (nbytes) { + unsigned int random_variable; + size_t chunk = min(nbytes, sizeof(random_variable)); + + random_variable = get_random_int(); + memcpy(p, &random_variable, chunk); + p += chunk; + nbytes -= chunk; + } +} + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -201,7 +220,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, /* * Generate 16 random bytes for userspace PRNG seeding. */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) @@ -738,8 +757,6 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8fb42916d8a2..69f6f802b09e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -510,7 +510,8 @@ static void bio_integrity_verify_fn(struct work_struct *work) * in process context. This function postpones completion * accordingly. */ -void bio_integrity_endio(struct bio *bio, int error) +void bio_integrity_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct bio_integrity_payload *bip = bio->bi_integrity; @@ -19,6 +19,7 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> @@ -27,6 +28,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/aio.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -759,7 +761,8 @@ struct submit_bio_ret { int error; }; -static void submit_bio_wait_endio(struct bio *bio, int error) +static void submit_bio_wait_endio(struct bio *bio, int error, + struct batch_complete *batch) { struct submit_bio_ret *ret = bio->bi_private; @@ -1413,7 +1416,8 @@ void bio_unmap_user(struct bio *bio) } EXPORT_SYMBOL(bio_unmap_user); -static void bio_map_kern_endio(struct bio *bio, int err) +static void bio_map_kern_endio(struct bio *bio, int err, + struct batch_complete *batch) { bio_put(bio); } @@ -1485,7 +1489,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, } EXPORT_SYMBOL(bio_map_kern); -static void bio_copy_kern_endio(struct bio *bio, int err) +static void bio_copy_kern_endio(struct bio *bio, int err, + struct batch_complete *batch) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; @@ -1684,31 +1689,40 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif -/** - * bio_endio - end I/O on a bio - * @bio: bio - * @error: error, if any - * - * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. - **/ -void bio_endio(struct bio *bio, int error) +static inline void __bio_endio(struct bio *bio, struct batch_complete *batch) { - if (error) + if (bio->bi_error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = -EIO; if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio->bi_end_io(bio, bio->bi_error, batch); +} + +void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch) +{ + if (error) + bio->bi_error = error; + + if (batch) + bio_list_add(&batch->bio, bio); + else + __bio_endio(bio, batch); + +} +EXPORT_SYMBOL(bio_endio_batch); + +void batch_complete(struct batch_complete *batch) +{ + struct bio *bio; + + while ((bio = bio_list_pop(&batch->bio))) + __bio_endio(bio, batch); + + batch_complete_aio(batch); } -EXPORT_SYMBOL(bio_endio); +EXPORT_SYMBOL(batch_complete); void bio_pair_release(struct bio_pair *bp) { @@ -1721,7 +1735,8 @@ void bio_pair_release(struct bio_pair *bp) } EXPORT_SYMBOL(bio_pair_release); -static void bio_pair_end_1(struct bio *bi, int err) +static void bio_pair_end_1(struct bio *bi, int err, + struct batch_complete *batch) { struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); @@ -1731,7 +1746,8 @@ static void bio_pair_end_1(struct bio *bi, int err) bio_pair_release(bp); } -static void bio_pair_end_2(struct bio *bi, int err) +static void bio_pair_end_2(struct bio *bi, int err, + struct batch_complete *batch) { struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); diff --git a/fs/block_dev.c b/fs/block_dev.c index e5571ea15673..61675d7b9828 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 18af6f48781a..3c617b3244c0 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -323,7 +323,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); -static void btrfsic_complete_bio_end_io(struct bio *bio, int err); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, @@ -336,7 +337,8 @@ static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status, + struct batch_complete *batch); static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, @@ -1751,7 +1753,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, return block_ctx->len; } -static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +static void btrfsic_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -2294,7 +2297,8 @@ continue_loop: goto again; } -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status, + struct batch_complete *batch) { struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; int iodone_w_error; @@ -2342,7 +2346,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) block = next_block; } while (NULL != block); - bp->bi_end_io(bp, bio_error_status); + bp->bi_end_io(bp, bio_error_status, batch); } static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 15b94089abc4..74ae115edba0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -153,7 +153,8 @@ fail: * The compressed pages are freed here, and it must be run * in process context */ -static void end_compressed_bio_read(struct bio *bio, int err) +static void end_compressed_bio_read(struct bio *bio, int err, + struct batch_complete *batch) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; @@ -263,7 +264,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, * This also calls the writeback end hooks for the file pages so that * metadata and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio, int err) +static void end_compressed_bio_write(struct bio *bio, int err, + struct batch_complete *batch) { struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d19a0a554aa..8e7250029f64 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -669,7 +669,8 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) return -EIO; /* we fixed nothing */ } -static void end_workqueue_bio(struct bio *bio, int err) +static void end_workqueue_bio(struct bio *bio, int err, + struct batch_complete *batch) { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; @@ -2957,7 +2958,8 @@ static int write_dev_supers(struct btrfs_device *device, * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ -static void btrfs_end_empty_barrier(struct bio *bio, int err) +static void btrfs_end_empty_barrier(struct bio *bio, int err, + struct batch_complete *batch) { if (err) { if (err == -EOPNOTSUPP) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 73f2bfe3ac93..fbf0d44851e4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1937,7 +1937,8 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, return err; } -static void repair_io_failure_callback(struct bio *bio, int err) +static void repair_io_failure_callback(struct bio *bio, int err, + struct batch_complete *batch) { complete(bio->bi_private); } @@ -2317,7 +2318,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio, int err) +static void end_bio_extent_writepage(struct bio *bio, int err, + struct batch_complete *batch) { struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; @@ -2363,7 +2365,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio, int err) +static void end_bio_extent_readpage(struct bio *bio, int err, + struct batch_complete *batch) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -3186,7 +3189,8 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +static void end_bio_extent_buffer_writepage(struct bio *bio, int err, + struct batch_complete *batch) { int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb8b7a0e28a6..bc4d54c465a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09c58a35b429..00313057e444 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> +#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -6913,7 +6914,8 @@ struct btrfs_dio_private { struct bio *orig_bio; }; -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -6967,10 +6969,11 @@ failed: /* If we had a csum failure make sure to clear the uptodate flag */ if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + dio_end_io(bio, err, batch); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7012,7 +7015,7 @@ out_done: /* If we had an error make sure to clear the uptodate flag */ if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + dio_end_io(bio, err, batch); } static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, @@ -7026,7 +7029,8 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_dio_private *dip = bio->bi_private; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9a79fb790adb..6df1ac8d0adf 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -850,7 +850,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_end_io(struct bio *bio, int err) +static void raid_write_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1384,7 +1385,8 @@ static void set_bio_pages_uptodate(struct bio *bio) * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid_rmw_end_io(struct bio *bio, int err) +static void raid_rmw_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1905,7 +1907,8 @@ cleanup_io: * This is called only for stripes we've read from disk to * reconstruct the parity. */ -static void raid_recover_end_io(struct bio *bio, int err) +static void raid_recover_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_raid_bio *rbio = bio->bi_private; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 85e072b956d5..fc29a119436a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -200,7 +200,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int is_metadata, int have_csum, const u8 *csum, u64 generation, u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); +static void scrub_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write); @@ -223,7 +224,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, @@ -240,7 +242,8 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); @@ -1386,7 +1389,8 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, sblock->checksum_error = 1; } -static void scrub_complete_bio_end_io(struct bio *bio, int err) +static void scrub_complete_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -1586,7 +1590,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(WRITE, sbio->bio); } -static void scrub_wr_bio_end_io(struct bio *bio, int err) +static void scrub_wr_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; @@ -2056,7 +2061,8 @@ leave_nomem: return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_bio_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 678977226570..0182898a6c88 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5029,7 +5029,8 @@ static unsigned int extract_stripe_index_from_bio_private(void *bi_private) return (unsigned int)((uintptr_t)bi_private) & 3; } -static void btrfs_end_bio(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err, + struct batch_complete *batch) { struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); int is_orig_bio = 0; @@ -5086,7 +5087,7 @@ static void btrfs_end_bio(struct bio *bio, int err) } kfree(bbio); - bio_endio(bio, err); + bio_endio_batch(bio, err, batch); } else if (!is_orig_bio) { bio_put(bio); } diff --git a/fs/buffer.c b/fs/buffer.c index d2a4d1bb2d57..c41042274ac7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2882,7 +2882,8 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, } EXPORT_SYMBOL(generic_block_bmap); -static void end_bio_bh_io_sync(struct bio *bio, int err) +static void end_bio_bh_io_sync(struct bio *bio, int err, + struct batch_complete *batch) { struct buffer_head *bh = bio->bi_private; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d70830c66833..656e16907430 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "super.h" #include "mds_client.h" diff --git a/fs/compat.c b/fs/compat.c index 93f7d021b716..fc3b55dce184 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -47,6 +47,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> diff --git a/fs/direct-io.c b/fs/direct-io.c index 4925c1b51abb..b4dd97c8cea3 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> +#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -229,7 +230,8 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, + bool is_async, struct batch_complete *batch) { ssize_t transferred = 0; @@ -263,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is } else { inode_dio_done(dio->inode); if (is_async) - aio_complete(dio->iocb, ret, 0); + aio_complete_batch(dio->iocb, ret, 0, batch); } return ret; @@ -273,7 +275,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. */ -static void dio_bio_end_aio(struct bio *bio, int error) +static void dio_bio_end_aio(struct bio *bio, int error, + struct batch_complete *batch) { struct dio *dio = bio->bi_private; unsigned long remaining; @@ -289,7 +292,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); + dio_complete(dio, dio->iocb->ki_pos, 0, true, batch); kmem_cache_free(dio_cache, dio); } } @@ -323,12 +326,12 @@ static void dio_bio_end_io(struct bio *bio, int error) * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio, int error, struct batch_complete *batch) { struct dio *dio = bio->bi_private; if (dio->is_async) - dio_bio_end_aio(bio, error); + dio_bio_end_aio(bio, error, batch); else dio_bio_end_io(bio, error); } @@ -349,10 +352,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_bdev = bdev; bio->bi_sector = first_sector; - if (dio->is_async) - bio->bi_end_io = dio_bio_end_aio; - else - bio->bi_end_io = dio_bio_end_io; + bio->bi_end_io = dio_end_io; sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; @@ -1267,7 +1267,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, offset, retval, false); + retval = dio_complete(dio, offset, retval, false, NULL); kmem_cache_free(dio_cache, dio); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..f23d2a7ed438 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -58,6 +58,8 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { + printk(KERN_NOTICE "%s (%d): dropped kernel caches: %d\n", + current->comm, task_pid_nr(current), sysctl_drop_caches); if (sysctl_drop_caches & 1) iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/aio.h> #include "ecryptfs_kernel.h" /** diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext2.h" #include "acl.h" #include "xip.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d706dbfa6220..23c712825640 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1bc4523541d6..b1b4d51b5d86 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include "ext4.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 98be6f697463..b8d5d351e24f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 793d44b84d7f..0723774bdfb5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/aio.h> #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5929cd0baa20..0f5670970915 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,6 +18,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> @@ -257,7 +258,8 @@ static void buffer_io_error(struct buffer_head *bh) (unsigned long long)bh->b_blocknr); } -static void ext4_end_bio(struct bio *bio, int error) +static void ext4_end_bio(struct bio *bio, int error, + struct batch_complete *batch) { ext4_io_end_t *io_end = bio->bi_private; struct inode *inode; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2db9380f5dda..d5038e5688f6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> @@ -329,7 +330,7 @@ repeat: return page; } -static void read_end_io(struct bio *bio, int err) +static void read_end_io(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d8e84e49a5c3..e36793c00caf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -633,7 +633,8 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -static void f2fs_end_io_write(struct bio *bio, int err) +static void f2fs_end_io_write(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e50ddb..73264395f705 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -17,8 +17,11 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -140,6 +143,22 @@ static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, static int fat_file_release(struct inode *inode, struct file *filp) { + + struct super_block *sb = inode->i_sb; + loff_t mmu_private_ideal; + + /* + * Release unwritten fallocated blocks on file release. + * Do this only when the last open file descriptor is closed. + */ + mutex_lock(&inode->i_mutex); + mmu_private_ideal = round_up(inode->i_size, sb->s_blocksize); + + if (mmu_private_ideal < MSDOS_I(inode)->mmu_private && + filp->f_dentry->d_count == 1) + fat_truncate_blocks(inode, inode->i_size); + mutex_unlock(&inode->i_mutex); + if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); @@ -174,6 +193,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -212,6 +232,88 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. The + * allocated clusters are freed in fat_file_release(). + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int cluster, fclus, dclus; + int nr_cluster; /* Number of clusters to be allocated */ + loff_t nr_bytes; /* Number of bytes to be allocated*/ + loff_t free_bytes; /* Unused bytes in the last cluster of file*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if ((offset + len) <= MSDOS_I(inode)->mmu_private) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): Blocks already allocated"); + err = -EINVAL; + goto error; + } + + if (mode & FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + if (inode->i_size > 0) { + err = fat_get_cluster(inode, FAT_ENT_EOF, + &fclus, &dclus); + if (err < 0) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_get_cluster() error"); + goto error; + } + free_bytes = ((fclus + 1) << sbi->cluster_bits) - + inode->i_size; + nr_bytes = offset + len - inode->i_size - free_bytes; + MSDOS_I(inode)->mmu_private = (fclus + 1) << + sbi->cluster_bits; + } else + nr_bytes = offset + len - inode->i_size; + + nr_cluster = (nr_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_alloc_clusters() error"); + goto error; + } + err = fat_chain_add(inode, cluster, 1); + if (err) { + fat_free_clusters(inode, cluster); + goto error; + } + MSDOS_I(inode)->mmu_private += sbi->cluster_size; + } + } else { + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_cont_expand() error"); + } + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -378,6 +480,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; unsigned int ia_valid; int error; + loff_t mmu_private_ideal; + + mmu_private_ideal = round_up(inode->i_size, dentry->d_sb->s_blocksize); /* Check for setting the inode time. */ ia_valid = attr->ia_valid; @@ -403,7 +508,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { inode_dio_wait(inode); - if (attr->ia_size > inode->i_size) { + if (attr->ia_size > inode->i_size && + MSDOS_I(inode)->mmu_private <= mmu_private_ideal) { error = fat_cont_expand(inode, attr->ia_size); if (error || attr->ia_valid == ATTR_SIZE) goto out; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4ff901632b26..3300da0ce0e5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -19,6 +19,7 @@ #include <linux/mpage.h> #include <linux/buffer_head.h> #include <linux/mount.h> +#include <linux/aio.h> #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> @@ -151,11 +152,65 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } } +static int fat_zero_falloc_area(struct file *file, + struct address_space *mapping, loff_t pos) +{ + struct page *page; + struct inode *inode = mapping->host; + loff_t curpos = i_size_read(inode); + size_t count = pos - curpos; + int err; + + do { + unsigned offset; + size_t bytes; + void *fsdata; + + offset = (curpos & (PAGE_CACHE_SIZE - 1)); + bytes = PAGE_CACHE_SIZE - offset; + bytes = min(bytes, count); + + err = pagecache_write_begin(NULL, mapping, curpos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + break; + + zero_user(page, offset, bytes); + + err = pagecache_write_end(NULL, mapping, curpos, bytes, bytes, + page, fsdata); + if (err < 0) + break; + curpos += bytes; + count -= bytes; + err = 0; + } while (count); + + return err; +} + static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int err; + loff_t mmu_private_ideal, mmu_private_actual; + loff_t size; + struct inode *inode = mapping->host; + struct super_block *sb = inode->i_sb; + + size = i_size_read(inode); + mmu_private_actual = MSDOS_I(inode)->mmu_private; + mmu_private_ideal = round_up(size, sb->s_blocksize); + if ((mmu_private_actual > mmu_private_ideal) && (pos > size)) { + err = fat_zero_falloc_area(file, mapping, pos); + if (err) { + fat_msg(sb, KERN_ERR, + "Error (%d) zeroing fallocated area", err); + return err; + } + } *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, flags, diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b3aaf7b3578b..aef34b1e635e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a6c1664e330b..1d55f9465400 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4655e59d545b..d1c9b85b3f58 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> +#include <linux/aio.h> static const struct file_operations fuse_direct_io_file_operations; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9883694f1e7c..0bad69ed6336 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index c5fa758fd844..91a5ebb614ca 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -200,7 +200,8 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, * */ -static void gfs2_end_log_write(struct bio *bio, int error) +static void gfs2_end_log_write(struct bio *bio, int error, + struct batch_complete *batch) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 60ede2a0f43f..86eb657aeaca 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -155,7 +155,8 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } -static void end_bio_io_page(struct bio *bio, int error) +static void end_bio_io_page(struct bio *bio, int error, + struct batch_complete *batch) { struct page *page = bio->bi_private; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 716e1aafb2e2..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7faaa964968e..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b51a6079108d..96375a5124b2 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -24,7 +24,8 @@ struct hfsplus_wd { u16 embed_count; }; -static void hfsplus_end_io_sync(struct bio *bio, int err) +static void hfsplus_end_io_sync(struct bio *bio, int err, + struct batch_complete *batch) { if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77554b61d124..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index c57499dca89c..b02926c80b84 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2153,7 +2153,7 @@ static void lbmStartIO(struct lbuf * bp) /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio, 0, NULL); } else { submit_bio(WRITE_SYNC, bio); INCREMENT(lmStat.submitted); @@ -2191,7 +2191,7 @@ static int lbmIOWait(struct lbuf * bp, int flag) * * executed at INTIODONE level */ -static void lbmIODone(struct bio *bio, int error) +static void lbmIODone(struct bio *bio, int error, struct batch_complete *batch) { struct lbuf *bp = bio->bi_private; struct lbuf *nextbp, *tail; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 6740d34cd82b..6ba675782e9f 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -283,7 +283,8 @@ static void last_read_complete(struct page *page) unlock_page(page); } -static void metapage_read_end_io(struct bio *bio, int err) +static void metapage_read_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct page *page = bio->bi_private; @@ -338,7 +339,8 @@ static void last_write_complete(struct page *page) end_page_writeback(page); } -static void metapage_write_end_io(struct bio *bio, int err) +static void metapage_write_end_io(struct bio *bio, int err, + struct batch_complete *batch) { struct page *page = bio->bi_private; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 550475ca6a0e..0ae2254f74bf 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -14,7 +14,8 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static void request_complete(struct bio *bio, int err) +static void request_complete(struct bio *bio, int err, + struct batch_complete *batch) { complete((struct completion *)bio->bi_private); } @@ -64,7 +65,8 @@ static int bdev_readpage(void *_sb, struct page *page) static DECLARE_WAIT_QUEUE_HEAD(wq); -static void writeseg_end_io(struct bio *bio, int err) +static void writeseg_end_io(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -168,7 +170,7 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) } -static void erase_end_io(struct bio *bio, int err) +static void erase_end_io(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct super_block *sb = bio->bi_private; diff --git a/fs/mpage.c b/fs/mpage.c index 0face1c4d4c6..a4089bbfee0a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -41,7 +41,7 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 434b93ec0970..76cf69557051 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -143,7 +143,7 @@ bl_submit_bio(int rw, struct bio *bio) static struct bio *bl_alloc_init_bio(int npg, sector_t isect, struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + bio_end_io_t *end_io, struct parallel_io *par) { struct bio *bio; @@ -167,7 +167,7 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + bio_end_io_t *end_io, struct parallel_io *par, unsigned int offset, int len) { @@ -190,7 +190,7 @@ retry: static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + bio_end_io_t *end_io, struct parallel_io *par) { return do_add_page_to_bio(bio, npg, rw, isect, page, be, @@ -198,7 +198,8 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, } /* This is basically copied from mpage_end_io_read */ -static void bl_end_io_read(struct bio *bio, int err) +static void bl_end_io_read(struct bio *bio, int err, + struct batch_complete *batch) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -380,7 +381,8 @@ static void mark_extents_written(struct pnfs_block_layout *bl, } } -static void bl_end_io_write_zero(struct bio *bio, int err) +static void bl_end_io_write_zero(struct bio *bio, int err, + struct batch_complete *batch) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -408,7 +410,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err) put_parallel(par); } -static void bl_end_io_write(struct bio *bio, int err) +static void bl_end_io_write(struct bio *bio, int err, + struct batch_complete *batch) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -487,7 +490,7 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) } static void -bl_read_single_end_io(struct bio *bio, int error) +bl_read_single_end_io(struct bio *bio, int error, struct batch_complete *batch) { struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf02f5530713..689fb608648e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -25,7 +25,7 @@ #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/uio.h> +#include <linux/aio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc9a913784ab..680b65b8a74d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,8 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) /* * BIO operations */ -static void nilfs_end_bio_write(struct bio *bio, int err) +static void nilfs_end_bio_write(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da4b81e6f76..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> +#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> +#include <linux/aio.h> #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,6 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H +#include <linux/aio.h> + handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 8c3318bf2252..0cc19d0417bf 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -372,8 +372,8 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, wait_for_completion(&wc->wc_io_complete); } -static void o2hb_bio_end_io(struct bio *bio, - int error) +static void o2hb_bio_end_io(struct bio *bio, int error, + struct batch_complete *batch) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags, subclass, _RET_IP_); if (status < 0) { - if (status != -EAGAIN && status != -EIOCBRETRY) + if (status != -EAGAIN) mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..c765bdf6d60e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -28,6 +28,8 @@ #include "extent_map.h" +struct iocb; + /* OCFS2 Inode Private Data */ struct ocfs2_inode_info { diff --git a/fs/pipe.c b/fs/pipe.c index a029a14bacf1..d2c45e14e6d8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index dd51e50001fe..4d3ebd6fd710 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2673,6 +2673,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap2", S_IRUGO, proc_pagemap2_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3029,6 +3030,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap2", S_IRUGO, proc_pagemap2_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d600fb098b6a..f417d4398b68 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -290,6 +290,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_pagemap2_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d864d56..3240a497926c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -688,10 +688,41 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + struct vm_area_struct *vma; + enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -706,6 +737,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -719,10 +755,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +762,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,23 +771,28 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + .type = type, + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* @@ -794,6 +832,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -807,14 +846,17 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in pagemap2 pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -837,7 +879,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -847,11 +889,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -866,19 +909,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -887,13 +932,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -905,17 +950,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -932,7 +980,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -940,7 +988,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -955,14 +1003,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -976,7 +1024,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1012,8 +1060,8 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t do_pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, bool v2) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; @@ -1038,6 +1086,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = v2; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1110,10 +1159,27 @@ out: return ret; } +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, false); +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, }; + +static ssize_t pagemap2_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, true); +} + +const struct file_operations proc_pagemap2_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap2_read, +}; #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA diff --git a/fs/read_write.c b/fs/read_write.c index 90ba3b350e50..03430008704e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,6 +9,7 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -329,16 +330,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; @@ -350,13 +341,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -406,13 +391,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -592,13 +571,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,6 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> +#include <linux/aio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,6 +50,7 @@ */ #include "ubifs.h" +#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/slab.h> diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> +#include <linux/aio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3244c988d379..f64ee7130509 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -379,7 +380,8 @@ xfs_imap_valid( STATIC void xfs_end_bio( struct bio *bio, - int error) + int error, + struct batch_complete *batch) { xfs_ioend_t *ioend = bio->bi_private; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 82b70bda9f47..cee0e42b5389 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1224,7 +1224,8 @@ _xfs_buf_ioend( STATIC void xfs_buf_bio_end_io( struct bio *bio, - int error) + int error, + struct batch_complete *batch) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 054d60c0ac57..a5f2042aec8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" +#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a59ff51b0166..347fb7bfcbe2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -396,6 +396,28 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_start_context_switch(prev) do {} while (0) #endif +#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline int pte_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline int pmd_soft_dirty(pmd_t pmd) +{ + return 0; +} + +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) +{ + return pmd; +} +#endif + #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of diff --git a/include/linux/aio.h b/include/linux/aio.h index 31ff6dba4872..a7e4c595825e 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -6,94 +6,38 @@ #include <linux/aio_abi.h> #include <linux/uio.h> #include <linux/rcupdate.h> - #include <linux/atomic.h> - -#define AIO_MAXSEGS 4 -#define AIO_KIOGRP_NR_ATOMIC 8 +#include <linux/batch_complete.h> struct kioctx; +struct kiocb; +struct batch_complete; -/* Notes on cancelling a kiocb: - * If a kiocb is cancelled, aio_complete may return 0 to indicate - * that cancel has not yet disposed of the kiocb. All cancel - * operations *must* call aio_put_req to dispose of the kiocb - * to guard against races with the completion code. - */ -#define KIOCB_C_CANCELLED 0x01 -#define KIOCB_C_COMPLETE 0x02 - -#define KIOCB_SYNC_KEY (~0U) +#define KIOCB_KEY 0 -/* ki_flags bits */ /* - * This may be used for cancel/retry serialization in the future, but - * for now it's unused and we probably don't want modules to even - * think they can use it. + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either + * cancelled or completed (this makes a certain amount of sense because + * successful cancellation - io_cancel() - does deliver the completion to + * userspace). + * + * And since most things don't implement kiocb cancellation and we'd really like + * kiocb completion to be lockless when possible, we use ki_cancel to + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). */ -/* #define KIF_LOCKED 0 */ -#define KIF_KICKED 1 -#define KIF_CANCELLED 2 - -#define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) +#define KIOCB_CANCELLED ((void *) (~0ULL)) -#define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) -#define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) +typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); -#define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) -#define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) - -#define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) -#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) -#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) - -/* is there a better place to document function pointer methods? */ -/** - * ki_retry - iocb forward progress callback - * @kiocb: The kiocb struct to advance by performing an operation. - * - * This callback is called when the AIO core wants a given AIO operation - * to make forward progress. The kiocb argument describes the operation - * that is to be performed. As the operation proceeds, perhaps partially, - * ki_retry is expected to update the kiocb with progress made. Typically - * ki_retry is set in the AIO core and it itself calls file_operations - * helpers. - * - * ki_retry's return value determines when the AIO operation is completed - * and an event is generated in the AIO event ring. Except the special - * return values described below, the value that is returned from ki_retry - * is transferred directly into the completion ring as the operation's - * resulting status. Once this has happened ki_retry *MUST NOT* reference - * the kiocb pointer again. - * - * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete() - * will be called on the kiocb pointer in the future. The AIO core will - * not ask the method again -- ki_retry must ensure forward progress. - * aio_complete() must be called once and only once in the future, multiple - * calls may result in undefined behaviour. - * - * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb() - * will be called on the kiocb pointer in the future. This may happen - * through generic helpers that associate kiocb->ki_wait with a wait - * queue head that ki_retry uses via current->io_wait. It can also happen - * with custom tracking and manual calls to kick_iocb(), though that is - * discouraged. In either case, kick_iocb() must be called once and only - * once. ki_retry must ensure forward progress, the AIO core will wait - * indefinitely for kick_iocb() to be called. - */ struct kiocb { - struct list_head ki_run_list; - unsigned long ki_flags; - int ki_users; - unsigned ki_key; /* id of this request */ + struct rb_node ki_node; + + atomic_t ki_users; struct file *ki_filp; - struct kioctx *ki_ctx; /* may be NULL for sync ops */ - int (*ki_cancel)(struct kiocb *, struct io_event *); - ssize_t (*ki_retry)(struct kiocb *); + struct kioctx *ki_ctx; /* NULL for sync ops */ + kiocb_cancel_fn *ki_cancel; void (*ki_dtor)(struct kiocb *); union { @@ -102,6 +46,9 @@ struct kiocb { } ki_obj; __u64 ki_user_data; /* user's data for completion */ + long ki_res; + long ki_res2; + loff_t ki_pos; void *private; @@ -117,7 +64,6 @@ struct kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ - struct list_head ki_batch; /* batch allocation */ /* * If the aio_resfd field of the userspace iocb is not zero, @@ -128,108 +74,55 @@ struct kiocb { static inline bool is_sync_kiocb(struct kiocb *kiocb) { - return kiocb->ki_key == KIOCB_SYNC_KEY; + return kiocb->ki_ctx == NULL; } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { - .ki_users = 1, - .ki_key = KIOCB_SYNC_KEY, + .ki_users = ATOMIC_INIT(1), + .ki_ctx = NULL, .ki_filp = filp, .ki_obj.tsk = current, }; } -#define AIO_RING_MAGIC 0xa10a10a1 -#define AIO_RING_COMPAT_FEATURES 1 -#define AIO_RING_INCOMPAT_FEATURES 0 -struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ - unsigned head; - unsigned tail; - - unsigned magic; - unsigned compat_features; - unsigned incompat_features; - unsigned header_length; /* size of aio_ring */ - - - struct io_event io_events[0]; -}; /* 128 bytes + ring size */ - -#define AIO_RING_PAGES 8 -struct aio_ring_info { - unsigned long mmap_base; - unsigned long mmap_size; - - struct page **ring_pages; - spinlock_t ring_lock; - long nr_pages; - - unsigned nr, tail; - - struct page *internal_pages[AIO_RING_PAGES]; -}; - -static inline unsigned aio_ring_avail(struct aio_ring_info *info, - struct aio_ring *ring) -{ - return (ring->head + info->nr - 1 - ring->tail) % info->nr; -} - -struct kioctx { - atomic_t users; - int dead; - struct mm_struct *mm; - - /* This needs improving */ - unsigned long user_id; - struct hlist_node list; - - wait_queue_head_t wait; - - spinlock_t ctx_lock; - - int reqs_active; - struct list_head active_reqs; /* used for cancellation */ - struct list_head run_list; /* used for kicked reqs */ - - /* sys_io_setup currently limits this to an unsigned int */ - unsigned max_reqs; - - struct aio_ring_info ring_info; - - struct delayed_work wq; - - struct rcu_head rcu_head; -}; - /* prototypes */ -extern unsigned aio_max_size; - #ifdef CONFIG_AIO extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); -extern int aio_put_req(struct kiocb *iocb); -extern void kick_iocb(struct kiocb *iocb); -extern int aio_complete(struct kiocb *iocb, long res, long res2); +extern void aio_put_req(struct kiocb *iocb); +extern void batch_complete_aio(struct batch_complete *batch); +extern void aio_complete_batch(struct kiocb *iocb, long res, long res2, + struct batch_complete *batch); struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } -static inline int aio_put_req(struct kiocb *iocb) { return 0; } -static inline void kick_iocb(struct kiocb *iocb) { } -static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; } +static inline void aio_put_req(struct kiocb *iocb) { } + +static inline void batch_complete_aio(struct batch_complete *batch) { } +static inline void aio_complete_batch(struct kiocb *iocb, long res, long res2, + struct batch_complete *batch) +{ + return; +} struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user * __user *iocbpp, bool compat) { return 0; } +static inline void kiocb_set_cancel_fn(struct kiocb *req, + kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ +static inline void aio_complete(struct kiocb *iocb, long res, long res2) +{ + aio_complete_batch(iocb, res, res2, NULL); +} + static inline struct kiocb *list_kiocb(struct list_head *h) { return list_entry(h, struct kiocb, ki_list); diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f7f1d7169b11..6fd5cc80f62f 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -213,8 +213,15 @@ static inline bool balloon_compaction_check(void) return true; } +static inline void balloon_event_count(enum vm_event_item item) +{ + count_vm_event(item); +} #else /* !CONFIG_BALLOON_COMPACTION */ +/* A macro, to avoid generating references to the undefined COMPACTBALLOON* */ +#define balloon_event_count(item) do { } while (0) + static inline void *balloon_mapping_alloc(void *balloon_device, const struct address_space_operations *a_ops) { diff --git a/include/linux/batch_complete.h b/include/linux/batch_complete.h new file mode 100644 index 000000000000..8167a9d306fb --- /dev/null +++ b/include/linux/batch_complete.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_BATCH_COMPLETE_H +#define _LINUX_BATCH_COMPLETE_H + +#include <linux/rbtree.h> + +/* + * Common stuff to the aio and block code for batch completion. Everything + * important is elsewhere: + */ + +struct bio; + +struct bio_list { + struct bio *head; + struct bio *tail; +}; + +struct batch_complete { + struct bio_list bio; + struct rb_root kiocb; +}; + +#endif diff --git a/include/linux/bio.h b/include/linux/bio.h index ef24466d8f82..5db8a51eebb1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -24,6 +24,7 @@ #include <linux/mempool.h> #include <linux/ioprio.h> #include <linux/bug.h> +#include <linux/batch_complete.h> #ifdef CONFIG_BLOCK @@ -69,6 +70,8 @@ #define bio_sectors(bio) ((bio)->bi_size >> 9) #define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio))) +void bio_endio_batch(struct bio *bio, int error, struct batch_complete *batch); + static inline unsigned int bio_cur_bytes(struct bio *bio) { if (bio->bi_vcnt) @@ -252,7 +255,25 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } -extern void bio_endio(struct bio *, int); +/** + * bio_endio - end I/O on a bio + * @bio: bio + * @error: error, if any + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the + * preferred way to end I/O on a bio, it takes care of clearing + * BIO_UPTODATE on error. @error is 0 on success, and and one of the + * established -Exxxx (-EIO, for instance) error values in case + * something went wrong. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io + * function. + **/ +static inline void bio_endio(struct bio *bio, int error) +{ + bio_endio_batch(bio, error, NULL); +} + struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -404,10 +425,6 @@ static inline bool bio_mergeable(struct bio *bio) * member of the bio. The bio_list also caches the last list member to allow * fast access to the tail. */ -struct bio_list { - struct bio *head; - struct bio *tail; -}; static inline int bio_list_empty(const struct bio_list *bl) { @@ -554,6 +571,15 @@ struct biovec_slab { */ #define BIO_SPLIT_ENTRIES 2 +static inline void batch_complete_init(struct batch_complete *batch) +{ + bio_list_init(&batch->bio); + batch->kiocb = RB_ROOT; +} + +void batch_complete(struct batch_complete *batch); + + #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) @@ -580,7 +606,7 @@ extern int bio_integrity_enabled(struct bio *bio); extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); extern int bio_integrity_prep(struct bio *); -extern void bio_integrity_endio(struct bio *, int); +extern void bio_integrity_endio(struct bio *, int, struct batch_complete *); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); extern void bio_integrity_split(struct bio *, struct bio_pair *, int); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fa1abeb45b76..9d3cafa6bbcd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -16,7 +16,8 @@ struct page; struct block_device; struct io_context; struct cgroup_subsys_state; -typedef void (bio_end_io_t) (struct bio *, int); +struct batch_complete; +typedef void (bio_end_io_t) (struct bio *, int, struct batch_complete *); typedef void (bio_destructor_t) (struct bio *); /* @@ -42,6 +43,7 @@ struct bio { * top bits priority */ + short bi_error; unsigned short bi_vcnt; /* how many bio_vec's */ unsigned short bi_idx; /* current index into bvl_vec */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6189bf26b53d..ff636bd8fde1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -883,7 +883,8 @@ extern struct request *blk_fetch_request(struct request_queue *q); * This prevents code duplication in drivers. */ extern bool blk_update_request(struct request *rq, int error, - unsigned int nr_bytes); + unsigned int nr_bytes, + struct batch_complete *batch); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); @@ -891,10 +892,17 @@ extern bool blk_end_request_cur(struct request *rq, int error); extern bool blk_end_request_err(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, int error); extern bool __blk_end_request_cur(struct request *rq, int error); extern bool __blk_end_request_err(struct request *rq, int error); +extern void blk_end_request_all_batch(struct request *rq, int error, + struct batch_complete *batch); + +static inline void __blk_end_request_all(struct request *rq, int error) +{ + blk_end_request_all_batch(rq, error, NULL); +} + extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3bff9ce09cf7..5047355b9a0f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -28,6 +28,7 @@ struct cgroup_subsys; struct inode; struct cgroup; struct css_id; +struct eventfd_ctx; extern int cgroup_init_early(void); extern int cgroup_init(void); diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h new file mode 100644 index 000000000000..d5b68bf3ec92 --- /dev/null +++ b/include/linux/decompress/unlz4.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZ4_H +#define DECOMPRESS_UNLZ4_H + +int unlz4(unsigned char *inbuf, int len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *pos, + void(*error)(char *x)); +#endif diff --git a/include/linux/errno.h b/include/linux/errno.h index f6bf082d4d4f..89627b9187f9 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h @@ -28,6 +28,5 @@ #define EBADTYPE 527 /* Type not supported by server */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ -#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b5a24ba83b6f..8b1aab600327 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2453,7 +2453,7 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio, int error); +void dio_end_io(struct bio *bio, int error, struct batch_complete *batch); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index c1d6555d2567..f3cec6856a4b 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -94,6 +94,11 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) +/* + * Are we in nmi,irq context, or softirq context? + */ +#define in_serving_irq() (in_nmi() || in_irq() || in_serving_softirq()) + #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f1e877b79ed8..cfc2f119779a 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -365,7 +365,7 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#else /* !LOCKDEP */ +#else /* !CONFIG_LOCKDEP */ static inline void lockdep_off(void) { @@ -479,82 +479,36 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define spin_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# endif -# define spin_release(l, n, i) lock_release(l, n, i) +#ifdef CONFIG_PROVE_LOCKING + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 2, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 2, n, i) #else -# define spin_acquire(l, s, t, i) do { } while (0) -# define spin_release(l, n, i) do { } while (0) + #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) + #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) + #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, NULL, i) -# else -# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, NULL, i) -# endif -# define rwlock_release(l, n, i) lock_release(l, n, i) -#else -# define rwlock_acquire(l, s, t, i) do { } while (0) -# define rwlock_acquire_read(l, s, t, i) do { } while (0) -# define rwlock_release(l, n, i) do { } while (0) -#endif +#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define spin_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# else -# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# endif -# define mutex_release(l, n, i) lock_release(l, n, i) -#else -# define mutex_acquire(l, s, t, i) do { } while (0) -# define mutex_acquire_nest(l, s, t, n, i) do { } while (0) -# define mutex_release(l, n, i) do { } while (0) -#endif +#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_release(l, n, i) lock_release(l, n, i) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) -# else -# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) -# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) -# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) -# endif +#define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define mutex_release(l, n, i) lock_release(l, n, i) + +#define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) +#define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) +#define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) # define rwsem_release(l, n, i) lock_release(l, n, i) -#else -# define rwsem_acquire(l, s, t, i) do { } while (0) -# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) -# define rwsem_acquire_read(l, s, t, i) do { } while (0) -# define rwsem_release(l, n, i) do { } while (0) -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_PROVE_LOCKING -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_) -# else -# define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_) -# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_) -# endif +#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) # define lock_map_release(l) lock_release(l, 1, _THIS_IP_) -#else -# define lock_map_acquire(l) do { } while (0) -# define lock_map_acquire_read(l) do { } while (0) -# define lock_map_release(l) do { } while (0) -#endif #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000000..d21c13f10a64 --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,87 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) +#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + + /* + * lz4hc_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len); +#endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ace9a5f01c64..fb425aa16c01 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -330,12 +330,9 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); - void (*unmap_area) (struct mm_struct *mm, unsigned long addr); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ - unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ - unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b..8c9f859ab558 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -815,7 +815,10 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +static inline enum zone_type zone_idx(struct zone *zone) +{ + return zone - zone->zone_pgdat->node_zones; +} static inline int populated_zone(struct zone *zone) { @@ -856,25 +859,18 @@ static inline int is_normal_idx(enum zone_type idx) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM - int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; - return zone_off == ZONE_HIGHMEM * sizeof(*zone) || - (zone_off == ZONE_MOVABLE * sizeof(*zone) && - zone_movable_is_highmem()); -#else - return 0; -#endif + return is_highmem_idx(zone_idx(zone)); } static inline int is_normal(struct zone *zone) { - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; + return zone_idx(zone) == ZONE_NORMAL; } static inline int is_dma32(struct zone *zone) { #ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; + return zone_idx(zone) == ZONE_DMA32; #else return 0; #endif @@ -883,7 +879,7 @@ static inline int is_dma32(struct zone *zone) static inline int is_dma(struct zone *zone) { #ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; + return zone_idx(zone) == ZONE_DMA; #else return 0; #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e3dea75a078b..8352e0300001 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -187,7 +187,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) { VM_BUG_ON(in_interrupt()); -#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h new file mode 100644 index 000000000000..d0cf8872dc43 --- /dev/null +++ b/include/linux/percpu-refcount.h @@ -0,0 +1,114 @@ +/* + * Dynamic percpu refcounts: + * (C) 2012 Google, Inc. + * Author: Kent Overstreet <koverstreet@google.com> + * + * This implements a refcount with similar semantics to atomic_t - atomic_inc(), + * atomic_dec_and_test() - but potentially percpu. + * + * There's one important difference between percpu refs and normal atomic_t + * refcounts; you have to keep track of your initial refcount, and then when you + * start shutting down you call percpu_ref_kill() _before_ dropping the initial + * refcount. + * + * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the + * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() + * puts the ref back in single atomic_t mode, collecting the per cpu refs and + * issuing the appropriate barriers, and then marks the ref as shutting down so + * that percpu_ref_put() will check for the ref hitting 0. After it returns, + * it's safe to drop the initial ref. + * + * BACKGROUND: + * + * Percpu refcounts are quite useful for performance, but if we blindly + * converted all refcounts to percpu counters we'd waste quite a bit of memory. + * + * Think about all the refcounts embedded in kobjects, files, etc. most of which + * aren't used much. These start out as simple atomic counters - a little bigger + * than a bare atomic_t, 16 bytes instead of 4 - but if we exceed some arbitrary + * number of gets in one second, we then switch to percpu counters. + * + * This heuristic isn't perfect because it'll fire if the refcount was only + * being used on one cpu; ideally we'd be able to count the number of cache + * misses on percpu_ref_get() or something similar, but that'd make the non + * percpu path significantly heavier/more complex. We can count the number of + * gets() without any extra atomic instructions on arches that support + * atomic64_t - simply by changing the atomic_inc() to atomic_add_return(). + * + * USAGE: + * + * See fs/aio.c for some example usage; it's used there for struct kioctx, which + * is created when userspaces calls io_setup(), and destroyed when userspace + * calls io_destroy() or the process exits. + * + * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it + * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove + * the kioctx from the proccess's list of kioctxs - after that, there can't be + * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop + * the initial ref with percpu_ref_put(). + * + * Code that does a two stage shutdown like this often needs some kind of + * explicit synchronization to ensure the initial refcount can only be dropped + * once - percpu_ref_kill() does this for you, it returns true once and false if + * someone else already called it. The aio code uses it this way, but it's not + * necessary if the code has some other mechanism to synchronize teardown. + * + * As mentioned previously, we decide when to convert a ref to percpu counters + * in percpu_ref_get(). However, since percpu_ref_get() will often be called + * with rcu_read_lock() held, it's not done there - percpu_ref_get() returns + * true if the ref should be converted to percpu counters. + * + * The caller should then call percpu_ref_alloc() after dropping + * rcu_read_lock(); if there is an uncommonly used codepath where it's + * inconvenient to call percpu_ref_alloc() after get(), it may be safely skipped + * and percpu_ref_get() will return true again the next time the counter wraps + * around. + */ + +#ifndef _LINUX_PERCPU_REFCOUNT_H +#define _LINUX_PERCPU_REFCOUNT_H + +#include <linux/atomic.h> +#include <linux/percpu.h> + +struct percpu_ref { + atomic64_t count; + unsigned long pcpu_count; +}; + +void percpu_ref_init(struct percpu_ref *ref); +void __percpu_ref_get(struct percpu_ref *ref, bool alloc); +int percpu_ref_put(struct percpu_ref *ref); + +int percpu_ref_kill(struct percpu_ref *ref); +int percpu_ref_dead(struct percpu_ref *ref); + +/** + * percpu_ref_get - increment a dynamic percpu refcount + * + * Increments @ref and possibly converts it to percpu counters. Must be called + * with rcu_read_lock() held, and may potentially drop/reacquire rcu_read_lock() + * to allocate percpu counters - if sleeping/allocation isn't safe for some + * other reason (e.g. a spinlock), see percpu_ref_get_noalloc(). + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get(struct percpu_ref *ref) +{ + __percpu_ref_get(ref, true); +} + +/** + * percpu_ref_get_noalloc - increment a dynamic percpu refcount + * + * Increments @ref, to be used when it's not safe to allocate percpu counters. + * Must be called with rcu_read_lock() held. + * + * Analagous to atomic_inc(). + */ +static inline void percpu_ref_get_noalloc(struct percpu_ref *ref) +{ + __percpu_ref_get(ref, false); +} + +#endif diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 731e4ecee3bd..e2772666f004 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> +#include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/kref.h> diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 7794d75ed155..907f3fd191ac 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -7,14 +7,20 @@ #include <linux/timex.h> #include <linux/alarmtimer.h> -union cpu_time_count { - cputime_t cpu; - unsigned long long sched; -}; + +static inline unsigned long long cputime_to_expires(cputime_t expires) +{ + return (__force unsigned long long)expires; +} + +static inline cputime_t expires_to_cputime(unsigned long long expires) +{ + return (__force cputime_t)expires; +} struct cpu_timer_list { struct list_head entry; - union cpu_time_count expires, incr; + unsigned long long expires, incr; struct task_struct *task; int firing; }; diff --git a/include/linux/random.h b/include/linux/random.h index 347ce553a306..3b9377d6b7a5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -29,13 +29,6 @@ u32 prandom_u32(void); void prandom_bytes(void *buf, int nbytes); void prandom_seed(u32 seed); -/* - * These macros are preserved for backward compatibility and should be - * removed as soon as a transition is finished. - */ -#define random32() prandom_u32() -#define srandom32(seed) prandom_seed(seed) - u32 prandom_u32_state(struct rnd_state *); void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); diff --git a/include/linux/rio.h b/include/linux/rio.h index a3e784278667..a29a35bb649d 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -237,6 +237,7 @@ enum rio_phy_type { * @name: Port name string * @priv: Master port private data * @dma: DMA device associated with mport + * @nscan: RapidIO network enumeration/discovery operations */ struct rio_mport { struct list_head dbells; /* list of doorbell events */ @@ -262,8 +263,14 @@ struct rio_mport { #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_device dma; #endif + struct rio_scan *nscan; }; +/* + * Enumeration/discovery control flags + */ +#define RIO_SCAN_ENUM_NO_WAIT 0x00000001 /* Do not wait for enum completed */ + struct rio_id_table { u16 start; /* logical minimal id */ u32 max; /* max number of IDs in table */ @@ -460,6 +467,16 @@ static inline struct rio_mport *dma_to_mport(struct dma_device *ddev) } #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ +/** + * struct rio_scan - RIO enumeration and discovery operations + * @enumerate: Callback to perform RapidIO fabric enumeration. + * @discover: Callback to perform RapidIO fabric discovery. + */ +struct rio_scan { + int (*enumerate)(struct rio_mport *mport, u32 flags); + int (*discover)(struct rio_mport *mport, u32 flags); +}; + /* Architecture and hardware-specific functions */ extern int rio_register_mport(struct rio_mport *); extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int); diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index b75c05920ab5..5059994fe297 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -433,5 +433,6 @@ extern u16 rio_local_get_device_id(struct rio_mport *port); extern struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from); extern struct rio_dev *rio_get_asm(u16 vid, u16 did, u16 asm_vid, u16 asm_did, struct rio_dev *from); +extern int rio_init_mports(void); #endif /* LINUX_RIO_DRV_H */ diff --git a/include/linux/rtc-pxa.h b/include/linux/rtc-pxa.h new file mode 100644 index 000000000000..71bc45f060fc --- /dev/null +++ b/include/linux/rtc-pxa.h @@ -0,0 +1,18 @@ +/* + * include/linux/rtc-pxa.h + * + * RTC PXA Header file + * + * Copyright (C) 2010 Marvell International Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_RTC_PXA_H +#define __LINUX_RTC_PXA_H + +extern int pxa_rtc_sync_time(unsigned int ticks); + +#endif /* __LINUX_RTC_PXA_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 420c2874325e..7eefe794f4cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -313,8 +313,6 @@ extern void schedule_preempt_disabled(void); struct nsproxy; struct user_namespace; -#include <linux/aio.h> - #ifdef CONFIG_MMU extern void arch_pick_mmap_layout(struct mm_struct *mm); extern unsigned long @@ -324,8 +322,6 @@ extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern void arch_unmap_area(struct mm_struct *, unsigned long); -extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 6f19cfd1840e..2793607bf6b8 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -47,6 +47,24 @@ static inline int seccomp_mode(struct seccomp *s) return s->mode; } +#ifdef CONFIG_SECCOMP_FILTER_JIT +extern void seccomp_jit_compile(struct seccomp_filter *fp); +extern void seccomp_jit_free(struct seccomp_filter *fp); + +/* + * accessors used by seccomp JIT code to avoid having to declare the + * seccomp_filter struct here. + */ +unsigned short seccomp_filter_get_len(struct seccomp_filter *fp); +struct sock_filter *seccomp_filter_get_insns(struct seccomp_filter *fp); +void seccomp_filter_set_bpf_func(struct seccomp_filter *fp, void *func); +void *seccomp_filter_get_bpf_func(struct seccomp_filter *fp); + +#else +static inline void seccomp_jit_compile(struct seccomp_filter *fp) { } +static inline void seccomp_jit_free(struct seccomp_filter *fp) { } +#endif + #else /* CONFIG_SECCOMP */ #include <linux/errno.h> diff --git a/include/linux/swap.h b/include/linux/swap.h index 1701ce4be746..ca031f7abee4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -330,11 +330,14 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern void end_swap_bio_write(struct bio *bio, int err); +extern void end_swap_bio_write(struct bio *bio, int err, + struct batch_complete *batch); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)); + void (*end_write_func)(struct bio *bio, int err, + struct batch_complete *batch)); extern int swap_set_page_dirty(struct page *page); -extern void end_swap_bio_read(struct bio *bio, int err); +extern void end_swap_bio_read(struct bio *bio, int err, + struct batch_complete *batch); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..d4b7a184f08c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,7 +50,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, -#endif +#ifdef CONFIG_BALLOON_COMPACTION + COMPACTBALLOONISOLATED, /* isolated from balloon pagelist */ + COMPACTBALLOONMIGRATED, /* balloon page sucessfully migrated */ + COMPACTBALLOONRETURNED, /* putback to pagelist, not-migrated */ +#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif diff --git a/include/linux/wait.h b/include/linux/wait.h index 7cb64d4b499d..ac38be2692d8 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -330,6 +330,92 @@ do { \ __ret; \ }) +#define __wait_event_hrtimeout(wq, condition, timeout, state) \ +({ \ + int __ret = 0; \ + DEFINE_WAIT(__wait); \ + struct hrtimer_sleeper __t; \ + \ + hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ + HRTIMER_MODE_REL); \ + hrtimer_init_sleeper(&__t, current); \ + if ((timeout).tv64 != KTIME_MAX) \ + hrtimer_start_range_ns(&__t.timer, timeout, \ + current->timer_slack_ns, \ + HRTIMER_MODE_REL); \ + \ + for (;;) { \ + prepare_to_wait(&wq, &__wait, state); \ + if (condition) \ + break; \ + if (state == TASK_INTERRUPTIBLE && \ + signal_pending(current)) { \ + __ret = -ERESTARTSYS; \ + break; \ + } \ + if (!__t.task) { \ + __ret = -ETIME; \ + break; \ + } \ + schedule(); \ + } \ + \ + hrtimer_cancel(&__t.timer); \ + destroy_hrtimer_on_stack(&__t.timer); \ + finish_wait(&wq, &__wait); \ + __ret; \ +}) + +/** + * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, as a ktime_t + * + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the + * @condition evaluates to true or a signal is received. + * The @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * The function returns 0 if @condition became true, or -ETIME if the timeout + * elapsed. + */ +#define wait_event_hrtimeout(wq, condition, timeout) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = __wait_event_hrtimeout(wq, condition, timeout, \ + TASK_UNINTERRUPTIBLE); \ + __ret; \ +}) + +/** + * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, as a ktime_t + * + * The process is put to sleep (TASK_INTERRUPTIBLE) until the + * @condition evaluates to true or a signal is received. + * The @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * The function returns 0 if @condition became true, -ERESTARTSYS if it was + * interrupted by a signal, or -ETIME if the timeout elapsed. + */ +#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ +({ \ + long __ret = 0; \ + if (!(condition)) \ + __ret = __wait_event_hrtimeout(wq, condition, timeout, \ + TASK_INTERRUPTIBLE); \ + __ret; \ +}) + #define __wait_event_interruptible_exclusive(wq, condition, ret) \ do { \ DEFINE_WAIT(__wait); \ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9a9367c0c076..579a5007c696 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -5,6 +5,7 @@ #define WRITEBACK_H #include <linux/sched.h> +#include <linux/workqueue.h> #include <linux/fs.h> DECLARE_PER_CPU(int, dirty_throttle_leaks); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4c062ccff9aa..4405886980c7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -905,7 +905,7 @@ struct ip_vs_app { struct ipvs_master_sync_state { struct list_head sync_queue; struct ip_vs_sync_buff *sync_buff; - int sync_queue_len; + unsigned long sync_queue_len; unsigned int sync_queue_delay; struct task_struct *master_thread; struct delayed_work master_wakeup_work; @@ -998,7 +998,7 @@ struct netns_ipvs { int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; - int sysctl_sync_qlen_max; + unsigned long sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; @@ -1085,7 +1085,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return ACCESS_ONCE(ipvs->sysctl_sync_ports); } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return ipvs->sysctl_sync_qlen_max; } @@ -1138,7 +1138,7 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) return 1; } -static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) +static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs) { return IPVS_SYNC_QLEN_MAX; } diff --git a/init/Kconfig b/init/Kconfig index 9d3a7887a6d3..3fdc9ff1f3e8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -98,10 +98,13 @@ config HAVE_KERNEL_XZ config HAVE_KERNEL_LZO bool +config HAVE_KERNEL_LZ4 + bool + choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -168,6 +171,18 @@ config KERNEL_LZO size is about 10% bigger than gzip; however its speed (both compression and decompression) is the fastest. +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + <https://code.google.com/p/lz4/>. + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + endchoice config DEFAULT_HOSTNAME @@ -931,6 +946,23 @@ config MEMCG_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. +config MEMCG_DEBUG_ASYNC_DESTROY + bool "Memory Resource Controller Debug asynchronous object destruction" + depends on MEMCG_KMEM || MEMCG_SWAP + default n + help + When a memcg is destroyed, the memory consumed by it may not be + immediately freed. This is because when some extensions are used, such + as swap or kernel memory, objects can outlive the group and hold a + reference to it. + + If this is the case, the dangling_memcgs file will show information + about what are the memcgs still alive, and which references are still + preventing it to be freed. There is nothing wrong with that, but it is + very useful when debugging, to know where this memory is being held. + This is a developer-oriented debugging facility only, and no + guarantees of interface stability will be given. + config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" depends on RESOURCE_COUNTERS && HUGETLB_PAGE diff --git a/ipc/msg.c b/ipc/msg.c index d0c6d967b390..29787212d097 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -827,16 +827,15 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, struct ipc_namespace *ns; struct msg_msg *copy = NULL; - ns = current->nsproxy->ipc_ns; - if (msqid < 0 || (long) bufsz < 0) return -EINVAL; if (msgflg & MSG_COPY) { - copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); + copy = prepare_copy(buf, bufsz); if (IS_ERR(copy)) return PTR_ERR(copy); } mode = convert_mode(&msgtyp, msgflg); + ns = current->nsproxy->ipc_ns; msq = msg_lock_check(ns, msqid); if (IS_ERR(msq)) { diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..9fd17f057ce5 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -621,6 +621,7 @@ void audit_trim_trees(void) skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); + put_tree(tree); } list_del(&cursor); mutex_unlock(&audit_filter_mutex); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 267436826c3b..b93206c29243 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,6 +423,10 @@ exit_nofree: return entry; exit_free: + if (entry->rule.watch) + audit_put_watch(entry->rule.watch); /* matches initial get */ + if (entry->rule.tree) + audit_put_tree(entry->rule.tree); /* that's the temporary one */ audit_free_rule(entry); return ERR_PTR(err); } diff --git a/kernel/fork.c b/kernel/fork.c index 0519c9b3261c..6fcc2e24561e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -70,6 +70,7 @@ #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> +#include <linux/aio.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -364,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -539,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_lock(lock); } @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_unlock(lock); preempt_enable(); @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_lock(lock); } @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_unlock(lock); preempt_enable(); @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) int i; preempt_disable(); - rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) { int i; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) + list_del_init(&timer->entry); +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= ptime; - } - } - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= utime; - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; - } else { - timer->expires.sched -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* @@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. @@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } @@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. diff --git a/kernel/printk.c b/kernel/printk.c index 59d354e95763..a1bdf61d38ba 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> +#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -362,6 +363,46 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ + return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + goto ok; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + goto ok; + /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ + if (capable(CAP_SYS_ADMIN)) { + printk_once(KERN_WARNING "%s (%d): " + "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated).\n", + current->comm, task_pid_nr(current)); + goto ok; + } + return -EPERM; + } +ok: + return security_syslog(type); +} + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -437,10 +478,16 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, char cont = '-'; size_t len; ssize_t ret; + int err; if (!user) return -EBADF; + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_FILE); + if (err) + return err; + ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; @@ -619,7 +666,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_OPEN, SYSLOG_FROM_FILE); if (err) return err; @@ -812,45 +859,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1126,10 +1134,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (error) goto out; - error = security_syslog(type); - if (error) - return error; - switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 17ae54da0ec2..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -17,6 +17,7 @@ #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/uio.h> #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..e03440ba0f20 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -339,6 +339,10 @@ static void wakeup_readers(unsigned long data) { struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); + /* + * Stupid polling for now: + */ + mod_timer(&buf->timer, jiffies + 1); } /** @@ -356,6 +360,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + mod_timer(&buf->timer, jiffies + 1); } else del_timer_sync(&buf->timer); @@ -739,15 +744,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) else buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32c..237741439ae9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -39,6 +39,8 @@ * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter * @len: the number of instructions in the program + * @bpf_func: points to either sk_run_filter or the code generated + * by the BPF JIT. * @insns: the BPF program instructions to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev @@ -55,6 +57,8 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; unsigned short len; /* Instruction count */ + unsigned int (*bpf_func)(const struct sk_buff *skb, + const struct sock_filter *filter); struct sock_filter insns[]; }; @@ -213,7 +217,7 @@ static u32 seccomp_run_filters(int syscall) * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter(NULL, f->insns); + u32 cur_ret = f->bpf_func(NULL, f->insns); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -275,6 +279,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (ret) goto fail; + filter->bpf_func = sk_run_filter; + seccomp_jit_compile(filter); + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -332,10 +339,34 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; + seccomp_jit_free(freeme); kfree(freeme); } } +/* + * seccomp filter accessors for JIT code. + */ +unsigned short seccomp_filter_get_len(struct seccomp_filter *fp) +{ + return fp->len; +} + +struct sock_filter *seccomp_filter_get_insns(struct seccomp_filter *fp) +{ + return fp->insns; +} + +void seccomp_filter_set_bpf_func(struct seccomp_filter *fp, void *func) +{ + fp->bpf_func = func; +} + +void *seccomp_filter_get_bpf_func(struct seccomp_filter *fp) +{ + return fp->bpf_func; +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..97084cfc61ca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -12,6 +12,7 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/hardirq.h> #include "smpboot.h" @@ -240,8 +241,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -378,8 +380,9 @@ void smp_call_function_many(const struct cpumask *mask, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress && !early_boot_irqs_disabled); + WARN_ON_ONCE(cpu_online(this_cpu) + && (irqs_disabled() || in_serving_irq()) + && !oops_in_progress && !early_boot_irqs_disabled); /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f07..ea741c32d596 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -239,10 +239,12 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) + if (hardlockup_panic) { + trigger_all_cpu_backtrace(); panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else + } else { WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + } __this_cpu_write(hard_watchdog_warn, true); return; @@ -323,8 +325,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); - if (softlockup_panic) + if (softlockup_panic) { + trigger_all_cpu_backtrace(); panic("softlockup: hung tasks"); + } __this_cpu_write(soft_watchdog_warn, true); } else __this_cpu_write(soft_watchdog_warn, false); diff --git a/lib/Kconfig b/lib/Kconfig index fe01d418b09a..d526e109b147 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -189,6 +189,15 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_COMPRESS + tristate + +config LZ4HC_COMPRESS + tristate + +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # @@ -213,6 +222,10 @@ config DECOMPRESS_LZO select LZO_DECOMPRESS tristate +config DECOMPRESS_LZ4 + select LZ4_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Makefile b/lib/Makefile index e9c52e1b853a..af79e8c25c1b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o + earlycpio.o percpu-refcount.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o @@ -75,6 +75,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ @@ -83,6 +86,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o +lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/decompress.c b/lib/decompress.c index f8fdedaf7b3d..4d1cd0397aab 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -11,6 +11,7 @@ #include <linux/decompress/unxz.h> #include <linux/decompress/inflate.h> #include <linux/decompress/unlzo.h> +#include <linux/decompress/unlz4.h> #include <linux/types.h> #include <linux/string.h> @@ -31,6 +32,9 @@ #ifndef CONFIG_DECOMPRESS_LZO # define unlzo NULL #endif +#ifndef CONFIG_DECOMPRESS_LZ4 +# define unlz4 NULL +#endif struct compress_format { unsigned char magic[2]; @@ -45,6 +49,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0x5d, 0x00}, "lzma", unlzma }, { {0xfd, 0x37}, "xz", unxz }, { {0x89, 0x4c}, "lzo", unlzo }, + { {0x02, 0x21}, "lz4", unlz4 }, { {0, 0}, NULL, NULL } }; diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c new file mode 100644 index 000000000000..3e67cfad16ad --- /dev/null +++ b/lib/decompress_unlz4.c @@ -0,0 +1,187 @@ +/* + * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifdef STATIC +#define PREBOOT +#include "lz4/lz4_decompress.c" +#else +#include <linux/decompress/unlz4.h> +#endif +#include <linux/types.h> +#include <linux/lz4.h> +#include <linux/decompress/mm.h> +#include <linux/compiler.h> + +#include <asm/unaligned.h> + +/* + * Note: Uncompressed chunk size is used in the compressor side + * (userspace side for compression). + * It is hardcoded because there is not proper way to extract it + * from the binary stream which is generated by the preliminary + * version of LZ4 tool so far. + */ +#define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) +#define ARCHIVE_MAGICNUMBER 0x184C2102 + +STATIC inline int INIT unlz4(u8 *input, int in_len, + int (*fill) (void *, unsigned int), + int (*flush) (void *, unsigned int), + u8 *output, int *posp, + void (*error) (char *x)) +{ + int ret = -1; + size_t chunksize = 0; + size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE; + u8 *inp; + u8 *inp_start; + u8 *outp; + int size = in_len; +#ifdef PREBOOT + size_t out_len = get_unaligned_le32(input + in_len); +#endif + size_t dest_len; + + + if (output) { + outp = output; + } else if (!flush) { + error("NULL output pointer and no flush function provided"); + goto exit_0; + } else { + outp = large_malloc(uncomp_chunksize); + if (!outp) { + error("Could not allocate output buffer"); + goto exit_0; + } + } + + if (input && fill) { + error("Both input pointer and fill function provided,"); + goto exit_1; + } else if (input) { + inp = input; + } else if (!fill) { + error("NULL input pointer and missing fill function"); + goto exit_1; + } else { + inp = large_malloc(lz4_compressbound(uncomp_chunksize)); + if (!inp) { + error("Could not allocate input buffer"); + goto exit_1; + } + } + inp_start = inp; + + if (posp) + *posp = 0; + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + } else { + error("invalid header"); + goto exit_2; + } + + if (posp) + *posp += 4; + + for (;;) { + + if (fill) + fill(inp, 4); + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + if (posp) + *posp += 4; + continue; + } + inp += 4; + size -= 4; + + if (posp) + *posp += 4; + + if (fill) { + if (chunksize > lz4_compressbound(uncomp_chunksize)) { + error("chunk length is longer than allocated"); + goto exit_2; + } + fill(inp, chunksize); + } +#ifdef PREBOOT + if (out_len >= uncomp_chunksize) { + dest_len = uncomp_chunksize; + out_len -= dest_len; + } else + dest_len = out_len; + ret = lz4_decompress(inp, &chunksize, outp, dest_len); +#else + dest_len = uncomp_chunksize; + ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, + &dest_len); +#endif + if (ret < 0) { + error("Decoding failed"); + goto exit_2; + } + + if (flush && flush(outp, dest_len) != dest_len) + goto exit_2; + if (output) + outp += dest_len; + if (posp) + *posp += chunksize; + + size -= chunksize; + + if (size == 0) + break; + else if (size < 0) { + error("data corrupted"); + goto exit_2; + } + + inp += chunksize; + if (fill) + inp = inp_start; + } + + ret = 0; +exit_2: + if (!input) + large_free(inp_start); +exit_1: + if (!output) + large_free(outp); +exit_0: + return ret; +} + +#ifdef PREBOOT +STATIC int INIT decompress(unsigned char *buf, int in_len, + int(*fill)(void*, unsigned int), + int(*flush)(void*, unsigned int), + unsigned char *output, + int *posp, + void(*error)(char *x) + ) +{ + return unlz4(buf, in_len - 4, fill, flush, output, posp, error); +} +#endif diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000000..8085d04e9309 --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c new file mode 100644 index 000000000000..fd94058bd7f9 --- /dev/null +++ b/lib/lz4/lz4_compress.c @@ -0,0 +1,443 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min <chanho.min@lge.com> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <asm/unaligned.h> +#include "lz4defs.h" + +/* + * LZ4_compressCtx : + * ----------------- + * Compress 'isize' bytes from 'source' into an output buffer 'dest' of + * maximum size 'maxOutputSize'. * If it cannot achieve it, compression + * will stop, and result of the function will be zero. + * return : the number of bytes written in buffer 'dest', or 0 if the + * compression fails + */ +static inline int lz4_compressctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + HTYPE *hashtable = (HTYPE *)ctx; + const u8 *ip = (u8 *)source; +#if LZ4_ARCH64 + const BYTE * const base = ip; +#else + const int base = 0; +#endif + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardh = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (unlikely(forwardip > mflimit)) + goto _last_literals; + + forwardh = LZ4_HASH_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = ip - base; + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + + if (length >= (int)RUN_MASK) { + int len; + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + /* Encode MatchLength */ + length = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) + return 0; + if (length >= (int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length -= 510) { + *op++ = 255; + *op++ = 255; + } + if (length > 254) { + length -= 255; + *op++ = 255; + } + *op++ = (u8)length; + } else + *token += length; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + /* Test next position */ + ref = base + hashtable[LZ4_HASH_VALUE(ip)]; + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (((char *)op - dest) + lastrun + 1 + + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) + return 0; + + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + + /* End */ + return (int)(((char *)op) - dest); +} + +static inline int lz4_compress64kctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + u16 *hashtable = (u16 *)ctx; + const u8 *ip = (u8 *) source; + const u8 *anchor = ip; + const u8 *const base = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int len, length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (forwardip > mflimit) + goto _last_literals; + + forwardh = LZ4_HASH64K_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = (u16)(ip - base); + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) + && (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + + while (ip < MATCHLIMIT - (STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + + /* Encode MatchLength */ + len = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return 0; + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (u8)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + + /* Test next position */ + ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; + hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) + return 0; + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int)(((char *)op) - dest); +} + +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + if (src_len < LZ4_64KLIMIT) + out_len = lz4_compress64kctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + else + out_len = lz4_compressctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + + return 0; +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000000..d3414eae73a1 --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,326 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include <linux/module.h> +#include <linux/kernel.h> +#endif +#include <linux/lz4.h> + +#include <asm/unaligned.h> + +#include "lz4defs.h" + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *)ip) - source)); +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ + + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *) ip) - source)); +} + +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000000..abcecdc2d0f2 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,156 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define BYTE u8 +typedef struct _U16_S { u16 v; } U16_S; +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + A16(p) = v; \ + p += 2; \ + } while (0) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) +#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) +#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) + +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + put_unaligned(v, (u16 *)(p)); \ + p += 2; \ + } while (0) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) +#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - (MEMORY_USAGE-2))) +#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASHLOG64K)) +#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASH_LOG)) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) +#define HTYPE u32 + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) +#endif + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const u8* + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) +#endif + +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) + +#define LZ4_BLINDCOPY(s, d, l) \ + do { \ + u8 *e = (d) + l; \ + LZ4_WILDCOPY(s, d, e); \ + d = e; \ + } while (0) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c new file mode 100644 index 000000000000..eb1a74f5e368 --- /dev/null +++ b/lib/lz4/lz4hc_compress.c @@ -0,0 +1,539 @@ +/* + * LZ4 HC - High Compression Mode of LZ4 + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min <chanho.min@lge.com> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/lz4.h> +#include <asm/unaligned.h> +#include "lz4defs.h" + +struct lz4hc_data { + const u8 *base; + HTYPE hashtable[HASHTABLESIZE]; + u16 chaintable[MAXD]; + const u8 *nexttoupdate; +} __attribute__((__packed__)); + +static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +{ + memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); + memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); + +#if LZ4_ARCH64 + hc4->nexttoupdate = base + 1; +#else + hc4->nexttoupdate = base; +#endif + hc4->base = base; + return 1; +} + +/* Update chains up to ip (excluded) */ +static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +{ + u16 *chaintable = hc4->chaintable; + HTYPE *hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + + while (hc4->nexttoupdate < ip) { + const u8 *p = hc4->nexttoupdate; + size_t delta = p - (hashtable[HASH_VALUE(p)] + base); + if (delta > MAX_DISTANCE) + delta = MAX_DISTANCE; + chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; + hashtable[HASH_VALUE(p)] = (p) - base; + hc4->nexttoupdate++; + } +} + +static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, + const u8 *const matchlimit) +{ + const u8 *p1t = p1; + + while (p1t < matchlimit - (STEPSIZE - 1)) { +#if LZ4_ARCH64 + u64 diff = A64(p2) ^ A64(p1t); +#else + u32 diff = A32(p2) ^ A32(p1t); +#endif + if (!diff) { + p1t += STEPSIZE; + p2 += STEPSIZE; + continue; + } + p1t += LZ4_NBCOMMONBYTES(diff); + return p1t - p1; + } +#if LZ4_ARCH64 + if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { + p1t += 4; + p2 += 4; + } +#endif + + if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { + p1t += 2; + p2 += 2; + } + if ((p1t < matchlimit) && (*p2 == *p1t)) + p1t++; + return p1t - p1; +} + +static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; + const u8 *ref; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + int nbattempts = MAX_NB_ATTEMPTS; + size_t repl = 0, ml = 0; + u16 delta; + + /* HC4 match finder */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + /* potential repetition */ + if (ref >= ip-4) { + /* confirmed */ + if (A32(ref) == A32(ip)) { + delta = (u16)(ip-ref); + repl = ml = lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + while ((ref >= ip - MAX_DISTANCE) && nbattempts) { + nbattempts--; + if (*(ref + ml) == *(ip + ml)) { + if (A32(ref) == A32(ip)) { + size_t mlt = + lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { + ml = mlt; + *matchpos = ref; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + /* Complete table */ + if (repl) { + const BYTE *ptr = ip; + const BYTE *end; + end = ip + repl - (MINMATCH-1); + /* Pre-Load */ + while (ptr < end - delta) { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + ptr++; + } + do { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + /* Head of chain */ + hashtable[HASH_VALUE(ptr)] = (ptr) - base; + ptr++; + } while (ptr < end); + hc4->nexttoupdate = end; + } + + return (int)ml; +} + +static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, + const u8 **matchpos, const u8 **startpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + const u8 *ref; + int nbattempts = MAX_NB_ATTEMPTS; + int delta = (int)(ip - startlimit); + + /* First Match */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) + && (nbattempts)) { + nbattempts--; + if (*(startlimit + longest) == *(ref - delta + longest)) { + if (A32(ref) == A32(ip)) { + const u8 *reft = ref + MINMATCH; + const u8 *ipt = ip + MINMATCH; + const u8 *startt = ip; + + while (ipt < matchlimit-(STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(reft) ^ A64(ipt); + #else + u32 diff = A32(reft) ^ A32(ipt); + #endif + + if (!diff) { + ipt += STEPSIZE; + reft += STEPSIZE; + continue; + } + ipt += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ipt < (matchlimit - 3)) + && (A32(reft) == A32(ipt))) { + ipt += 4; + reft += 4; + } + ipt += 2; + #endif + if ((ipt < (matchlimit - 1)) + && (A16(reft) == A16(ipt))) { + reft += 2; + } + if ((ipt < matchlimit) && (*reft == *ipt)) + ipt++; +_endcount: + reft = ref; + + while ((startt > startlimit) + && (reft > hc4->base) + && (startt[-1] == reft[-1])) { + startt--; + reft--; + } + + if ((ipt - startt) > longest) { + longest = (int)(ipt - startt); + *matchpos = reft; + *startpos = startt; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + return longest; +} + +static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, + int ml, const u8 *ref) +{ + int length, len; + u8 *token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *(*op)++ = 255; + *(*op)++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(*anchor, *op, length); + + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref)); + + /* Encode MatchLength */ + len = (int)(ml - MINMATCH); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *(*op)++ = 255; + *(*op)++ = 255; + } + if (len > 254) { + len -= 255; + *(*op)++ = 255; + } + *(*op)++ = (u8)len; + } else + *token += len; + + /* Prepare next loop */ + *ip += ml; + *anchor = *ip; + + return 0; +} + +static int lz4_compresshcctx(struct lz4hc_data *ctx, + const char *source, + char *dest, + int isize) +{ + const u8 *ip = (const u8 *)source; + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + const u8 *const matchlimit = (iend - LASTLITERALS); + + u8 *op = (u8 *)dest; + + int ml, ml2, ml3, ml0; + const u8 *ref = NULL; + const u8 *start2 = NULL; + const u8 *ref2 = NULL; + const u8 *start3 = NULL; + const u8 *ref3 = NULL; + const u8 *start0; + const u8 *ref0; + int lastrun; + + ip++; + + /* Main Loop */ + while (ip < mflimit) { + ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + if (!ml) { + ip++; + continue; + } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; +_search2: + if (ip+ml < mflimit) + ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, + ip + 1, matchlimit, ml, &ref2, &start2); + else + ml2 = ml; + /* No better match */ + if (ml2 == ml) { + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + continue; + } + + if (start0 < ip) { + /* empirical */ + if (start2 < ip + ml0) { + ip = start0; + ref = ref0; + ml = ml0; + } + } + /* + * Here, start0==ip + * First Match too small : removed + */ + if ((start2 - ip) < 3) { + ml = ml2; + ip = start2; + ref = ref2; + goto _search2; + } + +_search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) + new_ml = OPTIMAL_ML; + if (ip + new_ml > start2 + ml2 - MINMATCH) + new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* + * Now, we have start2 = ip+new_ml, + * with new_ml=min(ml, OPTIMAL_ML=18) + */ + if (start2 + ml2 < mflimit) + ml3 = lz4hc_insertandgetwidermatch(ctx, + start2 + ml2 - 3, start2, matchlimit, + ml2, &ref3, &start3); + else + ml3 = ml2; + + /* No better match : 2 sequences to encode */ + if (ml3 == ml2) { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) + ml = (int)(start2 - ip); + + /* Now, encode 2 sequences */ + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start2; + lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + continue; + } + + /* Not enough space for match 2 : remove it */ + if (start3 < ip + ml + 3) { + /* + * can write Seq1 immediately ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ + if (start3 >= (ip + ml)) { + if (start2 < ip + ml) { + int correction = + (int)(ip + ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least + * the first one ip & ref are known; Now for ml + */ + if (start2 < ip + ml) { + if ((start2 - ip) < (int)ML_MASK) { + int correction; + if (ml > OPTIMAL_ML) + ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) + ml = (int)(start2 - ip) + ml2 + - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else + ml = (int)(start2 - ip); + } + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _search3; + } + + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8) lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int) (((char *)op) - dest); +} + +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; + lz4hc_init(hc4, (const u8 *)src); + out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, + (char *)dst, (int)src_len); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + return 0; + +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4hc_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC compressor"); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..b2bfa1057f7b --- /dev/null +++ b/lib/percpu-refcount.c @@ -0,0 +1,251 @@ +#define pr_fmt(fmt) "%s: " fmt "\n", __func__ + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/percpu-refcount.h> +#include <linux/rcupdate.h> + +/* + * A percpu refcount can be in 4 different modes. The state is tracked in the + * low two bits of percpu_ref->pcpu_count: + * + * PCPU_REF_NONE - the initial state, no percpu counters allocated. + * + * PCPU_REF_PTR - using percpu counters for the refcount. + * + * PCPU_REF_DYING - we're shutting down so get()/put() should use the embedded + * atomic counter, but we're not finished updating the atomic counter from the + * percpu counters - this means that percpu_ref_put() can't check for the ref + * hitting 0 yet. + * + * PCPU_REF_DEAD - we've finished the teardown sequence, percpu_ref_put() should + * now check for the ref hitting 0. + * + * In PCPU_REF_NONE mode, we need to count the number of times percpu_ref_get() + * is called; this is done with the high bits of the raw atomic counter. We also + * track the time, in jiffies, when the get count last wrapped - this is done + * with the remaining bits of percpu_ref->percpu_count. + * + * So, when percpu_ref_get() is called it increments the get count and checks if + * it wrapped; if it did, it checks if the last time it wrapped was less than + * one second ago; if so, we want to allocate percpu counters. + * + * PCPU_COUNT_BITS determines the threshold where we convert to percpu: of the + * raw 64 bit counter, we use PCPU_COUNT_BITS for the refcount, and the + * remaining (high) bits to count the number of times percpu_ref_get() has been + * called. It's currently (completely arbitrarily) 16384 times in one second. + * + * Percpu mode (PCPU_REF_PTR): + * + * In percpu mode all we do on get and put is increment or decrement the cpu + * local counter, which is a 32 bit unsigned int. + * + * Note that all the gets() could be happening on one cpu, and all the puts() on + * another - the individual cpu counters can wrap (potentially many times). + * + * But this is fine because we don't need to check for the ref hitting 0 in + * percpu mode; before we set the state to PCPU_REF_DEAD we simply sum up all + * the percpu counters and add them to the atomic counter. Since addition and + * subtraction in modular arithmatic is still associative, the result will be + * correct. + */ + +#define PCPU_COUNT_BITS 50 +#define PCPU_COUNT_MASK ((1LL << PCPU_COUNT_BITS) - 1) + +#define PCPU_STATUS_BITS 2 +#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) + +#define PCPU_REF_PTR 0 +#define PCPU_REF_NONE 1 +#define PCPU_REF_DYING 2 +#define PCPU_REF_DEAD 3 + +#define REF_STATUS(count) (count & PCPU_STATUS_MASK) + +/** + * percpu_ref_init - initialize a dynamic percpu refcount + * + * Initializes the refcount in single atomic counter mode with a refcount of 1; + * analagous to atomic_set(ref, 1). + */ +void percpu_ref_init(struct percpu_ref *ref) +{ + unsigned long now = jiffies; + + atomic64_set(&ref->count, 1); + + now <<= PCPU_STATUS_BITS; + now |= PCPU_REF_NONE; + + ref->pcpu_count = now; +} + +static void percpu_ref_alloc(struct percpu_ref *ref, unsigned long pcpu_count) +{ + unsigned long new, now = jiffies; + + now <<= PCPU_STATUS_BITS; + now |= PCPU_REF_NONE; + + if (now - pcpu_count <= HZ << PCPU_STATUS_BITS) { + rcu_read_unlock(); + new = (unsigned long) alloc_percpu(unsigned); + rcu_read_lock(); + + if (!new) + goto update_time; + + BUG_ON(new & PCPU_STATUS_MASK); + + if (cmpxchg(&ref->pcpu_count, pcpu_count, new) != pcpu_count) + free_percpu((void __percpu *) new); + else + pr_debug("created"); + } else { +update_time: + new = now; + cmpxchg(&ref->pcpu_count, pcpu_count, new); + } +} + +void __percpu_ref_get(struct percpu_ref *ref, bool alloc) +{ + unsigned long pcpu_count; + uint64_t v; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + if (REF_STATUS(pcpu_count) == PCPU_REF_PTR) { + /* for rcu - we're not using rcu_dereference() */ + smp_read_barrier_depends(); + __this_cpu_inc(*((unsigned __percpu *) pcpu_count)); + } else { + v = atomic64_add_return(1 + (1ULL << PCPU_COUNT_BITS), + &ref->count); + + /* + * The high bits of the counter count the number of gets() that + * have occured; we check for overflow to call + * percpu_ref_alloc() every (1 << (64 - PCPU_COUNT_BITS)) + * iterations. + */ + + if (!(v >> PCPU_COUNT_BITS) && + REF_STATUS(pcpu_count) == PCPU_REF_NONE && alloc) + percpu_ref_alloc(ref, pcpu_count); + } +} + +/** + * percpu_ref_put - decrement a dynamic percpu refcount + * + * Returns true if the result is 0, otherwise false; only checks for the ref + * hitting 0 after percpu_ref_kill() has been called. Analagous to + * atomic_dec_and_test(). + */ +int percpu_ref_put(struct percpu_ref *ref) +{ + unsigned long pcpu_count; + uint64_t v; + int ret = 0; + + rcu_read_lock(); + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + switch (REF_STATUS(pcpu_count)) { + case PCPU_REF_PTR: + /* for rcu - we're not using rcu_dereference() */ + smp_read_barrier_depends(); + __this_cpu_dec(*((unsigned __percpu *) pcpu_count)); + break; + case PCPU_REF_NONE: + case PCPU_REF_DYING: + atomic64_dec(&ref->count); + break; + case PCPU_REF_DEAD: + v = atomic64_dec_return(&ref->count); + v &= PCPU_COUNT_MASK; + + ret = v == 0; + break; + } + + rcu_read_unlock(); + + return ret; +} + +/** + * percpu_ref_kill - prepare a dynamic percpu refcount for teardown + * + * Must be called before dropping the initial ref, so that percpu_ref_put() + * knows to check for the refcount hitting 0. If the refcount was in percpu + * mode, converts it back to single atomic counter mode. + * + * Returns true the first time called on @ref and false if @ref is already + * shutting down, so it may be used by the caller for synchronizing other parts + * of a two stage shutdown. + */ +int percpu_ref_kill(struct percpu_ref *ref) +{ + unsigned long old, new, status, pcpu_count; + + pcpu_count = ACCESS_ONCE(ref->pcpu_count); + + do { + status = REF_STATUS(pcpu_count); + + switch (status) { + case PCPU_REF_PTR: + new = PCPU_REF_DYING; + break; + case PCPU_REF_NONE: + new = PCPU_REF_DEAD; + break; + case PCPU_REF_DYING: + case PCPU_REF_DEAD: + return 0; + } + + old = pcpu_count; + pcpu_count = cmpxchg(&ref->pcpu_count, old, new); + } while (pcpu_count != old); + + if (status == PCPU_REF_PTR) { + unsigned count = 0, cpu; + + synchronize_rcu(); + + for_each_possible_cpu(cpu) + count += *per_cpu_ptr((unsigned __percpu *)pcpu_count, + cpu); + + pr_debug("global %lli pcpu %i", + atomic64_read(&ref->count) & PCPU_COUNT_MASK, + (int) count); + + atomic64_add((int) count, &ref->count); + smp_wmb(); + /* Between setting global count and setting PCPU_REF_DEAD */ + ref->pcpu_count = PCPU_REF_DEAD; + + free_percpu((unsigned __percpu *) pcpu_count); + } + + return 1; +} + +/** + * percpu_ref_dead - check if a dynamic percpu refcount is shutting down + * + * Returns true if percpu_ref_kill() has been called on @ref, false otherwise. + */ +int percpu_ref_dead(struct percpu_ref *ref) +{ + unsigned status = REF_STATUS(ref->pcpu_count); + + return status == PCPU_REF_DYING || + status == PCPU_REF_DEAD; +} diff --git a/mm/Kconfig b/mm/Kconfig index e742d06285b7..b0a7dad663f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -477,3 +477,15 @@ config FRONTSWAP and swap data is stored as normal on the matching swap device. If unsure, say Y to enable frontswap. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..2c8ce496804f 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -242,6 +242,7 @@ bool balloon_page_isolate(struct page *page) if (__is_movable_balloon_page(page) && page_count(page) == 2) { __isolate_balloon_page(page); + balloon_event_count(COMPACTBALLOONISOLATED); unlock_page(page); return true; } @@ -265,6 +266,7 @@ void balloon_page_putback(struct page *page) __putback_balloon_page(page); /* drop the extra ref count taken for page isolation */ put_page(page); + balloon_event_count(COMPACTBALLOONRETURNED); } else { WARN_ON(1); dump_page(page); diff --git a/mm/bounce.c b/mm/bounce.c index c9f0a4339a7d..708c1e99f57b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -147,12 +147,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) bio_put(bio); } -static void bounce_end_io_write(struct bio *bio, int err) +static void bounce_end_io_write(struct bio *bio, int err, + struct batch_complete *batch) { bounce_end_io(bio, page_pool, err); } -static void bounce_end_io_write_isa(struct bio *bio, int err) +static void bounce_end_io_write_isa(struct bio *bio, int err, + struct batch_complete *batch) { bounce_end_io(bio, isa_page_pool, err); @@ -168,12 +170,14 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) bounce_end_io(bio, pool, err); } -static void bounce_end_io_read(struct bio *bio, int err) +static void bounce_end_io_read(struct bio *bio, int err, + struct batch_complete *batch) { __bounce_end_io_read(bio, page_pool, err); } -static void bounce_end_io_read_isa(struct bio *bio, int err) +static void bounce_end_io_read_isa(struct bio *bio, int err, + struct batch_complete *batch) { __bounce_end_io_read(bio, isa_page_pool, err); } diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf9..668f26316e2e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -132,6 +132,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; + int node; if (align == 0) { align = 1; @@ -156,7 +157,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, return NULL; } - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + node = WARN_ON(!dev) ? -1 : dev_to_node(dev); + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03a89a2f464b..d252c6e0fd03 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (ret == 1) { pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(&mm->page_table_lock); } out: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f1d92163f30..fe4f123f1170 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -319,14 +319,31 @@ struct mem_cgroup { /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; - /* For oom notifier event fd */ - struct list_head oom_notify; + union { + /* For oom notifier event fd */ + struct list_head oom_notify; + /* + * we can only trigger an oom event if the memcg is alive. + * so we will reuse this field to hook the memcg in the list + * of dead memcgs. + */ + struct list_head dead; + }; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; + union { + /* + * Should we move charges of a task when a task is moved into + * this mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + + /* + * We are no longer concerned about moving charges after memcg + * is dead. So we will fill this up with its name, to aid + * debugging. + */ + char *memcg_name; + }; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -380,6 +397,55 @@ static size_t memcg_size(void) nr_node_ids * sizeof(struct mem_cgroup_per_node); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static LIST_HEAD(dangling_memcgs); +static DEFINE_MUTEX(dangling_memcgs_mutex); + +static inline void memcg_dangling_free(struct mem_cgroup *memcg) +{ + mutex_lock(&dangling_memcgs_mutex); + list_del(&memcg->dead); + mutex_unlock(&dangling_memcgs_mutex); + free_pages((unsigned long)memcg->memcg_name, 0); +} + +static inline void memcg_dangling_add(struct mem_cgroup *memcg) +{ + /* + * cgroup.c will do page-sized allocations most of the time, + * so we'll just follow the pattern. Also, __get_free_pages + * is a better interface than kmalloc for us here, because + * we'd like this memory to be always billed to the root cgroup, + * not to the process removing the memcg. While kmalloc would + * require us to wrap it into memcg_stop/resume_kmem_account, + * with __get_free_pages we just don't pass the memcg flag. + */ + memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0); + + /* + * we will, in general, just ignore failures. No need to go crazy, + * being this just a debugging interface. It is nice to copy a memcg + * name over, but if we (unlikely) can't, just the address will do + */ + if (!memcg->memcg_name) + goto add_list; + + if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) { + free_pages((unsigned long)memcg->memcg_name, 0); + memcg->memcg_name = NULL; + } + +add_list: + INIT_LIST_HEAD(&memcg->dead); + mutex_lock(&dangling_memcgs_mutex); + list_add(&memcg->dead, &dangling_memcgs); + mutex_unlock(&dangling_memcgs_mutex); +} +#else +static inline void memcg_dangling_free(struct mem_cgroup *memcg) {} +static inline void memcg_dangling_add(struct mem_cgroup *memcg) {} +#endif + /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -5076,6 +5142,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY +static void +mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_SWAP + u64 kmem; + u64 memsw; + + /* + * kmem will also propagate here, so we are only interested in the + * difference. See comment in mem_cgroup_reparent_charges for details. + * + * We could save this value for later consumption by kmem reports, but + * there is not a lot of problem if the figures differ slightly. + */ + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem; + seq_printf(m, "\t%llu swap bytes\n", memsw); +#endif +} + + +static void +mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m) +{ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + struct tcp_memcontrol *tcp = &memcg->tcp_mem; + s64 tcp_socks; + u64 tcp_bytes; + + tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated); + tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + seq_printf(m, "\t%llu tcp bytes", tcp_bytes); + /* + * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print + * it! + */ + if (tcp_bytes || tcp_socks) + seq_printf(m, ", in %lld sockets", tcp_socks); + seq_printf(m, "\n"); + +#endif +} + +static void +mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m) +{ +#ifdef CONFIG_MEMCG_KMEM + u64 kmem; + struct memcg_cache_params *params; + + kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE); + seq_printf(m, "\t%llu kmem bytes", kmem); + + /* list below may not be initialized, so not even try */ + if (!kmem) + return; + + seq_printf(m, " in caches"); + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + struct kmem_cache *s = memcg_params_to_cache(params); + + seq_printf(m, " %s", s->name); + } + mutex_unlock(&memcg->slab_caches_mutex); + seq_printf(m, "\n"); +#endif +} + +/* + * After a memcg is destroyed, it may still be kept around in memory. + * Currently, the two main reasons for it are swap entries, and kernel memory. + * Because they will be freed assynchronously, they will pin the memcg structure + * and its resources until the last reference goes away. + * + * This root-only file will show information about which users + */ +static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg; + + mutex_lock(&dangling_memcgs_mutex); + + list_for_each_entry(memcg, &dangling_memcgs, dead) { + if (memcg->memcg_name) + seq_printf(m, "%s:\n", memcg->memcg_name); + else + seq_printf(m, "%p (name lost):\n", memcg); + + mem_cgroup_dangling_swap(memcg, m); + mem_cgroup_dangling_tcp(memcg, m); + mem_cgroup_dangling_kmem(memcg, m); + } + + mutex_unlock(&dangling_memcgs_mutex); + return 0; +} +#endif + static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; @@ -5977,6 +6144,14 @@ static struct cftype mem_cgroup_files[] = { }, #endif #endif + +#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY + { + .name = "dangling_memcgs", + .read_seq_string = mem_cgroup_dangling_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, +#endif { }, /* terminate */ }; @@ -6126,6 +6301,8 @@ static void free_work(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, work_freeing); + + memcg_dangling_free(memcg); __mem_cgroup_free(memcg); } @@ -6320,6 +6497,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) kmem_cgroup_destroy(memcg); + memcg_dangling_add(memcg); mem_cgroup_put(memcg); } diff --git a/mm/migrate.c b/mm/migrate.c index 27ed22579fd9..c9c5eeebe5a4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -876,6 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); balloon_page_free(page); + balloon_event_count(COMPACTBALLOONMIGRATED); return MIGRATEPAGE_SUCCESS; } out: diff --git a/mm/mmap.c b/mm/mmap.c index da3e9c04bf37..eb390ee646c5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1867,15 +1867,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1932,19 +1923,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -2365,7 +2343,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2382,11 +2359,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..8a8cd0265e52 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -14,9 +14,6 @@ * use_mm * Makes the calling kernel thread take on the specified * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. * (Note: this routine is intended to be called only * from a kernel thread context) */ diff --git a/mm/mremap.c b/mm/mremap.c index 463a25705ac6..3708655378e9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); } arch_leave_lazy_mmu_mode(); diff --git a/mm/nommu.c b/mm/nommu.c index 298884dcd6e7..886e07ce4d1d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1869,10 +1869,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98cbdf6e5532..76350aacdfe5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3926,8 +3926,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { + pfn = ALIGN(pfn + MAX_ORDER_NR_PAGES, + MAX_ORDER_NR_PAGES) - 1; continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; } diff --git a/mm/page_io.c b/mm/page_io.c index e9a3f6334352..a29407666467 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/frontswap.h> +#include <linux/aio.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -41,7 +42,8 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, return bio; } -void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio, int err, + struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -67,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -void end_swap_bio_read(struct bio *bio, int err) +void end_swap_bio_read(struct bio *bio, int err, struct batch_complete *batch) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -202,7 +204,8 @@ out: } int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)) + void (*end_write_func)(struct bio *bio, int err, + struct batch_complete *batch)) { struct bio *bio; int ret = 0, rw = WRITE; diff --git a/mm/shmem.c b/mm/shmem.c index 39b2a0b86fe8..5e6a8422658b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,6 +31,7 @@ #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> +#include <linux/aio.h> static struct vfsmount *shm_mnt; diff --git a/mm/swap.c b/mm/swap.c index acd40bfffa82..dfd7d71d6841 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,6 +30,7 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> +#include <linux/uio.h> #include "internal.h" diff --git a/mm/util.c b/mm/util.c index ab1424dbe2e6..7441c41d00f6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index f42745e65780..7a35116462a7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -794,7 +794,14 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", -#endif + +#ifdef CONFIG_BALLOON_COMPACTION + "compact_balloon_isolated", + "compact_balloon_migrated", + "compact_balloon_returned", +#endif /* CONFIG_BALLOON_COMPACTION */ + +#endif /* CONFIG_COMPACTION */ #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5b142fb16480..70146496e73a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1716,9 +1716,9 @@ static struct ctl_table vs_vars[] = { }, { .procname = "sync_qlen_max", - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 51bb3de680b6..a0ab6d7599f1 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -311,6 +311,11 @@ cmd_lzo = (cat $(filter-out FORCE,$^) | \ lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) +quiet_cmd_lz4 = LZ4 $@ +cmd_lz4 = (cat $(filter-out FORCE,$^) | \ + lz4demo -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + # U-Boot mkimage # --------------------------------------------------------------------------- diff --git a/security/keys/internal.h b/security/keys/internal.h index 8bbefc3b55d4..d4f1468b9b50 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -16,6 +16,8 @@ #include <linux/key-type.h> #include <linux/task_work.h> +struct iovec; + #ifdef __KDEBUG #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 4b5c948eb414..33cfd27b4de2 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -22,6 +22,7 @@ #include <linux/err.h> #include <linux/vmalloc.h> #include <linux/security.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 23e3c46cd0a4..ccfa383f1fda 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -25,7 +25,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/pm_qos.h> -#include <linux/uio.h> +#include <linux/aio.h> #include <linux/dma-mapping.h> #include <sound/core.h> #include <sound/control.h> diff --git a/sound/soc/codecs/si476x.c b/sound/soc/codecs/si476x.c index 721587c9cd84..73e205c892a0 100644 --- a/sound/soc/codecs/si476x.c +++ b/sound/soc/codecs/si476x.c @@ -38,9 +38,9 @@ enum si476x_digital_io_output_format { SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT = 8, }; -#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0b111 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \ - (0b111 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT)) -#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0b1111110) +#define SI476X_DIGITAL_IO_OUTPUT_WIDTH_MASK ((0x7 << SI476X_DIGITAL_IO_SLOT_SIZE_SHIFT) | \ + (0x7 << SI476X_DIGITAL_IO_SAMPLE_SIZE_SHIFT)) +#define SI476X_DIGITAL_IO_OUTPUT_FORMAT_MASK (0x7e) enum si476x_daudio_formats { SI476X_DAUDIO_MODE_I2S = (0x0 << 1), diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d4abc59ce1d9..a2c214e39fd4 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -2,6 +2,7 @@ TARGETS = breakpoints TARGETS += cpu-hotplug TARGETS += efivarfs TARGETS += kcmp +TARGETS += timers TARGETS += memory-hotplug TARGETS += mqueue TARGETS += net diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile new file mode 100644 index 000000000000..eb2859f4ad21 --- /dev/null +++ b/tools/testing/selftests/timers/Makefile @@ -0,0 +1,8 @@ +all: + gcc posix_timers.c -o posix_timers -lrt + +run_tests: all + ./posix_timers + +clean: + rm -f ./posix_timers diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c new file mode 100644 index 000000000000..4fa655d68a81 --- /dev/null +++ b/tools/testing/selftests/timers/posix_timers.c @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2013 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Licensed under the terms of the GNU GPL License version 2 + * + * Selftests for a few posix timers interface. + * + * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com> + */ + +#include <sys/time.h> +#include <stdio.h> +#include <signal.h> +#include <unistd.h> +#include <time.h> +#include <pthread.h> + +#define DELAY 2 +#define USECS_PER_SEC 1000000 + +static volatile int done; + +/* Busy loop in userspace to elapse ITIMER_VIRTUAL */ +static void user_loop(void) +{ + while (!done); +} + +/* + * Try to spend as much time as possible in kernelspace + * to elapse ITIMER_PROF. + */ +static void kernel_loop(void) +{ + void *addr = sbrk(0); + + while (!done) { + brk(addr + 4096); + brk(addr); + } +} + +/* + * Sleep until ITIMER_REAL expiration. + */ +static void idle_loop(void) +{ + pause(); +} + +static void sig_handler(int nr) +{ + done = 1; +} + +/* + * Check the expected timer expiration matches the GTOD elapsed delta since + * we armed the timer. Keep a 0.5 sec error margin due to various jitter. + */ +static int check_diff(struct timeval start, struct timeval end) +{ + long long diff; + + diff = end.tv_usec - start.tv_usec; + diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; + + if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + printf("Diff too high: %lld..", diff); + return -1; + } + + return 0; +} + +static int check_itimer(int which) +{ + int err; + struct timeval start, end; + struct itimerval val = { + .it_value.tv_sec = DELAY, + }; + + printf("Check itimer "); + + if (which == ITIMER_VIRTUAL) + printf("virtual... "); + else if (which == ITIMER_PROF) + printf("prof... "); + else if (which == ITIMER_REAL) + printf("real... "); + + fflush(stdout); + + done = 0; + + if (which == ITIMER_VIRTUAL) + signal(SIGVTALRM, sig_handler); + else if (which == ITIMER_PROF) + signal(SIGPROF, sig_handler); + else if (which == ITIMER_REAL) + signal(SIGALRM, sig_handler); + + err = gettimeofday(&start, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + err = setitimer(which, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + if (which == ITIMER_VIRTUAL) + user_loop(); + else if (which == ITIMER_PROF) + kernel_loop(); + else if (which == ITIMER_REAL) + idle_loop(); + + gettimeofday(&end, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + if (!check_diff(start, end)) + printf("[OK]\n"); + else + printf("[FAIL]\n"); + + return 0; +} + +static int check_timer_create(int which) +{ + int err; + timer_t id; + struct timeval start, end; + struct itimerspec val = { + .it_value.tv_sec = DELAY, + }; + + printf("Check timer_create() "); + if (which == CLOCK_THREAD_CPUTIME_ID) { + printf("per thread... "); + } else if (which == CLOCK_PROCESS_CPUTIME_ID) { + printf("per process... "); + } + fflush(stdout); + + done = 0; + timer_create(which, NULL, &id); + if (err < 0) { + perror("Can't create timer\n"); + return -1; + } + signal(SIGALRM, sig_handler); + + err = gettimeofday(&start, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + err = timer_settime(id, 0, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + user_loop(); + + gettimeofday(&end, NULL); + if (err < 0) { + perror("Can't call gettimeofday()\n"); + return -1; + } + + if (!check_diff(start, end)) + printf("[OK]\n"); + else + printf("[FAIL]\n"); + + return 0; +} + +int main(int argc, char **argv) +{ + int err; + + printf("Testing posix timers. False negative may happen on CPU execution \n"); + printf("based timers if other threads run on the CPU...\n"); + + if (check_itimer(ITIMER_VIRTUAL) < 0) + return -1; + + if (check_itimer(ITIMER_PROF) < 0) + return -1; + + if (check_itimer(ITIMER_REAL) < 0) + return -1; + + if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) + return -1; + + /* + * It's unfortunately hard to reliably test a timer expiration + * on parallel multithread cputime. We could arm it to expire + * on DELAY * nr_threads, with nr_threads busy looping, then wait + * the normal DELAY since the time is elapsing nr_threads faster. + * But for that we need to ensure we have real physical free CPUs + * to ensure true parallelism. So test only one thread until we + * find a better solution. + */ + if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) + return -1; + + return 0; +} diff --git a/usr/Kconfig b/usr/Kconfig index 085872bb2bb5..642f503d3e9f 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -90,6 +90,15 @@ config RD_LZO Support loading of a LZO encoded initial ramdisk or cpio buffer If unsure, say N. +config RD_LZ4 + bool "Support initial ramdisks compressed using LZ4" if EXPERT + default !EXPERT + depends on BLK_DEV_INITRD + select DECOMPRESS_LZ4 + help + Support loading of a LZ4 encoded initial ramdisk or cpio buffer + If unsure, say N. + choice prompt "Built-in initramfs compression mode" if INITRAMFS_SOURCE!="" help |