diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2019-05-14 13:37:22 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2019-05-14 13:37:22 +1000 |
commit | c32d7d0b94c4a228969f4c0cd88c078316b4c322 (patch) | |
tree | c2daf214671787d9c2bea8f8919c44fd39c4b689 | |
parent | c372a7379a1ce0a00ee21ef44272499825365a8d (diff) | |
parent | b78fc46079e444c5bdf3c2b843cccc2a195d1f75 (diff) |
Merge branch 'akpm-current/current'
354 files changed, 10010 insertions, 3858 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 29601d93a1c2..d742c6cfdffb 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -106,6 +106,15 @@ Description: are from ZONE_DMA. Available when CONFIG_ZONE_DMA is enabled. +What: /sys/kernel/slab/cache/cache_dma32 +Date: December 2018 +KernelVersion: 4.21 +Contact: Nicolas Boichat <drinkcat@chromium.org> +Description: + The cache_dma32 file is read-only and specifies whether objects + are from ZONE_DMA32. + Available when CONFIG_ZONE_DMA32 is enabled. + What: /sys/kernel/slab/cache/cpu_slabs Date: May 2007 KernelVersion: 2.6.22 diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 7e71c9c1d8e9..5cbe5659e3b7 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames. +Monitoring for pressure thresholds +================================== + +Users can register triggers and use poll() to be woken up when resource +pressure exceeds certain thresholds. + +A trigger describes the maximum cumulative stall time over a specific +time window, e.g. 100ms of total stall time within any 500ms window to +generate a wakeup event. + +To register a trigger user has to open psi interface file under +/proc/pressure/ representing the resource to be monitored and write the +desired threshold and time window. The open file descriptor should be +used to wait for trigger events using select(), poll() or epoll(). +The following format is used: + +<some|full> <stall amount in us> <time window in us> + +For example writing "some 150000 1000000" into /proc/pressure/memory +would add 150ms threshold for partial memory stall measured within +1sec time window. Writing "full 50000 1000000" into /proc/pressure/io +would add 50ms threshold for full io stall measured within 1sec time window. + +Triggers can be set on more than one psi metric and more than one trigger +for the same psi metric can be specified. However for each trigger a separate +file descriptor is required to be able to poll it separately from others, +therefore for each trigger a separate open() syscall should be made even +when opening the same psi interface file. + +Monitors activate only when system enters stall state for the monitored +psi metric and deactivates upon exit from the stall state. While system is +in the stall state psi signal growth is monitored at a rate of 10 times per +tracking window. + +The kernel accepts window sizes ranging from 500ms to 10s, therefore min +monitoring update interval is 50ms and max is 1s. Min limit is set to +prevent overly frequent polling. Max limit is chosen as a high enough number +after which monitors are most likely not needed and psi averages can be used +instead. + +When activated, psi monitor stays active for at least the duration of one +tracking window to avoid repeated activations/deactivations when system is +bouncing in and out of the stall state. + +Notifications to the userspace are rate-limited to one per tracking window. + +The trigger will de-register when the file descriptor used to define the +trigger is closed. + +Userspace monitor usage example +=============================== + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <poll.h> +#include <string.h> +#include <unistd.h> + +/* + * Monitor memory partial stall with 1s tracking window size + * and 150ms threshold. + */ +int main() { + const char trig[] = "some 150000 1000000"; + struct pollfd fds; + int n; + + fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK); + if (fds.fd < 0) { + printf("/proc/pressure/memory open error: %s\n", + strerror(errno)); + return 1; + } + fds.events = POLLPRI; + + if (write(fds.fd, trig, strlen(trig) + 1) < 0) { + printf("/proc/pressure/memory write error: %s\n", + strerror(errno)); + return 1; + } + + printf("waiting for events...\n"); + while (1) { + n = poll(&fds, 1, -1); + if (n < 0) { + printf("poll error: %s\n", strerror(errno)); + return 1; + } + if (fds.revents & POLLERR) { + printf("got POLLERR, event source is gone\n"); + return 0; + } + if (fds.revents & POLLPRI) { + printf("event triggered!\n"); + } else { + printf("unknown event received: 0x%x\n", fds.revents); + return 1; + } + } + + return 0; +} + Cgroup2 interface ================= @@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped into cgroups. Each subdirectory in the cgroupfs mountpoint contains cpu.pressure, memory.pressure, and io.pressure files; the format is the same as the /proc/pressure/ files. + +Per-cgroup psi monitors can be specified and used the same way as +system-wide ones. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 88e746074252..c55427269cc8 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -606,8 +606,8 @@ on an IO device and is an example of this type. Protections ----------- -A cgroup is protected to be allocated upto the configured amount of -the resource if the usages of all its ancestors are under their +A cgroup is protected upto the configured amount of the resource +as long as the usages of all its ancestors are under their protected levels. Protections can be hard guarantees or best effort soft boundaries. Protections can also be over-committed in which case only upto the amount available to the parent is protected among @@ -1047,7 +1047,10 @@ PAGE_SIZE multiple when read back. is within its effective min boundary, the cgroup's memory won't be reclaimed under any conditions. If there is no unprotected reclaimable memory available, OOM killer - is invoked. + is invoked. Above the effective min boundary (or + effective low boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective min boundary is limited by memory.min values of all ancestor cgroups. If there is memory.min overcommitment @@ -1069,7 +1072,10 @@ PAGE_SIZE multiple when read back. Best-effort memory protection. If the memory usage of a cgroup is within its effective low boundary, the cgroup's memory won't be reclaimed unless memory can be reclaimed - from unprotected cgroups. + from unprotected cgroups. Above the effective low boundary (or + effective min boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment @@ -2326,8 +2332,10 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it's within its low, -which makes delegation of subtrees possible. +reserve. A cgroup enjoys reclaim protection when it's within its +effective low, which makes delegation of subtrees possible. It also +enjoys having reclaim pressure proportional to its overage when +above its effective low. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 64e4df0c8488..268b10aa7a3d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1830,6 +1830,9 @@ ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt. + ipcmni_extend [KNL] Extend the maximum number of unique System V + IPC identifiers from 32,768 to 16,777,216. + irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. @@ -3146,6 +3149,16 @@ This will also cause panics on machine check exceptions. Useful together with panic=30 to trigger a reboot. + page_alloc.shuffle= + [KNL] Boolean flag to control whether the page allocator + should randomize its free lists. The randomization may + be automatically enabled if the kernel detects it is + running on a platform with a direct-mapped memory-side + cache, and this parameter can be used to + override/disable that behavior. The state of the flag + can be read from sysfs at: + /sys/module/page_alloc/parameters/shuffle. + page_owner= [KNL] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, @@ -3171,6 +3184,7 @@ bit 2: print timer info bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer + bit 5: print all printk messages in buffer panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). @@ -4026,7 +4040,9 @@ [[,]s[mp]#### \ [[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \ [[,]f[orce] - Where reboot_mode is one of warm (soft) or cold (hard) or gpio, + Where reboot_mode is one of warm (soft) or cold (hard) or gpio + (prefix with 'panic_' to set mode for panic + reboot only), reboot_type is one of bios, acpi, kbd, triple, efi, or pci, reboot_force is either force or not specified, reboot_cpu is s[mp]#### with #### being the processor diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst index 6b38a39fab24..a534cc7ebd05 100644 --- a/Documentation/core-api/genalloc.rst +++ b/Documentation/core-api/genalloc.rst @@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future. :functions: gen_pool_for_each_chunk .. kernel-doc:: lib/genalloc.c - :functions: addr_in_gen_pool + :functions: gen_pool_has_addr .. kernel-doc:: lib/genalloc.c :functions: gen_pool_avail diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 71f5d2fe39b7..a29c99d13331 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -147,10 +147,10 @@ Division Functions .. kernel-doc:: include/linux/math64.h :internal: -.. kernel-doc:: lib/div64.c +.. kernel-doc:: lib/math/div64.c :functions: div_s64_rem div64_u64_rem div64_u64 div64_s64 -.. kernel-doc:: lib/gcd.c +.. kernel-doc:: lib/math/gcd.c :export: UUID/GUID diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst index 69a7d90c320a..46aae52a41d0 100644 --- a/Documentation/dev-tools/gcov.rst +++ b/Documentation/dev-tools/gcov.rst @@ -34,10 +34,6 @@ Configure the kernel with:: CONFIG_DEBUG_FS=y CONFIG_GCOV_KERNEL=y -select the gcc's gcov format, default is autodetect based on gcc version:: - - CONFIG_GCOV_FORMAT_AUTODETECT=y - and to get coverage data for the entire kernel:: CONFIG_GCOV_PROFILE_ALL=y @@ -169,6 +165,20 @@ b) gcov is run on the BUILD machine [user@build] gcov -o /tmp/coverage/tmp/out/init main.c +Note on compilers +----------------- + +GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with +GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang. + +.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html +.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html + +Build differences between GCC and Clang gcov are handled by Kconfig. It +automatically selects the appropriate gcov format depending on the detected +toolchain. + + Troubleshooting --------------- diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt index 3683874832ae..9012a2a02e14 100644 --- a/Documentation/devicetree/bindings/pps/pps-gpio.txt +++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt @@ -7,6 +7,10 @@ Required properties: - compatible: should be "pps-gpio" - gpios: one PPS GPIO in the format described by ../gpio/gpio.txt +Additional required properties for the PPS ECHO functionality: +- echo-gpios: one PPS ECHO GPIO in the format described by ../gpio/gpio.txt +- echo-active-ms: duration in ms of the active portion of the echo pulse + Optional properties: - assert-falling-edge: when present, assert is indicated by a falling edge (instead of by a rising edge) @@ -19,5 +23,8 @@ Example: gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>; assert-falling-edge; + echo-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + echo-active-ms = <100>; + compatible = "pps-gpio"; }; diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt index 45edad6933cc..acc02fc57993 100644 --- a/Documentation/filesystems/autofs-mount-control.txt +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -354,8 +354,10 @@ this ioctl is called until no further expire candidates are found. The call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to the descriptor obtained from the open call. In -addition an immediate expire, independent of the mount timeout, can be -requested by setting the how field of struct args_expire to 1. If no +addition an immediate expire that's independent of the mount timeout, +and a forced expire that's independent of whether the mount is busy, +can be requested by setting the how field of struct args_expire to +AUTOFS_EXP_IMMEDIATE or AUTOFS_EXP_FORCED, respectively . If no expire candidates can be found the ioctl returns -1 with errno set to EAGAIN. diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt index 373ad25852d3..3af38c7fd26d 100644 --- a/Documentation/filesystems/autofs.txt +++ b/Documentation/filesystems/autofs.txt @@ -116,7 +116,7 @@ that purpose there is another flag. **DCACHE_MANAGE_TRANSIT** If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but -related behaviors are invoked, both using the `d_op->d_manage()` +related behaviours are invoked, both using the `d_op->d_manage()` dentry operation. Firstly, before checking to see if any filesystem is mounted on the @@ -193,8 +193,8 @@ VFS remain in RCU-walk mode, but can only tell it to get out of RCU-walk mode by returning `-ECHILD`. So `d_manage()`, when called with `rcu_walk` set, should either return --ECHILD if there is any reason to believe it is unsafe to end the -mounted filesystem, and otherwise should return 0. +-ECHILD if there is any reason to believe it is unsafe to enter the +mounted filesystem, otherwise it should return 0. autofs will return `-ECHILD` if an expiry of the filesystem has been initiated or is being considered, otherwise it returns 0. @@ -210,7 +210,7 @@ mounts that were created by `d_automount()` returning a filesystem to be mounted. As autofs doesn't return such a filesystem but leaves the mounting to the automount daemon, it must involve the automount daemon in unmounting as well. This also means that autofs has more control -of expiry. +over expiry. The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to the `umount` system call. Unmounting with MNT_EXPIRE will fail unless @@ -225,7 +225,7 @@ unmount any filesystems mounted on the autofs filesystem or remove any symbolic links or empty directories any time it likes. If the unmount or removal is successful the filesystem will be returned to the state it was before the mount or creation, so that any access of the name -will trigger normal auto-mount processing. In particlar, `rmdir` and +will trigger normal auto-mount processing. In particular, `rmdir` and `unlink` do not leave negative entries in the dcache as a normal filesystem would, so an attempt to access a recently-removed object is passed to autofs for handling. @@ -240,11 +240,18 @@ Normally the daemon only wants to remove entries which haven't been used for a while. For this purpose autofs maintains a "`last_used`" time stamp on each directory or symlink. For symlinks it genuinely does record the last time the symlink was "used" or followed to find -out where it points to. For directories the field is a slight -misnomer. It actually records the last time that autofs checked if -the directory or one of its descendents was busy and found that it -was. This is just as useful and doesn't require updating the field so -often. +out where it points to. For directories the field is used slightly +differently. The field is updated at mount time and during expire +checks if it is found to be in use (ie. open file descriptor or +process working directory) and during path walks. The update done +during path walks prevents frequent expire and immediate mount of +frequently accessed automounts. But in the case where a GUI continually +access or an application frequently scans an autofs directory tree +there can be an accumulation of mounts that aren't actually being +used. To cater for this case the "`strictexpire`" autofs mount option +can be used to avoid the "`last_used`" update on path walk thereby +preventing this apparent inability to expire mounts that aren't +really in use. The daemon is able to ask autofs if anything is due to be expired, using an `ioctl` as discussed later. For a *direct* mount, autofs @@ -255,8 +262,12 @@ up. There is an option with indirect mounts to consider each of the leaves that has been mounted on instead of considering the top-level names. -This is intended for compatability with version 4 of autofs and should -be considered as deprecated. +This was originally intended for compatibility with version 4 of autofs +and should be considered as deprecated for Sun Format automount maps. +However, it may be used again for amd format mount maps (which are +generally indirect maps) because the amd automounter allows for the +setting of an expire timeout for individual mounts. But there are +some difficulties in making the needed changes for this. When autofs considers a directory it checks the `last_used` time and compares it with the "timeout" value set when the filesystem was @@ -273,7 +284,7 @@ mounts. If it finds something in the root directory to expire it will return the name of that thing. Once a name has been returned the automount daemon needs to unmount any filesystems mounted below the name normally. As described above, this is unsafe for non-toplevel -mounts in a version-5 autofs. For this reason the current `automountd` +mounts in a version-5 autofs. For this reason the current `automount(8)` does not use this ioctl. The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or @@ -345,7 +356,7 @@ The `wait_queue_token` is a unique number which can identify a particular request to be acknowledged. When a message is sent over the pipe the affected dentry is marked as either "active" or "expiring" and other accesses to it block until the message is -acknowledged using one of the ioctls below and the relevant +acknowledged using one of the ioctls below with the relevant `wait_queue_token`. Communicating with autofs: root directory ioctls @@ -367,15 +378,14 @@ The available ioctl commands are: This mode is also entered if a write to the pipe fails. - **AUTOFS_IOC_PROTOVER**: This returns the protocol version in use. - **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which - is really a version number for the implementation. It is - currently 2. + is really a version number for the implementation. - **AUTOFS_IOC_SETTIMEOUT**: This passes a pointer to an unsigned long. The value is used to set the timeout for expiry, and the current timeout value is stored back through the pointer. - **AUTOFS_IOC_ASKUMOUNT**: Returns, in the pointed-to `int`, 1 if the filesystem could be unmounted. This is only a hint as the situation could change at any instant. This call can be - use to avoid a more expensive full unmount attempt. + used to avoid a more expensive full unmount attempt. - **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is anything suitable to expire. A pointer to a packet: @@ -400,6 +410,11 @@ The available ioctl commands are: **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored and objects are expired if the are not in use. + **AUTOFS_EXP_FORCED** causes the in use status to be ignored + and objects are expired ieven if they are in use. This assumes + that the daemon has requested this because it is capable of + performing the umount. + **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level name to expire. This is only safe when *maxproto* is 4. @@ -415,7 +430,7 @@ which can be used to communicate directly with the autofs filesystem. It requires CAP_SYS_ADMIN for access. The `ioctl`s that can be used on this device are described in a separate -document `autofs-mount-control.txt`, and are summarized briefly here. +document `autofs-mount-control.txt`, and are summarised briefly here. Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: struct autofs_dev_ioctl { @@ -511,6 +526,21 @@ directories. Catatonic mode can only be left via the **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`. +The "ignore" mount option +------------------------- + +The "ignore" mount option can be used to provide a generic indicator +to applications that the mount entry should be ignored when displaying +mount information. + +In other OSes that provide autofs and that provide a mount list to user +space based on the kernel mount list a no-op mount option ("ignore" is +the one use on the most common OSes) is allowed so that autofs file +system users can optionally use it. + +This is intended to be used by user space programs to exclude autofs +mounts from consideration when reading the mounts list. + autofs, name spaces, and shared mounts -------------------------------------- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3f13d8599337..749322060f10 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - stat_refresh - numa_stat - swappiness +- unprivileged_userfaultfd - user_reserve_kbytes - vfs_cache_pressure - watermark_boost_factor @@ -818,6 +819,17 @@ The default value is 60. ============================================================== +unprivileged_userfaultfd + +This flag controls whether unprivileged users can use the userfaultfd +system calls. Set this to 1 to allow unprivileged users to use the +userfaultfd system calls, or set this to 0 to restrict userfaultfd to only +privileged users (with SYS_CAP_PTRACE capability). + +The default value is 1. + +============================================================== + - user_reserve_kbytes When overcommit_memory is set to 2, "never overcommit" mode, reserve diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 66bfd8396877..995da15b16ca 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", - "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_congested", "nr_immediate", "nr_activate_anon", + "nr_activate_file", "nr_ref_keep", "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", @@ -407,7 +408,7 @@ EVENT_PROCESS: } my $nr_reclaimed = $3; - my $flags = $12; + my $flags = $13; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 44205f0b671f..ec1efa32af3c 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -189,20 +189,10 @@ the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use either:: - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); - int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); - -The first one (hmm_vma_get_pfns()) will only fetch present CPU page table + long hmm_range_snapshot(struct hmm_range *range); + long hmm_range_fault(struct hmm_range *range, bool block); + +The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path @@ -220,25 +210,56 @@ respect in order to keep things properly synchronized. The usage pattern is:: { struct hmm_range range; ... + + range.start = ...; + range.end = ...; + range.pfns = ...; + range.flags = ...; + range.values = ...; + range.pfn_shift = ...; + hmm_range_register(&range); + + /* + * Just wait for range to be valid, safe to ignore return value as we + * will use the return value of hmm_range_snapshot() below under the + * mmap_sem to ascertain the validity of the range. + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); + again: - ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); - if (ret) + down_read(&mm->mmap_sem); + ret = hmm_range_snapshot(&range); + if (ret) { + up_read(&mm->mmap_sem); + if (ret == -EAGAIN) { + /* + * No need to check hmm_range_wait_until_valid() return value + * on retry we will get proper error with hmm_range_snapshot() + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); + goto again; + } + hmm_mirror_unregister(&range); return ret; + } take_lock(driver->update); - if (!hmm_vma_range_done(vma, &range)) { + if (!range.valid) { release_lock(driver->update); + up_read(&mm->mmap_sem); goto again; } // Use pfns array content to update device page table + hmm_mirror_unregister(&range); release_lock(driver->update); + up_read(&mm->mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before hmm_vma_range_done() to avoid -any race with a concurrent CPU page table update. +update() callback. That lock must be held before checking the range.valid +field to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing @@ -255,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this concurrently). +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows +to set fault or snapshot policy for a whole range instead of having to set them +for each entries in the range. + +For instance if the device flags for device entries are: + VALID (1 << 63) + WRITE (1 << 62) + +Now let say that device driver wants to fault with at least read a range then +it does set: + range->default_flags = (1 << 63) + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all page +in the range with at least read permission. + +Now let say driver wants to do the same except for one page in the range for +which its want to have write. Now driver set: + range->default_flags = (1 << 63); + range->pfn_flags_mask = (1 << 62); + range->pfns[index_of_write] = (1 << 62); + +With this HMM will fault in all page with at least read (ie valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission ie if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +Note that HMM will populate the pfns array with write permission for any entry +that have write permission within the CPU pte no matter what are the values set +in default_flags or pfn_flags_mask. + + Represent and manage device memory from core kernel point of view ================================================================= diff --git a/arch/Kconfig b/arch/Kconfig index 5e43fcbad4ca..f11f0698b148 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE An architecture should select this when it can successfully build and run with CONFIG_FORTIFY_SOURCE. +# +# Select if the arch provides a historic keepinit alias for the retain_initrd +# command line option +# +config ARCH_HAS_KEEPINITRD + bool + # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h config ARCH_HAS_SET_MEMORY bool diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index a42fc5c4db89..e2cbec3789e8 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -285,17 +285,3 @@ mem_init(void) memblock_free_all(); mem_init_print_info(NULL); } - -void -free_initmem(void) -{ - free_initmem_default(-1); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void -free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index e1ab2d7f1d64..02b7a3b20d7c 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -206,18 +206,3 @@ void __init mem_init(void) memblock_free_all(); mem_init_print_info(NULL); } - -/* - * free_initmem: Free all the __init memory. - */ -void __ref free_initmem(void) -{ - free_initmem_default(-1); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 901bf1b79138..8869742a85df 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -4,11 +4,11 @@ config ARM default y select ARCH_32BIT_OFF_T select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_KEEPINITRD select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL if ARM_LPAE @@ -21,6 +21,7 @@ config ARM select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index cba23eaa6072..7a88f160b1fb 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h @@ -6,6 +6,7 @@ #include <linux/threads.h> #include <asm/irq.h> +/* number of IPIS _not_ including IPI_CPU_BACKTRACE */ #define NR_IPI 7 typedef struct { diff --git a/arch/arm/kernel/atags.h b/arch/arm/kernel/atags.h index 201100226301..067e12edc341 100644 --- a/arch/arm/kernel/atags.h +++ b/arch/arm/kernel/atags.h @@ -5,7 +5,7 @@ void convert_to_tag_list(struct tag *tags); const struct machine_desc *setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr); #else -static inline const struct machine_desc * +static inline const struct machine_desc * __init __noreturn setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr) { early_print("no ATAGS support: can't continue\n"); diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index c6d37563610a..ebc53804d57b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -70,6 +70,10 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_IRQ_WORK, IPI_COMPLETION, + /* + * CPU_BACKTRACE is special and not included in NR_IPI + * or tracable with trace_ipi_* + */ IPI_CPU_BACKTRACE, /* * SGI8-15 can be reserved by secure firmware, and thus may @@ -803,7 +807,7 @@ core_initcall(register_cpufreq_notifier); static void raise_nmi(cpumask_t *mask) { - smp_cross_call(mask, IPI_CPU_BACKTRACE); + __smp_cross_call(mask, IPI_CPU_BACKTRACE); } void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 43f46aa7ef33..12e0812e988c 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -565,7 +565,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page) static bool __in_atomic_pool(void *start, size_t size) { - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); } static int __free_from_pool(void *start, size_t size) @@ -1577,31 +1577,21 @@ static int __arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; struct page **pages = __iommu_get_pages(cpu_addr, attrs); unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; + int err; if (!pages) return -ENXIO; - if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off) + if (vma->vm_pgoff >= nr_pages) return -ENXIO; - pages += off; - - do { - int ret = vm_insert_page(vma, uaddr, *pages++); - if (ret) { - pr_err("Remapping memory failed: %d\n", ret); - return ret; - } - uaddr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); + err = vm_map_pages(vma, pages, nr_pages); + if (err) + pr_err("Remapping memory failed: %d\n", err); - return 0; + return err; } static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index b2fbb4a711fc..be0b42937888 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -680,27 +680,14 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd; - void free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) { - if (start == initrd_start) - start = round_down(start, PAGE_SIZE); - if (end == initrd_end) - end = round_up(end, PAGE_SIZE); - - poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, -1, "initrd"); - } -} + if (start == initrd_start) + start = round_down(start, PAGE_SIZE); + if (end == initrd_end) + end = round_up(end, PAGE_SIZE); -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; + poison_init_mem((void *)start, PAGE_ALIGN(end) - start); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } - -__setup("keepinitrd", keepinitrd_setup); #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9504cf52a7a3..42551c89a5a2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,8 +19,9 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV + select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS @@ -59,6 +60,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -84,6 +86,7 @@ config ARM64 select CRC32 select DCACHE_WORD_ACCESS select DMA_DIRECT_REMAP + select DYNAMIC_DEBUG_RELATIVE_POINTERS select EDAC_SUPPORT select FRAME_POINTER select GENERIC_ALLOCATOR diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index cc3f9d864224..50695e4d34c4 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += delay.h generic-y += div64.h generic-y += dma.h generic-y += dma-contiguous.h +generic-y += dynamic_debug.h generic-y += early_ioremap.h generic-y += emergency-restart.h generic-y += hw_irq.h diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f210bcf096f7..bc895c869892 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -401,7 +401,7 @@ unsigned long cpu_get_elf_hwcap2(void); #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) /* System capability check for constant caps */ -static inline bool __cpus_have_const_cap(int num) +static __always_inline bool __cpus_have_const_cap(int num) { if (num >= ARM64_NCAPS) return false; @@ -415,7 +415,7 @@ static inline bool cpus_have_cap(unsigned int num) return test_bit(num, cpu_hwcaps); } -static inline bool cpus_have_const_cap(int num) +static __always_inline bool cpus_have_const_cap(int num) { if (static_branch_likely(&arm64_const_caps_ready)) return __cpus_have_const_cap(num); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6a07a3b433e..4aad6382f631 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, #include <asm-generic/hugetlb.h> -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif - #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 40e2d7e5efcb..007c05a4cce0 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -578,24 +578,11 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd __initdata; - void __init free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); - memblock_free(__virt_to_phys(start), end - start); - } -} - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; + free_reserved_area((void *)start, (void *)end, 0, "initrd"); + memblock_free(__virt_to_phys(start), end - start); } - -__setup("keepinitrd", keepinitrd_setup); #endif /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ef82312860ac..ef32d4839c3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { int flags = 0; @@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, - altmap, want_memblock); + restrictions); } #endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index fe582c3a1794..573242b160e1 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -68,15 +68,3 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - -void __init free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 0f04a5e9aa4f..1eab16b1a0bc 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -102,17 +102,3 @@ void __init mem_init(void) mem_init_print_info(NULL); } - - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - -void -free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 3e54a53208d5..b7d404bbaa0f 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -22,7 +22,6 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select ARCH_DISCARD_MEMBLOCK select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 1719ede9e9bd..41cf34243ea1 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -85,16 +85,6 @@ void __init mem_init(void) } /* - * free_initmem - frees memory used by stuff declared with __init - * - * Todo: free pages between __init_begin and __init_end; possibly - * some devtree related stuff as well. - */ -void __ref free_initmem(void) -{ -} - -/* * free_initrd_mem - frees... initrd memory. * @start - start of init memory * @end - end of init memory diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 73a26f04644e..7468d8e50467 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -33,7 +33,6 @@ config IA64 select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select VIRT_TO_BUS - select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index e49200e31750..d28e29103bdb 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -666,14 +666,14 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); @@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - if (ret) - pr_warn("%s: Problem encountered in __remove_pages() as" - " ret=%d\n", __func__, ret); - - return ret; + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index fe5cc2da6d10..f8020d261e08 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -22,12 +22,11 @@ config M68K select ARCH_WANT_IPC_PARSE_VERSION select HAVE_FUTEX_CMPXCHG if MMU && FUTEX select HAVE_MOD_ARCH_SPECIFIC + select MMU_GATHER_NO_RANGE if MMU select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select ARCH_DISCARD_MEMBLOCK - select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 8868a4c9adae..778cacb7d57b 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -147,10 +147,3 @@ void __init mem_init(void) init_pointer_tables(); mem_init_print_info(NULL); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 7e97d44f6538..a015a951c8b7 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -186,18 +186,6 @@ void __init setup_memory(void) paging_init(); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - -void free_initmem(void) -{ - free_initmem_default(-1); -} - void __init mem_init(void) { high_memory = (void *)__va(memory_start + lowmem_size - 1); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 25bd9cfea5d7..70d3200476bf 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index 830c93a010c3..9a466dde9b96 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -482,7 +482,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long * * Return the bit position (0..63) of the most significant 1 bit in a word * Returns -1 if no 1 bit exists */ -static inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long __fls(unsigned long word) { int num; @@ -548,7 +548,7 @@ static inline unsigned long __fls(unsigned long word) * Returns 0..SZLONG-1 * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long __ffs(unsigned long word) { return __fls(word & -word); } diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c index bada74af7641..c04b97aace4a 100644 --- a/arch/mips/kernel/cpu-bugs64.c +++ b/arch/mips/kernel/cpu-bugs64.c @@ -42,8 +42,8 @@ static inline void align_mod(const int align, const int mod) : "n"(align), "n"(mod)); } -static inline void mult_sh_align_mod(long *v1, long *v2, long *w, - const int align, const int mod) +static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w, + const int align, const int mod) { unsigned long flags; int m1, m2; diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 0d14e0d8eacf..4c2b4483683c 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -289,7 +290,7 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); + pages, gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index bbb196ad5f26..8a038b30d3c4 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - void (*free_init_pages_eva)(void *begin, void *end) = NULL; void __ref free_initmem(void) diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index f32e40e36fe1..55703b03d172 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -252,18 +252,6 @@ void __init mem_init(void) return; } -void free_initmem(void) -{ - free_initmem_default(-1); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index ea37394ff3ea..c2a8b9a95865 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -16,6 +16,7 @@ config NIOS2 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_KGDB select IRQ_DOMAIN + select MMU_GATHER_NO_RANGE if MMU select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE @@ -23,8 +24,6 @@ config NIOS2 select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS - select ARCH_DISCARD_MEMBLOCK - select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM def_bool y diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 16cea5776b87..2c609c2516b2 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,18 +82,6 @@ void __init mmu_init(void) flush_tlb_all(); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - -void __ref free_initmem(void) -{ - free_initmem_default(-1); -} - #define __page_aligned(order) __aligned(PAGE_SIZE << (order)) pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER); pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index e8a640c7e74c..e63cb4a91a3e 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -222,15 +222,3 @@ void __init mem_init(void) mem_init_done = 1; return; } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - -void free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 5a68185295a9..ddca8287d43b 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -920,10 +920,3 @@ void flush_tlb_all(void) spin_unlock(&sid_lock); } #endif - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d7996cfaceca..6a66a2da5b1a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -149,6 +150,7 @@ config PPC select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN + select DYNAMIC_DEBUG_RELATIVE_POINTERS if PPC64 select DYNAMIC_FTRACE if FUNCTION_TRACER select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index b9f6e72bf4e5..3efffcd675c2 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -3,6 +3,7 @@ generated-y += syscall_table_64.h generated-y += syscall_table_c32.h generated-y += syscall_table_spu.h generic-y += div64.h +generic-y += dynamic_debug.h generic-y += export.h generic-y += irq_regs.h generic-y += local64.h diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 56140d19c85f..12e150e615b7 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate) } } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) +#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +static inline bool gigantic_page_runtime_supported(void) { /* * We used gigantic page reservation with hypervisor assist in some case. @@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void) return true; } -#endif /* hugepd entry valid bit */ #define HUGEPD_VAL_BITS (0x8000000000000000UL) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 87da40129927..7dee64f1e4cb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1306,16 +1306,6 @@ void show_user_instructions(struct pt_regs *regs) pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - /* - * Make sure the NIP points at userspace, not kernel text/data or - * elsewhere. - */ - if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { - pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", - current->comm, current->pid); - return; - } - seq_buf_init(&s, buf, sizeof(buf)); while (n) { @@ -1326,7 +1316,7 @@ void show_user_instructions(struct pt_regs *regs) for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { int instr; - if (probe_kernel_address((const void *)pc, instr)) { + if (probe_user_read(&instr, (void __user *)pc, sizeof(instr))) { seq_buf_printf(&s, "XXXXXXXX "); continue; } diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 523bb99d7676..00682b8df330 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -628,14 +628,14 @@ static int __init prom_next_node(phandle *nodep) } } -static inline int prom_getprop(phandle node, const char *pname, - void *value, size_t valuelen) +static inline int __init prom_getprop(phandle node, const char *pname, + void *value, size_t valuelen) { return call_prom("getprop", 4, 1, node, ADDR(pname), (u32)(unsigned long) value, (u32) valuelen); } -static inline int prom_getproplen(phandle node, const char *pname) +static inline int __init prom_getproplen(phandle node, const char *pname) { return call_prom("getproplen", 2, 1, node, ADDR(pname)); } diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e8e93c2c7d03..7a1708875d27 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -610,7 +610,7 @@ SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2); SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3); SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4); SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5); -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC SYSFS_SPRSETUP(hid0, SPRN_HID0); SYSFS_SPRSETUP(hid1, SPRN_HID1); SYSFS_SPRSETUP(hid4, SPRN_HID4); @@ -639,7 +639,7 @@ SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0); SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1); SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2); SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3); -#endif /* CONFIG_DEBUG_KERNEL */ +#endif /* CONFIG_DEBUG_MISC */ #endif /* HAS_PPC_PMC_PA6T */ #ifdef HAS_PPC_PMC_IBM @@ -680,7 +680,7 @@ static struct device_attribute pa6t_attrs[] = { __ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3), __ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4), __ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5), -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC __ATTR(hid0, 0600, show_hid0, store_hid0), __ATTR(hid1, 0600, show_hid1, store_hid1), __ATTR(hid4, 0600, show_hid4, store_hid4), @@ -709,7 +709,7 @@ static struct device_attribute pa6t_attrs[] = { __ATTR(tsr1, 0600, show_tsr1, store_tsr1), __ATTR(tsr2, 0600, show_tsr2, store_tsr2), __ATTR(tsr3, 0600, show_tsr3, store_tsr3), -#endif /* CONFIG_DEBUG_KERNEL */ +#endif /* CONFIG_DEBUG_MISC */ }; #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 665f294725cb..83e59fdaa62d 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -179,7 +179,7 @@ extern void panic_flush_kmsg_end(void) kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); } static unsigned long oops_begin(struct pt_regs *regs) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index be7bc070eae5..ab3d484c5e2e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* If writing != 0, then the HPTE must allow writing, if we get here */ write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, writing, pages); + npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(¤t->mm->mmap_sem); @@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) goto err; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, 1, pages); + npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); if (npages < 1) goto err; page = pages[0]; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 66270e07449a..38af1c2eb652 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -56,39 +56,31 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages) return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; } -static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) +static long kvmppc_account_memlimit(unsigned long pages, bool inc) { long ret = 0; + s64 locked_vm; if (!current || !current->mm) return ret; /* process exited */ - down_write(¤t->mm->mmap_sem); - if (inc) { - unsigned long locked, lock_limit; + unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - locked = current->mm->locked_vm + stt_pages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + locked_vm = atomic64_add_return(pages, ¤t->mm->locked_vm); + if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) { ret = -ENOMEM; - else - current->mm->locked_vm += stt_pages; + atomic64_sub(pages, ¤t->mm->locked_vm); + } } else { - if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) - stt_pages = current->mm->locked_vm; - - current->mm->locked_vm -= stt_pages; + locked_vm = atomic64_sub_return(pages, ¤t->mm->locked_vm); + WARN_ON_ONCE(locked_vm < 0); } - pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, - inc ? '+' : '-', - stt_pages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK), - ret ? " - exceeded" : ""); - - up_write(¤t->mm->mmap_sem); + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%lu %lld/%lu%s\n", current->pid, + inc ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 24296f4cadc6..e0af53fd78c5 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, if (!pages) return -ENOMEM; - ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages); if (ret < 0) goto free_pages; diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 8330f135294f..cd370d66d797 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -54,33 +54,29 @@ struct mm_iommu_table_group_mem_t { static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, unsigned long npages, bool incr) { - long ret = 0, locked, lock_limit; + long ret = 0; + unsigned long lock_limit; + s64 locked_vm; if (!npages) return 0; - down_write(&mm->mmap_sem); - if (incr) { - locked = mm->locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + locked_vm = atomic64_add_return(npages, &mm->locked_vm); + if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) { ret = -ENOMEM; - else - mm->locked_vm += npages; + atomic64_sub(npages, &mm->locked_vm); + } } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; + locked_vm = atomic64_sub_return(npages, &mm->locked_vm); + WARN_ON_ONCE(locked_vm < 0); } - pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current ? current->pid : 0, - incr ? '+' : '-', - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, + pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%lu %lld/%lu\n", + current ? current->pid : 0, incr ? '+' : '-', + npages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); return ret; } @@ -141,8 +137,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, for (entry = 0; entry < entries; entry += chunk) { unsigned long n = min(entries - entry, chunk); - ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, - FOLL_WRITE, mem->hpages + entry, NULL); + ret = get_user_pages(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE | FOLL_LONGTERM, + mem->hpages + entry, NULL); if (ret == n) { pinned += n; continue; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 6a23b9ebd2a1..14ff414d1545 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -90,8 +90,8 @@ void radix__tlbiel_all(unsigned int action) asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } -static inline void __tlbiel_pid(unsigned long pid, int set, - unsigned long ric) +static __always_inline void __tlbiel_pid(unsigned long pid, int set, + unsigned long ric) { unsigned long rb,rs,prs,r; @@ -106,7 +106,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -136,7 +136,7 @@ static inline void __tlbiel_lpid(unsigned long lpid, int set, trace_tlbie(lpid, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -239,7 +239,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) /* * We use 128 set in radix mode and 256 set in hpt mode. */ -static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; @@ -928,7 +928,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) tlb->need_flush_all = 0; } -static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, +static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize, bool also_pwc) { diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b5d3578d9f65..8785def75916 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -304,12 +304,8 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && access_ok(nip, sizeof(*nip))) { unsigned int inst; - int res; - pagefault_disable(); - res = __get_user_inatomic(inst, nip); - pagefault_enable(); - if (!res) + if (!probe_user_read(&inst, nip, sizeof(inst))) return !store_updates_sp(inst); *must_retry = true; } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index cd525d709072..879872e72d40 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int __ref arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -127,12 +127,12 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm } flush_inval_dcache_range(start, start + size); - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE -int __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, if (altmap) page += vmem_altmap_offset(altmap); - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); - if (ret) - return ret; + __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); flush_inval_dcache_range(start, start + size); ret = remove_section_mapping(start, start + size); + WARN_ON_ONCE(ret); /* Ensure all vmalloc mappings are flushed in case they also * hit that section of memory @@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) pr_warn("Hash collision while resizing HPT\n"); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -338,13 +335,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* * This is called when a page has been modified by the kernel. * It just marks the page as not i-cache clean. We do the i-cache diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 0af051a1974e..0680efb2237b 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -159,12 +159,8 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) ((unsigned long)ptr & 7)) return -EFAULT; - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); + if (!probe_user_read(ret, ptr, sizeof(*ret))) return 0; - } - pagefault_enable(); return read_user_stack_slow(ptr, ret, 8); } @@ -175,12 +171,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) ((unsigned long)ptr & 3)) return -EFAULT; - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); + if (!probe_user_read(ret, ptr, sizeof(*ret))) return 0; - } - pagefault_enable(); return read_user_stack_slow(ptr, ret, 4); } @@ -307,17 +299,11 @@ static inline int current_is_64bit(void) */ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) { - int rc; - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || ((unsigned long)ptr & 3)) return -EFAULT; - pagefault_disable(); - rc = __get_user_inatomic(*ret, ptr); - pagefault_enable(); - - return rc; + return probe_user_read(ret, ptr, sizeof(*ret)); } static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a66fb9c01c9e..73db84d24491 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -420,7 +420,6 @@ static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) static __u64 power_pmu_bhrb_to(u64 addr) { unsigned int instr; - int ret; __u64 target; if (is_kernel_addr(addr)) { @@ -431,13 +430,8 @@ static __u64 power_pmu_bhrb_to(u64 addr) } /* Userspace: need copy instruction here then translate it */ - pagefault_disable(); - ret = __get_user_inatomic(instr, (unsigned int __user *)addr); - if (ret) { - pagefault_enable(); + if (probe_user_read(&instr, (unsigned int __user *)addr, sizeof(instr))) return 0; - } - pagefault_enable(); target = branch_target(&instr); if ((!target) || (instr & BRANCH_ABSOLUTE)) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d0e172d47574..2794235e9d3e 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 && HUGETLB_PAGE - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE select PPC_HAVE_KUEP select PPC_HAVE_KUAP default y diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index f49aec251a5a..37b0ba5f4ff9 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -1069,13 +1069,11 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs) addr += mfspr(SPRN_MCAR); if (is_in_pci_mem_space(addr)) { - if (user_mode(regs)) { - pagefault_disable(); - ret = get_user(inst, (__u32 __user *)regs->nip); - pagefault_enable(); - } else { + if (user_mode(regs)) + ret = probe_user_read(&inst, (void __user *)regs->nip, + sizeof(inst)); + else ret = probe_kernel_address((void *)regs->nip, inst); - } if (!ret && mcheck_handle_load(regs, inst)) { regs->nip += 4; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index bc7b77e34d09..8bf6f9c2d48c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -66,11 +66,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -void free_initmem(void) -{ - free_initmem_default(0); -} - #ifdef CONFIG_BLK_DEV_INITRD static void __init setup_initrd(void) { diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 07485582d027..109243fdb6ec 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -63,7 +63,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY @@ -100,6 +100,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_KEEP_MEMBLOCK select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index ce2770743cc5..27696755daa9 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -203,7 +203,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode) } } -static inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) { if (__cpacf_check_opcode(opcode)) { __cpacf_query(opcode, mask); diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d1afa58a4b6..bb59dd964590 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif +static inline bool gigantic_page_runtime_supported(void) +{ + return true; +} + #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f30679a3a939..9dde4d7d8704 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2383,7 +2383,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) ret = -EFAULT; goto out; } - ret = get_user_pages_fast(map->addr, 1, 1, &map->page); + ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); if (ret < 0) goto out; BUG_ON(ret != 1); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 7cf48eefec8f..14d1eae9fe43 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -157,14 +157,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - unsigned long memory_block_size_bytes(void) { /* @@ -227,8 +219,8 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -238,21 +230,22 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); + rc = __add_pages(nid, start_pfn, size_pages, restrictions); if (rc) vmem_remove_mapping(start, size); return rc; } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { /* * There is no hardware or firmware interface which could trigger a * hot memory remove on s390. So there is nothing that needs to be * implemented. */ - return -EBUSY; + BUG(); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 0be08d586d40..b77f512bb176 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -10,7 +10,6 @@ config SUPERH select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS @@ -53,6 +52,7 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI select NEED_SG_DMA_LENGTH + select ARCH_HAS_GIGANTIC_PAGE help The SuperH is a RISC processor targeted for use in embedded systems diff --git a/arch/sh/boards/mach-dreamcast/irq.c b/arch/sh/boards/mach-dreamcast/irq.c index a929f764ae04..cc06e4cdb4cd 100644 --- a/arch/sh/boards/mach-dreamcast/irq.c +++ b/arch/sh/boards/mach-dreamcast/irq.c @@ -10,7 +10,6 @@ */ #include <linux/irq.h> #include <linux/io.h> -#include <linux/irq.h> #include <linux/export.h> #include <linux/err.h> #include <mach/sysasic.h> diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 3e27f6d1f1ec..277c882f7489 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -261,7 +262,7 @@ slow_irqon: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 70621324db41..b95e343e3c9d 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -403,28 +403,16 @@ void __init mem_init(void) mem_init_done = 1; } -void free_initmem(void) -{ - free_initmem_default(-1); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); @@ -441,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - if (unlikely(ret)) - pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, - ret); - - return ret; + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index f6421c9ce5d3..7c93f3121ee6 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,7 @@ config SPARC64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_PTE_SPECIAL select PCI_DOMAINS if PCI + select ARCH_HAS_GIGANTIC_PAGE config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1393a8ac596b..22500c3be7a9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) -/* This macro must be updated when the size of struct page grows above 80 - * or reduces below 64. - * The idea that compiler optimizes out switch() statement, and only - * leaves clrx instructions - */ -#define mm_zero_struct_page(pp) do { \ - unsigned long *_pp = (void *)(pp); \ - \ - /* Check that struct page is either 64, 72, or 80 bytes */ \ - BUILD_BUG_ON(sizeof(struct page) & 7); \ - BUILD_BUG_ON(sizeof(struct page) < 64); \ - BUILD_BUG_ON(sizeof(struct page) > 80); \ - \ - switch (sizeof(struct page)) { \ - case 80: \ - _pp[9] = 0; /* fallthrough */ \ - case 72: \ - _pp[8] = 0; /* fallthrough */ \ - default: \ - _pp[7] = 0; \ - _pp[6] = 0; \ - _pp[5] = 0; \ - _pp[4] = 0; \ - _pp[3] = 0; \ - _pp[2] = 0; \ - _pp[1] = 0; \ - _pp[0] = 0; \ - } \ -} while (0) - /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index aee6dba83d0e..1e770a517d4a 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); @@ -324,7 +325,7 @@ slow: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index a8ff29821bdb..046ab116cc8c 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -294,19 +294,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -void free_initmem (void) -{ - free_initmem_default(POISON_FREE_INITMEM); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index bc2aaa47bc8a..4b099dd7a767 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2572,14 +2572,6 @@ void free_initmem(void) } } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - pgprot_t PAGE_KERNEL __read_mostly; EXPORT_SYMBOL(PAGE_KERNEL); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 99aa11bf53d1..a9c9a94c096f 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -188,13 +188,6 @@ void free_initmem(void) { } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* Allocate and free page tables. */ pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index e85f89f96eca..41fe944005f8 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -3,6 +3,7 @@ config UNICORE32 def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_KEEPINITRD select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_KERNEL_GZIP diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 74b6a2e29809..b4442f3060ce 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -287,27 +287,3 @@ void __init mem_init(void) sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } } - -void free_initmem(void) -{ - free_initmem_default(-1); -} - -#ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd; - -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (!keep_initrd) - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; -} - -__setup("keepinitrd", keepinitrd_setup); -#endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e7212731cffb..98b86e819772 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY @@ -30,6 +30,7 @@ config X86_64 select NEED_DMA_MAP_STATE select SWIOTLB select ARCH_HAS_SYSCALL_WRAPPER + select DYNAMIC_DEBUG_RELATIVE_POINTERS # # Arch settings @@ -47,7 +48,6 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -306,9 +306,6 @@ config ZONE_DMA32 config AUDIT_ARCH def_bool y if X86_64 -config ARCH_SUPPORTS_OPTIMIZED_INLINING - def_bool y - config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 15d0fbe27872..f730680dc818 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -266,20 +266,6 @@ config CPA_DEBUG ---help--- Do change_page_attr() self-tests every 30 seconds. -config OPTIMIZE_INLINING - bool "Allow gcc to uninline functions marked 'inline'" - ---help--- - This option determines if the kernel forces gcc to inline the functions - developers have marked 'inline'. Doing so takes away freedom from gcc to - do what it thinks is best, which is desirable for the gcc 3.x series of - compilers. The gcc 4.x series have a rewritten inlining algorithm and - enabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc 4.x and above to make the - decision will become the default in the future. Until then this option - is there to test gcc for this. - - If unsure, say N. - config DEBUG_ENTRY bool "Debug low-level entry code" depends on DEBUG_KERNEL diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 9242b28418d5..9acec4426206 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -17,6 +17,7 @@ #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP #undef CONFIG_NR_CPUS +#undef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS #define CONFIG_X86_32 1 #define CONFIG_PGTABLE_LEVELS 2 diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index eebd05942e6c..60232bddbf88 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -7,6 +7,7 @@ generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h generic-y += dma-contiguous.h +generic-y += dynamic_debug.h generic-y += early_ioremap.h generic-y += export.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 7469d321f072..f65cfb48cfdd 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2bdbbbcfa393..cdf44aa9a501 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file contains definitions from Hyper-V Hypervisor Top-Level Functional diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bdca39829bc..08715034e315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t *table; struct page *page; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); /* Check if the user is doing something meaningless. */ if (unlikely(npages != 1)) return -EFAULT; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 406b558abfef..6b92eaf4a3b1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 92e4c4b85bba..fab095362c50 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 85c94f9a87f8..075e568098f2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -850,24 +850,25 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bccff68e3267..20d14254b686 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock) + struct mhp_restrictions *restrictions) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, return ret; } -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return add_pages(nid, start_pfn, nr_pages, restrictions); } #define PAGE_INUSE 0xFD @@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -int __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(start_pfn); struct zone *zone; - int ret; /* With altmap the first mapped page is offset from @start */ if (altmap) page += vmem_altmap_offset(altmap); zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - WARN_ON_ONCE(ret); + __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index dfb6c4df639a..53277862de40 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -212,8 +212,6 @@ static void __init alloc_node_data(int nid) node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - - node_set_online(nid); } /** @@ -566,7 +564,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return -EINVAL; /* Finally register nodes. */ - for_each_node_mask(nid, node_possible_map) { + for_each_node_mask(nid, numa_nodes_parsed) { u64 start = PFN_PHYS(max_pfn); u64 end = 0; @@ -577,9 +575,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) end = max(mi->blk[i].end, end); } - if (start >= end) - continue; - /* * Don't confuse VM with a node that doesn't have the * minimum amount of memory: @@ -588,6 +583,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) continue; alloc_node_data(nid); + if (end) + node_set_online(nid); } /* Dump memblock with node info and return. */ @@ -717,21 +714,6 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static void __init init_memory_less_node(int nid) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - - /* Allocate and initialize node data. Memory-less node is now online.*/ - alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); - - /* - * All zonelists will be built later in start_kernel() after per cpu - * areas are initialized. - */ -} - /* * Setup early cpu_to_node. * @@ -759,9 +741,6 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; - if (!node_online(node)) - init_memory_less_node(node); - numa_set_node(cpu, node); } } diff --git a/arch/xtensa/include/asm/irqflags.h b/arch/xtensa/include/asm/irqflags.h index 9b5e8526afe5..12890681587b 100644 --- a/arch/xtensa/include/asm/irqflags.h +++ b/arch/xtensa/include/asm/irqflags.h @@ -27,7 +27,7 @@ static inline unsigned long arch_local_irq_save(void) { unsigned long flags; #if XTENSA_FAKE_NMI -#if defined(CONFIG_DEBUG_KERNEL) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL +#if defined(CONFIG_DEBUG_MISC) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL unsigned long tmp; asm volatile("rsr %0, ps\t\n" diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 3699d6d3e479..83b244ce61ee 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -126,7 +126,7 @@ void secondary_start_kernel(void) init_mmu(); -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC if (boot_secondary_processors == 0) { pr_debug("%s: boot_secondary_processors:%d; Hanging cpu:%d\n", __func__, boot_secondary_processors, cpu); diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index d49861099684..b51746f2b80b 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void free_initmem(void) -{ - free_initmem_default(-1); -} - static void __init parse_memmap_one(char *p) { char *oldp; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e49028a60429..f180427e48f4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -231,13 +231,14 @@ static bool pages_correctly_probed(unsigned long start_pfn) * OK to have direct references to sparsemem variables in here. */ static int -memory_block_action(unsigned long phys_index, unsigned long action, int online_type) +memory_block_action(unsigned long start_section_nr, unsigned long action, + int online_type) { unsigned long start_pfn; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; int ret; - start_pfn = section_nr_to_pfn(phys_index); + start_pfn = section_nr_to_pfn(start_section_nr); switch (action) { case MEM_ONLINE: @@ -251,7 +252,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " - "%ld\n", __func__, phys_index, action, action); + "%ld\n", __func__, start_section_nr, action, action); ret = -EINVAL; } @@ -733,16 +734,18 @@ unregister_memory(struct memory_block *memory) { BUG_ON(memory->dev.bus != &memory_subsys); - /* drop the ref. we got in remove_memory_section() */ + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); } -static int remove_memory_section(unsigned long node_id, - struct mem_section *section, int phys_device) +void unregister_memory_section(struct mem_section *section) { struct memory_block *mem; + if (WARN_ON_ONCE(!present_section(section))) + return; + mutex_lock(&mem_sysfs_mutex); /* @@ -763,15 +766,6 @@ static int remove_memory_section(unsigned long node_id, out_unlock: mutex_unlock(&mem_sysfs_mutex); - return 0; -} - -int unregister_memory_section(struct mem_section *section) -{ - if (!present_section(section)) - return -EINVAL; - - return remove_memory_section(0, section, 0); } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c index 11b921727183..23fd996df8d2 100644 --- a/drivers/firewire/core-iso.c +++ b/drivers/firewire/core-iso.c @@ -107,19 +107,8 @@ EXPORT_SYMBOL(fw_iso_buffer_init); int fw_iso_buffer_map_vma(struct fw_iso_buffer *buffer, struct vm_area_struct *vma) { - unsigned long uaddr; - int i, err; - - uaddr = vma->vm_start; - for (i = 0; i < buffer->page_count; i++) { - err = vm_insert_page(vma, uaddr, buffer->pages[i]); - if (err) - return err; - - uaddr += PAGE_SIZE; - } - - return 0; + return vm_map_pages_zero(vma, buffer->pages, + buffer->page_count); } void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index e18a786fc943..0bbc7142f1dc 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -35,45 +35,37 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata) * afu_dma_adjust_locked_vm - adjust locked memory * @dev: port device * @npages: number of pages - * @incr: increase or decrease locked memory * * Increase or decrease the locked memory size with npages input. * * Return 0 on success. * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK. */ -static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr) +static int afu_dma_adjust_locked_vm(struct device *dev, long pages) { - unsigned long locked, lock_limit; + unsigned long lock_limit; + s64 locked_vm; int ret = 0; /* the task is exiting. */ - if (!current->mm) + if (!current->mm || !pages) return 0; - down_write(¤t->mm->mmap_sem); - - if (incr) { - locked = current->mm->locked_vm + npages; + locked_vm = atomic64_add_return(pages, ¤t->mm->locked_vm); + WARN_ON_ONCE(locked_vm < 0); + if (pages > 0 && !capable(CAP_IPC_LOCK)) { lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + if (locked_vm > lock_limit) { ret = -ENOMEM; - else - current->mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > current->mm->locked_vm)) - npages = current->mm->locked_vm; - current->mm->locked_vm -= npages; + atomic64_sub(pages, ¤t->mm->locked_vm); + } } - dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid, - incr ? '+' : '-', npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), + dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %lld/%lu%s\n", current->pid, + (pages > 0) ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), ret ? "- exceeded" : ""); - up_write(¤t->mm->mmap_sem); - return ret; } @@ -92,7 +84,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata, struct device *dev = &pdata->dev->dev; int ret, pinned; - ret = afu_dma_adjust_locked_vm(dev, npages, true); + ret = afu_dma_adjust_locked_vm(dev, npages); if (ret) return ret; @@ -102,7 +94,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata, goto unlock_vm; } - pinned = get_user_pages_fast(region->user_addr, npages, 1, + pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE, region->pages); if (pinned < 0) { ret = pinned; @@ -121,7 +113,7 @@ put_pages: free_pages: kfree(region->pages); unlock_vm: - afu_dma_adjust_locked_vm(dev, npages, false); + afu_dma_adjust_locked_vm(dev, -npages); return ret; } @@ -141,7 +133,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata, put_all_pages(region->pages, npages); kfree(region->pages); - afu_dma_adjust_locked_vm(dev, npages, false); + afu_dma_adjust_locked_vm(dev, -npages); dev_dbg(dev, "%ld pages unpinned\n", npages); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..58ed401c5996 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(&amn->objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } @@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end = range->end - 1; - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(&amn->objects, range->start, end); @@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 215bf3fef10c..8079ea3af103 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, while (it) { struct drm_i915_gem_object *obj; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; break; } diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..c9bd1278f573 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(&rmn->lock); else if (!mutex_trylock(&rmn->lock)) return -EAGAIN; @@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; goto out_unlock; } diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c index a8db758d523e..a2ebb08990e9 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c @@ -221,26 +221,13 @@ static int rockchip_drm_gem_object_mmap_iommu(struct drm_gem_object *obj, struct vm_area_struct *vma) { struct rockchip_gem_object *rk_obj = to_rockchip_obj(obj); - unsigned int i, count = obj->size >> PAGE_SHIFT; + unsigned int count = obj->size >> PAGE_SHIFT; unsigned long user_count = vma_pages(vma); - unsigned long uaddr = vma->vm_start; - unsigned long offset = vma->vm_pgoff; - unsigned long end = user_count + offset; - int ret; if (user_count == 0) return -ENXIO; - if (end > count) - return -ENXIO; - for (i = offset; i < end; i++) { - ret = vm_insert_page(vma, uaddr, rk_obj->pages[i]); - if (ret) - return ret; - uaddr += PAGE_SIZE; - } - - return 0; + return vm_map_pages(vma, rk_obj->pages, count); } static int rockchip_drm_gem_object_mmap_dma(struct drm_gem_object *obj, diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 8bf3a7c23ed3..062067438f1d 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) if (NULL == vsg->pages) return -ENOMEM; ret = get_user_pages_fast((unsigned long)xfer->mem_addr, - vsg->num_pages, vsg->direction == DMA_FROM_DEVICE, + vsg->num_pages, + vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0, vsg->pages); if (ret != vsg->num_pages) { if (ret < 0) diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c index 53c376d55fcf..a24548489dde 100644 --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c @@ -224,8 +224,7 @@ xen_drm_front_gem_import_sg_table(struct drm_device *dev, static int gem_mmap_obj(struct xen_gem_object *xen_obj, struct vm_area_struct *vma) { - unsigned long addr = vma->vm_start; - int i; + int ret; /* * clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the @@ -252,18 +251,11 @@ static int gem_mmap_obj(struct xen_gem_object *xen_obj, * FIXME: as we insert all the pages now then no .fault handler must * be called, so don't provide one */ - for (i = 0; i < xen_obj->num_pages; i++) { - int ret; - - ret = vm_insert_page(vma, addr, xen_obj->pages[i]); - if (ret < 0) { - DRM_ERROR("Failed to insert pages into vma: %d\n", ret); - return ret; - } + ret = vm_map_pages(vma, xen_obj->pages, xen_obj->num_pages); + if (ret < 0) + DRM_ERROR("Failed to map pages into vma: %d\n", ret); - addr += PAGE_SIZE; - } - return 0; + return ret; } int xen_drm_front_gem_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 0a23048db523..e7ea819fcb11 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, while (npages) { down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, NULL); + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) { up_read(&mm->mmap_sem); goto umem_release; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c7226cf52acc..f962b5bbfa40 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -152,7 +152,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (range->blockable) + if (mmu_notifier_range_blockable(range)) down_read(&per_mm->umem_rwsem); else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; @@ -170,7 +170,8 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, range->end, invalidate_range_start_trampoline, - range->blockable, NULL); + mmu_notifier_range_blockable(range), + NULL); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 24b592c6522e..02eee8eff1db 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -104,8 +104,9 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np bool writable, struct page **pages) { int ret; + unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0); - ret = get_user_pages_fast(vaddr, npages, writable, pages); + ret = get_user_pages_fast(vaddr, npages, gup_flags, pages); if (ret < 0) return ret; diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 112d2f38e0de..8ff0e90d7564 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages); + ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, + FOLL_WRITE | FOLL_LONGTERM, pages); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 123ca8f64f75..f712fb7fa82f 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, down_read(¤t->mm->mmap_sem); for (got = 0; got < num_pages; got += ret) { - ret = get_user_pages_longterm(start_page + got * PAGE_SIZE, - num_pages - got, - FOLL_WRITE | FOLL_FORCE, - p + got, NULL); + ret = get_user_pages(start_page + got * PAGE_SIZE, + num_pages - got, + FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, + p + got, NULL); if (ret < 0) { up_read(¤t->mm->mmap_sem); goto bail_release; diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index ef19d39a44b1..0c204776263f 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -670,7 +670,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, else j = npages; - ret = get_user_pages_fast(addr, j, 0, pages); + ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages); if (ret != j) { i = 0; j = ret; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index da35d6fdfc5e..e312f522a66d 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = 0; while (npages) { - ret = get_user_pages_longterm(cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 542b0005e492..129c4badf9ae 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -648,17 +648,7 @@ out_free_pages: int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma) { - unsigned long uaddr = vma->vm_start; - unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int ret = -ENXIO; - - for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) { - ret = vm_insert_page(vma, uaddr, pages[i]); - if (ret) - break; - uaddr += PAGE_SIZE; - } - return ret; + return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index 7ebd58a1c431..3cf25abf5807 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -2201,6 +2201,13 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma) goto unlock; } + /* + * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer, + * not as a in-buffer offset. We always want to mmap a whole buffer + * from its beginning. + */ + vma->vm_pgoff = 0; + ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma); unlock: diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index 82389aead6ed..ecbef266130b 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -186,12 +186,6 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma) return -EINVAL; } - /* - * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to - * map whole buffer - */ - vma->vm_pgoff = 0; - ret = dma_mmap_attrs(buf->dev, vma, buf->cookie, buf->dma_addr, buf->size, buf->attrs); diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 270c3162fdcb..4a4c49d6085c 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -328,28 +328,18 @@ static unsigned int vb2_dma_sg_num_users(void *buf_priv) static int vb2_dma_sg_mmap(void *buf_priv, struct vm_area_struct *vma) { struct vb2_dma_sg_buf *buf = buf_priv; - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - int i = 0; + int err; if (!buf) { printk(KERN_ERR "No memory to map\n"); return -EINVAL; } - do { - int ret; - - ret = vm_insert_page(vma, uaddr, buf->pages[i++]); - if (ret) { - printk(KERN_ERR "Remapping memory, error: %d\n", ret); - return ret; - } - - uaddr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); - + err = vm_map_pages(vma, buf->pages, buf->num_pages); + if (err) { + printk(KERN_ERR "Remapping memory, error: %d\n", err); + return err; + } /* * Use common vm_area operations to track buffer refcount. diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 08929c087e27..870a2a526e0b 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, - flags, dma->pages, NULL); + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, + flags | FOLL_LONGTERM, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; - dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, + dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); return err < 0 ? err : -EINVAL; } diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 25265fd0fd6e..89cff9d1012b 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr, /* pin user pages in memory */ rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */ m->nr_pages, - m->write, /* readable/writable */ + m->write ? FOLL_WRITE : 0, /* readable/writable */ m->page_list); /* ptrs to pages */ if (rc < 0) goto fail_get_user_pages; diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c index 426ad912b441..d054e2842a5f 100644 --- a/drivers/misc/sram-exec.c +++ b/drivers/misc/sram-exec.c @@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src, if (!part) return NULL; - if (!addr_in_gen_pool(pool, (unsigned long)dst, size)) + if (!gen_pool_has_addr(pool, (unsigned long)dst, size)) return NULL; base = (unsigned long)part->base; diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c index 997f92543dd4..422d08da3244 100644 --- a/drivers/misc/vmw_vmci/vmci_host.c +++ b/drivers/misc/vmw_vmci/vmci_host.c @@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context, /* * Lock physical page backing a given user VA. */ - retval = get_user_pages_fast(uva, 1, 1, &context->notify_page); + retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); if (retval != 1) { context->notify_page = NULL; return VMCI_ERROR_GENERIC; diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index f5f1aac9d163..1174735f003d 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva, int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, - produce_q->kernel_if->num_pages, 1, + produce_q->kernel_if->num_pages, + FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", @@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva, } retval = get_user_pages_fast((uintptr_t) consume_uva, - consume_q->kernel_if->num_pages, 1, + consume_q->kernel_if->num_pages, + FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", diff --git a/drivers/mtd/nand/raw/vf610_nfc.c b/drivers/mtd/nand/raw/vf610_nfc.c index 6d43ddb3332f..e4fe8c4bc711 100644 --- a/drivers/mtd/nand/raw/vf610_nfc.c +++ b/drivers/mtd/nand/raw/vf610_nfc.c @@ -364,7 +364,7 @@ static int vf610_nfc_cmd(struct nand_chip *chip, { const struct nand_op_instr *instr; struct vf610_nfc *nfc = chip_to_nfc(chip); - int op_id = -1, trfr_sz = 0, offset; + int op_id = -1, trfr_sz = 0, offset = 0; u32 col = 0, row = 0, cmd1 = 0, cmd2 = 0, code = 0; bool force8bit = false; diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 321bc673c417..cef0133aa47a 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page, *iter_last_page_size = last_page_size; } - ret = get_user_pages_fast(first_page, requested_pages, !is_write, + ret = get_user_pages_fast(first_page, requested_pages, + !is_write ? FOLL_WRITE : 0, pages); if (ret <= 0) return -EFAULT; diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index dd5d1103e02b..4b6418039387 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -31,19 +31,25 @@ #include <linux/slab.h> #include <linux/pps_kernel.h> #include <linux/pps-gpio.h> -#include <linux/gpio.h> +#include <linux/gpio/consumer.h> #include <linux/list.h> #include <linux/of_device.h> #include <linux/of_gpio.h> +#include <linux/timer.h> +#include <linux/jiffies.h> /* Info for each registered platform device */ struct pps_gpio_device_data { int irq; /* IRQ used as PPS source */ struct pps_device *pps; /* PPS source device */ struct pps_source_info info; /* PPS source information */ + struct gpio_desc *gpio_pin; /* GPIO port descriptors */ + struct gpio_desc *echo_pin; + struct timer_list echo_timer; /* timer to reset echo active state */ bool assert_falling_edge; bool capture_clear; - unsigned int gpio_pin; + unsigned int echo_active_ms; /* PPS echo active duration */ + unsigned long echo_timeout; /* timer timeout value in jiffies */ }; /* @@ -61,18 +67,101 @@ static irqreturn_t pps_gpio_irq_handler(int irq, void *data) info = data; - rising_edge = gpio_get_value(info->gpio_pin); + rising_edge = gpiod_get_value(info->gpio_pin); if ((rising_edge && !info->assert_falling_edge) || (!rising_edge && info->assert_falling_edge)) - pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL); + pps_event(info->pps, &ts, PPS_CAPTUREASSERT, data); else if (info->capture_clear && ((rising_edge && info->assert_falling_edge) || - (!rising_edge && !info->assert_falling_edge))) - pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL); + (!rising_edge && !info->assert_falling_edge))) + pps_event(info->pps, &ts, PPS_CAPTURECLEAR, data); return IRQ_HANDLED; } +/* This function will only be called when an ECHO GPIO is defined */ +static void pps_gpio_echo(struct pps_device *pps, int event, void *data) +{ + /* add_timer() needs to write into info->echo_timer */ + struct pps_gpio_device_data *info = data; + + switch (event) { + case PPS_CAPTUREASSERT: + if (pps->params.mode & PPS_ECHOASSERT) + gpiod_set_value(info->echo_pin, 1); + break; + + case PPS_CAPTURECLEAR: + if (pps->params.mode & PPS_ECHOCLEAR) + gpiod_set_value(info->echo_pin, 1); + break; + } + + /* fire the timer */ + if (info->pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) { + info->echo_timer.expires = jiffies + info->echo_timeout; + add_timer(&info->echo_timer); + } +} + +/* Timer callback to reset the echo pin to the inactive state */ +static void pps_gpio_echo_timer_callback(struct timer_list *t) +{ + const struct pps_gpio_device_data *info; + + info = from_timer(info, t, echo_timer); + + gpiod_set_value(info->echo_pin, 0); +} + +static int pps_gpio_setup(struct platform_device *pdev) +{ + struct pps_gpio_device_data *data = platform_get_drvdata(pdev); + struct device_node *np = pdev->dev.of_node; + int ret; + u32 value; + + data->gpio_pin = devm_gpiod_get(&pdev->dev, + NULL, /* request "gpios" */ + GPIOD_IN); + if (IS_ERR(data->gpio_pin)) { + dev_err(&pdev->dev, + "failed to request PPS GPIO\n"); + return PTR_ERR(data->gpio_pin); + } + + data->echo_pin = devm_gpiod_get_optional(&pdev->dev, + "echo", + GPIOD_OUT_LOW); + if (data->echo_pin) { + if (IS_ERR(data->echo_pin)) { + dev_err(&pdev->dev, "failed to request ECHO GPIO\n"); + return PTR_ERR(data->echo_pin); + } + + ret = of_property_read_u32(np, + "echo-active-ms", + &value); + if (ret) { + dev_err(&pdev->dev, + "failed to get echo-active-ms from OF\n"); + return ret; + } + data->echo_active_ms = value; + /* sanity check on echo_active_ms */ + if (!data->echo_active_ms || data->echo_active_ms > 999) { + dev_err(&pdev->dev, + "echo-active-ms: %u - bad value from OF\n", + data->echo_active_ms); + return -EINVAL; + } + } + + if (of_property_read_bool(np, "assert-falling-edge")) + data->assert_falling_edge = true; + return 0; +} + static unsigned long get_irqf_trigger_flags(const struct pps_gpio_device_data *data) { @@ -90,53 +179,32 @@ get_irqf_trigger_flags(const struct pps_gpio_device_data *data) static int pps_gpio_probe(struct platform_device *pdev) { struct pps_gpio_device_data *data; - const char *gpio_label; int ret; int pps_default_params; const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data; - struct device_node *np = pdev->dev.of_node; /* allocate space for device info */ - data = devm_kzalloc(&pdev->dev, sizeof(struct pps_gpio_device_data), - GFP_KERNEL); + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; + platform_set_drvdata(pdev, data); + /* GPIO setup */ if (pdata) { data->gpio_pin = pdata->gpio_pin; - gpio_label = pdata->gpio_label; + data->echo_pin = pdata->echo_pin; data->assert_falling_edge = pdata->assert_falling_edge; data->capture_clear = pdata->capture_clear; + data->echo_active_ms = pdata->echo_active_ms; } else { - ret = of_get_gpio(np, 0); - if (ret < 0) { - dev_err(&pdev->dev, "failed to get GPIO from device tree\n"); - return ret; - } - data->gpio_pin = ret; - gpio_label = PPS_GPIO_NAME; - - if (of_get_property(np, "assert-falling-edge", NULL)) - data->assert_falling_edge = true; - } - - /* GPIO setup */ - ret = devm_gpio_request(&pdev->dev, data->gpio_pin, gpio_label); - if (ret) { - dev_err(&pdev->dev, "failed to request GPIO %u\n", - data->gpio_pin); - return ret; - } - - ret = gpio_direction_input(data->gpio_pin); - if (ret) { - dev_err(&pdev->dev, "failed to set pin direction\n"); - return -EINVAL; + ret = pps_gpio_setup(pdev); + if (ret) + return -EINVAL; } /* IRQ setup */ - ret = gpio_to_irq(data->gpio_pin); + ret = gpiod_to_irq(data->gpio_pin); if (ret < 0) { dev_err(&pdev->dev, "failed to map GPIO to IRQ: %d\n", ret); return -EINVAL; @@ -152,6 +220,11 @@ static int pps_gpio_probe(struct platform_device *pdev) data->info.owner = THIS_MODULE; snprintf(data->info.name, PPS_MAX_NAME_LEN - 1, "%s.%d", pdev->name, pdev->id); + if (data->echo_pin) { + data->info.echo = pps_gpio_echo; + data->echo_timeout = msecs_to_jiffies(data->echo_active_ms); + timer_setup(&data->echo_timer, pps_gpio_echo_timer_callback, 0); + } /* register PPS source */ pps_default_params = PPS_CAPTUREASSERT | PPS_OFFSETASSERT; @@ -173,7 +246,6 @@ static int pps_gpio_probe(struct platform_device *pdev) return -EINVAL; } - platform_set_drvdata(pdev, data); dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n", data->irq); @@ -185,6 +257,11 @@ static int pps_gpio_remove(struct platform_device *pdev) struct pps_gpio_device_data *data = platform_get_drvdata(pdev); pps_unregister_source(data->pps); + if (data->echo_pin) { + del_timer_sync(&data->echo_timer); + /* reset echo pin in any case */ + gpiod_set_value(data->echo_pin, 0); + } dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq); return 0; } @@ -209,4 +286,4 @@ MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>"); MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>"); MODULE_DESCRIPTION("Use GPIO pin as PPS source"); MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0.0"); +MODULE_VERSION("1.2.0"); diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 1e1f42e210a0..4a4a75fa26d5 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, pinned = get_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, - nr_pages, dir == DMA_FROM_DEVICE, page_list); + nr_pages, + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, + page_list); if (pinned != nr_pages) { if (pinned < 0) { diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index cf45829585cb..b29fc258eeba 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -2147,6 +2147,14 @@ static int riocm_add_mport(struct device *dev, mutex_init(&cm->rx_lock); riocm_rx_fill(cm, RIOCM_RX_RING_SIZE); cm->rx_wq = create_workqueue(DRV_NAME "/rxq"); + if (!cm->rx_wq) { + riocm_error("failed to allocate IBMBOX_%d on %s", + cmbox, mport->name); + rio_release_outb_mbox(mport, cmbox); + kfree(cm); + return -ENOMEM; + } + INIT_WORK(&cm->rx_work, rio_ibmsg_handler); cm->tx_slot = 0; diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index acd9ba40eabe..8090dc9a1514 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p) dax_dbg("uva %p", va); - ret = get_user_pages_fast((unsigned long)va, 1, 1, p); + ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); if (ret == 1) { dax_dbg("locked page %p, for VA %p", *p, va); return 0; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 19c022e66d63..3c6a18ad9a87 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp, /* Try to fault in all of the necessary pages */ /* rw==READ means read from drive, write into memory area */ - res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages); + res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0, + pages); /* Errors and no page mapped should return here */ if (res < nr_pages) diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 3912526ead66..cdb613d38062 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -15,6 +15,7 @@ #include <linux/clk.h> #include <linux/dmaengine.h> +#include <linux/interrupt.h> #include <linux/module.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c index 600928f63577..d35c4fb19e28 100644 --- a/drivers/staging/gasket/gasket_page_table.c +++ b/drivers/staging/gasket/gasket_page_table.c @@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl, ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr + off + i * PAGE_SIZE; } else { - ret = get_user_pages_fast(page_addr - offset, 1, 1, - &page); + ret = get_user_pages_fast(page_addr - offset, 1, + FOLL_WRITE, &page); if (ret <= 0) { dev_err(pg_tbl->device, diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 0b9ab1d0dd45..49fd7312e2aa 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } - rc = get_user_pages_fast(start, num_pages, 1, shm->pages); + rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages); if (rc > 0) shm->num_pages = rc; if (rc != num_pages) { diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 59e82e6d776d..573b2055173c 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -527,8 +527,12 @@ void __handle_sysrq(int key, bool check_mask) { struct sysrq_key_op *op_p; int orig_log_level; + int orig_suppress_printk; int i; + orig_suppress_printk = suppress_printk; + suppress_printk = 0; + rcu_sysrq_start(); rcu_read_lock(); /* @@ -574,6 +578,8 @@ void __handle_sysrq(int key, bool check_mask) } rcu_read_unlock(); rcu_sysrq_end(); + + suppress_printk = orig_suppress_printk; } void handle_sysrq(int key) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6b64e45a5269..7073c1472984 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -36,7 +36,9 @@ static void tce_iommu_detach_group(void *iommu_data, static long try_increment_locked_vm(struct mm_struct *mm, long npages) { - long ret = 0, locked, lock_limit; + long ret = 0; + s64 locked; + unsigned long lock_limit; if (WARN_ON_ONCE(!mm)) return -EPERM; @@ -44,39 +46,32 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages) if (!npages) return 0; - down_write(&mm->mmap_sem); - locked = mm->locked_vm + npages; + locked = atomic64_add_return(npages, &mm->locked_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { ret = -ENOMEM; - else - mm->locked_vm += npages; - - pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK), - ret ? " - exceeded" : ""); + atomic64_sub(npages, &mm->locked_vm); + } - up_write(&mm->mmap_sem); + pr_debug("[%d] RLIMIT_MEMLOCK +%ld %lld/%lu%s\n", current->pid, + npages << PAGE_SHIFT, locked << PAGE_SHIFT, + lock_limit, ret ? " - exceeded" : ""); return ret; } static void decrement_locked_vm(struct mm_struct *mm, long npages) { + s64 locked; + if (!mm || !npages) return; - down_write(&mm->mmap_sem); - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, + locked = atomic64_sub_return(npages, &mm->locked_vm); + WARN_ON_ONCE(locked < 0); + pr_debug("[%d] RLIMIT_MEMLOCK -%ld %lld/%lu\n", current->pid, + npages << PAGE_SHIFT, locked << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); } /* @@ -532,7 +527,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) enum dma_data_direction direction = iommu_tce_direction(tce); if (get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page) != 1) + direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, + &page) != 1) return -EFAULT; *hpa = __pa((unsigned long) page_address(page)); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3be1db3501cc..0237ace12998 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -264,7 +264,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) { struct mm_struct *mm; - int ret; + s64 locked_vm; + int ret = 0; if (!npage) return 0; @@ -273,24 +274,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) if (!mm) return -ESRCH; /* process exited */ - ret = down_write_killable(&mm->mmap_sem); - if (!ret) { - if (npage > 0) { - if (!dma->lock_cap) { - unsigned long limit; - - limit = task_rlimit(dma->task, - RLIMIT_MEMLOCK) >> PAGE_SHIFT; + locked_vm = atomic64_add_return(npage, &mm->locked_vm); - if (mm->locked_vm + npage > limit) - ret = -ENOMEM; - } + if (npage > 0 && !dma->lock_cap) { + unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> + PAGE_SHIFT; + if (locked_vm > limit) { + atomic64_sub(npage, &mm->locked_vm); + ret = -ENOMEM; } - - if (!ret) - mm->locked_vm += npage; - - up_write(&mm->mmap_sem); } if (async) @@ -358,7 +350,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); if (mm == current->mm) { - ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas); + ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page, + vmas); } else { ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, vmas, NULL); @@ -408,6 +401,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, long ret, pinned = 0, lock_acct = 0; bool rsvd; dma_addr_t iova = vaddr - dma->vaddr + dma->iova; + atomic64_t *locked_vm = ¤t->mm->locked_vm; /* This code path is only user initiated */ if (!current->mm) @@ -425,7 +419,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, * pages are already counted against the user. */ if (!rsvd && !vfio_find_vpfn(dma, iova)) { - if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) { + if (!dma->lock_cap && atomic64_read(locked_vm) + 1 > limit) { put_pfn(*pfn_base, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); @@ -452,7 +446,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, if (!rsvd && !vfio_find_vpfn(dma, iova)) { if (!dma->lock_cap && - current->mm->locked_vm + lock_acct + 1 > limit) { + atomic64_read(locked_vm) + lock_acct + 1 > limit) { put_pfn(pfn, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 351af88231ad..1e3ed41ae1f3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, 1, &page); + r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 53b8ceea9bde..fb45f866b923 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -155,21 +155,6 @@ static const struct backlight_ops pwm_backlight_ops = { #ifdef CONFIG_OF #define PWM_LUMINANCE_SCALE 10000 /* luminance scale */ -/* An integer based power function */ -static u64 int_pow(u64 base, int exp) -{ - u64 result = 1; - - while (exp) { - if (exp & 1) - result *= base; - exp >>= 1; - base *= base; - } - - return result; -} - /* * CIE lightness to PWM conversion. * diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index dfed532ed606..4e4d6a0df978 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages); + ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages); if (ret < nr_pages) { nr_pages = ret; ret = -EINVAL; diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 8ba726e600e9..6446bcab4185 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, - num_pages, param.source != -1, pages); + num_pages, param.source != -1 ? FOLL_WRITE : 0, pages); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 7cf9c51318aa..469dfbd6cf90 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -526,20 +526,20 @@ static int mn_invl_range_start(struct mmu_notifier *mn, struct gntdev_grant_map *map; int ret = 0; - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(&priv->lock); else if (!mutex_trylock(&priv->lock)) return -EAGAIN; list_for_each_entry(map, &priv->maps, next) { ret = unmap_if_in_range(map, range->start, range->end, - range->blockable); + mmu_notifier_range_blockable(range)); if (ret) goto out_unlock; } list_for_each_entry(map, &priv->freeable_maps, next) { ret = unmap_if_in_range(map, range->start, range->end, - range->blockable); + mmu_notifier_range_blockable(range)); if (ret) goto out_unlock; } @@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, unsigned long xen_pfn; int ret; - ret = get_user_pages_fast(addr, 1, writeable, &page); + ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page); if (ret < 0) return ret; @@ -1084,7 +1084,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) int index = vma->vm_pgoff; int count = vma_pages(vma); struct gntdev_grant_map *map; - int i, err = -EINVAL; + int err = -EINVAL; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -1145,12 +1145,9 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) goto out_put_map; if (!use_ptemod) { - for (i = 0; i < count; i++) { - err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, - map->pages[i]); - if (err) - goto out_put_map; - } + err = vm_map_pages(vma, map->pages, map->count); + if (err) + goto out_put_map; } else { #ifdef CONFIG_X86 /* diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index a1c61e351d3f..dd5bbb6e1b6b 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -165,12 +165,8 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_priv->n_pages != count) ret = -ENOMEM; else - for (i = 0; i < vma_priv->n_pages; i++) { - ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, - vma_priv->pages[i]); - if (ret) - break; - } + ret = vm_map_pages_zero(vma, vma_priv->pages, + vma_priv->n_pages); if (ret) privcmd_buf_vmapriv_free(vma_priv); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7d09d125f148..fa9e99a962e0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -524,6 +524,19 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ +static inline int make_prot(u32 p_flags) +{ + int prot = 0; + + if (p_flags & PF_R) + prot |= PROT_READ; + if (p_flags & PF_W) + prot |= PROT_WRITE; + if (p_flags & PF_X) + prot |= PROT_EXEC; + return prot; +} + /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function is only provided so that we can read a.out libraries that have @@ -563,16 +576,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; - int elf_prot = 0; + int elf_prot = make_prot(eppnt->p_flags); unsigned long vaddr = 0; unsigned long k, map_addr; - if (eppnt->p_flags & PF_R) - elf_prot = PROT_READ; - if (eppnt->p_flags & PF_W) - elf_prot |= PROT_WRITE; - if (eppnt->p_flags & PF_X) - elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED_NOREPLACE; @@ -687,7 +694,6 @@ static int load_elf_binary(struct linux_binprm *bprm) struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_addr = 0, load_bias = 0; int load_addr_set = 0; - char * elf_interpreter = NULL; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; @@ -698,13 +704,12 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; - struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; - loff_t pos; + struct pt_regs *regs; loc = kmalloc(sizeof(*loc), GFP_KERNEL); if (!loc) { @@ -734,69 +739,66 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; elf_ppnt = elf_phdata; - elf_bss = 0; - elf_brk = 0; + for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + char *elf_interpreter; + loff_t pos; - start_code = ~0UL; - end_code = 0; - start_data = 0; - end_data = 0; + if (elf_ppnt->p_type != PT_INTERP) + continue; - for (i = 0; i < loc->elf_ex.e_phnum; i++) { - if (elf_ppnt->p_type == PT_INTERP) { - /* This is the program interpreter used for - * shared libraries - for now assume that this - * is an a.out format binary - */ - retval = -ENOEXEC; - if (elf_ppnt->p_filesz > PATH_MAX || - elf_ppnt->p_filesz < 2) - goto out_free_ph; - - retval = -ENOMEM; - elf_interpreter = kmalloc(elf_ppnt->p_filesz, - GFP_KERNEL); - if (!elf_interpreter) - goto out_free_ph; - - pos = elf_ppnt->p_offset; - retval = kernel_read(bprm->file, elf_interpreter, - elf_ppnt->p_filesz, &pos); - if (retval != elf_ppnt->p_filesz) { - if (retval >= 0) - retval = -EIO; - goto out_free_interp; - } - /* make sure path is NULL terminated */ - retval = -ENOEXEC; - if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') - goto out_free_interp; + /* + * This is the program interpreter used for shared libraries - + * for now assume that this is an a.out format binary. + */ + retval = -ENOEXEC; + if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) + goto out_free_ph; - interpreter = open_exec(elf_interpreter); - retval = PTR_ERR(interpreter); - if (IS_ERR(interpreter)) - goto out_free_interp; + retval = -ENOMEM; + elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); + if (!elf_interpreter) + goto out_free_ph; - /* - * If the binary is not readable then enforce - * mm->dumpable = 0 regardless of the interpreter's - * permissions. - */ - would_dump(bprm, interpreter); - - /* Get the exec headers */ - pos = 0; - retval = kernel_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), &pos); - if (retval != sizeof(loc->interp_elf_ex)) { - if (retval >= 0) - retval = -EIO; - goto out_free_dentry; - } + pos = elf_ppnt->p_offset; + retval = kernel_read(bprm->file, elf_interpreter, + elf_ppnt->p_filesz, &pos); + if (retval != elf_ppnt->p_filesz) { + if (retval >= 0) + retval = -EIO; + goto out_free_interp; + } + /* make sure path is NULL terminated */ + retval = -ENOEXEC; + if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') + goto out_free_interp; - break; + interpreter = open_exec(elf_interpreter); + kfree(elf_interpreter); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) + goto out_free_ph; + + /* + * If the binary is not readable then enforce mm->dumpable = 0 + * regardless of the interpreter's permissions. + */ + would_dump(bprm, interpreter); + + /* Get the exec headers */ + pos = 0; + retval = kernel_read(interpreter, &loc->interp_elf_ex, + sizeof(loc->interp_elf_ex), &pos); + if (retval != sizeof(loc->interp_elf_ex)) { + if (retval >= 0) + retval = -EIO; + goto out_free_dentry; } - elf_ppnt++; + + break; + +out_free_interp: + kfree(elf_interpreter); + goto out_free_ph; } elf_ppnt = elf_phdata; @@ -819,7 +821,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } /* Some simple consistency checks for the interpreter */ - if (elf_interpreter) { + if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) @@ -884,13 +886,19 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval < 0) goto out_free_dentry; - current->mm->start_stack = bprm->p; + elf_bss = 0; + elf_brk = 0; + + start_code = ~0UL; + end_code = 0; + start_data = 0; + end_data = 0; /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; + int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -931,12 +939,7 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_fixed = MAP_FIXED; } - if (elf_ppnt->p_flags & PF_R) - elf_prot |= PROT_READ; - if (elf_ppnt->p_flags & PF_W) - elf_prot |= PROT_WRITE; - if (elf_ppnt->p_flags & PF_X) - elf_prot |= PROT_EXEC; + elf_prot = make_prot(elf_ppnt->p_flags); elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; @@ -978,7 +981,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * independently randomized mmap region (0 load_bias * without MAP_FIXED). */ - if (elf_interpreter) { + if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); @@ -1076,7 +1079,7 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out_free_dentry; } - if (elf_interpreter) { + if (interpreter) { unsigned long interp_map_addr = 0; elf_entry = load_elf_interp(&loc->interp_elf_ex, @@ -1100,7 +1103,6 @@ static int load_elf_binary(struct linux_binprm *bprm) allow_write_access(interpreter); fput(interpreter); - kfree(elf_interpreter); } else { elf_entry = loc->elf_ex.e_entry; if (BAD_ADDR(elf_entry)) { @@ -1115,7 +1117,7 @@ static int load_elf_binary(struct linux_binprm *bprm) set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, !!elf_interpreter); + retval = arch_setup_additional_pages(bprm, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ @@ -1132,6 +1134,17 @@ static int load_elf_binary(struct linux_binprm *bprm) current->mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { + /* + * For architectures with ELF randomization, when executing + * a loader directly (i.e. no interpreter listed in ELF + * headers), move the brk area out of the mmap region + * (since it grows up, and may collide early with the stack + * growing down), and into the unused ELF_ET_DYN_BASE region. + */ + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + current->mm->brk = current->mm->start_brk = + ELF_ET_DYN_BASE; + current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); #ifdef compat_brk_randomized @@ -1148,6 +1161,7 @@ static int load_elf_binary(struct linux_binprm *bprm) MAP_FIXED | MAP_PRIVATE, 0); } + regs = current_pt_regs(); #ifdef ELF_PLAT_INIT /* * The ABI may specify that certain registers be set up in special @@ -1176,8 +1190,6 @@ out_free_dentry: allow_write_access(interpreter); if (interpreter) fput(interpreter); -out_free_interp: - kfree(elf_interpreter); out_free_ph: kfree(elf_phdata); goto out; @@ -1456,8 +1468,6 @@ static void fill_elf_header(struct elfhdr *elf, int segs, elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = segs; - - return; } static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) @@ -1470,7 +1480,6 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 0; - return; } static void fill_note(struct memelfnote *note, const char *name, int type, @@ -1480,7 +1489,6 @@ static void fill_note(struct memelfnote *note, const char *name, int type, note->type = type; note->datasz = sz; note->data = data; - return; } /* diff --git a/fs/buffer.c b/fs/buffer.c index 0faa41fb4c88..2e088db43646 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -955,10 +955,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -978,6 +988,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1033,6 +1046,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1044,6 +1063,24 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state, + IS_ERR_OR_NULL(bdev->bd_super) ? -1L : + bdev->bd_super->s_blocksize, size, + IS_ERR_OR_NULL(bdev->bd_super) ? -1 : + bdev->bd_super->s_blocksize_bits, + IS_ERR_OR_NULL(bdev->bd_inode) ? -1 : + bdev->bd_inode->i_blkbits); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -3226,6 +3263,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3264,11 +3306,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3292,6 +3341,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; @@ -814,7 +814,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, goto unlock_pmd; flush_cache_page(vma, address, pfn); - pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmdp_invalidate(vma, address, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); diff --git a/fs/eventfd.c b/fs/eventfd.c index 08d3bd602f73..ce8fa15cebe5 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -21,6 +21,9 @@ #include <linux/eventfd.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/idr.h> + +DEFINE_IDA(eventfd_ida); struct eventfd_ctx { struct kref kref; @@ -35,6 +38,7 @@ struct eventfd_ctx { */ __u64 count; unsigned int flags; + int id; }; /** @@ -69,6 +73,8 @@ EXPORT_SYMBOL_GPL(eventfd_signal); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { + if (ctx->id >= 0) + ida_simple_remove(&eventfd_ida, ctx->id); kfree(ctx); } @@ -297,6 +303,7 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) seq_printf(m, "eventfd-count: %16llx\n", (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); + seq_printf(m, "eventfd-id: %d\n", ctx->id); } #endif @@ -400,6 +407,7 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; + ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); diff --git a/fs/exec.c b/fs/exec.c index 2e0033348d8e..d88584ebf07f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1652,11 +1652,13 @@ int search_binary_handler(struct linux_binprm *bprm) if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); + bprm->recursion_depth++; retval = fmt->load_binary(bprm); + bprm->recursion_depth--; + read_lock(&binfmt_lock); put_binfmt(fmt); - bprm->recursion_depth--; if (retval < 0 && !bprm->mm) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); diff --git a/fs/fat/file.c b/fs/fat/file.c index b3bed32946b1..0e3ed79fcc3f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -193,12 +193,17 @@ static int fat_file_release(struct inode *inode, struct file *filp) int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; - int res, err; + int err; + + err = __generic_file_fsync(filp, start, end, datasync); + if (err) + return err; - res = generic_file_fsync(filp, start, end, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + if (err) + return err; - return res ? res : err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c74ef4426282..1dcc57189382 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -499,8 +497,15 @@ static void hugetlbfs_evict_inode(struct inode *inode) struct resv_map *resv_map; remove_inode_hugepages(inode, 0, LLONG_MAX); - resv_map = (struct resv_map *)inode->i_mapping->private_data; - /* root inode doesn't have the resv_map, so we should check it */ + + /* + * Get the resv_map from the address space embedded in the inode. + * This is the address space which points to any resv_map allocated + * at inode creation time. If this is a device special inode, + * i_mapping may not point to the original address space. + */ + resv_map = (struct resv_map *)(&inode->i_data)->private_data; + /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); clear_inode(inode); @@ -639,8 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, - index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/fs/io_uring.c b/fs/io_uring.c index c57eda9d0dc4..f55bbc1b934d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2698,8 +2698,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 6f0999015a44..c89fa18c9b03 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6007,6 +6007,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; + struct ocfs2_journal *journal = osb->journal; BUG_ON(inode_trylock(tl_inode)); @@ -6027,6 +6028,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } + /* Appending truncate log(TA) and and flushing truncate log(TF) are + * two separated transactions. They can be both committed but not + * checkpointed. If crash occurs then, both two transaction will be + * replayed with several already released to global bitmap clusters. + * Then truncate log will be replayed resulting in cluster double free. + */ + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + mlog_errno(status); + goto out; + } + data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 832c1759a09a..e8ea6f783f19 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2153,13 +2153,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct ocfs2_dio_write_ctxt *dwc = NULL; struct buffer_head *di_bh = NULL; u64 p_blkno; - loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned int i_blkbits = inode->i_sb->s_blocksize_bits; + loff_t pos = iblock << i_blkbits; + sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits; unsigned len, total_len = bh_result->b_size; int ret = 0, first_get_block = 0; len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); len = min(total_len, len); + /* + * bh_result->b_size is count in get_more_blocks according to write + * "pos" and "end", we need map twice to return different buffer state: + * 1. area in file size, not set NEW; + * 2. area out file size, set NEW. + * + * iblock endblk + * |--------|---------|---------|--------- + * |<-------area in file------->| + */ + + if ((iblock <= endblk) && + ((iblock + ((len - 1) >> i_blkbits)) > endblk)) + len = (endblk - iblock + 1) << i_blkbits; + mlog(0, "get block of %lu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); @@ -2243,6 +2260,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, if (desc->c_needs_zero) set_buffer_new(bh_result); + if (iblock > endblk) + set_buffer_new(bh_result); + /* May sleep in end_io. It should not happen in a irq context. So defer * it to dio work queue. */ set_buffer_defer_completion(bh_result); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c121abbdfc7d..85f21caaa6ec 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -69,10 +69,6 @@ #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -static unsigned char ocfs2_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, @@ -1718,7 +1714,7 @@ int __ocfs2_add_entry(handle_t *handle, de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); de = de1; } - de->file_type = OCFS2_FT_UNKNOWN; + de->file_type = FT_UNKNOWN; if (blkno) { de->inode = cpu_to_le64(blkno); ocfs2_set_de_type(de, inode->i_mode); @@ -1803,13 +1799,9 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, } offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - unsigned char d_type = DT_UNKNOWN; - - if (de->file_type < OCFS2_FT_MAX) - d_type = ocfs2_filetype_table[de->file_type]; - if (!dir_emit(ctx, de->name, de->name_len, - le64_to_cpu(de->inode), d_type)) + le64_to_cpu(de->inode), + fs_ftype_to_dtype(de->file_type))) goto out; } ctx->pos += le16_to_cpu(de->rec_len); @@ -1900,14 +1892,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, break; } if (le64_to_cpu(de->inode)) { - unsigned char d_type = DT_UNKNOWN; - - if (de->file_type < OCFS2_FT_MAX) - d_type = ocfs2_filetype_table[de->file_type]; if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), - d_type)) { + fs_ftype_to_dtype(de->file_type))) { brelse(bh); return 0; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 63d701cd1e2e..c8e9b7031d9a 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -105,7 +105,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, enum dlm_status status; int actions = 0; int in_use; - u8 owner; + u8 owner; + int recovery_wait = 0; mlog(0, "master_node = %d, valblk = %d\n", master_node, flags & LKM_VALBLK); @@ -208,9 +209,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, } if (flags & LKM_CANCEL) lock->cancel_pending = 0; - else - lock->unlock_pending = 0; - + else { + if (!lock->unlock_pending) + recovery_wait = 1; + else + lock->unlock_pending = 0; + } } /* get an extra ref on lock. if we are just switching @@ -244,6 +248,17 @@ leave: spin_unlock(&res->spinlock); wake_up(&res->wq); + if (recovery_wait) { + spin_lock(&res->spinlock); + /* Unlock request will directly succeed after owner dies, + * and the lock is already removed from grant list. We have to + * wait for RECOVERING done or we miss the chance to purge it + * since the removement is much faster than RECOVERING proc. + */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); + spin_unlock(&res->spinlock); + } + /* let the caller's final dlm_lock_put handle the actual kfree */ if (actions & DLM_UNLOCK_FREE_LOCK) { /* this should always be coupled with list removal */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index af405586c5b1..dccf4136f8c1 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -448,7 +448,7 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, struct ocfs2_mask_waiter *mw, int ret) { u32 usec; - ktime_t kt; + ktime_t last, kt; struct ocfs2_lock_stats *stats; if (level == LKM_PRMODE) @@ -458,7 +458,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, else return; - kt = ktime_sub(ktime_get(), mw->mw_lock_start); + last = ktime_get(); + kt = ktime_sub(last, mw->mw_lock_start); usec = ktime_to_us(kt); stats->ls_gets++; @@ -474,6 +475,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, if (ret) stats->ls_fail++; + + stats->ls_last = ktime_to_timespec(last).tv_sec; } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) @@ -3093,8 +3096,10 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) * - Lock stats printed * New in version 3 * - Max time in lock stats is in usecs (instead of nsecs) + * New in version 4 + * - Add last pr/ex unlock times in secs */ -#define OCFS2_DLM_DEBUG_STR_VERSION 3 +#define OCFS2_DLM_DEBUG_STR_VERSION 4 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; @@ -3145,6 +3150,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) # define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) # define lock_refresh(_l) ((_l)->l_lock_refresh) +# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last) +# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last) #else # define lock_num_prmode(_l) (0) # define lock_num_exmode(_l) (0) @@ -3155,6 +3162,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) (0) # define lock_max_exmode(_l) (0) # define lock_refresh(_l) (0) +# define lock_last_prmode(_l) (0) +# define lock_last_exmode(_l) (0) #endif /* The following seq_print was added in version 2 of this output */ seq_printf(m, "%u\t" @@ -3165,6 +3174,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%llu\t" "%u\t" "%u\t" + "%u\t" + "%u\t" "%u\t", lock_num_prmode(lockres), lock_num_exmode(lockres), @@ -3174,7 +3185,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lock_total_exmode(lockres), lock_max_prmode(lockres), lock_max_exmode(lockres), - lock_refresh(lockres)); + lock_refresh(lockres), + lock_last_prmode(lockres), + lock_last_exmode(lockres)); /* End the line */ seq_printf(m, "\n"); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 4bf8d5854b27..af2888d23de3 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) u64 blkno; struct dentry *parent; struct inode *dir = d_inode(child); + int set; trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); + status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + parent = ERR_PTR(status); + goto bail; + } + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); parent = ERR_PTR(status); - goto bail; + goto unlock_nfs_sync; } status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); @@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } + status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + parent = ERR_PTR(status); + goto bail_unlock; + } + + trace_ocfs2_get_dentry_test_bit(status, set); + if (!set) { + status = -ESTALE; + parent = ERR_PTR(status); + goto bail_unlock; + } + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); bail_unlock: ocfs2_inode_unlock(dir, 0); +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1); + bail: trace_ocfs2_get_parent_end(parent); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1f029fbe8b8d..8efa022684f4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -164,6 +164,7 @@ struct ocfs2_lock_stats { /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ + u32 ls_last; /* Last unlock time in SEC */ }; #endif diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7071ad0dec90..b86bf5e74348 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -392,21 +392,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_HB_GLOBAL "heartbeat=global" /* - * OCFS2 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -#define OCFS2_FT_UNKNOWN 0 -#define OCFS2_FT_REG_FILE 1 -#define OCFS2_FT_DIR 2 -#define OCFS2_FT_CHRDEV 3 -#define OCFS2_FT_BLKDEV 4 -#define OCFS2_FT_FIFO 5 -#define OCFS2_FT_SOCK 6 -#define OCFS2_FT_SYMLINK 7 - -#define OCFS2_FT_MAX 8 - -/* * OCFS2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 @@ -424,17 +409,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_LINKS_HI_SHIFT 16 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) -#define S_SHIFT 12 -static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, - [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, - [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, -}; - /* * Convenience casts @@ -1629,7 +1603,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, umode_t mode) { - de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + de->file_type = fs_umode_to_ftype(mode); } static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index d4811f981608..2bb916d68576 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, /* map the pages */ ret = get_user_pages_fast((unsigned long)user_desc->ptr, - bufmap->page_count, 1, bufmap->page_array); + bufmap->page_count, FOLL_WRITE, bufmap->page_array); if (ret < 0) return ret; diff --git a/fs/proc/base.c b/fs/proc/base.c index b6ccb6c57706..9c8ca6cd3ce4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -510,7 +510,7 @@ static ssize_t lstats_write(struct file *file, const char __user *buf, if (!task) return -ESRCH; - clear_all_latency_tracing(task); + clear_tsk_latency_tracing(task); put_task_struct(task); return count; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 568d90e17c17..465ea0153b2a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 95ca1fe7283c..a1c2ad9f960a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -58,7 +58,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) swap = get_mm_counter(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); - SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", atomic64_read(&mm->locked_vm)); SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); @@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(&range, mm, 0, -1UL); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 11201b2d06b9..a26ae0b83f8f 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -145,6 +145,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * return error; } +static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOSPC; + d_tmpfile(dentry, inode); + return 0; +} + static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, @@ -155,6 +166,7 @@ static const struct inode_operations ramfs_dir_inode_operations = { .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, + .tmpfile = ramfs_tmpfile, }; /* diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 32d8986c26fb..b5b26d8a192c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -450,6 +450,15 @@ fail: static inline __u32 xattr_hash(const char *msg, int len) { + /* + * csum_partial() gives different results for little-endian and + * big endian hosts. Images created on little-endian hosts and + * mounted on big-endian hosts(and vice versa) will see csum mismatches + * when trying to fetch xattrs. Treating the hash as __wsum_t would + * lower the frequency of mismatch. This is an endianness bug in + * reiserfs. The return statement would result in a sparse warning. Do + * not fix the sparse warning so as to not hide a reminder of the bug. + */ return csum_partial(msg, len, 0); } diff --git a/fs/sync.c b/fs/sync.c index 01e82170545a..39df8c2958fe 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -292,8 +292,14 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { + int sync_mode = WB_SYNC_NONE; + + if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == + SYNC_FILE_RANGE_WRITE_AND_WAIT) + sync_mode = WB_SYNC_ALL; + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + sync_mode); if (ret < 0) goto out; } @@ -306,9 +312,9 @@ out: } /* - * sys_sync_file_range() permits finely controlled syncing over a segment of + * ksys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is - * zero then sys_sync_file_range() will operate from offset out to EOF. + * zero then ksys_sync_file_range() will operate from offset out to EOF. * * The flag bits are: * @@ -325,7 +331,7 @@ out: * Useful combinations of the flag bits are: * * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages - * in the range which were dirty on entry to sys_sync_file_range() are placed + * in the range which were dirty on entry to ksys_sync_file_range() are placed * under writeout. This is a start-write-for-data-integrity operation. * * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which @@ -337,10 +343,13 @@ out: * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait * for that operation to complete and to return the result. * - * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER + * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT): * a traditional sync() operation. This is a write-for-data-integrity operation * which will ensure that all pages in the range which were dirty on entry to - * sys_sync_file_range() are committed to disk. + * ksys_sync_file_range() are committed to disk. It should be noted that disk + * caches are not flushed by this call, so there are no guarantees here that the + * data will be available on disk after a crash. * * * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f5de1e726356..3b30301c90ec 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,8 @@ #include <linux/security.h> #include <linux/hugetlb.h> +int sysctl_unprivileged_userfaultfd __read_mostly = 1; + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; enum userfaultfd_state { @@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) struct userfaultfd_ctx *ctx; int fd; + if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) + return -EPERM; + BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ diff --git a/include/asm-generic/dynamic_debug.h b/include/asm-generic/dynamic_debug.h new file mode 100644 index 000000000000..f1dd3019cd98 --- /dev/null +++ b/include/asm-generic/dynamic_debug.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_DYNAMIC_DEBUG_H +#define _ASM_GENERIC_DYNAMIC_DEBUG_H + +#ifndef _DYNAMIC_DEBUG_H +#error "don't include asm/dynamic_debug.h directly" +#endif + +#include <linux/build_bug.h> +#ifdef CONFIG_JUMP_LABEL +#include <linux/jump_label.h> +#endif + +/* + * We need to know the exact layout of struct _ddebug in order to + * initialize it in assembly. Check that all members are at expected + * offsets - if any of these fail, the arch cannot use this generic + * dynamic_debug.h. DYNAMIC_DEBUG_RELATIVE_POINTERS is pointless for + * !64BIT, so we expect the static_key to be at an 8-byte boundary + * since it contains stuff which is long-aligned. + */ + +static_assert(BITS_PER_LONG == 64); +static_assert(offsetof(struct _ddebug, modname_disp) == 0); +static_assert(offsetof(struct _ddebug, function_disp) == 4); +static_assert(offsetof(struct _ddebug, filename_disp) == 8); +static_assert(offsetof(struct _ddebug, format_disp) == 12); +static_assert(offsetof(struct _ddebug, flags_lineno) == 16); + +#ifdef CONFIG_JUMP_LABEL +static_assert(offsetof(struct _ddebug, key) == 24); +static_assert(offsetof(struct static_key, enabled) == 0); +static_assert(offsetof(struct static_key, type) == 8); + +/* <4 bytes padding>, .enabled, <4 bytes padding>, .type */ +# ifdef DEBUG +# define _DPRINTK_ASM_KEY_INIT \ + "\t.int 0\n" "\t.int 1\n" "\t.int 0\n" "\t.quad "__stringify(__JUMP_TYPE_TRUE)"\n" +# else +# define _DPRINTK_ASM_KEY_INIT \ + "\t.int 0\n" "\t.int 0\n" "\t.int 0\n" "\t.quad "__stringify(__JUMP_TYPE_FALSE)"\n" +# endif +#else /* !CONFIG_JUMP_LABEL */ +#define _DPRINTK_ASM_KEY_INIT "" +#endif + +/* + * There's a bit of magic involved here. + * + * First, unlike the bug table entries, we need to define an object in + * assembly which we can reference from C code (for use by the + * DYNAMIC_DEBUG_BRANCH macro), but we don't want 'name' to have + * external linkage (that would require use of globally unique + * identifiers, which we can't guarantee). Fortunately, the "extern" + * keyword just tells the compiler that _somebody_ provides that + * symbol - usually that somebody is the linker, but in this case it's + * the assembler, and since we do not do .globl name, the symbol gets + * internal linkage. + * + * So far so good. The next problem is that there's no scope in + * assembly, so the identifier 'name' has to be unique within each + * translation unit - otherwise all uses of that identifier end up + * referring to the same struct _ddebug instance. pr_debug and friends + * do this by use of indirection and __UNIQUE_ID(), and new users of + * DEFINE_DYNAMIC_DEBUG_METADATA() should do something similar. We + * need to catch cases where this is not done at build time. + * + * With assembly-level .ifndef we can ensure that we only define a + * given identifier once, preventing "symbol 'foo' already defined" + * errors. But we still need to detect and fail on multiple uses of + * the same identifer. The simplest, and wrong, solution to that is to + * add an .else .error branch to the .ifndef. The problem is that just + * because the DEFINE_DYNAMIC_DEBUG_METADATA() macro is only expanded + * once with a given identifier, the compiler may emit the assembly + * code multiple times, e.g. if the macro appears in an inline + * function. Now, in a normal case like + * + * static inline get_next_id(void) { static int v; return ++v; } + * + * all inlined copies of get_next_id are _supposed_ to refer to the + * same object 'v'. So we do need to allow this chunk of assembly to + * appear multiple times with the same 'name', as long as they all + * came from the same DEFINE_DYNAMIC_DEBUG_METADATA() instance. To do + * that, we pass __COUNTER__ to the asm(), and set an assembler symbol + * name.ddebug.once to that value when we first define 'name'. When we + * meet a second attempt at defining 'name', we compare + * name.ddebug.once to %6 and error out if they are different. + */ + +#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ + extern struct _ddebug name; \ + asm volatile(".ifndef " __stringify(name) "\n" \ + ".pushsection __verbose,\"aw\"\n" \ + ".type "__stringify(name)", STT_OBJECT\n" \ + ".size "__stringify(name)", %c5\n" \ + "1:\n" \ + __stringify(name) ":\n" \ + "\t.int %c0 - 1b /* _ddebug::modname_disp */\n" \ + "\t.int %c1 - 1b /* _ddebug::function_disp */\n" \ + "\t.int %c2 - 1b /* _ddebug::filename_disp */\n" \ + "\t.int %c3 - 1b /* _ddebug::format_disp */\n" \ + "\t.int %c4 /* _ddebug::flags_lineno */\n" \ + _DPRINTK_ASM_KEY_INIT \ + "\t.org 1b+%c5\n" \ + ".popsection\n" \ + ".set "__stringify(name)".ddebug.once, %c6\n" \ + ".elseif "__stringify(name)".ddebug.once - %c6\n" \ + ".line "__stringify(__LINE__) " - 1\n" \ + ".error \"'"__stringify(name)"' used as _ddebug identifier more than once\"\n" \ + ".endif\n" \ + : : "i" (KBUILD_MODNAME), "i" (__func__), \ + "i" (__FILE__), "i" (fmt), \ + "i" (_DPRINTK_FLAGS_LINENO_INIT), \ + "i" (sizeof(struct _ddebug)), "i" (__COUNTER__)) + +#endif /* _ASM_GENERIC_DYNAMIC_DEBUG_H */ diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 71d7b77eea50..822f433ac95c 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep) } #endif +#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +static inline bool gigantic_page_runtime_supported(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); +} +#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ + #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h index 8b78c0ba08b1..b8f9035ffc2c 100644 --- a/include/asm-generic/shmparam.h +++ b/include/asm-generic/shmparam.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_SHMPARAM_H #define __ASM_GENERIC_SHMPARAM_H diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f111c780ef1d..f31521dcb09a 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -151,21 +151,6 @@ static inline void balloon_page_delete(struct page *page) list_del(&page->lru); } -static inline bool __is_movable_balloon_page(struct page *page) -{ - return false; -} - -static inline bool balloon_page_movable(struct page *page) -{ - return false; -} - -static inline bool isolated_balloon_page(struct page *page) -{ - return false; -} - static inline bool balloon_page_isolate(struct page *page) { return false; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 688ab0de7810..b40fc633f3be 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -15,7 +15,6 @@ struct filename; * This structure is used to hold the arguments that are used when loading binaries. */ struct linux_binprm { - char buf[BINPRM_BUF_SIZE]; #ifdef CONFIG_MMU struct vm_area_struct *vma; unsigned long vma_pages; @@ -64,6 +63,8 @@ struct linux_binprm { unsigned long loader, exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ + + char buf[BINPRM_BUF_SIZE]; } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 602af23b98c7..cf074bce3eb3 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -60,7 +60,7 @@ static __always_inline unsigned long hweight_long(unsigned long w) */ static inline __u64 rol64(__u64 word, unsigned int shift) { - return (word << shift) | (word >> (64 - shift)); + return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** @@ -70,7 +70,7 @@ static inline __u64 rol64(__u64 word, unsigned int shift) */ static inline __u64 ror64(__u64 word, unsigned int shift) { - return (word >> shift) | (word << (64 - shift)); + return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** @@ -80,7 +80,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> ((-shift) & 31)); + return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** @@ -90,7 +90,7 @@ static inline __u32 rol32(__u32 word, unsigned int shift) */ static inline __u32 ror32(__u32 word, unsigned int shift) { - return (word >> shift) | (word << (32 - shift)); + return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** @@ -100,7 +100,7 @@ static inline __u32 ror32(__u32 word, unsigned int shift) */ static inline __u16 rol16(__u16 word, unsigned int shift) { - return (word << shift) | (word >> (16 - shift)); + return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** @@ -110,7 +110,7 @@ static inline __u16 rol16(__u16 word, unsigned int shift) */ static inline __u16 ror16(__u16 word, unsigned int shift) { - return (word >> shift) | (word << (16 - shift)); + return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** @@ -120,7 +120,7 @@ static inline __u16 ror16(__u16 word, unsigned int shift) */ static inline __u8 rol8(__u8 word, unsigned int shift) { - return (word << shift) | (word >> (8 - shift)); + return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** @@ -130,7 +130,7 @@ static inline __u8 rol8(__u8 word, unsigned int shift) */ static inline __u8 ror8(__u8 word, unsigned int shift) { - return (word >> shift) | (word << (8 - shift)); + return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index ba814f18cb4c..19e58b9138a0 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -140,8 +140,7 @@ struct ftrace_likely_data { * Do not use __always_inline here, since currently it expands to inline again * (which would break users of __always_inline). */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ - !defined(CONFIG_OPTIMIZE_INLINING) +#if !defined(CONFIG_OPTIMIZE_INLINING) #define inline inline __attribute__((__always_inline__)) __gnu_inline \ __maybe_unused notrace #else diff --git a/include/linux/console.h b/include/linux/console.h index ec9bdb3d7bab..d09951d5a94e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -166,6 +166,11 @@ struct console { extern int console_set_on_cmdline; extern struct console *early_console; +enum con_flush_mode { + CONSOLE_FLUSH_PENDING, + CONSOLE_REPLAY_ALL, +}; + extern int add_preferred_console(char *name, int idx, char *options); extern void register_console(struct console *); extern int unregister_console(struct console *); @@ -175,7 +180,7 @@ extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); -extern void console_flush_on_panic(void); +extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 147bdec42215..21755471b1c3 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -633,8 +633,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { - char *nl = strchr(buf, '\n'); - unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf); + unsigned int len = strchrnul(buf, '\n') - buf; return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } diff --git a/include/linux/device.h b/include/linux/device.h index e85264fb6616..72a6260f2b4d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1562,7 +1562,7 @@ do { \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ -#define dev_dbg_ratelimited(dev, fmt, ...) \ +#define _dev_dbg_ratelimited(descriptor, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ @@ -1573,6 +1573,8 @@ do { \ __dynamic_dev_dbg(&descriptor, dev, dev_fmt(fmt), \ ##__VA_ARGS__); \ } while (0) +#define dev_dbg_ratelimited(dev, fmt, ...) \ + _dev_dbg_ratelimited(__UNIQUE_ID(ddebug), dev, fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 6c809440f319..f048ff06f521 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -16,11 +16,17 @@ struct _ddebug { * These fields are used to drive the user interface * for selecting and displaying debug callsites. */ +#ifdef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS + signed int modname_disp; + signed int function_disp; + signed int filename_disp; + signed int format_disp; +#else const char *modname; const char *function; const char *filename; const char *format; - unsigned int lineno:18; +#endif /* * The flags field controls the behaviour at the callsite. * The bits here are changed dynamically when the user @@ -37,7 +43,7 @@ struct _ddebug { #else #define _DPRINTK_FLAGS_DEFAULT 0 #endif - unsigned int flags:8; + unsigned int flags_lineno; /* flags in lower 8 bits, lineno in upper 24 */ #ifdef CONFIG_JUMP_LABEL union { struct static_key_true dd_key_true; @@ -46,7 +52,7 @@ struct _ddebug { #endif } __attribute__((aligned(8))); - +#define _DPRINTK_FLAGS_LINENO_INIT (((unsigned)__LINE__ << 8) | _DPRINTK_FLAGS_DEFAULT) #if defined(CONFIG_DYNAMIC_DEBUG) int ddebug_add_module(struct _ddebug *tab, unsigned int n, @@ -78,6 +84,13 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, const struct ib_device *ibdev, const char *fmt, ...); + +#ifdef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS +#include <asm/dynamic_debug.h> +#ifndef DEFINE_DYNAMIC_DEBUG_METADATA +# error "asm/dynamic_debug.h must provide definition of DEFINE_DYNAMIC_DEBUG_METADATA" +#endif +#else #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ static struct _ddebug __aligned(8) \ __attribute__((section("__verbose"))) name = { \ @@ -85,10 +98,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, .function = __func__, \ .filename = __FILE__, \ .format = (fmt), \ - .lineno = __LINE__, \ - .flags = _DPRINTK_FLAGS_DEFAULT, \ + .flags_lineno = _DPRINTK_FLAGS_LINENO_INIT, \ _DPRINTK_KEY_INIT \ } +#endif #ifdef CONFIG_JUMP_LABEL @@ -111,10 +124,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #ifdef DEBUG #define DYNAMIC_DEBUG_BRANCH(descriptor) \ - likely(descriptor.flags & _DPRINTK_FLAGS_PRINT) + likely(descriptor.flags_lineno & _DPRINTK_FLAGS_PRINT) #else #define DYNAMIC_DEBUG_BRANCH(descriptor) \ - unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) + unlikely(descriptor.flags_lineno & _DPRINTK_FLAGS_PRINT) #endif #endif diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index dd0a452373e7..c5e5bed82fbc 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -156,7 +156,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, int nid, const char *name); extern struct gen_pool *gen_pool_get(struct device *dev, const char *name); -bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, +extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start, size_t size); #ifdef CONFIG_OF diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fdab7de7490d..fb07b503dc45 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -585,12 +585,12 @@ static inline bool pm_suspended_storage(void) } #endif /* CONFIG_PM_SLEEP */ -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); -extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif +void free_contig_range(unsigned long pfn, unsigned int nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ad50b7b4f141..51ec27a84668 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -77,8 +77,34 @@ #include <linux/migrate.h> #include <linux/memremap.h> #include <linux/completion.h> +#include <linux/mmu_notifier.h> -struct hmm; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + * @wq: wait queue for user waiting on a range invalidation + * @notifiers: count of active mmu notifiers + * @dead: is the mm dead ? + */ +struct hmm { + struct mm_struct *mm; + struct kref kref; + struct mutex lock; + struct list_head ranges; + struct list_head mirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; + wait_queue_head_t wq; + long notifiers; + bool dead; +}; /* * hmm_pfn_flag_e - HMM flag enums @@ -131,6 +157,7 @@ enum hmm_pfn_value_e { /* * struct hmm_range - track invalidation lock on virtual address range * + * @hmm: the core HMM structure this range is active against * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) @@ -138,10 +165,13 @@ enum hmm_pfn_value_e { * @pfns: array of pfns (big enough for the range) * @flags: pfn flags to match device driver page table * @values: pfn value for some special case (none, special, error, ...) + * @default_flags: default flags for the range (write, read, ... see hmm doc) + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct hmm *hmm; struct vm_area_struct *vma; struct list_head list; unsigned long start; @@ -149,41 +179,96 @@ struct hmm_range { uint64_t *pfns; const uint64_t *flags; const uint64_t *values; + uint64_t default_flags; + uint64_t pfn_flags_mask; + uint8_t page_shift; uint8_t pfn_shift; bool valid; }; /* - * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to get corresponding struct page from - * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise + * hmm_range_page_shift() - return the page shift for the range + * @range: range being queried + * Returns: page shift (page size = 1 << page shift) for the range + */ +static inline unsigned hmm_range_page_shift(const struct hmm_range *range) +{ + return range->page_shift; +} + +/* + * hmm_range_page_size() - return the page size for the range + * @range: range being queried + * Returns: page size for the range in bytes + */ +static inline unsigned long hmm_range_page_size(const struct hmm_range *range) +{ + return 1UL << hmm_range_page_shift(range); +} + +/* + * hmm_range_wait_until_valid() - wait for range to be valid + * @range: range affected by invalidation to wait on + * @timeout: time out for wait in ms (ie abort wait after that period of time) + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_wait_until_valid(struct hmm_range *range, + unsigned long timeout) +{ + /* Check if mm is dead ? */ + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { + range->valid = false; + return false; + } + if (range->valid) + return true; + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + msecs_to_jiffies(timeout)); + /* Return current valid status just in case we get lucky */ + return range->valid; +} + +/* + * hmm_range_valid() - test if a range is valid or not + * @range: range + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_valid(struct hmm_range *range) +{ + return range->valid; +} + +/* + * hmm_device_entry_to_page() - return struct page pointed to by a device entry + * @range: range use to decode device entry value + * @entry: device entry value to get corresponding struct page from + * Returns: struct page pointer if entry is a valid, NULL otherwise * - * If the HMM pfn is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the HMM pfn. Otherwise return NULL. + * If the device entry is valid (ie valid flag set) then return the struct page + * matching the entry value. Otherwise return NULL. */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) +static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range, + uint64_t entry) { - if (pfn == range->values[HMM_PFN_NONE]) + if (entry == range->values[HMM_PFN_NONE]) return NULL; - if (pfn == range->values[HMM_PFN_ERROR]) + if (entry == range->values[HMM_PFN_ERROR]) return NULL; - if (pfn == range->values[HMM_PFN_SPECIAL]) + if (entry == range->values[HMM_PFN_SPECIAL]) return NULL; - if (!(pfn & range->flags[HMM_PFN_VALID])) + if (!(entry & range->flags[HMM_PFN_VALID])) return NULL; - return pfn_to_page(pfn >> range->pfn_shift); + return pfn_to_page(entry >> range->pfn_shift); } /* - * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to extract pfn from - * Returns: pfn value if HMM pfn is valid, -1UL otherwise + * hmm_device_entry_to_pfn() - return pfn value store in a device entry + * @range: range use to decode device entry value + * @entry: device entry to extract pfn from + * Returns: pfn value if device entry is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) +static inline unsigned long +hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) { if (pfn == range->values[HMM_PFN_NONE]) return -1UL; @@ -197,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, } /* - * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * hmm_device_entry_from_page() - create a valid device entry for a page * @range: range use to encode HMM pfn value - * @page: struct page pointer for which to create the HMM pfn - * Returns: valid HMM pfn for the page + * @page: page for which to create the device entry + * Returns: valid device entry for the page */ -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, - struct page *page) +static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, + struct page *page) { return (page_to_pfn(page) << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } /* - * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the HMM pfn - * Returns: valid HMM pfn for the pfn + * @pfn: pfn value for which to create the device entry + * Returns: valid device entry for the pfn */ -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) +static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, + unsigned long pfn) { return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } +/* + * Old API: + * hmm_pfn_to_page() + * hmm_pfn_to_pfn() + * hmm_pfn_from_page() + * hmm_pfn_from_pfn() + * + * This are the OLD API please use new API, it is here to avoid cross-tree + * merge painfullness ie we convert things to new API in stages. + */ +static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, + uint64_t pfn) +{ + return hmm_device_entry_to_page(range, pfn); +} + +static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, + uint64_t pfn) +{ + return hmm_device_entry_to_pfn(range, pfn); +} + +static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, + struct page *page) +{ + return hmm_device_entry_from_page(range, page); +} + +static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, + unsigned long pfn) +{ + return hmm_device_entry_from_pfn(range, pfn); +} + + #if IS_ENABLED(CONFIG_HMM_MIRROR) /* @@ -353,43 +473,113 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); - /* - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device - * driver lock that serializes device page table updates, then call - * hmm_vma_range_done(), to check if the snapshot is still valid. The same - * device driver page table update lock must also be used in the - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page - * table invalidation serializes on it. - * - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_vma_get_pfns() WITHOUT ERROR ! - * - * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! + * hmm_mirror_mm_is_alive() - test if mm is still alive + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + * Returns: false if the mm is dead, true otherwise + * + * This is an optimization it will not accurately always return -EINVAL if the + * mm is dead ie there can be false negative (process is being kill but HMM is + * not yet inform of that). It is only intented to be use to optimize out case + * where driver is about to do something time consuming and it would be better + * to skip it if the mm is dead. */ -int hmm_vma_get_pfns(struct hmm_range *range); -bool hmm_vma_range_done(struct hmm_range *range); +static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) +{ + struct mm_struct *mm; + + if (!mirror || !mirror->hmm) + return false; + mm = READ_ONCE(mirror->hmm->mm); + if (mirror->hmm->dead || !mm) + return false; + + return true; +} /* - * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will - * not migrate any device memory back to system memory. The HMM pfn array will - * be updated with the fault result and current snapshot of the CPU page table - * for the range. - * - * The mmap_sem must be taken in read mode before entering and it might be - * dropped by the function if the block argument is false. In that case, the - * function returns -EAGAIN. - * - * Return value does not reflect if the fault was successful for every single - * address or not. Therefore, the caller must to inspect the HMM pfn array to - * determine fault status for each address. - * - * Trying to fault inside an invalid vma will result in -EINVAL. + * Please see Documentation/vm/hmm.rst for how to use the range API. + */ +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end, + unsigned page_shift); +void hmm_range_unregister(struct hmm_range *range); +long hmm_range_snapshot(struct hmm_range *range); +long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block); +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty); + +/* + * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range * - * See the function description in mm/hmm.c for further documentation. + * When waiting for mmu notifiers we need some kind of time out otherwise we + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to + * wait already. */ -int hmm_vma_fault(struct hmm_range *range, bool block); +#define HMM_RANGE_DEFAULT_TIMEOUT 1000 + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline bool hmm_vma_range_done(struct hmm_range *range) +{ + bool ret = hmm_range_valid(range); + + hmm_range_unregister(range); + return ret; +} + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline int hmm_vma_fault(struct hmm_range *range, bool block) +{ + long ret; + + /* + * With the old API the driver must set each individual entries with + * the requested flags (valid, write, ...). So here we set the mask to + * keep intact the entries provided by the driver and zero out the + * default_flags. + */ + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + + ret = hmm_range_register(range, range->vma->vm_mm, + range->start, range->end, + PAGE_SHIFT); + if (ret) + return (int)ret; + + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { + /* + * The mmap_sem was taken by driver we release it here and + * returns -EAGAIN which correspond to mmap_sem have been + * drop in the old API. + */ + up_read(&range->vma->vm_mm->mmap_sem); + return -EAGAIN; + } + + ret = hmm_range_fault(range, block); + if (ret <= 0) { + if (ret == -EBUSY || !ret) { + /* Same as above drop mmap_sem to match old API. */ + up_read(&range->vma->vm_mm->mmap_sem); + ret = -EBUSY; + } else if (ret == -EAGAIN) + ret = -EBUSY; + hmm_range_unregister(range); + return ret; + } + return 0; +} /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 11943b60f208..edf476c8cfb9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6ab8c1bada3f..c309f43bde45 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -19,6 +19,7 @@ struct ipc_ids { struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; + int last_idx; /* For wrap around detection */ #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 3e113a1fa0f1..2b027d2ef3d0 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -190,6 +190,8 @@ struct module; #ifdef CONFIG_JUMP_LABEL +#define __JUMP_TYPE_FALSE 0 +#define __JUMP_TYPE_TRUE 1 #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a3b59d143afb..74b1ee9027f5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -484,6 +484,7 @@ extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); +u64 int_pow(u64 base, unsigned int exp); unsigned long int_sqrt(unsigned long); #if BITS_PER_LONG < 64 diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2c89e60bc752..0f9da966934e 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,7 +4,6 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include <linux/err.h> #include <linux/sched.h> -#include <linux/cgroup.h> __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,6 +197,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +struct cgroup_subsys_state; + #ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 7c560e0dc8f4..9022f0c2e2e4 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -36,7 +36,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) __account_scheduler_latency(task, usecs, inter); } -void clear_all_latency_tracing(struct task_struct *p); +void clear_tsk_latency_tracing(struct task_struct *p); extern int sysctl_latencytop(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -48,7 +48,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) { } -static inline void clear_all_latency_tracing(struct task_struct *p) +static inline void clear_tsk_latency_tracing(struct task_struct *p) { } diff --git a/include/linux/list.h b/include/linux/list.h index e137b98cd31a..e951228db4b2 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -151,6 +151,23 @@ static inline void list_replace_init(struct list_head *old, } /** + * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position + * @entry1: the location to place entry2 + * @entry2: the location to place entry1 + */ +static inline void list_swap(struct list_head *entry1, + struct list_head *entry2) +{ + struct list_head *pos = entry2->prev; + + list_del(entry2); + list_replace(entry1, entry2); + if (pos == entry1) + pos = entry2; + list_add(entry1, pos); +} + +/** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ @@ -271,6 +288,24 @@ static inline void list_rotate_left(struct list_head *head) } /** + * list_rotate_to_front() - Rotate list to specific item. + * @list: The desired new front of the list. + * @head: The head of the list. + * + * Rotates list so that @list becomes the new front of the list. + */ +static inline void list_rotate_to_front(struct list_head *list, + struct list_head *head) +{ + /* + * Deletes the list head from the list denoted by @head and + * places it as the tail of @list, this effectively rotates the + * list so that @list is at the front. + */ + list_move_tail(head, list); +} + +/** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h index ba79956e848d..20f178c24e9d 100644 --- a/include/linux/list_sort.h +++ b/include/linux/list_sort.h @@ -6,6 +6,7 @@ struct list_head; +__attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 294d5d80e150..676d3900e1bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -96,13 +96,14 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock +static inline void memblock_discard(void) {} #endif #define memblock_dbg(fmt, ...) \ @@ -240,6 +241,47 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, + unsigned long *out_epfn); +/** + * for_each_free_mem_range_in_zone - iterate through zone specific free + * memblock areas + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone. Available once memblock and an empty zone is initialized. The main + * assumption is that the zone start, end, and pgdat have been associated. + * This way we can use the zone to determine NUMA node, and if a given part + * of the memblock is valid for the zone. + */ +#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ + for (i = 0, \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ + i != U64_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +/** + * for_each_free_mem_range_in_zone_from - iterate through zone specific + * free memblock areas from a given point + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone, continuing from current position. Available as soon as memblock is + * initialized. + */ +#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ + for (; i != U64_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dbb6118370c1..d2f644de7f9d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -333,6 +333,19 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) +{ + if (mem_cgroup_disabled()) + return 0; + + if (in_low_reclaim) + return READ_ONCE(memcg->memory.emin); + + return max(READ_ONCE(memcg->memory.emin), + READ_ONCE(memcg->memory.elow)); +} + enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -501,22 +514,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask); - -static inline -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - struct mem_cgroup_per_node *mz; - unsigned long nr_pages = 0; - int zid; - - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for (zid = 0; zid < MAX_NR_ZONES; zid++) - nr_pages += mz->lru_zone_size[zid][lru]; - return nr_pages; -} - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -531,6 +528,8 @@ void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); +unsigned long mem_cgroup_size(struct mem_cgroup *memcg); + void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -827,6 +826,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, + bool in_low_reclaim) +{ + return 0; +} + static inline enum mem_cgroup_protection mem_cgroup_protected( struct mem_cgroup *root, struct mem_cgroup *memcg) { @@ -960,11 +965,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; } -static inline unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - return 0; -} static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -972,14 +972,12 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return 0; } -static inline unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } -static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } @@ -1117,6 +1115,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, { } +static inline void __count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + static inline void count_memcg_page_event(struct page *page, int idx) { diff --git a/include/linux/memory.h b/include/linux/memory.h index a6ddefc60517..e1dc1bb2b787 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int hotplug_memory_register(int nid, struct mem_section *section); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int unregister_memory_section(struct mem_section *); +extern void unregister_memory_section(struct mem_section *); #endif extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8ade08c50d26..ae892eef8b82 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -54,6 +54,16 @@ enum { }; /* + * Restrictions for the memory hotplug: + * flags: MHP_ flags + * altmap: alternative allocator for memmap array + */ +struct mhp_restrictions { + unsigned long flags; + struct vmem_altmap *altmap; +}; + +/* * Zone resizing functions * * Note: any attempt to resize a zone should has pgdat_resize_lock() @@ -87,7 +97,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); extern int online_pages(unsigned long, unsigned long, int); extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); -extern void __offline_isolated_pages(unsigned long, unsigned long); +extern unsigned long __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); @@ -100,6 +111,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); +extern int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions); extern u64 max_mem_size; extern bool memhp_auto_online; @@ -111,26 +124,33 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern int arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap); -extern int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap); +extern void __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ +/* + * Do we want sysfs memblock files created. This will allow userspace to online + * and offline memory explicitly. Lack of this bit means that the caller has to + * call move_pfn_range_to_zone to finish the initialization. + */ + +#define MHP_MEMBLOCK_API (1<<0) + /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #else /* ARCH_HAS_ADD_PAGES */ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA @@ -331,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); -extern int arch_add_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/include/linux/mm.h b/include/linux/mm.h index 083d7b4863ed..0e8834ac32b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly; /* * On some architectures it is expensive to call memset() for small sizes. - * Those architectures should provide their own implementation of "struct page" - * zeroing by defining this macro in <asm/pgtable.h>. + * If an architecture decides to implement their own version of + * mm_zero_struct_page they should wrap the defines below in a #ifndef and + * define their own version of this macro in <asm/pgtable.h> */ -#ifndef mm_zero_struct_page +#if BITS_PER_LONG == 64 +/* This function must be updated when the size of struct page grows above 80 + * or reduces below 56. The idea that compiler optimizes out switch() + * statement, and only leaves move/store instructions. Also the compiler can + * combine write statments if they are both assignments and can be reordered, + * this can result in several of the writes here being dropped. + */ +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) +static inline void __mm_zero_struct_page(struct page *page) +{ + unsigned long *_pp = (void *)page; + + /* Check that struct page is either 56, 64, 72, or 80 bytes */ + BUILD_BUG_ON(sizeof(struct page) & 7); + BUILD_BUG_ON(sizeof(struct page) < 56); + BUILD_BUG_ON(sizeof(struct page) > 80); + + switch (sizeof(struct page)) { + case 80: + _pp[9] = 0; /* fallthrough */ + case 72: + _pp[8] = 0; /* fallthrough */ + case 64: + _pp[7] = 0; /* fallthrough */ + case 56: + _pp[6] = 0; + _pp[5] = 0; + _pp[4] = 0; + _pp[3] = 0; + _pp[2] = 0; + _pp[1] = 0; + _pp[0] = 0; + } +} +#else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif @@ -501,9 +536,6 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) struct mmu_gather; struct inode; -#define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) - #if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { @@ -1007,6 +1039,30 @@ static inline void put_page(struct page *page) __put_page(page); } +/** + * put_user_page() - release a gup-pinned page + * @page: pointer to page to be released + * + * Pages that were pinned via get_user_pages*() must be released via + * either put_user_page(), or one of the put_user_pages*() routines + * below. This is so that eventually, pages that are pinned via + * get_user_pages*() can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special + * handling. + * + * put_user_page() and put_page() are not interchangeable, despite this early + * implementation that makes them look the same. put_user_page() calls must + * be perfectly matched up with get_user_page() calls. + */ +static inline void put_user_page(struct page *page) +{ + put_page(page); +} + +void put_user_pages_dirty(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages(struct page **pages, unsigned long npages); + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1505,21 +1561,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -#else -static inline long get_user_pages_longterm(unsigned long start, - unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); -} -#endif /* CONFIG_FS_DAX */ - -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* Container for pinned pfns / pages */ struct frame_vector { @@ -2533,6 +2576,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, @@ -2583,6 +2630,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ + +/* + * NOTE on FOLL_LONGTERM: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is contrasted with + * iov_iter_get_pages() where usages which are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY + * + * In the CMA case: longterm pins in a CMA region would unnecessarily fragment + * that region. And so CMA attempts to migrate the page before pinning when + * FOLL_LONGTERM is specified. + */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 04ec454d44ce..6f2fef7b0784 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); - __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4ef4bbe78a1d..1815fbc40926 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,7 +103,7 @@ struct page { }; struct { /* slab, slob and slub */ union { - struct list_head slab_list; /* uses lru */ + struct list_head slab_list; struct { /* Partial pages */ struct page *next; #ifdef CONFIG_64BIT @@ -220,6 +220,9 @@ struct page { #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +#define page_private(page) ((page)->private) +#define set_page_private(page, v) ((page)->private = (v)) + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -410,7 +413,7 @@ struct mm_struct { unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm; /* Total pages mapped */ - unsigned long locked_vm; /* Pages that have PG_mlocked set */ + atomic64_t locked_vm; /* Pages that have PG_mlocked set */ atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* @@ -25,11 +55,15 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -225,6 +259,14 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); + +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); +} static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -269,7 +311,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -278,7 +320,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -318,13 +360,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ @@ -452,9 +500,14 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range, mm, start, end) \ +#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { @@ -517,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fba7741533be..70394cabaf4e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -18,6 +18,8 @@ #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> +#include <linux/mm_types.h> +#include <linux/page-flags.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ @@ -98,6 +100,62 @@ struct free_area { unsigned long nr_free; }; +/* Used for pages not on another list */ +static inline void add_to_free_area(struct page *page, struct free_area *area, + int migratetype) +{ + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_area_tail(struct page *page, struct free_area *area, + int migratetype) +{ + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +/* Used to preserve page allocation order entropy */ +void add_to_free_area_random(struct page *page, struct free_area *area, + int migratetype); +#else +static inline void add_to_free_area_random(struct page *page, + struct free_area *area, int migratetype) +{ + add_to_free_area(page, area, migratetype); +} +#endif + +/* Used for pages which are on another list */ +static inline void move_to_free_area(struct page *page, struct free_area *area, + int migratetype) +{ + list_move(&page->lru, &area->free_list[migratetype]); +} + +static inline struct page *get_page_from_free_area(struct free_area *area, + int migratetype) +{ + return list_first_entry_or_null(&area->free_list[migratetype], + struct page, lru); +} + +static inline void del_page_from_free_area(struct page *page, + struct free_area *area) +{ + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + area->nr_free--; +} + +static inline bool free_area_empty(struct free_area *area, int migratetype) +{ + return list_empty(&area->free_list[migratetype]); +} + struct pglist_data; /* @@ -247,11 +305,6 @@ struct lruvec { #endif }; -/* Mask used at gathering information at once (see memcontrol.c) */ -#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) -#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) -#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) - /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ @@ -1276,6 +1329,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define pfn_present pfn_valid #endif /* CONFIG_SPARSEMEM */ /* diff --git a/include/linux/net.h b/include/linux/net.h index 50bf5206ead6..60db423a2d94 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -268,7 +268,7 @@ do { \ #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) -#define net_dbg_ratelimited(fmt, ...) \ +#define _net_dbg_ratelimited(descriptor, fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \ @@ -276,6 +276,8 @@ do { \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), \ ##__VA_ARGS__); \ } while (0) +#define net_dbg_ratelimited(fmt, ...) \ + _net_dbg_ratelimited(__UNIQUE_ID(ddebug), fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf909d0de5f..2e8438a1216a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,19 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + unsigned long mask; + + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + mask = (1UL << compound_order(page)) - 1; + return page + (offset & mask); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/include/linux/pid.h b/include/linux/pid.h index 3c8ef5a199ca..0be5829ddd80 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H #include <linux/rculist.h> +#include <linux/refcount.h> enum pid_type { @@ -56,7 +57,7 @@ struct upid { struct pid { - atomic_t count; + refcount_t count; unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; @@ -71,7 +72,7 @@ extern const struct file_operations pidfd_fops; static inline struct pid *get_pid(struct pid *pid) { if (pid) - atomic_inc(&pid->count); + refcount_inc(&pid->count); return pid; } diff --git a/include/linux/plist.h b/include/linux/plist.h index 97883604a3c5..9365df5a823f 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -231,7 +231,7 @@ static inline int plist_node_empty(const struct plist_node *node) * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST # define plist_first_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ @@ -248,7 +248,7 @@ static inline int plist_node_empty(const struct plist_node *node) * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST # define plist_last_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ diff --git a/include/linux/poll.h b/include/linux/poll.h index 7e0fdcf905d2..1cdc32b1f1b0 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -16,7 +16,11 @@ extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ +#ifdef __clang__ +#define MAX_STACK_ALLOC 768 +#else #define MAX_STACK_ALLOC 832 +#endif #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h index 56f35dd3d01d..44171e6b7197 100644 --- a/include/linux/pps-gpio.h +++ b/include/linux/pps-gpio.h @@ -23,10 +23,11 @@ #define _PPS_GPIO_H struct pps_gpio_platform_data { + struct gpio_desc *gpio_pin; + struct gpio_desc *echo_pin; bool assert_falling_edge; bool capture_clear; - unsigned int gpio_pin; - const char *gpio_label; + unsigned int echo_active_ms; }; #endif /* _PPS_GPIO_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 84ea4d094af3..54499c1a74fd 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -82,6 +82,8 @@ static inline void console_verbose(void) extern char devkmsg_log_str[]; struct ctl_table; +extern int suppress_printk; + struct va_format { const char *fmt; va_list *va; @@ -454,7 +456,7 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ -#define pr_debug_ratelimited(fmt, ...) \ +#define _pr_debug_ratelimited(descriptor, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ @@ -464,6 +466,8 @@ do { \ __ratelimit(&_rs)) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) +#define pr_debug_ratelimited(fmt, ...) \ + _pr_debug_ratelimited(__UNIQUE_ID(ddebug), fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) diff --git a/include/linux/psi.h b/include/linux/psi.h index 7006008d5b72..af892c290116 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,7 @@ #include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> +#include <linux/poll.h> struct seq_file; struct css_set; @@ -26,6 +27,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); + +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res); +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); + +__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, + poll_table *wait); #endif #else /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 2cf422db5d18..07aaf9b82241 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -1,8 +1,11 @@ #ifndef _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H +#include <linux/kthread.h> #include <linux/seqlock.h> #include <linux/types.h> +#include <linux/kref.h> +#include <linux/wait.h> #ifdef CONFIG_PSI @@ -11,7 +14,7 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS, + NR_PSI_TASK_COUNTS = 3, }; /* Task state bitmasks */ @@ -24,7 +27,7 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES, + NR_PSI_RESOURCES = 3, }; /* @@ -41,7 +44,13 @@ enum psi_states { PSI_CPU_SOME, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES, + NR_PSI_STATES = 6, +}; + +enum psi_aggregators { + PSI_AVGS = 0, + PSI_POLL, + NR_PSI_AGGREGATORS, }; struct psi_group_cpu { @@ -53,6 +62,9 @@ struct psi_group_cpu { /* States of the tasks belonging to this group */ unsigned int tasks[NR_PSI_TASK_COUNTS]; + /* Aggregate pressure state derived from the tasks */ + u32 state_mask; + /* Period time sampling buckets for each state of interest (ns) */ u32 times[NR_PSI_STATES]; @@ -62,25 +74,94 @@ struct psi_group_cpu { /* 2nd cacheline updated by the aggregator */ /* Delta detection against the sampling buckets */ - u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] + ____cacheline_aligned_in_smp; +}; + +/* PSI growth tracking window */ +struct psi_window { + /* Window size in ns */ + u64 size; + + /* Start time of the current window in ns */ + u64 start_time; + + /* Value at the start of the window */ + u64 start_value; + + /* Value growth in the previous window */ + u64 prev_growth; +}; + +struct psi_trigger { + /* PSI state being monitored by the trigger */ + enum psi_states state; + + /* User-spacified threshold in ns */ + u64 threshold; + + /* List node inside triggers list */ + struct list_head node; + + /* Backpointer needed during trigger destruction */ + struct psi_group *group; + + /* Wait queue for polling */ + wait_queue_head_t event_wait; + + /* Pending event flag */ + int event; + + /* Tracking window */ + struct psi_window win; + + /* + * Time last event was generated. Used for rate-limiting + * events to one per window + */ + u64 last_event_time; + + /* Refcounting to prevent premature destruction */ + struct kref refcount; }; struct psi_group { - /* Protects data updated during an aggregation */ - struct mutex stat_lock; + /* Protects data used by the aggregator */ + struct mutex avgs_lock; /* Per-cpu task state & time tracking */ struct psi_group_cpu __percpu *pcpu; - /* Periodic aggregation state */ - u64 total_prev[NR_PSI_STATES - 1]; - u64 last_update; - u64 next_update; - struct delayed_work clock_work; + /* Running pressure averages */ + u64 avg_total[NR_PSI_STATES - 1]; + u64 avg_last_update; + u64 avg_next_update; + + /* Aggregator work control */ + struct delayed_work avgs_work; /* Total stall times and sampled pressure averages */ - u64 total[NR_PSI_STATES - 1]; + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; unsigned long avg[NR_PSI_STATES - 1][3]; + + /* Monitor work control */ + atomic_t poll_scheduled; + struct kthread_worker __rcu *poll_kworker; + struct kthread_delayed_work poll_work; + + /* Protects data used by the monitor */ + struct mutex trigger_lock; + + /* Configured polling triggers */ + struct list_head triggers; + u32 nr_triggers[NR_PSI_STATES - 1]; + u32 poll_states; + u64 poll_min_period; + + /* Total stall times at the start of monitor activation */ + u64 polling_total[NR_PSI_STATES - 1]; + u64 polling_next_update; + u64 polling_until; }; #else /* CONFIG_PSI */ diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index 3bcd67fd5548..dd464943f717 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved. */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index e63799a6e895..3734cd8f38a8 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -14,6 +14,7 @@ struct device; #define SYS_POWER_OFF 0x0003 /* Notify of system power off */ enum reboot_mode { + REBOOT_UNDEFINED = -1, REBOOT_COLD = 0, REBOOT_WARM, REBOOT_HARD, @@ -21,6 +22,7 @@ enum reboot_mode { REBOOT_GPIO, }; extern enum reboot_mode reboot_mode; +extern enum reboot_mode panic_reboot_mode; enum reboot_type { BOOT_TRIPLE = 't', diff --git a/include/linux/sched.h b/include/linux/sched.h index a2cd15855bad..9bc2ba94382b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -26,7 +26,6 @@ #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/signal_types.h> -#include <linux/psi_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/rseq.h> @@ -1161,6 +1160,9 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; + + /* Used by memcontrol for high relcaim: */ + struct mem_cgroup *memcg_high_reclaim; #endif #ifdef CONFIG_BLK_CGROUP @@ -1201,6 +1203,13 @@ struct task_struct { unsigned long prev_lowest_stack; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2b70130af585..de29cca9c297 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -263,6 +263,40 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define probe_kernel_address(addr, retval) \ probe_kernel_read(&retval, addr, sizeof(retval)) +/** + * probe_user_read(): safely attempt to read from a user location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + * + * We ensure that the copy_from_user is executed in atomic context so that + * do_page_fault() doesn't attempt to take mmap_sem. This makes + * probe_user_read() suitable for use within regions where the caller + * already holds mmap_sem, or other locks which nest inside mmap_sem. + * + * Returns 0 on success, -EFAULT on error. + */ + +#ifndef probe_user_read +static __always_inline long probe_user_read(void *dst, const void __user *src, + size_t size) +{ + long ret; + + if (!access_ok(src, size)) + return -EFAULT; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} +#endif + #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 37c9eba75c98..ac9d71e24b81 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -28,6 +28,8 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +extern int sysctl_unprivileged_userfaultfd; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c6eebb839552..9b21d0047710 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,12 +50,16 @@ struct vm_struct { struct vmap_area { unsigned long va_start; unsigned long va_end; + + /* + * Largest available free size in subtree. + */ + unsigned long subtree_max_size; unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct llist_node purge_list; /* "lazy purge" list */ struct vm_struct *vm; - struct rcu_head rcu_head; }; /* @@ -68,10 +72,12 @@ extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU extern void __init vmalloc_init(void); +extern unsigned long vmalloc_nr_pages(void); #else static inline void vmalloc_init(void) { } +static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif extern void *vmalloc(unsigned long size); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 2db8d60981fe..bdeda4b079fe 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -26,7 +26,7 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; - unsigned nr_activate; + unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; }; diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 6074eff3d766..e5bf6ee4e814 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -64,6 +64,7 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); +#ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned long nr_all, @@ -132,7 +133,6 @@ TRACE_EVENT(mm_compaction_begin, __entry->sync ? "sync" : "async") ); -#ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_end, TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, unsigned long free_pfn, unsigned long zone_end, bool sync, @@ -166,7 +166,6 @@ TRACE_EVENT(mm_compaction_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); -#endif TRACE_EVENT(mm_compaction_try_to_compact_pages, @@ -189,13 +188,12 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, __entry->prio = prio; ), - TP_printk("order=%d gfp_mask=0x%x priority=%d", + TP_printk("order=%d gfp_mask=%s priority=%d", __entry->order, - __entry->gfp_mask, + show_gfp_flags(__entry->gfp_mask), __entry->prio) ); -#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, @@ -296,7 +294,6 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_ARGS(zone, order) ); -#endif TRACE_EVENT(mm_compaction_kcompactd_sleep, @@ -352,6 +349,7 @@ DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, TP_ARGS(nid, order, classzone_idx) ); +#endif #endif /* _TRACE_COMPACTION_H */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 252327dbfa51..a5ab2973e8dc 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -27,17 +27,11 @@ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" -#define trace_reclaim_flags(page) ( \ - (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ +#define trace_reclaim_flags(file) ( \ + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) -#define trace_shrink_flags(file) \ - ( \ - (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (RECLAIM_WB_ASYNC) \ - ) - TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), @@ -73,7 +67,9 @@ TRACE_EVENT(mm_vmscan_kswapd_wake, __entry->order = order; ), - TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order) + TP_printk("nid=%d order=%d", + __entry->nid, + __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, @@ -96,60 +92,53 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, __entry->gfp_flags = gfp_flags; ), - TP_printk("nid=%d zid=%d order=%d gfp_flags=%s", + TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, - __entry->zid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx), + TP_ARGS(order, gfp_flags), TP_STRUCT__entry( __field( int, order ) - __field( int, may_writepage ) __field( gfp_t, gfp_flags ) - __field( int, classzone_idx ) ), TP_fast_assign( __entry->order = order; - __entry->may_writepage = may_writepage; __entry->gfp_flags = gfp_flags; - __entry->classzone_idx = classzone_idx; ), - TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d", + TP_printk("order=%d gfp_flags=%s", __entry->order, - __entry->may_writepage, - show_gfp_flags(__entry->gfp_flags), - __entry->classzone_idx) + show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) + TP_ARGS(order, gfp_flags) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) + TP_ARGS(order, gfp_flags) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) + TP_ARGS(order, gfp_flags) ); #endif /* CONFIG_MEMCG */ @@ -333,7 +322,8 @@ TRACE_EVENT(mm_vmscan_writepage, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->reclaim_flags = trace_reclaim_flags(page); + __entry->reclaim_flags = trace_reclaim_flags( + page_is_file_cache(page)); ), TP_printk("page=%p pfn=%lu flags=%s", @@ -358,7 +348,8 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) - __field(unsigned long, nr_activate) + __field(unsigned int, nr_activate0) + __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) @@ -373,20 +364,22 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; - __entry->nr_activate = stat->nr_activate; + __entry->nr_activate0 = stat->nr_activate[0]; + __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; - __entry->reclaim_flags = trace_shrink_flags(file); + __entry->reclaim_flags = trace_reclaim_flags(file); ), - TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, - __entry->nr_activate, __entry->nr_ref_keep, - __entry->nr_unmap_fail, __entry->priority, + __entry->nr_activate0, __entry->nr_activate1, + __entry->nr_ref_keep, __entry->nr_unmap_fail, + __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); @@ -415,7 +408,7 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active, __entry->nr_deactivated = nr_deactivated; __entry->nr_referenced = nr_referenced; __entry->priority = priority; - __entry->reclaim_flags = trace_shrink_flags(file); + __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", @@ -454,7 +447,8 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low, __entry->total_active = total_active; __entry->active = active; __entry->ratio = ratio; - __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU; + __entry->reclaim_flags = trace_reclaim_flags(file) & + RECLAIM_WB_LRU; ), TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s", @@ -465,6 +459,38 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low, __entry->ratio, show_reclaim_flags(__entry->reclaim_flags)) ); + +TRACE_EVENT(mm_vmscan_node_reclaim_begin, + + TP_PROTO(int nid, int order, gfp_t gfp_flags), + + TP_ARGS(nid, order, gfp_flags), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(gfp_t, gfp_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("nid=%d order=%d gfp_flags=%s", + __entry->nid, + __entry->order, + show_gfp_flags(__entry->gfp_flags)) +); + +DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, + + TP_PROTO(unsigned long nr_reclaimed), + + TP_ARGS(nr_reclaimed) +); + #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h index 2199adc6a6c2..3e58b717fb5a 100644 --- a/include/uapi/linux/byteorder/big_endian.h +++ b/include/uapi/linux/byteorder/big_endian.h @@ -2,6 +2,10 @@ #ifndef _UAPI_LINUX_BYTEORDER_BIG_ENDIAN_H #define _UAPI_LINUX_BYTEORDER_BIG_ENDIAN_H +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +#error "Unsupported endianness, check your toolchain" +#endif + #ifndef __BIG_ENDIAN #define __BIG_ENDIAN 4321 #endif diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h index 601c904fd5cd..c9ee440716a5 100644 --- a/include/uapi/linux/byteorder/little_endian.h +++ b/include/uapi/linux/byteorder/little_endian.h @@ -2,6 +2,10 @@ #ifndef _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H #define _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +#error "Unsupported endianness, check your toolchain" +#endif + #ifndef __LITTLE_ENDIAN #define __LITTLE_ENDIAN 1234 #endif diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 121e82ce296b..59c71fa8c553 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -320,6 +320,9 @@ struct fscrypt_key { #define SYNC_FILE_RANGE_WAIT_BEFORE 1 #define SYNC_FILE_RANGE_WRITE 2 #define SYNC_FILE_RANGE_WAIT_AFTER 4 +#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \ + SYNC_FILE_RANGE_WAIT_BEFORE | \ + SYNC_FILE_RANGE_WAIT_AFTER) /* * Flags for preadv2/pwritev2: diff --git a/init/Kconfig b/init/Kconfig index 82b84e5ee30d..8b9ffe236e4f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1752,6 +1752,30 @@ config SLAB_FREELIST_HARDENED sacrifies to harden the kernel slab allocator against common freelist exploit methods. +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/init/initramfs.c b/init/initramfs.c index 4749e1115eef..435a428c2af1 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -513,42 +513,55 @@ static int __init retain_initrd_param(char *str) } __setup("retain_initrd", retain_initrd_param); +#ifdef CONFIG_ARCH_HAS_KEEPINITRD +static int __init keepinitrd_setup(char *__unused) +{ + do_retain_initrd = 1; + return 1; +} +__setup("keepinitrd", keepinitrd_setup); +#endif + extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> -static void __init free_initrd(void) +void __weak free_initrd_mem(unsigned long start, unsigned long end) { + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); +} + #ifdef CONFIG_KEXEC_CORE +static bool kexec_free_initrd(void) +{ unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); -#endif - if (do_retain_initrd) - goto skip; -#ifdef CONFIG_KEXEC_CORE /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ - if (initrd_start < crashk_end && initrd_end > crashk_start) { - /* - * Initialize initrd memory region since the kexec boot does - * not do. - */ - memset((void *)initrd_start, 0, initrd_end - initrd_start); - if (initrd_start < crashk_start) - free_initrd_mem(initrd_start, crashk_start); - if (initrd_end > crashk_end) - free_initrd_mem(crashk_end, initrd_end); - } else -#endif - free_initrd_mem(initrd_start, initrd_end); -skip: - initrd_start = 0; - initrd_end = 0; + if (initrd_start >= crashk_end || initrd_end <= crashk_start) + return false; + + /* + * Initialize initrd memory region since the kexec boot does not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + return true; +} +#else +static inline bool kexec_free_initrd(void) +{ + return false; } +#endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM #define BUF_SIZE 1024 @@ -597,7 +610,38 @@ static void __init clean_rootfs(void) ksys_close(fd); kfree(buf); } -#endif +#else +static inline void clean_rootfs(void) +{ +} +#endif /* CONFIG_BLK_DEV_RAM */ + +#ifdef CONFIG_BLK_DEV_RAM +static void populate_initrd_image(char *err) +{ + ssize_t written; + int fd; + + unpack_to_rootfs(__initramfs_start, __initramfs_size); + + printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", + err); + fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + if (fd < 0) + return; + + written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start); + if (written != initrd_end - initrd_start) + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", + written, initrd_end - initrd_start); + ksys_close(fd); +} +#else +static void populate_initrd_image(char *err) +{ + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); +} +#endif /* CONFIG_BLK_DEV_RAM */ static int __init populate_rootfs(void) { @@ -605,46 +649,31 @@ static int __init populate_rootfs(void) char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - /* If available load the bootloader supplied initrd */ - if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { -#ifdef CONFIG_BLK_DEV_RAM - int fd; + + if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) + goto done; + + if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); - err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start); - if (!err) { - free_initrd(); - goto done; - } else { - clean_rootfs(); - unpack_to_rootfs(__initramfs_start, __initramfs_size); - } - printk(KERN_INFO "rootfs image is not initramfs (%s)" - "; looks like an initrd\n", err); - fd = ksys_open("/initrd.image", - O_WRONLY|O_CREAT, 0700); - if (fd >= 0) { - ssize_t written = xwrite(fd, (char *)initrd_start, - initrd_end - initrd_start); - - if (written != initrd_end - initrd_start) - pr_err("/initrd.image: incomplete write (%zd != %ld)\n", - written, initrd_end - initrd_start); - - ksys_close(fd); - free_initrd(); - } - done: - /* empty statement */; -#else + else printk(KERN_INFO "Unpacking initramfs...\n"); - err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start); - if (err) - printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); - free_initrd(); -#endif + + err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); + if (err) { + clean_rootfs(); + populate_initrd_image(err); } + +done: + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (!do_retain_initrd && !kexec_free_initrd()) + free_initrd_mem(initrd_start, initrd_end); + initrd_start = 0; + initrd_end = 0; + flush_delayed_fput(); return 0; } diff --git a/init/main.c b/init/main.c index 33c87e91dc37..5a2c69b4d7b3 100644 --- a/init/main.c +++ b/init/main.c @@ -1074,6 +1074,11 @@ static inline void mark_readonly(void) } #endif +void __weak free_initmem(void) +{ + free_initmem_default(POISON_FREE_INITMEM); +} + static int __ref kernel_init(void *unused) { int ret; diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 49f9bf4ffc7f..bfaae457810c 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -120,7 +120,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, static int zero; static int one = 1; static int int_max = INT_MAX; -static int ipc_mni = IPCMNI; +int ipc_mni = IPCMNI; +int ipc_mni_shift = IPCMNI_SHIFT; +int ipc_min_cycle = RADIX_TREE_MAP_SIZE; static struct ctl_table ipc_kern_table[] = { { @@ -246,3 +248,13 @@ static int __init ipc_sysctl_init(void) } device_initcall(ipc_sysctl_init); + +static int __init ipc_mni_extend(char *str) +{ + ipc_mni = IPCMNI_EXTEND; + ipc_mni_shift = IPCMNI_EXTEND_SHIFT; + ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE; + pr_info("IPCMNI extended to %d.\n", ipc_mni); + return 0; +} +early_param("ipcmni_extend", ipc_mni_extend); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ba44164ea1f9..216cad1ff0d0 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -76,6 +76,7 @@ struct mqueue_inode_info { wait_queue_head_t wait_q; struct rb_root msg_tree; + struct rb_node *msg_tree_rightmost; struct posix_msg_tree_node *node_cache; struct mq_attr attr; @@ -131,6 +132,7 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) { struct rb_node **p, *parent = NULL; struct posix_msg_tree_node *leaf; + bool rightmost = true; p = &info->msg_tree.rb_node; while (*p) { @@ -139,9 +141,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) if (likely(leaf->priority == msg->m_type)) goto insert_msg; - else if (msg->m_type < leaf->priority) + else if (msg->m_type < leaf->priority) { p = &(*p)->rb_left; - else + rightmost = false; + } else p = &(*p)->rb_right; } if (info->node_cache) { @@ -154,6 +157,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) INIT_LIST_HEAD(&leaf->msg_list); } leaf->priority = msg->m_type; + + if (rightmost) + info->msg_tree_rightmost = &leaf->rb_node; + rb_link_node(&leaf->rb_node, parent, p); rb_insert_color(&leaf->rb_node, &info->msg_tree); insert_msg: @@ -163,23 +170,35 @@ insert_msg: return 0; } +static inline void msg_tree_erase(struct posix_msg_tree_node *leaf, + struct mqueue_inode_info *info) +{ + struct rb_node *node = &leaf->rb_node; + + if (info->msg_tree_rightmost == node) + info->msg_tree_rightmost = rb_prev(node); + + rb_erase(node, &info->msg_tree); + if (info->node_cache) { + kfree(leaf); + } else { + info->node_cache = leaf; + } +} + static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) { - struct rb_node **p, *parent = NULL; + struct rb_node *parent = NULL; struct posix_msg_tree_node *leaf; struct msg_msg *msg; try_again: - p = &info->msg_tree.rb_node; - while (*p) { - parent = *p; - /* - * During insert, low priorities go to the left and high to the - * right. On receive, we want the highest priorities first, so - * walk all the way to the right. - */ - p = &(*p)->rb_right; - } + /* + * During insert, low priorities go to the left and high to the + * right. On receive, we want the highest priorities first, so + * walk all the way to the right. + */ + parent = info->msg_tree_rightmost; if (!parent) { if (info->attr.mq_curmsgs) { pr_warn_once("Inconsistency in POSIX message queue, " @@ -194,24 +213,14 @@ try_again: pr_warn_once("Inconsistency in POSIX message queue, " "empty leaf node but we haven't implemented " "lazy leaf delete!\n"); - rb_erase(&leaf->rb_node, &info->msg_tree); - if (info->node_cache) { - kfree(leaf); - } else { - info->node_cache = leaf; - } + msg_tree_erase(leaf, info); goto try_again; } else { msg = list_first_entry(&leaf->msg_list, struct msg_msg, m_list); list_del(&msg->m_list); if (list_empty(&leaf->msg_list)) { - rb_erase(&leaf->rb_node, &info->msg_tree); - if (info->node_cache) { - kfree(leaf); - } else { - info->node_cache = leaf; - } + msg_tree_erase(leaf, info); } } info->attr.mq_curmsgs--; @@ -254,6 +263,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->qsize = 0; info->user = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; + info->msg_tree_rightmost = NULL; info->node_cache = NULL; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, @@ -430,7 +440,8 @@ static void mqueue_evict_inode(struct inode *inode) struct user_struct *user; unsigned long mq_bytes, mq_treesize; struct ipc_namespace *ipc_ns; - struct msg_msg *msg; + struct msg_msg *msg, *nmsg; + LIST_HEAD(tmp_msg); clear_inode(inode); @@ -441,10 +452,15 @@ static void mqueue_evict_inode(struct inode *inode) info = MQUEUE_I(inode); spin_lock(&info->lock); while ((msg = msg_get(info)) != NULL) - free_msg(msg); + list_add_tail(&msg->m_list, &tmp_msg); kfree(info->node_cache); spin_unlock(&info->lock); + list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) { + list_del(&msg->m_list); + free_msg(msg); + } + /* Total amount of bytes accounted for the mqueue */ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * @@ -605,8 +621,6 @@ static void wq_add(struct mqueue_inode_info *info, int sr, { struct ext_wait_queue *walk; - ewp->task = current; - list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { if (walk->task->prio <= current->prio) { list_add_tail(&ewp->list, &walk->list); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 84598025a6ad..e65593742e2b 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -18,6 +18,7 @@ #include <linux/utsname.h> #include <linux/proc_ns.h> #include <linux/uaccess.h> +#include <linux/sched.h> #include "util.h" @@ -64,6 +65,9 @@ static struct msg_msg *alloc_msg(size_t len) pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; + + cond_resched(); + alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) @@ -176,6 +180,8 @@ void free_msg(struct msg_msg *msg) kfree(msg); while (seg != NULL) { struct msg_msgseg *tmp = seg->next; + + cond_resched(); kfree(seg); seg = tmp; } diff --git a/ipc/util.c b/ipc/util.c index 095274a871f8..d126d156efc6 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -109,7 +109,7 @@ static const struct rhashtable_params ipc_kht_params = { * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited - * below IPCMNI) then initialise the keys hashtable and ids idr. + * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { @@ -119,6 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids) rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; + ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif @@ -192,6 +193,10 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. + * + * To conserve sequence number space, especially with extended ipc_mni, + * the sequence number is incremented only when the returned ID is less than + * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { @@ -215,17 +220,42 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT); + int max_idx; + + max_idx = max(ids->in_use*3/2, ipc_min_cycle); + max_idx = min(max_idx, ipc_mni); + + /* allocate the idx, with a NULL struct kern_ipc_perm */ + idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, + GFP_NOWAIT); + + if (idx >= 0) { + /* + * idx got allocated successfully. + * Now calculate the sequence number and set the + * pointer for real. + */ + if (idx <= ids->last_idx) { + ids->seq++; + if (ids->seq >= ipcid_seq_max()) + ids->seq = 0; + } + ids->last_idx = idx; + + new->seq = ids->seq; + /* no need for smp_wmb(), this is done + * inside idr_replace, as part of + * rcu_assign_pointer + */ + idr_replace(&ids->ipcs_idr, new, idx); + } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) - new->id = SEQ_MULTIPLIER * new->seq + idx; + new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } @@ -253,8 +283,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); - if (limit > IPCMNI) - limit = IPCMNI; + if (limit > ipc_mni) + limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; @@ -737,7 +767,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, if (total >= ids->in_use) return NULL; - for (; pos < IPCMNI; pos++) { + for (; pos < ipc_mni; pos++) { ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; diff --git a/ipc/util.h b/ipc/util.h index e272be622ae7..0fcf8e719b76 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -15,8 +15,37 @@ #include <linux/err.h> #include <linux/ipc_namespace.h> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ -#define SEQ_MULTIPLIER (IPCMNI) +/* + * The IPC ID contains 2 separate numbers - index and sequence number. + * By default, + * bits 0-14: index (32k, 15 bits) + * bits 15-30: sequence number (64k, 16 bits) + * + * When IPCMNI extension mode is turned on, the composition changes: + * bits 0-23: index (16M, 24 bits) + * bits 24-30: sequence number (128, 7 bits) + */ +#define IPCMNI_SHIFT 15 +#define IPCMNI_EXTEND_SHIFT 24 +#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE) +#define IPCMNI (1 << IPCMNI_SHIFT) +#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT) + +#ifdef CONFIG_SYSVIPC_SYSCTL +extern int ipc_mni; +extern int ipc_mni_shift; +extern int ipc_min_cycle; + +#define ipcmni_seq_shift() ipc_mni_shift +#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1) + +#else /* CONFIG_SYSVIPC_SYSCTL */ + +#define ipc_mni IPCMNI +#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE) +#define ipcmni_seq_shift() IPCMNI_SHIFT +#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) +#endif /* CONFIG_SYSVIPC_SYSCTL */ void sem_init(void); void msg_init(void); @@ -96,9 +125,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *); #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) +#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK) +#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift()) +#define ipcid_seq_max() (INT_MAX >> ipcmni_seq_shift()) /* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); @@ -123,8 +152,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids) if (ids->in_use == 0) return -1; - if (ids->in_use == IPCMNI) - return IPCMNI - 1; + if (ids->in_use == ipc_mni) + return ipc_mni - 1; return ids->max_idx; } @@ -216,10 +245,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static inline int sem_check_semmni(struct ipc_namespace *ns) { /* - * Check semmni range [0, IPCMNI] + * Check semmni range [0, ipc_mni] * semmni is the last element of sem_ctls[4] array */ - return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI)) + return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni)) ? -ERANGE : 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 327f37c9fdfa..1140357d46f4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); } -#endif + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { @@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = { .name = "io.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 7a723194ecbe..2b750f13bc8f 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -158,7 +158,7 @@ out: bool dma_in_atomic_pool(void *start, size_t size) { - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); } void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..8361a560cd1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,7 +422,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -462,7 +462,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -483,7 +483,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..b17c5224569c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -921,6 +921,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; + tsk->memcg_high_reclaim = NULL; #endif return tsk; @@ -955,6 +956,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -982,7 +992,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->core_state = NULL; mm_pgtables_bytes_init(mm); mm->map_count = 0; - mm->locked_vm = 0; + atomic64_set(&mm->locked_vm, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); @@ -1343,6 +1353,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1726,6 +1737,24 @@ static int pidfd_create(struct pid *pid) return fd; } +#ifdef CONFIG_MEMCG +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} +#endif /* CONFIG_MEMCG */ + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ +#ifdef CONFIG_MEMCG + call_rcu(&tsk->rcu, __delayed_free_task); +#else /* CONFIG_MEMCG */ + free_task(tsk); +#endif /* CONFIG_MEMCG */ +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2068,7 +2097,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif - clear_all_latency_tracing(p); + clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); @@ -2233,8 +2262,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2265,7 +2296,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..f71c1adcff31 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL + depends on CC_IS_GCC ---help--- The gcov format is usually determined by the GCC version, and the default is chosen according to your GCC version. However, there are @@ -62,7 +63,7 @@ choice config GCOV_FORMAT_3_4 bool "GCC 3.4 format" - depends on CC_IS_GCC && GCC_VERSION < 40700 + depends on GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@ #include <linux/sched.h> #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = gcov_info_version(info); - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - gcov_info_link(info); - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock); /** * gcov_enable_events - enable event reporting through gcov_event() @@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within_module((unsigned long)info, mod)) { + if (gcov_info_within_module(info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data. LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + * llvm_gcda_start_file() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * [... repeats for each function ...] + * llvm_gcda_summary_info() + * llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit. This forces us to keep some local state + * about which module we're dealing with at the moment. On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { + struct list_head head; + + const char *filename; + unsigned int version; + u32 checksum; + + struct list_head functions; +}; + +struct gcov_fn_info { + struct list_head head; + + u32 ident; + u32 checksum; + u8 use_extra_checksum; + u32 cfg_checksum; + + u32 num_counters; + u64 *counters; + const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ + struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + INIT_LIST_HEAD(&info->functions); + + mutex_lock(&gcov_lock); + + list_add_tail(&info->head, &clang_gcov_list); + current_info = info; + writeout(); + current_info = NULL; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], + u32 checksum) +{ + current_info->filename = orig_filename; + memcpy(¤t_info->version, version, sizeof(current_info->version)); + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, + u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + if (function_name) + info->function_name = kstrdup(function_name, GFP_KERNEL); + + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ + struct gcov_fn_info *info = list_last_entry(¤t_info->functions, + struct gcov_fn_info, head); + + info->num_counters = num_counters; + info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return list_first_entry_or_null(&clang_gcov_list, + struct gcov_info, head); + if (list_is_last(&info->head, &clang_gcov_list)) + return NULL; + return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + /* Generic code unlinks while iterating. */ + __list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_fn_info *fn; + + list_for_each_entry(fn, &info->functions, head) + memset(fn->counters, 0, + sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( + &info1->functions, struct gcov_fn_info, head); + struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( + &info2->functions, struct gcov_fn_info, head); + + if (info1->checksum != info2->checksum) + return false; + if (!fn_ptr1) + return fn_ptr1 == fn_ptr2; + while (!list_is_last(&fn_ptr1->head, &info1->functions) && + !list_is_last(&fn_ptr2->head, &info2->functions)) { + if (fn_ptr1->checksum != fn_ptr2->checksum) + return false; + if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) + return false; + if (fn_ptr1->use_extra_checksum && + fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; + fn_ptr1 = list_next_entry(fn_ptr1, head); + fn_ptr2 = list_next_entry(fn_ptr2, head); + } + return list_is_last(&fn_ptr1->head, &info1->functions) && + list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_fn_info *dfn_ptr; + struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, + struct gcov_fn_info, head); + + list_for_each_entry(dfn_ptr, &dst->functions, head) { + u32 i; + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; + } +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); + if (!fn_dup->function_name) + goto err_name; + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) + goto err_counters; + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; + +err_counters: + kfree(fn_dup->function_name); +err_name: + kfree(fn_dup); + return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_fn_info *fn; + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + INIT_LIST_HEAD(&dup->head); + INIT_LIST_HEAD(&dup->functions); + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err; + + list_for_each_entry(fn, &info->functions, head) { + struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + + if (!fn_dup) + goto err; + list_add_tail(&fn_dup->head, &dup->functions); + } + + return dup; + +err: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + kfree(fn->function_name); + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->checksum); + + list_for_each_entry(fi_ptr, &info->functions, head) { + u32 i; + u32 len = 2; + + if (fi_ptr->use_extra_checksum) + len++; + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, len); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); + if (fi_ptr->use_extra_checksum) + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); + pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); + for (i = 0; i < fi_ptr->num_counters; i++) + pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = gcov_info_version(info); + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + gcov_info_link(info); + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@ #ifndef GCOV_H #define GCOV_H GCOV_H +#include <linux/module.h> #include <linux/types.h> /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod); /* Base interface. */ enum gcov_action { @@ -83,4 +85,7 @@ struct gcov_link { }; extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; + #endif /* GCOV_H */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..be4e8795561a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> +#include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 99a5b5f46dc5..871734ea2f04 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -67,13 +67,10 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; - if (!latencytop_enabled) - return; - raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; @@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk, int firstnonnull = MAXLR + 1; int i; - if (!latencytop_enabled) - return; - /* skip kernel threads for now */ if (!tsk->mm) return; diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..bfc95b3e4235 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { + WARN_ONCE(((*nl) == n), "double register detected"); if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); diff --git a/kernel/panic.c b/kernel/panic.c index c1fcaad337b7..b4543a31a495 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 +#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic); static void panic_print_sys_info(void) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -277,7 +281,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); panic_print_sys_info(); @@ -306,6 +310,8 @@ void panic(const char *fmt, ...) * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ + if (panic_reboot_mode != REBOOT_UNDEFINED) + reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ @@ -321,6 +327,9 @@ void panic(const char *fmt, ...) disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + + /* Do not scroll important messages printed above */ + suppress_printk = 1; local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..5b063f0556ba 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -37,12 +37,12 @@ #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> -#include <linux/proc_fs.h> +#include <linux/refcount.h> #include <linux/sched/task.h> #include <linux/idr.h> struct pid init_struct_pid = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, @@ -106,8 +106,8 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if ((refcount_read(&pid->count) == 1) || + refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -210,7 +210,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) } get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..e1e82506b2dd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; u64 curr_log_seq; + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; @@ -2525,10 +2535,11 @@ void console_unblank(void) /** * console_flush_on_panic - flush console content on panic + * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ -void console_flush_on_panic(void) +void console_flush_on_panic(enum con_flush_mode mode) { /* * If someone else is holding the console lock, trylock will fail @@ -2539,6 +2550,11 @@ void console_flush_on_panic(void) */ console_trylock(); console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) { + console_seq = log_first_seq; + console_idx = log_first_idx; + } console_unlock(); } diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..b9e79e8c7226 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; /* * This variable is used privately to keep track of whether or not @@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); static int __init reboot_setup(char *str) { for (;;) { + enum reboot_mode *mode; + /* * Having anything passed on the command line via * reboot= will cause us to disable DMI checking @@ -526,17 +529,24 @@ static int __init reboot_setup(char *str) */ reboot_default = 0; + if (!strncmp(str, "panic_", 6)) { + mode = &panic_reboot_mode; + str += 6; + } else { + mode = &reboot_mode; + } + switch (*str) { case 'w': - reboot_mode = REBOOT_WARM; + *mode = REBOOT_WARM; break; case 'c': - reboot_mode = REBOOT_COLD; + *mode = REBOOT_COLD; break; case 'h': - reboot_mode = REBOOT_HARD; + *mode = REBOOT_HARD; break; case 's': @@ -553,11 +563,11 @@ static int __init reboot_setup(char *str) if (rc) return rc; } else - reboot_mode = REBOOT_SOFT; + *mode = REBOOT_SOFT; break; } case 'g': - reboot_mode = REBOOT_GPIO; + *mode = REBOOT_GPIO; break; case 'b': diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..e88918e0bb6d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h" @@ -140,9 +147,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -156,6 +163,11 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; @@ -165,7 +177,7 @@ static struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, aggregator, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); + + if (pchanged_states) + *pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; + expires = group->avg_next_update; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; + u64 now; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + now = sched_clock(); + + collect_percpu_times(group, PSI_AVGS, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } + + mutex_unlock(&group->avgs_lock); +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) diff --git a/kernel/signal.c b/kernel/signal.c index 62f9aea4a15a..c4dd66436fc5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; + /* fall through */ default: return -EPERM; } diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..943c89178e3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> +#include <linux/userfaultfd_k.h> #include "../lib/kstrtox.h" @@ -1720,6 +1721,17 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; @@ -2874,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; @@ -3158,17 +3172,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (write) { char *kbuf, *p; + size_t skipped = 0; - if (left > PAGE_SIZE - 1) + if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } p = kbuf = memdup_user_nul(buffer, left); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), - sizeof(unsigned long), - GFP_KERNEL); + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); return -ENOMEM; @@ -3177,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, while (!err && left) { unsigned long val_a, val_b; bool neg; + size_t saved_left; + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_a >= bitmap_len || neg) { @@ -3197,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_b >= bitmap_len || neg || @@ -3215,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, proc_skip_char(&p, &left, '\n'); } kfree(kbuf); + left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -3259,7 +3298,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, *ppos += *lenp; } - kfree(tmp_bitmap); + bitmap_free(tmp_bitmap); return err; } diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..88b834f0eebc 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid) if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) - goto out_unlock; + return NULL; new->uid = uid; refcount_set(&new->__count, 1); @@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid) } return up; - -out_unlock: - return NULL; } static int __init uid_cache_init(void) diff --git a/lib/Kconfig b/lib/Kconfig index f871282e0612..8d9239a4156c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -46,9 +46,6 @@ config HAVE_ARCH_BITREVERSE This option enables the use of hardware bit-reversal instructions on architectures which support such operations. -config RATIONAL - bool - config GENERIC_STRNCPY_FROM_USER bool @@ -61,6 +58,8 @@ config GENERIC_NET_UTILS config GENERIC_FIND_FIRST_BIT bool +source "lib/math/Kconfig" + config NO_GENERIC_PCI_IOPORT_MAP bool @@ -531,12 +530,6 @@ config LRU_CACHE config CLZ_TAB bool -config CORDIC - tristate "CORDIC algorithm" - help - This option provides an implementation of the CORDIC algorithm; - calculations are in fixed point. Module will be called cordic. - config DDR bool "JEDEC DDR data" help @@ -632,9 +625,6 @@ config SBITMAP config PARMAN tristate "parman" if COMPILE_TEST -config PRIME_NUMBERS - tristate - config STRING_SELFTEST tristate "Test string functions" diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d695ec1477f3..2a75cfef2e20 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -163,6 +163,9 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_RELATIVE_POINTERS + bool + endmenu # "printk and dmesg options" menu "Compile-time checks and compiler options" @@ -318,6 +321,20 @@ config HEADERS_CHECK exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in your build tree), to make sure they're suitable. +config OPTIMIZE_INLINING + bool "Allow compiler to uninline functions marked 'inline'" + help + This option determines if the kernel forces gcc to inline the functions + developers have marked 'inline'. Doing so takes away freedom from gcc to + do what it thinks is best, which is desirable for the gcc 3.x series of + compilers. The gcc 4.x series have a rewritten inlining algorithm and + enabling this option will generate a smaller kernel there. Hopefully + this algorithm is so good that allowing gcc 4.x and above to make the + decision will become the default in the future. Until then this option + is there to test gcc for this. + + If unsure, say N. + config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help @@ -446,6 +463,15 @@ config DEBUG_KERNEL Say Y here if you are developing drivers or trying to debug and identify kernel problems. +config DEBUG_MISC + bool "Miscellaneous debug code" + default DEBUG_KERNEL + depends on DEBUG_KERNEL + help + Say Y here if you need to enable miscellaneous debug code that should + be under a more specific debug option but isn't. + + menu "Memory Debugging" source "mm/Kconfig.debug" @@ -1358,7 +1384,7 @@ config DEBUG_LIST If unsure, say N. -config DEBUG_PI_LIST +config DEBUG_PLIST bool "Debug priority linked list manipulation" depends on DEBUG_KERNEL help @@ -2088,6 +2114,12 @@ config IO_STRICT_DEVMEM If in doubt, say Y. +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu # Kernel hacking diff --git a/lib/Makefile b/lib/Makefile index 83d7df2661ff..fb7697031a79 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -30,7 +30,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ - idr.o int_sqrt.o extable.o \ + idr.o extable.o \ sha1.o chacha.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ @@ -44,11 +44,11 @@ lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o klist.o obj-y += lockref.o -obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ +obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \ + list_sort.o uuid.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o rhashtable.o reciprocal_div.o \ + percpu-refcount.o rhashtable.o \ once.o refcount.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o obj-$(CONFIG_STRING_SELFTEST) += test_string.o @@ -102,6 +102,8 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) +obj-y += math/ + obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o @@ -121,7 +123,6 @@ obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o obj-$(CONFIG_BITREVERSE) += bitrev.o obj-$(CONFIG_PACKING) += packing.o -obj-$(CONFIG_RATIONAL) += rational.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC16) += crc16.o obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o @@ -195,8 +196,6 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o -obj-$(CONFIG_CORDIC) += cordic.o - obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_GLOB) += glob.o @@ -238,8 +237,6 @@ obj-$(CONFIG_ASN1) += asn1_decoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ -obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o - hostprogs-y := gen_crc32table hostprogs-y += gen_crc64table clean-files := crc32table.h diff --git a/lib/bitmap.c b/lib/bitmap.c index 98872e9025da..f235434df87b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -20,6 +20,8 @@ #include <asm/page.h> +#include "kstrtox.h" + /** * DOC: bitmap introduction * @@ -477,12 +479,128 @@ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, } EXPORT_SYMBOL(bitmap_print_to_pagebuf); +/* + * Region 9-38:4/10 describes the following bitmap structure: + * 0 9 12 18 38 + * .........****......****......****...... + * ^ ^ ^ ^ + * start off group_len end + */ +struct region { + unsigned int start; + unsigned int off; + unsigned int group_len; + unsigned int end; +}; + +static int bitmap_set_region(const struct region *r, + unsigned long *bitmap, int nbits) +{ + unsigned int start; + + if (r->end >= nbits) + return -ERANGE; + + for (start = r->start; start <= r->end; start += r->group_len) + bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); + + return 0; +} + +static int bitmap_check_region(const struct region *r) +{ + if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) + return -EINVAL; + + return 0; +} + +static const char *bitmap_getnum(const char *str, unsigned int *num) +{ + unsigned long long n; + unsigned int len; + + len = _parse_integer(str, 10, &n); + if (!len) + return ERR_PTR(-EINVAL); + if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n) + return ERR_PTR(-EOVERFLOW); + + *num = n; + return str + len; +} + +static inline bool end_of_str(char c) +{ + return c == '\0' || c == '\n'; +} + +static inline bool __end_of_region(char c) +{ + return isspace(c) || c == ','; +} + +static inline bool end_of_region(char c) +{ + return __end_of_region(c) || end_of_str(c); +} + +/* + * The format allows commas and whitespases at the beginning + * of the region. + */ +static const char *bitmap_find_region(const char *str) +{ + while (__end_of_region(*str)) + str++; + + return end_of_str(*str) ? NULL : str; +} + +static const char *bitmap_parse_region(const char *str, struct region *r) +{ + str = bitmap_getnum(str, &r->start); + if (IS_ERR(str)) + return str; + + if (end_of_region(*str)) + goto no_end; + + if (*str != '-') + return ERR_PTR(-EINVAL); + + str = bitmap_getnum(str + 1, &r->end); + if (IS_ERR(str)) + return str; + + if (end_of_region(*str)) + goto no_pattern; + + if (*str != ':') + return ERR_PTR(-EINVAL); + + str = bitmap_getnum(str + 1, &r->off); + if (IS_ERR(str)) + return str; + + if (*str != '/') + return ERR_PTR(-EINVAL); + + return bitmap_getnum(str + 1, &r->group_len); + +no_end: + r->end = r->start; +no_pattern: + r->off = r->end + 1; + r->group_len = r->end + 1; + + return end_of_str(*str) ? NULL : str; +} + /** - * __bitmap_parselist - convert list format ASCII string to bitmap - * @buf: read nul-terminated user string from this buffer - * @buflen: buffer size in bytes. If string is smaller than this - * then it must be terminated with a \0. - * @is_user: location of buffer, 0 indicates kernel space + * bitmap_parselist - convert list format ASCII string to bitmap + * @buf: read user string from this buffer; must be terminated + * with a \0 or \n. * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * @@ -498,127 +616,38 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * * Returns: 0 on success, -errno on invalid input strings. Error values: * - * - ``-EINVAL``: second number in range smaller than first + * - ``-EINVAL``: wrong region format * - ``-EINVAL``: invalid character in string * - ``-ERANGE``: bit number specified too large for mask + * - ``-EOVERFLOW``: integer overflow in the input parameters */ -static int __bitmap_parselist(const char *buf, unsigned int buflen, - int is_user, unsigned long *maskp, - int nmaskbits) +int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) { - unsigned int a, b, old_a, old_b; - unsigned int group_size, used_size, off; - int c, old_c, totaldigits, ndigits; - const char __user __force *ubuf = (const char __user __force *)buf; - int at_start, in_range, in_partial_range; + struct region r; + long ret; - totaldigits = c = 0; - old_a = old_b = 0; - group_size = used_size = 0; bitmap_zero(maskp, nmaskbits); - do { - at_start = 1; - in_range = 0; - in_partial_range = 0; - a = b = 0; - ndigits = totaldigits; - - /* Get the next cpu# or a range of cpu#'s */ - while (buflen) { - old_c = c; - if (is_user) { - if (__get_user(c, ubuf++)) - return -EFAULT; - } else - c = *buf++; - buflen--; - if (isspace(c)) - continue; - - /* A '\0' or a ',' signal the end of a cpu# or range */ - if (c == '\0' || c == ',') - break; - /* - * whitespaces between digits are not allowed, - * but it's ok if whitespaces are on head or tail. - * when old_c is whilespace, - * if totaldigits == ndigits, whitespace is on head. - * if whitespace is on tail, it should not run here. - * as c was ',' or '\0', - * the last code line has broken the current loop. - */ - if ((totaldigits != ndigits) && isspace(old_c)) - return -EINVAL; - if (c == '/') { - used_size = a; - at_start = 1; - in_range = 0; - a = b = 0; - continue; - } + while (buf) { + buf = bitmap_find_region(buf); + if (buf == NULL) + return 0; - if (c == ':') { - old_a = a; - old_b = b; - at_start = 1; - in_range = 0; - in_partial_range = 1; - a = b = 0; - continue; - } + buf = bitmap_parse_region(buf, &r); + if (IS_ERR(buf)) + return PTR_ERR(buf); - if (c == '-') { - if (at_start || in_range) - return -EINVAL; - b = 0; - in_range = 1; - at_start = 1; - continue; - } + ret = bitmap_check_region(&r); + if (ret) + return ret; - if (!isdigit(c)) - return -EINVAL; + ret = bitmap_set_region(&r, maskp, nmaskbits); + if (ret) + return ret; + } - b = b * 10 + (c - '0'); - if (!in_range) - a = b; - at_start = 0; - totaldigits++; - } - if (ndigits == totaldigits) - continue; - if (in_partial_range) { - group_size = a; - a = old_a; - b = old_b; - old_a = old_b = 0; - } else { - used_size = group_size = b - a + 1; - } - /* if no digit is after '-', it's wrong*/ - if (at_start && in_range) - return -EINVAL; - if (!(a <= b) || group_size == 0 || !(used_size <= group_size)) - return -EINVAL; - if (b >= nmaskbits) - return -ERANGE; - while (a <= b) { - off = min(b - a + 1, used_size); - bitmap_set(maskp, a, off); - a += group_size; - } - } while (buflen && c == ','); return 0; } - -int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) -{ - char *nl = strchrnul(bp, '\n'); - int len = nl - bp; - - return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); -} EXPORT_SYMBOL(bitmap_parselist); @@ -632,23 +661,27 @@ EXPORT_SYMBOL(bitmap_parselist); * @nmaskbits: size of bitmap, in bits. * * Wrapper for bitmap_parselist(), providing it with user buffer. - * - * We cannot have this as an inline function in bitmap.h because it needs - * linux/uaccess.h to get the access_ok() declaration and this causes - * cyclic dependencies. */ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { - if (!access_ok(ubuf, ulen)) - return -EFAULT; - return __bitmap_parselist((const char __force *)ubuf, - ulen, 1, maskp, nmaskbits); + char *buf; + int ret; + + buf = memdup_user_nul(ubuf, ulen); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + ret = bitmap_parselist(buf, maskp, nmaskbits); + + kfree(buf); + return ret; } EXPORT_SYMBOL(bitmap_parselist_user); +#ifdef CONFIG_NUMA /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap @@ -757,7 +790,6 @@ void bitmap_remap(unsigned long *dst, const unsigned long *src, set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst); } } -EXPORT_SYMBOL(bitmap_remap); /** * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit @@ -795,7 +827,6 @@ int bitmap_bitremap(int oldbit, const unsigned long *old, else return bitmap_ord_to_pos(new, n % w, bits); } -EXPORT_SYMBOL(bitmap_bitremap); /** * bitmap_onto - translate one bitmap relative to another @@ -930,7 +961,6 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig, m++; } } -EXPORT_SYMBOL(bitmap_onto); /** * bitmap_fold - fold larger bitmap into smaller, modulo specified size @@ -955,7 +985,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig, for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } -EXPORT_SYMBOL(bitmap_fold); +#endif /* CONFIG_NUMA */ /* * Common code for bitmap_*_region() routines. diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 55437fd5128b..1546a3c00709 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -375,6 +375,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; + bool check_object_on_stack = false; fill_pool(); @@ -391,7 +392,7 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) debug_objects_oom(); return; } - debug_object_is_on_stack(addr, onstack); + check_object_on_stack = true; } switch (obj->state) { @@ -402,20 +403,24 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) break; case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "init"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "init"); debug_object_fixup(descr->fixup_init, addr, state); return; case ODEBUG_STATE_DESTROYED: + raw_spin_unlock_irqrestore(&db->lock, flags); debug_print_object(obj, "init"); - break; + return; default: break; } raw_spin_unlock_irqrestore(&db->lock, flags); + if (check_object_on_stack) + debug_object_is_on_stack(addr, onstack); + } /** @@ -473,33 +478,34 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) obj = lookup_object(addr, db); if (obj) { + ret = 0; switch (obj->state) { case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: obj->state = ODEBUG_STATE_ACTIVE; - ret = 0; break; case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "activate"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "activate"); ret = debug_object_fixup(descr->fixup_activate, addr, state); return ret ? 0 : -EINVAL; case ODEBUG_STATE_DESTROYED: - debug_print_object(obj, "activate"); ret = -EINVAL; break; default: - ret = 0; break; } raw_spin_unlock_irqrestore(&db->lock, flags); + if (ret) + debug_print_object(obj, "activate"); return ret; } raw_spin_unlock_irqrestore(&db->lock, flags); + /* * We are here when a static object is activated. We * let the type specific code confirm whether this is @@ -548,24 +554,30 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) if (!obj->astate) obj->state = ODEBUG_STATE_INACTIVE; else - debug_print_object(obj, "deactivate"); + goto out_unlock_print; break; case ODEBUG_STATE_DESTROYED: - debug_print_object(obj, "deactivate"); - break; + goto out_unlock_print; + default: break; } - } else { + } + + raw_spin_unlock_irqrestore(&db->lock, flags); + if (!obj) { struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; debug_print_object(&o, "deactivate"); } + return; +out_unlock_print: raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "deactivate"); } EXPORT_SYMBOL_GPL(debug_object_deactivate); @@ -599,15 +611,16 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) obj->state = ODEBUG_STATE_DESTROYED; break; case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "destroy"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "destroy"); debug_object_fixup(descr->fixup_destroy, addr, state); return; case ODEBUG_STATE_DESTROYED: + raw_spin_unlock_irqrestore(&db->lock, flags); debug_print_object(obj, "destroy"); - break; + return; default: break; } @@ -641,9 +654,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) switch (obj->state) { case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "free"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "free"); debug_object_fixup(descr->fixup_free, addr, state); return; default: @@ -731,22 +744,27 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr, if (obj->astate == expect) obj->astate = next; else - debug_print_object(obj, "active_state"); + goto out_unlock_print; break; default: - debug_print_object(obj, "active_state"); - break; + goto out_unlock_print; } - } else { + } + + raw_spin_unlock_irqrestore(&db->lock, flags); + if (!obj) { struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; debug_print_object(&o, "active_state"); } + return; +out_unlock_print: raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "active_state"); } EXPORT_SYMBOL_GPL(debug_object_active_state); @@ -782,10 +800,10 @@ repeat: switch (obj->state) { case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "free"); descr = obj->descr; state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "free"); debug_object_fixup(descr->fixup_free, (void *) oaddr, state); goto repeat; @@ -978,19 +996,24 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) struct debug_obj *obj; unsigned long flags; int res = -EINVAL; + enum debug_obj_state obj_state; db = get_bucket((unsigned long) addr); raw_spin_lock_irqsave(&db->lock, flags); obj = lookup_object(addr, db); + obj_state = obj ? obj->state : state; + + raw_spin_unlock_irqrestore(&db->lock, flags); + if (!obj && state != ODEBUG_STATE_NONE) { WARN(1, KERN_ERR "ODEBUG: selftest object not found\n"); goto out; } - if (obj && obj->state != state) { + if (obj_state != state) { WARN(1, KERN_ERR "ODEBUG: selftest wrong state: %d != %d\n", - obj->state, state); + obj_state, state); goto out; } if (fixups != debug_objects_fixups) { @@ -1005,7 +1028,6 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) } res = 0; out: - raw_spin_unlock_irqrestore(&db->lock, flags); if (res) debug_objects_enabled = 0; return res; diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8a16c2d498e9..f1646795692c 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -39,6 +39,55 @@ #include <rdma/ib_verbs.h> +#ifdef CONFIG_DYNAMIC_DEBUG_RELATIVE_POINTERS +static inline const char *dd_modname(const struct _ddebug *dd) +{ + return (const char *)dd + dd->modname_disp; +} +static inline const char *dd_function(const struct _ddebug *dd) +{ + return (const char *)dd + dd->function_disp; +} +static inline const char *dd_filename(const struct _ddebug *dd) +{ + return (const char *)dd + dd->filename_disp; +} +static inline const char *dd_format(const struct _ddebug *dd) +{ + return (const char *)dd + dd->format_disp; +} +#else +static inline const char *dd_modname(const struct _ddebug *dd) +{ + return dd->modname; +} +static inline const char *dd_function(const struct _ddebug *dd) +{ + return dd->function; +} +static inline const char *dd_filename(const struct _ddebug *dd) +{ + return dd->filename; +} +static inline const char *dd_format(const struct _ddebug *dd) +{ + return dd->format; +} +#endif + +static inline unsigned dd_lineno(const struct _ddebug *dd) +{ + return dd->flags_lineno >> 8; +} +static inline unsigned dd_flags(const struct _ddebug *dd) +{ + return dd->flags_lineno & 0xff; +} +static inline void dd_set_flags(struct _ddebug *dd, unsigned newflags) +{ + dd->flags_lineno = (dd_lineno(dd) << 8) | newflags; +} + extern struct _ddebug __start___verbose[]; extern struct _ddebug __stop___verbose[]; @@ -96,7 +145,7 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf, BUG_ON(maxlen < 6); for (i = 0; i < ARRAY_SIZE(opt_array); ++i) - if (dp->flags & opt_array[i].flag) + if (dd_flags(dp) & opt_array[i].flag) *p++ = opt_array[i].opt_char; if (p == buf) *p++ = '_'; @@ -142,7 +191,7 @@ static int ddebug_change(const struct ddebug_query *query, { int i; struct ddebug_table *dt; - unsigned int newflags; + unsigned int newflags, oldflags; unsigned int nfound = 0; char flagbuf[10]; @@ -160,47 +209,48 @@ static int ddebug_change(const struct ddebug_query *query, /* match against the source filename */ if (query->filename && - !match_wildcard(query->filename, dp->filename) && + !match_wildcard(query->filename, dd_filename(dp)) && !match_wildcard(query->filename, - kbasename(dp->filename)) && + kbasename(dd_filename(dp))) && !match_wildcard(query->filename, - trim_prefix(dp->filename))) + trim_prefix(dd_filename(dp)))) continue; /* match against the function */ if (query->function && - !match_wildcard(query->function, dp->function)) + !match_wildcard(query->function, dd_function(dp))) continue; /* match against the format */ if (query->format && - !strstr(dp->format, query->format)) + !strstr(dd_format(dp), query->format)) continue; /* match against the line number range */ if (query->first_lineno && - dp->lineno < query->first_lineno) + dd_lineno(dp) < query->first_lineno) continue; if (query->last_lineno && - dp->lineno > query->last_lineno) + dd_lineno(dp) > query->last_lineno) continue; nfound++; - newflags = (dp->flags & mask) | flags; - if (newflags == dp->flags) + oldflags = dd_flags(dp); + newflags = (oldflags & mask) | flags; + if (newflags == oldflags) continue; #ifdef CONFIG_JUMP_LABEL - if (dp->flags & _DPRINTK_FLAGS_PRINT) { + if (oldflags & _DPRINTK_FLAGS_PRINT) { if (!(flags & _DPRINTK_FLAGS_PRINT)) static_branch_disable(&dp->key.dd_key_true); } else if (flags & _DPRINTK_FLAGS_PRINT) static_branch_enable(&dp->key.dd_key_true); #endif - dp->flags = newflags; + dd_set_flags(dp, newflags); vpr_info("changed %s:%d [%s]%s =%s\n", - trim_prefix(dp->filename), dp->lineno, - dt->mod_name, dp->function, + trim_prefix(dd_filename(dp)), dd_lineno(dp), + dt->mod_name, dd_function(dp), ddebug_describe_flags(dp, flagbuf, sizeof(flagbuf))); } @@ -522,10 +572,11 @@ static char *dynamic_emit_prefix(const struct _ddebug *desc, char *buf) { int pos_after_tid; int pos = 0; + unsigned flags = dd_flags(desc); *buf = '\0'; - if (desc->flags & _DPRINTK_FLAGS_INCL_TID) { + if (flags & _DPRINTK_FLAGS_INCL_TID) { if (in_interrupt()) pos += snprintf(buf + pos, remaining(pos), "<intr> "); else @@ -533,15 +584,15 @@ static char *dynamic_emit_prefix(const struct _ddebug *desc, char *buf) task_pid_vnr(current)); } pos_after_tid = pos; - if (desc->flags & _DPRINTK_FLAGS_INCL_MODNAME) + if (flags & _DPRINTK_FLAGS_INCL_MODNAME) pos += snprintf(buf + pos, remaining(pos), "%s:", - desc->modname); - if (desc->flags & _DPRINTK_FLAGS_INCL_FUNCNAME) + dd_modname(desc)); + if (flags & _DPRINTK_FLAGS_INCL_FUNCNAME) pos += snprintf(buf + pos, remaining(pos), "%s:", - desc->function); - if (desc->flags & _DPRINTK_FLAGS_INCL_LINENO) + dd_function(desc)); + if (flags & _DPRINTK_FLAGS_INCL_LINENO) pos += snprintf(buf + pos, remaining(pos), "%d:", - desc->lineno); + dd_lineno(desc)); if (pos - pos_after_tid) pos += snprintf(buf + pos, remaining(pos), " "); if (pos >= PREFIX_SIZE) @@ -827,10 +878,10 @@ static int ddebug_proc_show(struct seq_file *m, void *p) } seq_printf(m, "%s:%u [%s]%s =%s \"", - trim_prefix(dp->filename), dp->lineno, - iter->table->mod_name, dp->function, + trim_prefix(dd_filename(dp)), dd_lineno(dp), + iter->table->mod_name, dd_function(dp), ddebug_describe_flags(dp, flagsbuf, sizeof(flagsbuf))); - seq_escape(m, dp->format, "\t\r\n\""); + seq_escape(m, dd_format(dp), "\t\r\n\""); seq_puts(m, "\"\n"); return 0; @@ -1024,20 +1075,20 @@ static int __init dynamic_debug_init(void) return 1; } iter = __start___verbose; - modname = iter->modname; + modname = dd_modname(iter); iter_start = iter; for (; iter < __stop___verbose; iter++) { entries++; - verbose_bytes += strlen(iter->modname) + strlen(iter->function) - + strlen(iter->filename) + strlen(iter->format); + verbose_bytes += strlen(dd_modname(iter)) + strlen(dd_function(iter)) + + strlen(dd_filename(iter)) + strlen(dd_format(iter)); - if (strcmp(modname, iter->modname)) { + if (strcmp(modname, dd_modname(iter))) { modct++; ret = ddebug_add_module(iter_start, n, modname); if (ret) goto out_err; n = 0; - modname = iter->modname; + modname = dd_modname(iter); iter_start = iter; } n++; diff --git a/lib/genalloc.c b/lib/genalloc.c index 7e85d1e37a6e..2e45b3a94313 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -424,7 +424,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool, EXPORT_SYMBOL(gen_pool_for_each_chunk); /** - * addr_in_gen_pool - checks if an address falls within the range of a pool + * gen_pool_has_addr - checks if an address falls within the range of a pool * @pool: the generic memory pool * @start: start address * @size: size of the region @@ -432,7 +432,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk); * Check if the range of addresses falls within the specified pool. Returns * true if the entire range is contained in the pool and false otherwise. */ -bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, +bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start, size_t size) { bool found = false; @@ -451,6 +451,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, rcu_read_unlock(); return found; } +EXPORT_SYMBOL(gen_pool_has_addr); /** * gen_pool_avail - get available free space of the pool diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b396d328a764..f74fa832f3aa 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, + pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); if (unlikely(res < 0)) { kvfree(p); return res; diff --git a/lib/list_sort.c b/lib/list_sort.c index 85759928215b..06e900c5587b 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -7,33 +7,41 @@ #include <linux/list_sort.h> #include <linux/list.h> -#define MAX_LIST_LENGTH_BITS 20 +typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *, + struct list_head const *, struct list_head const *); /* * Returns a list organized in an intermediate format suited * to chaining of merge() calls: null-terminated, no reserved or * sentinel head node, "prev" links not maintained. */ -static struct list_head *merge(void *priv, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b), +__attribute__((nonnull(2,3,4))) +static struct list_head *merge(void *priv, cmp_func cmp, struct list_head *a, struct list_head *b) { - struct list_head head, *tail = &head; + struct list_head *head, **tail = &head; - while (a && b) { + for (;;) { /* if equal, take 'a' -- important for sort stability */ - if ((*cmp)(priv, a, b) <= 0) { - tail->next = a; + if (cmp(priv, a, b) <= 0) { + *tail = a; + tail = &a->next; a = a->next; + if (!a) { + *tail = b; + break; + } } else { - tail->next = b; + *tail = b; + tail = &b->next; b = b->next; + if (!b) { + *tail = a; + break; + } } - tail = tail->next; } - tail->next = a?:b; - return head.next; + return head; } /* @@ -43,44 +51,52 @@ static struct list_head *merge(void *priv, * prev-link restoration pass, or maintaining the prev links * throughout. */ -static void merge_and_restore_back_links(void *priv, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b), - struct list_head *head, - struct list_head *a, struct list_head *b) +__attribute__((nonnull(2,3,4,5))) +static void merge_final(void *priv, cmp_func cmp, struct list_head *head, + struct list_head *a, struct list_head *b) { struct list_head *tail = head; u8 count = 0; - while (a && b) { + for (;;) { /* if equal, take 'a' -- important for sort stability */ - if ((*cmp)(priv, a, b) <= 0) { + if (cmp(priv, a, b) <= 0) { tail->next = a; a->prev = tail; + tail = a; a = a->next; + if (!a) + break; } else { tail->next = b; b->prev = tail; + tail = b; b = b->next; + if (!b) { + b = a; + break; + } } - tail = tail->next; } - tail->next = a ? : b; + /* Finish linking remainder of list b on to tail */ + tail->next = b; do { /* - * In worst cases this loop may run many iterations. + * If the merge is highly unbalanced (e.g. the input is + * already sorted), this loop may run many iterations. * Continue callbacks to the client even though no * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - if (unlikely(!(++count))) - (*cmp)(priv, tail->next, tail->next); - - tail->next->prev = tail; - tail = tail->next; - } while (tail->next); - + if (unlikely(!++count)) + cmp(priv, b, b); + b->prev = tail; + tail = b; + b = b->next; + } while (b); + + /* And the final links to make a circular doubly-linked list */ tail->next = head; head->prev = tail; } @@ -91,55 +107,149 @@ static void merge_and_restore_back_links(void *priv, * @head: the list to sort * @cmp: the elements comparison function * - * This function implements "merge sort", which has O(nlog(n)) - * complexity. + * The comparison funtion @cmp must return > 0 if @a should sort after + * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should + * sort before @b *or* their original order should be preserved. It is + * always called with the element that came first in the input in @a, + * and list_sort is a stable sort, so it is not necessary to distinguish + * the @a < @b and @a == @b cases. + * + * This is compatible with two styles of @cmp function: + * - The traditional style which returns <0 / =0 / >0, or + * - Returning a boolean 0/1. + * The latter offers a chance to save a few cycles in the comparison + * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c). + * + * A good way to write a multi-word comparison is + * if (a->high != b->high) + * return a->high > b->high; + * if (a->middle != b->middle) + * return a->middle > b->middle; + * return a->low > b->low; + * + * + * This mergesort is as eager as possible while always performing at least + * 2:1 balanced merges. Given two pending sublists of size 2^k, they are + * merged to a size-2^(k+1) list as soon as we have 2^k following elements. + * + * Thus, it will avoid cache thrashing as long as 3*2^k elements can + * fit into the cache. Not quite as good as a fully-eager bottom-up + * mergesort, but it does use 0.2*n fewer comparisons, so is faster in + * the common case that everything fits into L1. + * + * + * The merging is controlled by "count", the number of elements in the + * pending lists. This is beautiully simple code, but rather subtle. * - * The comparison function @cmp must return a negative value if @a - * should sort before @b, and a positive value if @a should sort after - * @b. If @a and @b are equivalent, and their original relative - * ordering is to be preserved, @cmp must return 0. + * Each time we increment "count", we set one bit (bit k) and clear + * bits k-1 .. 0. Each time this happens (except the very first time + * for each bit, when count increments to 2^k), we merge two lists of + * size 2^k into one list of size 2^(k+1). + * + * This merge happens exactly when the count reaches an odd multiple of + * 2^k, which is when we have 2^k elements pending in smaller lists, + * so it's safe to merge away two lists of size 2^k. + * + * After this happens twice, we have created two lists of size 2^(k+1), + * which will be merged into a list of size 2^(k+2) before we create + * a third list of size 2^(k+1), so there are never more than two pending. + * + * The number of pending lists of size 2^k is determined by the + * state of bit k of "count" plus two extra pieces of information: + * - The state of bit k-1 (when k == 0, consider bit -1 always set), and + * - Whether the higher-order bits are zero or non-zero (i.e. + * is count >= 2^(k+1)). + * There are six states we distinguish. "x" represents some arbitrary + * bits, and "y" represents some arbitrary non-zero bits: + * 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k + * 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k + * 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k + * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k + * 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k + * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k + * (merge and loop back to state 2) + * + * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because + * bit k-1 is set while the more significant bits are non-zero) and + * merge them away in the 5->2 transition. Note in particular that just + * before the 5->2 transition, all lower-order bits are 11 (state 3), + * so there is one list of each smaller size. + * + * When we reach the end of the input, we merge all the pending + * lists, from smallest to largest. If you work through cases 2 to + * 5 above, you can see that the number of elements we merge with a list + * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to + * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1). */ +__attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { - struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists - -- last slot is a sentinel */ - int lev; /* index into part[] */ - int max_lev = 0; - struct list_head *list; + struct list_head *list = head->next, *pending = NULL; + size_t count = 0; /* Count of pending */ - if (list_empty(head)) + if (list == head->prev) /* Zero or one elements */ return; - memset(part, 0, sizeof(part)); - + /* Convert to a null-terminated singly-linked list. */ head->prev->next = NULL; - list = head->next; - - while (list) { - struct list_head *cur = list; - list = list->next; - cur->next = NULL; - for (lev = 0; part[lev]; lev++) { - cur = merge(priv, cmp, part[lev], cur); - part[lev] = NULL; - } - if (lev > max_lev) { - if (unlikely(lev >= ARRAY_SIZE(part)-1)) { - printk_once(KERN_DEBUG "list too long for efficiency\n"); - lev--; - } - max_lev = lev; + /* + * Data structure invariants: + * - All lists are singly linked and null-terminated; prev + * pointers are not maintained. + * - pending is a prev-linked "list of lists" of sorted + * sublists awaiting further merging. + * - Each of the sorted sublists is power-of-two in size. + * - Sublists are sorted by size and age, smallest & newest at front. + * - There are zero to two sublists of each size. + * - A pair of pending sublists are merged as soon as the number + * of following pending elements equals their size (i.e. + * each time count reaches an odd multiple of that size). + * That ensures each later final merge will be at worst 2:1. + * - Each round consists of: + * - Merging the two sublists selected by the highest bit + * which flips when count is incremented, and + * - Adding an element from the input as a size-1 sublist. + */ + do { + size_t bits; + struct list_head **tail = &pending; + + /* Find the least-significant clear bit in count */ + for (bits = count; bits & 1; bits >>= 1) + tail = &(*tail)->prev; + /* Do the indicated merge */ + if (likely(bits)) { + struct list_head *a = *tail, *b = a->prev; + + a = merge(priv, (cmp_func)cmp, b, a); + /* Install the merged result in place of the inputs */ + a->prev = b->prev; + *tail = a; } - part[lev] = cur; - } - for (lev = 0; lev < max_lev; lev++) - if (part[lev]) - list = merge(priv, cmp, part[lev], list); - - merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); + /* Move one element from input list to pending */ + list->prev = pending; + pending = list; + list = list->next; + pending->next = NULL; + count++; + } while (list); + + /* End of input; merge together all the pending lists. */ + list = pending; + pending = pending->prev; + for (;;) { + struct list_head *next = pending->prev; + + if (!next) + break; + list = merge(priv, (cmp_func)cmp, pending, list); + pending = next; + } + /* The final merge, rebuilding prev links */ + merge_final(priv, (cmp_func)cmp, head, pending, list); } EXPORT_SYMBOL(list_sort); diff --git a/lib/math/Kconfig b/lib/math/Kconfig new file mode 100644 index 000000000000..73bdf37178d1 --- /dev/null +++ b/lib/math/Kconfig @@ -0,0 +1,11 @@ +config CORDIC + tristate "CORDIC algorithm" + help + This option provides an implementation of the CORDIC algorithm; + calculations are in fixed point. Module will be called cordic. + +config PRIME_NUMBERS + tristate + +config RATIONAL + bool diff --git a/lib/math/Makefile b/lib/math/Makefile new file mode 100644 index 000000000000..583bbfebfc09 --- /dev/null +++ b/lib/math/Makefile @@ -0,0 +1,5 @@ +obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o + +obj-$(CONFIG_CORDIC) += cordic.o +obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o +obj-$(CONFIG_RATIONAL) += rational.o diff --git a/lib/cordic.c b/lib/math/cordic.c index 8ef27c12956f..8ef27c12956f 100644 --- a/lib/cordic.c +++ b/lib/math/cordic.c diff --git a/lib/div64.c b/lib/math/div64.c index ee146bb4c558..368ca7fd0d82 100644 --- a/lib/div64.c +++ b/lib/math/div64.c @@ -10,7 +10,7 @@ * Generic C version of 64bit/32bit division and modulo, with * 64bit result and 32bit remainder. * - * The fast case for (n>>32 == 0) is handled inline by do_div(). + * The fast case for (n>>32 == 0) is handled inline by do_div(). * * Code generated for this function might be very inefficient * for some CPUs. __div64_32() can be overridden by linking arch-specific diff --git a/lib/gcd.c b/lib/math/gcd.c index 7948ab27f0a4..7948ab27f0a4 100644 --- a/lib/gcd.c +++ b/lib/math/gcd.c diff --git a/lib/math/int_pow.c b/lib/math/int_pow.c new file mode 100644 index 000000000000..622fc1ab3c74 --- /dev/null +++ b/lib/math/int_pow.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * An integer based power function + * + * Derived from drivers/video/backlight/pwm_bl.c + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/types.h> + +/** + * int_pow - computes the exponentiation of the given base and exponent + * @base: base which will be raised to the given power + * @exp: power to be raised to + * + * Computes: pow(base, exp), i.e. @base raised to the @exp power + */ +u64 int_pow(u64 base, unsigned int exp) +{ + u64 result = 1; + + while (exp) { + if (exp & 1) + result *= base; + exp >>= 1; + base *= base; + } + + return result; +} +EXPORT_SYMBOL_GPL(int_pow); diff --git a/lib/int_sqrt.c b/lib/math/int_sqrt.c index 30e0f9770f88..30e0f9770f88 100644 --- a/lib/int_sqrt.c +++ b/lib/math/int_sqrt.c diff --git a/lib/lcm.c b/lib/math/lcm.c index 03d7fcb420b5..03d7fcb420b5 100644 --- a/lib/lcm.c +++ b/lib/math/lcm.c diff --git a/lib/prime_numbers.c b/lib/math/prime_numbers.c index 550eec457c2e..550eec457c2e 100644 --- a/lib/prime_numbers.c +++ b/lib/math/prime_numbers.c diff --git a/lib/math/rational.c b/lib/math/rational.c new file mode 100644 index 000000000000..31fb27db2deb --- /dev/null +++ b/lib/math/rational.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * rational fractions + * + * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com> + * Copyright (C) 2019 Trent Piepho <tpiepho@gmail.com> + * + * helper functions when coping with rational numbers + */ + +#include <linux/rational.h> +#include <linux/compiler.h> +#include <linux/export.h> +#include <linux/kernel.h> + +/* + * calculate best rational approximation for a given fraction + * taking into account restricted register size, e.g. to find + * appropriate values for a pll with 5 bit denominator and + * 8 bit numerator register fields, trying to set up with a + * frequency ratio of 3.1415, one would say: + * + * rational_best_approximation(31415, 10000, + * (1 << 8) - 1, (1 << 5) - 1, &n, &d); + * + * you may look at given_numerator as a fixed point number, + * with the fractional part size described in given_denominator. + * + * for theoretical background, see: + * http://en.wikipedia.org/wiki/Continued_fraction + */ + +void rational_best_approximation( + unsigned long given_numerator, unsigned long given_denominator, + unsigned long max_numerator, unsigned long max_denominator, + unsigned long *best_numerator, unsigned long *best_denominator) +{ + /* n/d is the starting rational, which is continually + * decreased each iteration using the Euclidean algorithm. + * + * dp is the value of d from the prior iteration. + * + * n2/d2, n1/d1, and n0/d0 are our successively more accurate + * approximations of the rational. They are, respectively, + * the current, previous, and two prior iterations of it. + * + * a is current term of the continued fraction. + */ + unsigned long n, d, n0, d0, n1, d1, n2, d2; + n = given_numerator; + d = given_denominator; + n0 = d1 = 0; + n1 = d0 = 1; + + for (;;) { + unsigned long dp, a; + + if (d == 0) + break; + /* Find next term in continued fraction, 'a', via + * Euclidean algorithm. + */ + dp = d; + a = n / d; + d = n % d; + n = dp; + + /* Calculate the current rational approximation (aka + * convergent), n2/d2, using the term just found and + * the two prior approximations. + */ + n2 = n0 + a * n1; + d2 = d0 + a * d1; + + /* If the current convergent exceeds the maxes, then + * return either the previous convergent or the + * largest semi-convergent, the final term of which is + * found below as 't'. + */ + if ((n2 > max_numerator) || (d2 > max_denominator)) { + unsigned long t = min((max_numerator - n0) / n1, + (max_denominator - d0) / d1); + + /* This tests if the semi-convergent is closer + * than the previous convergent. + */ + if (2u * t > a || (2u * t == a && d0 * dp > d1 * d)) { + n1 = n0 + t * n1; + d1 = d0 + t * d1; + } + break; + } + n0 = n1; + n1 = n2; + d0 = d1; + d1 = d2; + } + *best_numerator = n1; + *best_denominator = d1; +} + +EXPORT_SYMBOL(rational_best_approximation); diff --git a/lib/reciprocal_div.c b/lib/math/reciprocal_div.c index bf043258fa00..bf043258fa00 100644 --- a/lib/reciprocal_div.c +++ b/lib/math/reciprocal_div.c diff --git a/lib/plist.c b/lib/plist.c index 199408f91057..d3bd8827186f 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -26,7 +26,7 @@ #include <linux/bug.h> #include <linux/plist.h> -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST static struct plist_head test_head; @@ -173,7 +173,7 @@ void plist_requeue(struct plist_node *node, struct plist_head *head) plist_check_head(head); } -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/module.h> diff --git a/lib/rational.c b/lib/rational.c deleted file mode 100644 index ba7443677c90..000000000000 --- a/lib/rational.c +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * rational fractions - * - * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com> - * - * helper functions when coping with rational numbers - */ - -#include <linux/rational.h> -#include <linux/compiler.h> -#include <linux/export.h> - -/* - * calculate best rational approximation for a given fraction - * taking into account restricted register size, e.g. to find - * appropriate values for a pll with 5 bit denominator and - * 8 bit numerator register fields, trying to set up with a - * frequency ratio of 3.1415, one would say: - * - * rational_best_approximation(31415, 10000, - * (1 << 8) - 1, (1 << 5) - 1, &n, &d); - * - * you may look at given_numerator as a fixed point number, - * with the fractional part size described in given_denominator. - * - * for theoretical background, see: - * http://en.wikipedia.org/wiki/Continued_fraction - */ - -void rational_best_approximation( - unsigned long given_numerator, unsigned long given_denominator, - unsigned long max_numerator, unsigned long max_denominator, - unsigned long *best_numerator, unsigned long *best_denominator) -{ - unsigned long n, d, n0, d0, n1, d1; - n = given_numerator; - d = given_denominator; - n0 = d1 = 0; - n1 = d0 = 1; - for (;;) { - unsigned long t, a; - if ((n1 > max_numerator) || (d1 > max_denominator)) { - n1 = n0; - d1 = d0; - break; - } - if (d == 0) - break; - t = d; - a = n / d; - d = n % d; - n = t; - t = n0 + a * n1; - n0 = n1; - n1 = t; - t = d0 + a * d1; - d0 = d1; - d1 = t; - } - *best_numerator = n1; - *best_denominator = d1; -} - -EXPORT_SYMBOL(rational_best_approximation); diff --git a/lib/sort.c b/lib/sort.c index d6b7a202b0b6..50855ea8c262 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -1,8 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* - * A fast, small, non-recursive O(nlog n) sort for the Linux kernel + * A fast, small, non-recursive O(n log n) sort for the Linux kernel * - * Jan 23 2005 Matt Mackall <mpm@selenic.com> + * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, + * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. + * + * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n + * better) at the expense of stack usage and much larger code to avoid + * quicksort's O(n^2) worst case. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -11,35 +16,155 @@ #include <linux/export.h> #include <linux/sort.h> -static int alignment_ok(const void *base, int align) +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) { - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; } -static void u32_swap(void *a, void *b, int size) +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a, @b: pointers to the elements + * @size: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is stil valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) { - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); } -static void u64_swap(void *a, void *b, int size) +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a, @b: pointers to the elements + * @size: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) { - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); } -static void generic_swap(void *a, void *b, int size) +/** + * swap_bytes - swap two elements a byte at a time + * @a, @b: pointers to the elements + * @size: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) { - char t; - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +typedef void (*swap_func_t)(void *a, void *b, int size); + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_func_t)0 +#define SWAP_WORDS_32 (swap_func_t)1 +#define SWAP_BYTES (swap_func_t)2 + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size); +} + +/** + * parent - given the offset of the child, find the offset of the parent. + * @i: the offset of the heap element whose parent is sought. Non-zero. + * @lsbit: a precomputed 1-bit mask, equal to "size & -size" + * @size: size of each element + * + * In terms of array indexes, the parent of element j = @i/@size is simply + * (j-1)/2. But when working in byte offsets, we can't use implicit + * truncation of integer divides. + * + * Fortunately, we only need one bit of the quotient, not the full divide. + * @size has a least significant bit. That bit will be clear if @i is + * an even multiple of @size, and set if it's an odd multiple. + * + * Logically, we're doing "if (i & lsbit) i -= size;", but since the + * branch is unpredictable, it's done with a bit of clever branch-free + * code instead. + */ +__attribute_const__ __always_inline +static size_t parent(size_t i, unsigned int lsbit, size_t size) +{ + i -= size; + i -= size & -(i & lsbit); + return i / 2; } /** @@ -50,57 +175,78 @@ static void generic_swap(void *a, void *b, int size) * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * - * This function does a heapsort on the given array. You may provide a - * swap_func function optimized to your element type. + * This function does a heapsort on the given array. You may provide + * a swap_func function if you need to do something more than a memory + * copy (e.g. fix up pointers or auxiliary data), but the built-in swap + * avoids a slow retpoline and so is significantly faster. * * Sorting time is O(n log n) both on average and worst-case. While - * qsort is about 20% faster on average, it suffers from exploitable + * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ - void sort(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *), void (*swap_func)(void *, void *, int size)) { /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; + size_t n = num * size, a = (num/2) * size; + const unsigned int lsbit = size & -size; /* Used to find parent */ + + if (!a) /* num < 2 || size == 0 */ + return; if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; else - swap_func = generic_swap; + swap_func = SWAP_BYTES; } - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size) < 0) - c += size; - if (cmp_func(base + r, base + c) >= 0) - break; - swap_func(base + r, base + c, size); - } - } + /* + * Loop invariants: + * 1. elements [a,n) satisfy the heap property (compare greater than + * all of their children), + * 2. elements [n,num*size) are sorted, and + * 3. a <= b <= c <= d <= n (whenever they are valid). + */ + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + a -= size; + else if (n -= size) /* Sorting: Extract root to --n */ + do_swap(base, base + n, size, swap_func); + else /* Sort complete */ + break; - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size) < 0) - c += size; - if (cmp_func(base + r, base + c) >= 0) - break; - swap_func(base + r, base + c, size); + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + size, (d = c + size) < n;) + b = cmp_func(base + c, base + d) >= 0 ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && cmp_func(base + a, base + b) >= 0) + b = parent(b, lsbit, size); + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = parent(b, lsbit, size); + do_swap(base + b, base + c, size, swap_func); } } } - EXPORT_SYMBOL(sort); diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 792d90608052..d3a501f2a81a 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -11,6 +11,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/uaccess.h> #include "../tools/testing/selftests/kselftest_module.h" @@ -226,7 +227,8 @@ static const unsigned long exp[] __initconst = { BITMAP_FROM_U64(0xffffffff), BITMAP_FROM_U64(0xfffffffe), BITMAP_FROM_U64(0x3333333311111111ULL), - BITMAP_FROM_U64(0xffffffff77777777ULL) + BITMAP_FROM_U64(0xffffffff77777777ULL), + BITMAP_FROM_U64(0), }; static const unsigned long exp2[] __initconst = { @@ -249,55 +251,93 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {0, "1-31:4/4", &exp[9 * step], 32, 0}, {0, "0-31:1/4,32-63:2/4", &exp[10 * step], 64, 0}, {0, "0-31:3/4,32-63:4/4", &exp[11 * step], 64, 0}, + {0, " ,, 0-31:3/4 ,, 32-63:4/4 ,, ", &exp[11 * step], 64, 0}, {0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4", exp2, 128, 0}, {0, "0-2047:128/256", NULL, 2048, PARSE_TIME}, + {0, "", &exp[12 * step], 8, 0}, + {0, "\n", &exp[12 * step], 8, 0}, + {0, ",, ,, , , ,", &exp[12 * step], 8, 0}, + {0, " , ,, , , ", &exp[12 * step], 8, 0}, + {0, " , ,, , , \n", &exp[12 * step], 8, 0}, + {-EINVAL, "-1", NULL, 8, 0}, {-EINVAL, "-0", NULL, 8, 0}, {-EINVAL, "10-1", NULL, 8, 0}, {-EINVAL, "0-31:", NULL, 8, 0}, {-EINVAL, "0-31:0", NULL, 8, 0}, + {-EINVAL, "0-31:0/", NULL, 8, 0}, {-EINVAL, "0-31:0/0", NULL, 8, 0}, {-EINVAL, "0-31:1/0", NULL, 8, 0}, {-EINVAL, "0-31:10/1", NULL, 8, 0}, + {-EOVERFLOW, "0-98765432123456789:10/1", NULL, 8, 0}, + + {-EINVAL, "a-31", NULL, 8, 0}, + {-EINVAL, "0-a1", NULL, 8, 0}, + {-EINVAL, "a-31:10/1", NULL, 8, 0}, + {-EINVAL, "0-31:a/1", NULL, 8, 0}, + {-EINVAL, "0-\n", NULL, 8, 0}, }; -static void __init test_bitmap_parselist(void) +static void __init __test_bitmap_parselist(int is_user) { int i; int err; - cycles_t cycles; + ktime_t time; DECLARE_BITMAP(bmap, 2048); + char *mode = is_user ? "_user" : ""; for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) { #define ptest parselist_tests[i] - cycles = get_cycles(); - err = bitmap_parselist(ptest.in, bmap, ptest.nbits); - cycles = get_cycles() - cycles; + if (is_user) { + mm_segment_t orig_fs = get_fs(); + size_t len = strlen(ptest.in); + + set_fs(KERNEL_DS); + time = ktime_get(); + err = bitmap_parselist_user(ptest.in, len, + bmap, ptest.nbits); + time = ktime_get() - time; + set_fs(orig_fs); + } else { + time = ktime_get(); + err = bitmap_parselist(ptest.in, bmap, ptest.nbits); + time = ktime_get() - time; + } if (err != ptest.errno) { - pr_err("test %d: input is %s, errno is %d, expected %d\n", - i, ptest.in, err, ptest.errno); + pr_err("parselist%s: %d: input is %s, errno is %d, expected %d\n", + mode, i, ptest.in, err, ptest.errno); continue; } if (!err && ptest.expected && !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) { - pr_err("test %d: input is %s, result is 0x%lx, expected 0x%lx\n", - i, ptest.in, bmap[0], *ptest.expected); + pr_err("parselist%s: %d: input is %s, result is 0x%lx, expected 0x%lx\n", + mode, i, ptest.in, bmap[0], + *ptest.expected); continue; } if (ptest.flags & PARSE_TIME) - pr_err("test %d: input is '%s' OK, Time: %llu\n", - i, ptest.in, - (unsigned long long)cycles); + pr_err("parselist%s: %d: input is '%s' OK, Time: %llu\n", + mode, i, ptest.in, time); } } +static void __init test_bitmap_parselist(void) +{ + __test_bitmap_parselist(0); +} + +static void __init test_bitmap_parselist_user(void) +{ + __test_bitmap_parselist(1); +} + #define EXP_BYTES (sizeof(exp) * 8) static void __init test_bitmap_arr32(void) @@ -370,6 +410,7 @@ static void __init selftest(void) test_copy(); test_bitmap_arr32(); test_bitmap_parselist(); + test_bitmap_parselist_user(); test_mem_optimisations(); } diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 3dd801c1c85b..566dad3f4196 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -47,6 +47,9 @@ struct test_sysctl_data { unsigned int uint_0001; char string_0001[65]; + +#define SYSCTL_TEST_BITMAP_SIZE 65536 + unsigned long *bitmap_0001; }; static struct test_sysctl_data test_data = { @@ -102,6 +105,13 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, + { + .procname = "bitmap_0001", + .data = &test_data.bitmap_0001, + .maxlen = SYSCTL_TEST_BITMAP_SIZE, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { } }; @@ -129,15 +139,21 @@ static struct ctl_table_header *test_sysctl_header; static int __init test_sysctl_init(void) { + test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL); + if (!test_data.bitmap_0001) + return -ENOMEM; test_sysctl_header = register_sysctl_table(test_sysctl_root_table); - if (!test_sysctl_header) + if (!test_sysctl_header) { + kfree(test_data.bitmap_0001); return -ENOMEM; + } return 0; } late_initcall(test_sysctl_init); static void __exit test_sysctl_exit(void) { + kfree(test_data.bitmap_0001); if (test_sysctl_header) unregister_sysctl_table(test_sysctl_header); } diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index f832b095afba..8bbefcaddfe8 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -384,12 +384,11 @@ static int test_func(void *private) { struct test_driver *t = private; int random_array[ARRAY_SIZE(test_case_array)]; - int index, i, j, ret; + int index, i, j; ktime_t kt; u64 delta; - ret = set_cpus_allowed_ptr(current, cpumask_of(t->cpu)); - if (ret < 0) + if (set_cpus_allowed_ptr(current, cpumask_of(t->cpu)) < 0) pr_err("Failed to set affinity to %d CPU\n", t->cpu); for (i = 0; i < ARRAY_SIZE(test_case_array); i++) @@ -415,8 +414,7 @@ static int test_func(void *private) kt = ktime_get(); for (j = 0; j < test_repeat_count; j++) { - ret = test_case_array[index].test_func(); - if (!ret) + if (!test_case_array[index].test_func()) per_cpu_test_data[t->cpu][index].test_passed++; else per_cpu_test_data[t->cpu][index].test_failed++; diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..ee8d1f311858 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -11,23 +11,24 @@ choice default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here selected by the architecture + configuration. This is normal. config FLATMEM_MANUAL bool "Flat Memory" depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE help - This option allows you to change some of the ways that - Linux manages its memory internally. Most users will - only have one option here: FLATMEM. This is normal - and a correct option. - - Some users of more advanced features like NUMA and - memory hotplug may have different options here. - DISCONTIGMEM is a more mature, better tested system, - but is incompatible with memory hotplug and may suffer - decreased performance over SPARSEMEM. If unsure between - "Sparse Memory" and "Discontiguous Memory", choose - "Discontiguous Memory". + This option is best suited for non-NUMA systems with + flat address space. The FLATMEM is the most efficient + system in terms of performance and resource consumption + and it is the best option for smaller systems. + + For systems that have holes in their physical address + spaces and for features like NUMA and memory hotplug, + choose "Sparse Memory" If unsure, choose this option (Flat Memory) over any other. @@ -38,29 +39,26 @@ config DISCONTIGMEM_MANUAL This option provides enhanced support for discontiguous memory systems, over FLATMEM. These systems have holes in their physical address spaces, and this option provides - more efficient handling of these holes. However, the vast - majority of hardware has quite flat address spaces, and - can have degraded performance from the extra overhead that - this option imposes. + more efficient handling of these holes. - Many NUMA configurations will have this as the only option. + Although "Discontiguous Memory" is still used by several + architectures, it is considered deprecated in favor of + "Sparse Memory". - If unsure, choose "Flat Memory" over this option. + If unsure, choose "Sparse Memory" over this option. config SPARSEMEM_MANUAL bool "Sparse Memory" depends on ARCH_SPARSEMEM_ENABLE help This will be the only option for some systems, including - memory hotplug systems. This is normal. + memory hot-plug systems. This is normal. - For many other systems, this will be an alternative to - "Discontiguous Memory". This option provides some potential - performance benefits, along with decreased code complexity, - but it is newer, and more experimental. + This option provides efficient support for systems with + holes is their physical address space and allows memory + hot-plug and hot-remove. - If unsure, choose "Discontiguous Memory" or "Flat Memory" - over this option. + If unsure, choose "Flat Memory" over this option. endchoice @@ -136,7 +134,7 @@ config HAVE_MEMBLOCK_PHYS_MAP config HAVE_GENERIC_GUP bool -config ARCH_DISCARD_MEMBLOCK +config ARCH_KEEP_MEMBLOCK bool config MEMORY_ISOLATION @@ -161,7 +159,6 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" - default n depends on MEMORY_HOTPLUG help This option sets the default policy setting for memory hotplug @@ -258,6 +255,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config CONTIG_ALLOC + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + config PHYS_ADDR_T_64BIT def_bool 64BIT @@ -436,7 +436,6 @@ config NEED_PER_CPU_KM config CLEANCACHE bool "Enable cleancache driver to cache clean pages if tmem is present" - default n help Cleancache can be thought of as a page-granularity victim cache for clean pages that the kernel's pageframe replacement algorithm @@ -460,7 +459,6 @@ config CLEANCACHE config FRONTSWAP bool "Enable frontswap to cache swap pages if tmem is present" depends on SWAP - default n help Frontswap is so named because it can be thought of as the opposite of a "backing" store for a swap device. The data is stored into @@ -532,7 +530,6 @@ config ZSWAP depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO select ZPOOL - default n help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -549,14 +546,12 @@ config ZSWAP config ZPOOL tristate "Common API for compressed memory storage" - default n help Compressed memory storage API. This allows using either zbud or zsmalloc. config ZBUD tristate "Low (Up to 2x) density storage for compressed pages" - default n help A special purpose allocator for storing compressed pages. It is designed to store up to two compressed pages per physical @@ -567,7 +562,6 @@ config ZBUD config Z3FOLD tristate "Up to 3x density storage for compressed pages" depends on ZPOOL - default n help A special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical @@ -577,7 +571,6 @@ config Z3FOLD config ZSMALLOC tristate "Memory allocator for compressed pages" depends on MMU - default n help zsmalloc is a slab-based memory allocator designed to store compressed RAM pages. zsmalloc uses virtual memory mapping @@ -628,7 +621,6 @@ config MAX_STACK_SIZE_MB config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" - default n depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT @@ -676,6 +668,22 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM_MIRROR + bool + default y + depends on (X86_64 || PPC64) + depends on MMU && 64BIT + +config ARCH_HAS_HMM_DEVICE + bool + default y + depends on (X86_64 || PPC64) + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + depends on ARCH_HAS_ZONE_DEVICE + select XARRAY_MULTI + config ARCH_HAS_HMM bool default y @@ -694,12 +702,12 @@ config DEV_PAGEMAP_OPS config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on ARCH_HAS_HMM - select MMU_NOTIFIER select HMM help Select HMM_MIRROR if you want to mirror range of the CPU page table of a @@ -740,7 +748,6 @@ config ARCH_HAS_PKEYS config PERCPU_STATS bool "Collect percpu memory statistics" - default n help This feature collects and exposes statistics via debugfs. The information includes global and per chunk statistics, which can @@ -748,7 +755,6 @@ config PERCPU_STATS config GUP_BENCHMARK bool "Enable infrastructure for get_user_pages_fast() benchmarking" - default n help Provides /sys/kernel/debug/gup_benchmark that helps with testing performance of get_user_pages_fast(). diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e3df921208c0..e980ceb775a4 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -33,7 +33,6 @@ config DEBUG_PAGEALLOC config DEBUG_PAGEALLOC_ENABLE_DEFAULT bool "Enable debug page memory allocations by default?" - default n depends on DEBUG_PAGEALLOC ---help--- Enable debug page memory allocations by default? This value diff --git a/mm/Makefile b/mm/Makefile index d210cc9d6f80..ac5e5ba78874 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o \ + maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ @@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) +# Give 'page_alloc' its own module-parameter namespace +page-alloc-y := page_alloc.o +page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o + +obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o @@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma) cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) + if (!cma->bitmap) { + cma->count = 0; return -ENOMEM; + } WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn)); @@ -367,23 +369,26 @@ err: #ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit; + unsigned long next_zero_bit, next_set_bit, nr_zero; unsigned long start = 0; - unsigned int nr_zero, nr_total = 0; + unsigned long nr_part, nr_total = 0; + unsigned long nbits = cma_bitmap_maxno(cma); mutex_lock(&cma->lock); pr_info("number of available pages: "); for (;;) { - next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); - if (next_zero_bit >= cma->count) + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); + if (next_zero_bit >= nbits) break; - next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); nr_zero = next_set_bit - next_zero_bit; - pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); - nr_total += nr_zero; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, + next_zero_bit); + nr_total += nr_part; start = next_zero_bit + nr_zero; } - pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); mutex_unlock(&cma->lock); } #else diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 8d7b2fd52225..a7dd9e8e10d5 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val) mutex_lock(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= cma->count) + if (start >= bitmap_maxno) break; end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); diff --git a/mm/compaction.c b/mm/compaction.c index 3319e0872d01..cbac7277978a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1164,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc, static inline unsigned int freelist_scan_limit(struct compact_control *cc) { - return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; + unsigned short shift = BITS_PER_LONG - 1; + + return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1; } /* @@ -1886,13 +1888,13 @@ static enum compact_result __compact_finished(struct compact_control *cc) bool can_steal; /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[migratetype])) + if (!free_area_empty(area, migratetype)) return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && - !list_empty(&area->free_list[MIGRATE_CMA])) + !free_area_empty(area, MIGRATE_CMA)) return COMPACT_SUCCESS; #endif /* diff --git a/mm/debug.c b/mm/debug.c index eee9c221280c..b9cd71927d3c 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm) #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" - "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" + "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %llx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" @@ -167,7 +167,8 @@ void dump_mm(const struct mm_struct *mm) atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), mm->map_count, - mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, + mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, + (u64)atomic64_read(&mm->locked_vm), (u64)atomic64_read(&mm->pinned_vm), mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, diff --git a/mm/filemap.c b/mm/filemap.c index d78f577baef2..3ad18fa56057 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> @@ -279,11 +280,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -292,40 +293,44 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; + } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + (1UL << compound_order(page)) - 1 == + xas.xa_index) i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; - } xas_store(&xas, NULL); total_pages++; } @@ -878,6 +883,7 @@ error: put_page(page); return xas_error(&xas); } +ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); /** * add_to_page_cache_locked - add a locked page to the pagecache @@ -1440,7 +1446,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_miss() - Find the next gap in the page cache. + * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. @@ -1491,7 +1497,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1506,25 +1512,19 @@ repeat: if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1706,7 +1706,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1717,17 +1716,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1736,7 +1731,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1778,33 +1773,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1849,7 +1838,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1859,24 +1847,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1912,7 +1895,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1923,26 +1905,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1991,7 +1968,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, rcu_read_lock(); xas_for_each_marked(&xas, page, ULONG_MAX, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -2002,17 +1978,13 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -2021,7 +1993,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -2691,7 +2663,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2700,24 +2672,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || @@ -28,6 +28,111 @@ struct follow_page_context { unsigned int page_mask; }; +typedef int (*set_dirty_func_t)(struct page *page); + +static void __put_user_pages_dirty(struct page **pages, + unsigned long npages, + set_dirty_func_t sdf) +{ + unsigned long index; + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key cases: + * + * 1) This code sees the page as already dirty, so it skips + * the call to sdf(). That could happen because + * clear_page_dirty_for_io() called page_mkclean(), + * followed by set_page_dirty(). However, now the page is + * going to get written back, which meets the original + * intention of setting it dirty, so all is well: + * clear_page_dirty_for_io() goes on to call + * TestClearPageDirty(), and write the page back. + * + * 2) This code sees the page as clean, so it calls sdf(). + * The page stays dirty, despite being written back, so it + * gets written back again in the next writeback cycle. + * This is harmless. + */ + if (!PageDirty(page)) + sdf(page); + + put_user_page(page); + } +} + +/** + * put_user_pages_dirty() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * "gup-pinned page" refers to a page that has had one of the get_user_pages() + * variants called on that page. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * set_page_dirty(), which does not lock the page, is used here. + * Therefore, it is the caller's responsibility to ensure that this is + * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * + */ +void put_user_pages_dirty(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty); +} +EXPORT_SYMBOL(put_user_pages_dirty); + +/** + * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * This is just like put_user_pages_dirty(), except that it invokes + * set_page_dirty_lock(), instead of set_page_dirty(). + * + */ +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty_lock); +} +EXPORT_SYMBOL(put_user_pages_dirty_lock); + +/** + * put_user_pages() - release an array of gup-pinned pages. + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, release the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + */ +void put_user_pages(struct page **pages, unsigned long npages) +{ + unsigned long index; + + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + for (index = 0; index < npages; index++) + put_user_page(pages[index]); +} +EXPORT_SYMBOL(put_user_pages); + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -1018,6 +1123,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); @@ -1046,6 +1160,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + down_read(&mm->mmap_sem); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); @@ -1116,32 +1239,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1160,12 +1273,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1219,10 +1326,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1278,10 +1388,14 @@ check_again: putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1291,66 +1405,101 @@ check_again: return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. - * - * "longterm" == userspace controlled elevated page count lifetime. - * Contrast this to iov_iter_get_pages() usages which are transient. + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which + * allows us to process the FOLL_LONGTERM flag. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; - - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); } - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. @@ -1571,7 +1720,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -1589,10 +1738,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (pte_protnone(pte)) goto pte_unmap; - if (!pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { + if (unlikely(flags & FOLL_LONGTERM)) + goto pte_unmap; + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, pages); @@ -1641,7 +1793,7 @@ pte_unmap: * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { return 0; } @@ -1724,16 +1876,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, #endif static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pmd_access_permitted(orig, write)) + if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pmd_devmap(orig)) + if (pmd_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); + } refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1762,16 +1917,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pud_access_permitted(orig, write)) + if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pud_devmap(orig)) + if (pud_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); + } refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -1800,13 +1958,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, int write, + unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *head, *page; - if (!pgd_access_permitted(orig, write)) + if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); @@ -1837,7 +1995,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, } static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; @@ -1860,7 +2018,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_protnone(pmd)) return 0; - if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -1870,9 +2028,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, write, pages, nr)) + PMD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -1880,7 +2038,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, } static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; @@ -1893,14 +2051,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (pud_none(pud)) return 0; if (unlikely(pud_huge(pud))) { - if (!gup_huge_pud(pud, pudp, addr, next, write, + if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, write, pages, nr)) + PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -1908,7 +2066,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, } static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; @@ -1923,9 +2081,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, write, pages, nr)) + P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -1933,7 +2091,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, } static void gup_pgd_range(unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; @@ -1946,14 +2104,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, write, pages, nr)) + PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) + } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -2007,18 +2165,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_save(flags); - gup_pgd_range(start, end, write, pages, &nr); + gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); } return nr; } +static int __gup_longterm_unlocked(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int ret; + + /* + * FIXME: FOLL_LONGTERM does not work with + * get_user_pages_unlocked() (see comments in that function) + */ + if (gup_flags & FOLL_LONGTERM) { + down_read(¤t->mm->mmap_sem); + ret = __gup_longterm_locked(current, current->mm, + start, nr_pages, + pages, NULL, gup_flags); + up_read(¤t->mm->mmap_sem); + } else { + ret = get_user_pages_unlocked(start, nr_pages, + pages, gup_flags); + } + + return ret; +} + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -2030,8 +2211,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; @@ -2049,7 +2230,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); ret = nr; } @@ -2059,8 +2240,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); + ret = __gup_longterm_unlocked(start, nr_pages - nr, + gup_flags, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6c0279e70cc4..7dd602d7f8db 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i); break; case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, - pages + i, NULL); + nr = get_user_pages(addr, nr, + (gup->flags & 1) | FOLL_LONGTERM, + pages + i, NULL); break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, @@ -30,6 +30,7 @@ #include <linux/hugetlb.h> #include <linux/memremap.h> #include <linux/jump_label.h> +#include <linux/dma-mapping.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -38,54 +39,48 @@ #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -/* - * struct hmm - HMM per mm struct - * - * @mm: mm struct this HMM struct is bound to - * @lock: lock protecting ranges list - * @ranges: list of range being snapshotted - * @mirrors: list of mirrors for this mm - * @mmu_notifier: mmu notifier to track updates to CPU page table - * @mirrors_sem: read/write semaphore protecting the mirrors list - */ -struct hmm { - struct mm_struct *mm; - spinlock_t lock; - struct list_head ranges; - struct list_head mirrors; - struct mmu_notifier mmu_notifier; - struct rw_semaphore mirrors_sem; -}; +static inline struct hmm *mm_get_hmm(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); -/* - * hmm_register - register HMM against an mm (HMM internal) + if (hmm && kref_get_unless_zero(&hmm->kref)) + return hmm; + + return NULL; +} + +/** + * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to + * Returns: returns an HMM object, either by referencing the existing + * (per-process) object, or by creating a new one. * - * This is not intended to be used directly by device drivers. It allocates an - * HMM struct if mm does not have one, and initializes it. + * This is not intended to be used directly by device drivers. If mm already + * has an HMM struct then it get a reference on it and returns it. Otherwise + * it allocates an HMM struct, initializes it, associate it with the mm and + * returns it. */ -static struct hmm *hmm_register(struct mm_struct *mm) +static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = mm_get_hmm(mm); bool cleanup = false; - /* - * The hmm struct can only be freed once the mm_struct goes away, - * hence we should always have pre-allocated an new hmm struct - * above. - */ if (hmm) return hmm; hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) return NULL; + init_waitqueue_head(&hmm->wq); INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->lock); + mutex_init(&hmm->lock); + kref_init(&hmm->kref); + hmm->notifiers = 0; + hmm->dead = false; hmm->mm = mm; spin_lock(&mm->page_table_lock); @@ -106,7 +101,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(&mm->page_table_lock); @@ -118,54 +113,60 @@ error: return NULL; } -void hmm_mm_destroy(struct mm_struct *mm) +static void hmm_free(struct kref *kref) { - kfree(mm->hmm); -} + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; -static int hmm_invalidate_range(struct hmm *hmm, bool device, - const struct hmm_update *update) -{ - struct hmm_mirror *mirror; - struct hmm_range *range; - - spin_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - unsigned long addr, idx, npages; + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); - if (update->end < range->start || update->start >= range->end) - continue; + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); - range->valid = false; - addr = max(update->start, range->start); - idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; - memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); - } - spin_unlock(&hmm->lock); + kfree(hmm); +} - if (!device) - return 0; +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(&hmm->kref, hmm_free); +} - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; +void hmm_mm_destroy(struct mm_struct *mm) +{ + struct hmm *hmm; - ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); - if (!update->blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); - return -EAGAIN; - } + spin_lock(&mm->page_table_lock); + hmm = mm_get_hmm(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + hmm->dead = true; + spin_unlock(&mm->page_table_lock); + hmm_put(hmm); + return; } - up_read(&hmm->mirrors_sem); - return 0; + spin_unlock(&mm->page_table_lock); } static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { + struct hmm *hmm = mm_get_hmm(mm); struct hmm_mirror *mirror; - struct hmm *hmm = mm->hmm; + struct hmm_range *range; + + /* Report this HMM as dying. */ + hmm->dead = true; + + /* Wake-up everyone waiting on any range. */ + mutex_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { + range->valid = false; + } + wake_up_all(&hmm->wq); + mutex_unlock(&hmm->lock); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -186,36 +187,86 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) struct hmm_mirror, list); } up_write(&hmm->mirrors_sem); + + hmm_put(hmm); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { + struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm_mirror *mirror; struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + struct hmm_range *range; + int ret = 0; VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; + update.start = nrange->start; + update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; - update.blockable = range->blockable; - return hmm_invalidate_range(hmm, true, &update); + update.blockable = mmu_notifier_range_blockable(nrange); + + if (mmu_notifier_range_blockable(nrange)) + mutex_lock(&hmm->lock); + else if (!mutex_trylock(&hmm->lock)) { + ret = -EAGAIN; + goto out; + } + hmm->notifiers++; + list_for_each_entry(range, &hmm->ranges, list) { + if (update.end < range->start || update.start >= range->end) + continue; + + range->valid = false; + } + mutex_unlock(&hmm->lock); + + if (mmu_notifier_range_blockable(nrange)) + down_read(&hmm->mirrors_sem); + else if (!down_read_trylock(&hmm->mirrors_sem)) { + ret = -EAGAIN; + goto out; + } + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (!update.blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + ret = -EAGAIN; + goto out; + } + } + up_read(&hmm->mirrors_sem); + +out: + hmm_put(hmm); + return ret; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { - struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + struct hmm *hmm = mm_get_hmm(nrange->mm); VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = true; - hmm_invalidate_range(hmm, false, &update); + mutex_lock(&hmm->lock); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; + + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; + } + wake_up_all(&hmm->wq); + } + mutex_unlock(&hmm->lock); + + hmm_put(hmm); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -241,24 +292,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; -again: - mirror->hmm = hmm_register(mm); + mirror->hmm = hmm_get_or_create(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - if (mirror->hmm->mm == NULL) { - /* - * A racing hmm_mirror_unregister() is about to destroy the hmm - * struct. Try again to allocate a new one. - */ - up_write(&mirror->hmm->mirrors_sem); - mirror->hmm = NULL; - goto again; - } else { - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - } + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); return 0; } @@ -273,38 +313,24 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - bool should_unregister = false; - struct mm_struct *mm; - struct hmm *hmm; + struct hmm *hmm = READ_ONCE(mirror->hmm); - if (mirror->hmm == NULL) + if (hmm == NULL) return; - hmm = mirror->hmm; down_write(&hmm->mirrors_sem); list_del_init(&mirror->list); - should_unregister = list_empty(&hmm->mirrors); + /* To protect us against double unregister ... */ mirror->hmm = NULL; - mm = hmm->mm; - hmm->mm = NULL; up_write(&hmm->mirrors_sem); - if (!should_unregister || mm == NULL) - return; - - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); - - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); struct hmm_vma_walk { struct hmm_range *range; + struct dev_pagemap *pgmap; unsigned long last; bool fault; bool block; @@ -323,13 +349,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -355,7 +381,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -367,23 +393,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = hmm_range_page_size(range); + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; ret = hmm_vma_do_fault(walk, addr, write_fault, &pfns[i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -392,10 +420,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - *fault = *write_fault = false; if (!hmm_vma_walk->fault) return; + /* + * So we not only consider the individual per page request we also + * consider the default flags requested for the range. The API can + * be use in 2 fashions. The first one where the HMM user coalesce + * multiple page fault into one request and set flags per pfns for + * of those faults. The second one where the HMM user want to pre- + * fault a range with specific flags. For the latter one it is a + * waste to have the user pre-fill the pfn arrays with a default + * flags value. + */ + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; @@ -431,10 +470,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, return; } + *fault = *write_fault = false; for (i = 0; i < npages; ++i) { hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, fault, write_fault); - if ((*fault) || (*write_fault)) + if ((*write_fault)) return; } } @@ -465,12 +505,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, uint64_t *pfns, pmd_t pmd) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; @@ -486,10 +536,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + if (pmd_devmap(pmd)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } hmm_vma_walk->last = end; return 0; +#else + /* If THP is not enabled then we should never reach that code ! */ + return -EINVAL; +#endif } static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) @@ -514,11 +579,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + &fault, &write_fault); if (fault || write_fault) goto fault; return 0; @@ -546,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, &fault, &write_fault); if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn = hmm_device_entry_from_pfn(range, + swp_offset(entry)); *pfn |= cpu_flags; return 0; } @@ -557,7 +623,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, pmdp, addr); - return -EAGAIN; + return -EBUSY; } return 0; } @@ -565,15 +631,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, /* Report error for everything else */ *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; + } else { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); } if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + if (pte_devmap(pte)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + + *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); @@ -615,7 +699,7 @@ again: if (fault || write_fault) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(vma->vm_mm, pmdp); - return -EAGAIN; + return -EBUSY; } return 0; } else if (!pmd_present(pmd)) @@ -661,12 +745,158 @@ again: return r; } } + if (hmm_vma_walk->pgmap) { + /* + * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() + * so that we can leverage get_dev_pagemap() optimization which + * will not re-take a reference on a pgmap if we already have + * one. + */ + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; return 0; } +static int hmm_vma_walk_pud(pud_t *pudp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long addr = start, next; + pmd_t *pmdp; + pud_t pud; + int ret; + +again: + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + return hmm_vma_walk_hole(start, end, walk); + + if (pud_huge(pud) && pud_devmap(pud)) { + unsigned long i, npages, pfn; + uint64_t *pfns, cpu_flags; + bool fault, write_fault; + + if (!pud_present(pud)) + return hmm_vma_walk_hole(start, end, walk); + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + + cpu_flags = pud_to_hmm_pfn_flags(range, pud); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + cpu_flags, &fault, &write_fault); + if (fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, + write_fault, walk); + +#ifdef CONFIG_HUGETLB_PAGE + pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + for (i = 0; i < npages; ++i, ++pfn) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } + hmm_vma_walk->last = end; + return 0; +#else + return -EINVAL; +#endif + } + + split_huge_pud(walk->vma, pudp, addr); + if (pud_none(*pudp)) + goto again; + + pmdp = pmd_offset(pudp, addr); + do { + next = pmd_addr_end(addr, end); + ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); + if (ret) + return ret; + } while (pmdp++, addr = next, addr != end); + + return 0; +} + +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) + return -EINVAL; + pfn_inc = size >> PAGE_SHIFT; + } else { + pfn_inc = 1; + size = PAGE_SIZE; + } + + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + + i = (start - range->start) >> range->page_shift; + orig_pfn = range->pfns[i]; + range->pfns[i] = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, entry); + fault = write_fault = false; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) { + ret = -ENOENT; + goto unlock; + } + + pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); + for (; addr < end; addr += size, i++, pfn += pfn_inc) + range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; + hmm_vma_walk->last = end; + +unlock: + spin_unlock(ptl); + + if (ret == -ENOENT) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + return ret; +#else /* CONFIG_HUGETLB_PAGE */ + return -EINVAL; +#endif +} + static void hmm_pfns_clear(struct hmm_range *range, uint64_t *pfns, unsigned long addr, @@ -676,279 +906,437 @@ static void hmm_pfns_clear(struct hmm_range *range, *pfns = range->values[HMM_PFN_NONE]; } -static void hmm_pfns_special(struct hmm_range *range) -{ - unsigned long addr = range->start, i = 0; - - for (; addr < range->end; addr += PAGE_SIZE, i++) - range->pfns[i] = range->values[HMM_PFN_SPECIAL]; -} - /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See hmm_vma_range_done() for further - * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must then call - * hmm_vma_range_done() to stop CPU page table update tracking on this range. + * hmm_range_register() - start tracking change to CPU page table over a range + * @range: range + * @mm: the mm struct for the range of virtual address + * @start: start virtual address (inclusive) + * @end: end virtual address (exclusive) + * @page_shift: expect page shift for the range + * Returns 0 on success, -EFAULT if the address space is no longer valid * - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! + * Track updates to the CPU page table see include/linux/hmm.h */ -int hmm_vma_get_pfns(struct hmm_range *range) +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end, + unsigned page_shift) { - struct vm_area_struct *vma = range->vma; - struct hmm_vma_walk hmm_vma_walk; - struct mm_walk mm_walk; - struct hmm *hmm; + unsigned long mask = ((1UL << page_shift) - 1UL); + + range->valid = false; + range->hmm = NULL; - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) + if ((start & mask) || (end & mask)) return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) + if (start >= end) return -EINVAL; - hmm = hmm_register(vma->vm_mm); - if (!hmm) - return -ENOMEM; - /* Caller must have registered a mirror, via hmm_mirror_register() ! */ - if (!hmm->mmu_notifier.ops) - return -EINVAL; + range->page_shift = page_shift; + range->start = start; + range->end = end; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - return -EINVAL; - } + range->hmm = hmm_get_or_create(mm); + if (!range->hmm) + return -EFAULT; - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. - */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -EPERM; + /* Check if hmm_mm_destroy() was call. */ + if (range->hmm->mm == NULL || range->hmm->dead) { + hmm_put(range->hmm); + return -EFAULT; } /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - - walk_page_range(range->start, range->end, &mm_walk); + mutex_lock(&range->hmm->lock); + + list_add_rcu(&range->list, &range->hmm->ranges); + + /* + * If there are any concurrent notifiers we have to wait for them for + * the range to be valid (see hmm_range_wait_until_valid()). + */ + if (!range->hmm->notifiers) + range->valid = true; + mutex_unlock(&range->hmm->lock); + return 0; } -EXPORT_SYMBOL(hmm_vma_get_pfns); +EXPORT_SYMBOL(hmm_range_register); /* - * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @range: range being tracked - * Returns: false if range data has been invalidated, true otherwise + * hmm_range_unregister() - stop tracking change to CPU page table over a range + * @range: range * * Range struct is used to track updates to the CPU page table after a call to - * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done - * using the data, or wants to lock updates to the data it got from those - * functions, it must call the hmm_vma_range_done() function, which will then - * stop tracking CPU page table updates. - * - * Note that device driver must still implement general CPU page table update - * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using - * the mmu_notifier API directly. - * - * CPU page table update tracking done through hmm_range is only temporary and - * to be used while trying to duplicate CPU page table contents for a range of - * virtual addresses. - * - * There are two ways to use this : - * again: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * trans = device_build_page_table_update_transaction(pfns); - * device_page_table_lock(); - * if (!hmm_vma_range_done(range)) { - * device_page_table_unlock(); - * goto again; - * } - * device_commit_transaction(trans); - * device_page_table_unlock(); + * hmm_range_register(). See include/linux/hmm.h for how to use it. + */ +void hmm_range_unregister(struct hmm_range *range) +{ + /* Sanity check this really should not happen. */ + if (range->hmm == NULL || range->end <= range->start) + return; + + mutex_lock(&range->hmm->lock); + list_del_rcu(&range->list); + mutex_unlock(&range->hmm->lock); + + /* Drop reference taken by hmm_range_register() */ + range->valid = false; + hmm_put(range->hmm); + range->hmm = NULL; +} +EXPORT_SYMBOL(hmm_range_unregister); + +/* + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * permission (for instance asking for write and range is read only), + * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid + * vma or it is illegal to access that range), number of valid pages + * in range->pfns[] (from range start address). * - * Or: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * device_page_table_lock(); - * hmm_vma_range_done(range); - * device_update_page_table(range->pfns); - * device_page_table_unlock(); + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See in include/linux/hmm.h for example + * on how to use. */ -bool hmm_vma_range_done(struct hmm_range *range) +long hmm_range_snapshot(struct hmm_range *range) { - unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; - struct hmm *hmm; + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; + unsigned long start = range->start, end; + struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; + struct mm_walk mm_walk; - if (range->end <= range->start) { - BUG(); - return false; - } + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; - hmm = hmm_register(range->vma->vm_mm); - if (!hmm) { - memset(range->pfns, 0, sizeof(*range->pfns) * npages); - return false; - } + do { + /* If range is no longer valid force retry. */ + if (!range->valid) + return -EAGAIN; - spin_lock(&hmm->lock); - list_del_rcu(&range->list); - spin_unlock(&hmm->lock); + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & device_vma)) + return -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + struct hstate *h = hstate_vma(vma); - return range->valid; + if (huge_page_shift(h) != range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } + + range->vma = vma; + hmm_vma_walk.pgmap = NULL; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pud_entry = hmm_vma_walk_pud; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + + walk_page_range(start, end, &mm_walk); + start = end; + } while (start < range->end); + + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_range_done); +EXPORT_SYMBOL(hmm_range_snapshot); /* - * hmm_vma_fault() - try to fault some address in a virtual address range + * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + * invalid vma (for instance device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (for instance asking for write and + * range is read only). + * -EAGAIN: If you need to retry and mmap_sem was drop. This can only + * happens if block argument is false. + * -EBUSY: If the the range is being invalidated and you should wait + * for invalidation to finish. + * -EFAULT: Invalid (ie either no valid vma or it is illegal to access + * that range), number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs. + * any memory migration if the memory being faulted is not accessible by CPUs + * and caller does not ask for migration. * * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. - * - * Expected use pattern: - * retry: - * down_read(&mm->mmap_sem); - * // Find vma and address device wants to fault, initialize hmm_pfn_t - * // array accordingly - * ret = hmm_vma_fault(range, write, block); - * switch (ret) { - * case -EAGAIN: - * hmm_vma_range_done(range); - * // You might want to rate limit or yield to play nicely, you may - * // also commit any valid pfn in the array assuming that you are - * // getting true from hmm_vma_range_monitor_end() - * goto retry; - * case 0: - * break; - * case -ENOMEM: - * case -EINVAL: - * case -EPERM: - * default: - * // Handle error ! - * up_read(&mm->mmap_sem) - * return; - * } - * // Take device driver lock that serialize device page table update - * driver_lock_device_page_table_update(); - * hmm_vma_range_done(range); - * // Commit pfns we got from hmm_vma_fault() - * driver_unlock_device_page_table_update(); - * up_read(&mm->mmap_sem) - * - * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) - * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! - * - * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, bool block) { - struct vm_area_struct *vma = range->vma; - unsigned long start = range->start; + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; + unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; struct mm_walk mm_walk; - struct hmm *hmm; int ret; - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) - return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) - return -EINVAL; + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; - hmm = hmm_register(vma->vm_mm); - if (!hmm) { - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -ENOMEM; - } - /* Caller must have registered a mirror using hmm_mirror_register() */ - if (!hmm->mmu_notifier.ops) - return -EINVAL; + do { + /* If range is no longer valid force retry. */ + if (!range->valid) { + up_read(&hmm->mm->mmap_sem); + return -EAGAIN; + } - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - return -EINVAL; - } + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & device_vma)) + return -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } + + range->vma = vma; + hmm_vma_walk.pgmap = NULL; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = true; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pud_entry = hmm_vma_walk_pud; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + + /* Keep trying while the range is valid. */ + } while (ret == -EBUSY && range->valid); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(range, &range->pfns[i], + hmm_vma_walk.last, range->end); + return ret; + } + start = end; + + } while (start < range->end); + + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; +} +EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; - if (!(vma->vm_flags & VM_READ)) { /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. + * FIXME need to update DMA API to provide invalid DMA address + * value instead of a function to test dma address value. This + * would remove lot of dumb code duplicated accross many arch. + * + * For now setting it to 0 here is good enough as the pfns[] + * value is what is use to check what is valid and what isn't. */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -EPERM; + daddrs[i] = 0; + + page = hmm_device_entry_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; } - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - hmm_vma_walk.last = range->start; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; + return mapped; - do { - ret = walk_page_range(start, range->end, &mm_walk); - start = hmm_vma_walk.last; - } while (ret == -EAGAIN); +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; - if (ret) { - unsigned long i; + page = hmm_device_entry_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, - range->end); - hmm_vma_range_done(range); + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; } + return ret; } -EXPORT_SYMBOL(hmm_vma_fault); +EXPORT_SYMBOL(hmm_range_dma_map); + +/** + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() + * @range: range being unmapped + * @vma: the vma against which the range (optional) + * @device: device against which dma map was done + * @daddrs: dma address of mapped pages + * @dirty: dirty page if it had the write flag set + * Returns: number of page unmapped on success, -EINVAL otherwise + * + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide + * to the sync_cpu_device_pagetables() callback so that it is safe here to + * call set_page_dirty(). Caller must also take appropriate locks to avoid + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. + */ +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty) +{ + unsigned long i, npages; + long cpages = 0; + + /* Sanity check. */ + if (range->end <= range->start) + return -EINVAL; + if (!daddrs) + return -EINVAL; + if (!range->pfns) + return -EINVAL; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + page = hmm_device_entry_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { + dir = DMA_BIDIRECTIONAL; + + /* + * See comments in function description on why it is + * safe here to call set_page_dirty() + */ + if (dirty) + set_page_dirty(page); + } + + /* Unmap and clear pfns/dma address */ + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + range->pfns[i] = range->values[HMM_PFN_NONE]; + /* FIXME see comments in hmm_vma_dma_map() */ + daddrs[i] = 0; + cpages++; + } + + return cpages; +} +EXPORT_SYMBOL(hmm_range_dma_unmap); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6a34b32d8ac..093507983dd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1220,8 +1220,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1384,8 +1384,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); @@ -2060,7 +2060,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); @@ -2278,7 +2279,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); @@ -2492,6 +2494,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 641cedfc8c0f..bf58cee30f65 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref) static inline struct resv_map *inode_resv_map(struct inode *inode) { - return inode->i_mapping->private_data; + /* + * At inode evict time, i_mapping may not point to the original + * address space within the inode. This original address space + * contains the pointer to the resv_map. So, always use the + * address space embedded within the inode. + * The VERY common case is inode->mapping == &inode->i_data but, + * this may not be true for device special inodes. + */ + return (struct resv_map *)(&inode->i_data)->private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) @@ -1059,6 +1067,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) free_contig_range(page_to_pfn(page), 1 << order); } +#ifdef CONFIG_CONTIG_ALLOC static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { @@ -1143,11 +1152,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); +#else /* !CONFIG_CONTIG_ALLOC */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static inline bool gigantic_page_supported(void) { return false; } static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) { return NULL; } + int nid, nodemask_t *nodemask) +{ + return NULL; +} static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } @@ -1157,7 +1175,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; @@ -1258,12 +1276,23 @@ void free_huge_page(struct page *page) ClearPagePrivate(page); /* - * A return code of zero implies that the subpool will be under its - * minimum size if the reservation is not restored after page is free. - * Therefore, force restore_reserve operation. + * If PagePrivate() was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservtion, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) - restore_reserve = true; + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + } spin_lock(&hugetlb_lock); clear_page_huge_active(page); @@ -2277,13 +2306,47 @@ found: } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, - nodemask_t *nodes_allowed) +static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) - return h->max_huge_pages; + spin_lock(&hugetlb_lock); + + /* + * Check for a node specific request. + * Changing node specific huge page count may require a corresponding + * change to the global count. In any case, the passed node mask + * (nodes_allowed) will restrict alloc/free to the specified node. + */ + if (nid != NUMA_NO_NODE) { + unsigned long old_count = count; + + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + /* + * User may have specified a large count value which caused the + * above calculation to overflow. In this case, they wanted + * to allocate as many huge pages as possible. Set count to + * largest possible value to align with their intention. + */ + if (count < old_count) + count = ULONG_MAX; + } + + /* + * Gigantic pages runtime allocation depend on the capability for large + * page range allocation. + * If the system does not provide this feature, return an error when + * the user tries to allocate gigantic pages but let the user free the + * boottime allocated gigantic pages. + */ + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { + if (count > persistent_huge_pages(h)) { + spin_unlock(&hugetlb_lock); + return -EINVAL; + } + /* Fall through to decrease pool */ + } /* * Increase the pool size @@ -2296,7 +2359,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ - spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; @@ -2351,9 +2413,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, break; } out: - ret = persistent_huge_pages(h); + h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); - return ret; + + return 0; } #define HSTATE_ATTR_RO(_name) \ @@ -2403,41 +2466,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, unsigned long count, size_t len) { int err; - NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); + nodemask_t nodes_allowed, *n_mask; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) { - err = -EINVAL; - goto out; - } + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - } else if (nodes_allowed) { + init_nodemask_of_mempolicy(&nodes_allowed))) + n_mask = &node_states[N_MEMORY]; + else + n_mask = &nodes_allowed; + } else { /* - * per node hstate attribute: adjust count to global, - * but restrict alloc/free to the specified node. + * Node specific request. count adjustment happens in + * set_max_huge_pages() after acquiring hugetlb_lock. */ - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; - init_nodemask_of_node(nodes_allowed, nid); - } else - nodes_allowed = &node_states[N_MEMORY]; - - h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); + err = set_max_huge_pages(h, count, nid, n_mask); - return len; -out: - NODEMASK_FREE(nodes_allowed); - return err; + return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, @@ -3247,7 +3301,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, src, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, + vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); } @@ -3359,7 +3414,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); address = start; @@ -3626,7 +3682,8 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* @@ -3777,8 +3834,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3886,21 +3942,14 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { unsigned long key[2]; u32 hash; - if (vma->vm_flags & VM_SHARED) { - key[0] = (unsigned long) mapping; - key[1] = idx; - } else { - key[0] = (unsigned long) mm; - key[1] = address >> huge_page_shift(h); - } + key[0] = (unsigned long) mapping; + key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); @@ -3911,9 +3960,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { return 0; @@ -3958,7 +4005,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -4371,7 +4418,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, + 0, vma, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); @@ -4477,6 +4525,11 @@ int hugetlb_reserve_pages(struct inode *inode, * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { + /* + * resv_map can not be NULL as hugetlb_reserve_pages is only + * called for inodes for which resv_maps were created (see + * hugetlbfs_get_inode). + */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to); @@ -4568,6 +4621,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + /* + * Since this routine can be called in the evict inode path for all + * hugetlbfs inodes, resv_map could be NULL. + */ if (resv_map) { chg = region_del(resv_map, start, end); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 449044378782..a335f7c1fac4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* @@ -1374,7 +1375,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); nr_none++; continue; } @@ -1450,7 +1451,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); @@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, mm, pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); diff --git a/mm/madvise.c b/mm/madvise.c index bb3a4554d5d5..628022e674a7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, mm, range.start, range.end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm, range.start, range.end); diff --git a/mm/memblock.c b/mm/memblock.c index a48f520c2d01..6bbad46f4d2c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ * :c:func:`mem_init` function frees all the memory to the buddy page * allocator. * - * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system * initialization compltes. */ @@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK /** * memblock_discard - discard memory and reserved arrays if they were allocated */ @@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/** + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() + * + * @idx: pointer to u64 loop variable + * @zone: zone in which all of the memory blocks reside + * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL + * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL + * + * This function is meant to be a zone/pfn specific wrapper for the + * for_each_mem_range type iterators. Specifically they are used in the + * deferred memory init routines and as such we were duplicating much of + * this logic throughout the code. So instead of having it in multiple + * locations it seemed like it would make more sense to centralize this to + * one new iterator that does everything they need. + */ +void __init_memblock +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, unsigned long *out_epfn) +{ + int zone_nid = zone_to_nid(zone); + phys_addr_t spa, epa; + int nid; + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + + while (*idx != U64_MAX) { + unsigned long epfn = PFN_DOWN(epa); + unsigned long spfn = PFN_UP(spa); + + /* + * Verify the end is at least past the start of the zone and + * that we have at least one PFN to initialize. + */ + if (zone->zone_start_pfn < epfn && spfn < epfn) { + /* if we went too far just stop searching */ + if (zone_end_pfn(zone) <= spfn) { + *idx = U64_MAX; + break; + } + + if (out_spfn) + *out_spfn = max(zone->zone_start_pfn, spfn); + if (out_epfn) + *out_epfn = min(zone_end_pfn(zone), epfn); + + return; + } + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + } + + /* signal end of iteration */ + if (out_spfn) + *out_spfn = ULONG_MAX; + if (out_epfn) + *out_epfn = 0; +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * memblock_alloc_range_nid - allocate boot memory block @@ -1923,7 +1987,7 @@ unsigned long __init memblock_free_all(void) return pages; } -#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81a0d3914ec9..2bfe21f3e5ec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,34 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - unsigned long nr = 0; - enum lru_list lru; - - VM_BUG_ON((unsigned)nid >= nr_node_ids); - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += mem_cgroup_get_lru_size(lruvec, lru); - } - return nr; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) -{ - unsigned long nr = 0; - int nid; - - for_each_node_state(nid, N_MEMORY) - nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return nr; -} - static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { @@ -1358,7 +1330,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) for (i = 0; i < NR_LRU_LISTS; i++) pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + K(memcg_page_state(iter, NR_LRU_BASE + i))); pr_cont("\n"); } @@ -1384,6 +1356,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { @@ -1422,11 +1399,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) return true; return false; @@ -2168,14 +2149,17 @@ static void high_work_func(struct work_struct *work) void mem_cgroup_handle_over_high(void) { unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = current->memcg_high_reclaim; if (likely(!nr_pages)) return; - memcg = get_mem_cgroup_from_mm(current->mm); + if (!memcg) + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); css_put(&memcg->css); + current->memcg_high_reclaim = NULL; current->memcg_nr_pages_over_high = 0; } @@ -2327,10 +2311,10 @@ done_restock: * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here * if __GFP_RECLAIM but let's always punt for simplicity and so that - * GFP_KERNEL can consistently be used during reclaim. @memcg is - * not recorded as it most likely matches current's and won't - * change in the meantime. As high limit is checked again before - * reclaim, the cost of mismatch is negligible. + * GFP_KERNEL can consistently be used during reclaim. Record the memcg + * for the return-to-userland high reclaim. If the memcg is already + * recorded and the recorded memcg is not the descendant of the memcg + * needing high reclaim, punt the high reclaim to the work queue. */ do { if (page_counter_read(&memcg->memory) > memcg->high) { @@ -2338,6 +2322,13 @@ done_restock: if (in_interrupt()) { schedule_work(&memcg->high_work); break; + } else if (!current->memcg_high_reclaim) { + css_get(&memcg->css); + current->memcg_high_reclaim = memcg; + } else if (!mem_cgroup_is_descendant( + current->memcg_high_reclaim, memcg)) { + schedule_work(&memcg->high_work); + break; } current->memcg_nr_pages_over_high += batch; set_notify_resume(current); @@ -2990,8 +2981,8 @@ static void accumulate_memcg_tree(struct mem_cgroup *memcg, acc->events_array ? acc->events_array[i] : i); for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += - mem_cgroup_nr_lru_pages(mi, BIT(i)); + acc->lru_pages[i] += memcg_page_state(mi, + NR_LRU_BASE + i); } } @@ -3331,6 +3322,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif #ifdef CONFIG_NUMA + +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + unsigned long nr = 0; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask) +{ + unsigned long nr = 0; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + } + return nr; +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { @@ -3421,7 +3448,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], - mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -3927,8 +3955,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | - (1 << LRU_ACTIVE_FILE)); + *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/memory.c b/mm/memory.c index f7d962d7de19..0d0711a912de 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb, { struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); @@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1523,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +/* + * __vm_map_pages - maps range of kernel pages into user vma + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * @offset: user's requested vm_pgoff + * + * This allows drivers to map range of kernel pages into a user vma. + * + * Return: 0 on success and error code otherwise. + */ +static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num, unsigned long offset) +{ + unsigned long count = vma_pages(vma); + unsigned long uaddr = vma->vm_start; + int ret, i; + + /* Fail if the user requested offset is beyond the end of the object */ + if (offset > num) + return -ENXIO; + + /* Fail if the user requested size exceeds available object size */ + if (count > num - offset) + return -ENXIO; + + for (i = 0; i < count; i++) { + ret = vm_insert_page(vma, uaddr, pages[offset + i]); + if (ret < 0) + return ret; + uaddr += PAGE_SIZE; + } + + return 0; +} + +/** + * vm_map_pages - maps range of kernel pages starts with non zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Maps an object consisting of @num pages, catering for the user's + * requested vm_pgoff + * + * If we fail to insert any page into the vma, the function will return + * immediately leaving any previously inserted pages present. Callers + * from the mmap handler may immediately return the error as their caller + * will destroy the vma, removing any successfully inserted pages. Other + * callers should make their own arrangements for calling unmap_region(). + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, vma->vm_pgoff); +} +EXPORT_SYMBOL(vm_map_pages); + +/** + * vm_map_pages_zero - map range of kernel pages starts with zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Similar to vm_map_pages(), except that it explicitly sets the offset + * to 0. This function is intended for the drivers that did not consider + * vm_pgoff. + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, 0); +} +EXPORT_SYMBOL(vm_map_pages_zero); + static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { @@ -2279,7 +2364,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2830,7 +2916,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -2844,6 +2929,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); swap_free(entry); if (mem_cgroup_swap_full(page) || @@ -4104,8 +4190,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PMD_MASK, - (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, + NULL, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); @@ -4122,8 +4209,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PAGE_MASK, - (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b236069ff0d8..328878b6799d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -39,6 +39,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "shuffle.h" /* * online_page_callback contains pointer to current page onlining function. @@ -273,12 +274,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap = restrictions->altmap; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); @@ -299,7 +300,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, section_nr_to_pfn(i), altmap, - want_memblock); + restrictions->flags & MHP_MEMBLOCK_API); /* * EEXIST is finally dealt with by ioresource collision @@ -516,26 +517,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, struct vmem_altmap *altmap) +static void __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset, + struct vmem_altmap *altmap) { unsigned long start_pfn; int scn_nr; - int ret = -EINVAL; - if (!valid_section(ms)) - return ret; + if (WARN_ON_ONCE(!valid_section(ms))) + return; - ret = unregister_memory_section(ms); - if (ret) - return ret; + unregister_memory_section(ms); scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); sparse_remove_one_section(zone, ms, map_offset, altmap); - return 0; } /** @@ -550,31 +548,17 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; - int sections_to_remove, ret = 0; + int sections_to_remove; /* In the ZONE_DEVICE case device driver owns the memory region */ if (is_dev_zone(zone)) { if (altmap) map_offset = vmem_altmap_offset(altmap); - } else { - resource_size_t start, size; - - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - ret = release_mem_region_adjustable(&iomem_resource, start, - size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } } clear_zone_contiguous(zone); @@ -590,16 +574,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; cond_resched(); - ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + __remove_section(zone, __pfn_to_section(pfn), map_offset, + altmap); map_offset = 0; - if (ret) - break; } set_zone_contiguous(zone); - - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -714,7 +694,7 @@ static void node_states_check_changes_online(unsigned long nr_pages, if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; #ifdef CONFIG_HIGHMEM - if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; #endif } @@ -912,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->zone_pgdat->node_present_pages += onlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); + shuffle_zone(zone); + if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1097,6 +1079,9 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { + struct mhp_restrictions restrictions = { + .flags = MHP_MEMBLOCK_API, + }; u64 start, size; bool new_node = false; int ret; @@ -1124,7 +1109,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, NULL, true); + ret = arch_add_memory(nid, start, size, &restrictions); if (ret < 0) goto error; @@ -1341,8 +1326,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (!PageHuge(page)) continue; head = compound_head(page); - if (hugepage_migration_supported(page_hstate(head)) && - page_huge_active(head)) + if (page_huge_active(head)) return pfn; skip = (1 << compound_order(head)) - (page - head); pfn += skip - 1; @@ -1382,10 +1366,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - if (compound_order(head) > PFN_SECTION_SHIFT) { - ret = -EBUSY; - break; - } pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; isolate_huge_page(head, &source); continue; @@ -1454,15 +1434,10 @@ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) { - __offline_isolated_pages(start, start + nr_pages); - return 0; -} + unsigned long *offlined_pages = (unsigned long *)data; -static void -offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) -{ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, - offline_isolated_pages_cb); + *offlined_pages += __offline_isolated_pages(start, start + nr_pages); + return 0; } /* @@ -1472,26 +1447,7 @@ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { - int ret; - long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); - offlined = nr_pages; - if (!ret) - *(long *)data += offlined; - return ret; -} - -static long -check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) -{ - long offlined = 0; - int ret; - - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, - check_pages_isolated_cb); - if (ret < 0) - offlined = (long)ret; - return offlined; + return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); } static int __init cmdline_parse_movable_node(char *p) @@ -1576,7 +1532,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, nr_pages; - long offlined_pages; + unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; unsigned long valid_start, valid_end; @@ -1652,14 +1608,15 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal_isolated; } /* check again */ - offlined_pages = check_pages_isolated(start_pfn, end_pfn); - } while (offlined_pages < 0); + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + NULL, check_pages_isolated_cb); + } while (ret); - pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ - offline_isolated_pages(start_pfn, end_pfn); - + walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &offlined_pages, offline_isolated_pages_cb); + pr_info("Offlined Pages %ld\n", offlined_pages); /* * Onlining will reset pagetype flags and makes migrate type * MOVABLE, so just need to decrease the number of isolated @@ -1843,6 +1800,26 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); +static void __release_memory_resource(resource_size_t start, + resource_size_t size) +{ + int ret; + + /* + * When removing memory in the same granularity as it was added, + * this function never fails. It might only fail if resources + * have to be adjusted or split. We'll ignore the error, as + * removing of memory cannot fail. + */ + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } +} + /** * remove_memory * @nid: the node ID @@ -1877,6 +1854,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size); arch_remove_memory(nid, start, size, NULL); + __release_memory_resource(start, size); try_offline_node(nid); diff --git a/mm/migrate.c b/mm/migrate.c index 663a5449367a..f2ecc2855a12 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } @@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, + migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); @@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, + MMU_NOTIFY_CLEAR, 0, + NULL, migrate->vma->vm_mm, addr, migrate->end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mincore.c b/mm/mincore.c index 218099b5ed31..c3f058bd0faf 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -169,6 +169,22 @@ out: return 0; } +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * Reveal pagecache information only for non-anonymous mappings that + * correspond to the files the calling process could (if tried) open + * for writing; otherwise we'd be including shared non-exclusive + * mappings, which opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); + memset(vec, 1, pages); + return pages; + } + mincore_walk.mm = vma->vm_mm; err = walk_page_range(addr, end, &mincore_walk); if (err < 0) return err; diff --git a/mm/mlock.c b/mm/mlock.c index 080f3b36415b..e492a155c51a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -562,7 +562,7 @@ success: nr_pages = -nr_pages; else if (old_flags & VM_LOCKED) nr_pages = 0; - mm->locked_vm += nr_pages; + atomic64_add(nr_pages, &mm->locked_vm); /* * vm_flags is protected by the mmap_sem held in write mode. @@ -687,7 +687,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; - locked += current->mm->locked_vm; + locked += atomic64_read(¤t->mm->locked_vm); if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with diff --git a/mm/mmap.c b/mm/mmap.c index bd7b9f293b39..9cf52bdb22a8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1340,7 +1340,7 @@ static inline int mlock_future_check(struct mm_struct *mm, /* mlock MCL_FUTURE? */ if (flags & VM_LOCKED) { locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; + locked += atomic64_read(&mm->locked_vm); lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) @@ -1826,7 +1826,7 @@ out: vma == get_gate_vma(current->mm)) vma->vm_flags &= VM_LOCKED_CLEAR_MASK; else - mm->locked_vm += (len >> PAGE_SHIFT); + atomic64_add(len >> PAGE_SHIFT, &mm->locked_vm); } if (file) @@ -2302,7 +2302,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { unsigned long locked; unsigned long limit; - locked = mm->locked_vm + grow; + locked = atomic64_read(&mm->locked_vm) + grow; limit = rlimit(RLIMIT_MEMLOCK); limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) @@ -2396,7 +2396,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; + atomic64_add(grow, &mm->locked_vm); vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; @@ -2476,7 +2476,7 @@ int expand_downwards(struct vm_area_struct *vma, */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; + atomic64_add(grow, &mm->locked_vm); vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; @@ -2801,11 +2801,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* * unlock any mlock()ed ranges before detaching vmas */ - if (mm->locked_vm) { + if (atomic64_read(&mm->locked_vm)) { struct vm_area_struct *tmp = vma; while (tmp && tmp->vm_start < end) { if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); + atomic64_sub(vma_pages(tmp), &mm->locked_vm); munlock_vma_pages_all(tmp); } @@ -3048,7 +3048,7 @@ out: mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) - mm->locked_vm += (len >> PAGE_SHIFT); + atomic64_add(len >> PAGE_SHIFT, &mm->locked_vm); vma->vm_flags |= VM_SOFTDIRTY; return 0; } @@ -3120,7 +3120,7 @@ void exit_mmap(struct mm_struct *mm) up_write(&mm->mmap_sem); } - if (mm->locked_vm) { + if (atomic64_read(&mm->locked_vm)) { vma = mm->mmap; while (vma) { if (vma->vm_flags & VM_LOCKED) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9c884abc7850..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", mn->ops->invalidate_range_start, _ret, - !range->blockable ? "non-" : ""); + !mmu_notifier_range_blockable(range) ? "non-" : ""); ret = _ret; } } @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mprotect.c b/mm/mprotect.c index 028c724dcb1a..65242f1e4457 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,7 +185,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index e3edef6b7a12..37b5b2ad91be 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + old_addr, old_end); mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { @@ -422,7 +423,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } if (vm_flags & VM_LOCKED) { - mm->locked_vm += new_len >> PAGE_SHIFT; + atomic64_add(new_len >> PAGE_SHIFT, &mm->locked_vm); *locked = true; } @@ -473,7 +474,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; + locked = atomic64_read(&mm->locked_vm) << PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); locked += new_len - old_len; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) @@ -679,7 +680,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { - mm->locked_vm += pages; + atomic64_add(pages, &mm->locked_vm); locked = true; new_addr = addr; } diff --git a/mm/nommu.c b/mm/nommu.c index 749276beb109..b492fd1fcf9f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages); + +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages_zero); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3a2484884cfd..539c91d0b26a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, mm, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm, range.start, range.end); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59661106da16..1f99db76b1ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/mempolicy.h> #include <linux/memremap.h> #include <linux/stop_machine.h> +#include <linux/random.h> #include <linux/sort.h> #include <linux/pfn.h> #include <linux/backing-dev.h> @@ -72,6 +73,7 @@ #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#include "shuffle.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -755,12 +757,6 @@ static inline void set_page_order(struct page *page, unsigned int order) __SetPageBuddy(page); } -static inline void rmv_page_order(struct page *page) -{ - __ClearPageBuddy(page); - set_page_private(page, 0); -} - /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if @@ -918,13 +914,10 @@ continue_merging: * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. */ - if (page_is_guard(buddy)) { + if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); - } else { - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); - } + else + del_page_from_free_area(buddy, &zone->free_area[order]); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -966,7 +959,8 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) + && !is_shuffle_order(order)) { struct page *higher_page, *higher_buddy; combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); @@ -974,15 +968,18 @@ done_merging: higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (pfn_valid_within(buddy_pfn) && page_is_buddy(higher_page, higher_buddy, order + 1)) { - list_add_tail(&page->lru, - &zone->free_area[order].free_list[migratetype]); - goto out; + add_to_free_area_tail(page, &zone->free_area[order], + migratetype); + return; } } - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); -out: - zone->free_area[order].nr_free++; + if (is_shuffle_order(order)) + add_to_free_area_random(page, &zone->free_area[order], + migratetype); + else + add_to_free_area(page, &zone->free_area[order], migratetype); + } /* @@ -1416,36 +1413,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { int nid; - nid = __early_pfn_to_nid(pfn, state); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid >= 0 && nid != node) return false; return true; } -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - #else - static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - return true; -} #endif @@ -1574,21 +1557,13 @@ static inline void __init pgdat_init_report_one_done(void) * * Then, we check if a current large page is valid by only checking the validity * of the head pfn. - * - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not belong - * to this memory node. */ -static inline bool __init -deferred_pfn_valid(int nid, unsigned long pfn, - struct mminit_pfnnid_cache *nid_init_state) +static inline bool __init deferred_pfn_valid(unsigned long pfn) { if (!pfn_valid_within(pfn)) return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) - return false; return true; } @@ -1596,15 +1571,14 @@ deferred_pfn_valid(int nid, unsigned long pfn, * Free pages to buddy allocator. Try to free aligned pages in * pageblock_nr_pages sizes. */ -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, +static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; } else if (!(pfn & nr_pgmask)) { @@ -1624,17 +1598,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, * by performing it only once every pageblock_nr_pages. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(int nid, int zid, +static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; + int nid = zone_to_nid(zone); unsigned long nr_pages = 0; + int zid = zone_idx(zone); struct page *page = NULL; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { page = NULL; continue; } else if (!page || !(pfn & nr_pgmask)) { @@ -1649,18 +1624,100 @@ static unsigned long __init deferred_init_pages(int nid, int zid, return (nr_pages); } +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, then free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + nr_pages += deferred_init_pages(zone, *start_pfn, t); + + if (mo_pfn < *end_pfn) { + *start_pfn = mo_pfn; + break; + } + } + + /* Reset values and now loop through freeing pages as needed */ + swap(j, *i); + + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + deferred_free_pages(spfn, t); + + if (mo_pfn <= epfn) + break; + } + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; - int nid = pgdat->node_id; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long first_init_pfn, flags; unsigned long start = jiffies; - unsigned long nr_pages = 0; - unsigned long spfn, epfn, first_init_pfn, flags; - phys_addr_t spa, epa; - int zid; struct zone *zone; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + int zid; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1686,31 +1743,27 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) + goto zone_empty; /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, than free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. */ - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); - } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); - } + while (spfn < epfn) + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); +zone_empty: pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, - jiffies_to_msecs(jiffies - start)); + pr_info("node %d initialised, %lu pages in %ums\n", + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1734,14 +1787,11 @@ static int __init deferred_init_memmap(void *data) static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int zid = zone_idx(zone); - int nid = zone_to_nid(zone); - pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); - unsigned long nr_pages = 0; - unsigned long first_init_pfn, spfn, epfn, t, flags; + pg_data_t *pgdat = zone->zone_pgdat; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; - phys_addr_t spa, epa; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; u64 i; /* Only the last zone may have deferred pages */ @@ -1770,38 +1820,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return true; } - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); - - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { + pgdat->first_deferred_pfn = ULONG_MAX; pgdat_resize_unlock(pgdat, &flags); - return false; + return true; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn) { + /* update our first deferred PFN for this section */ + first_deferred_pfn = spfn; - while (spfn < epfn && nr_pages < nr_pages_needed) { - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); - first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(nid, zid, spfn, - first_deferred_pfn); - spfn = first_deferred_pfn; - } + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + + /* We should only stop along section boundaries */ + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) + continue; + /* If our quota has been met we can stop here */ if (nr_pages >= nr_pages_needed) break; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); - - if (first_deferred_pfn == epfn) - break; - } - pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat->first_deferred_pfn = spfn; pgdat_resize_unlock(pgdat, &flags); return nr_pages > 0; @@ -1824,9 +1871,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) void __init page_alloc_init_late(void) { struct zone *zone; + int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - int nid; /* There will be num_node_state(N_MEMORY) threads */ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); @@ -1846,10 +1893,12 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ memblock_discard(); -#endif + + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); for_each_populated_zone(zone) set_zone_contiguous(zone); @@ -1921,8 +1970,7 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - list_add(&page[size].lru, &area->free_list[migratetype]); - area->nr_free++; + add_to_free_area(&page[size], area, migratetype); set_page_order(&page[size], high); } } @@ -2064,13 +2112,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - page = list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); + page = get_page_from_free_area(area, migratetype); if (!page) continue; - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; + del_page_from_free_area(page, area); expand(zone, page, order, current_order, area, migratetype); set_pcppage_migratetype(page, migratetype); return page; @@ -2156,8 +2201,7 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_move(&page->lru, - &zone->free_area[order].free_list[migratetype]); + move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2345,7 +2389,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, single_page: area = &zone->free_area[current_order]; - list_move(&page->lru, &area->free_list[start_type]); + move_to_free_area(page, area, start_type); } /* @@ -2369,7 +2413,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (fallback_mt == MIGRATE_TYPES) break; - if (list_empty(&area->free_list[fallback_mt])) + if (free_area_empty(area, fallback_mt)) continue; if (can_steal_fallback(order, migratetype)) @@ -2456,9 +2500,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - page = list_first_entry_or_null( - &area->free_list[MIGRATE_HIGHATOMIC], - struct page, lru); + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; @@ -2581,8 +2623,7 @@ find_smallest: VM_BUG_ON(current_order == MAX_ORDER); do_steal: - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + page = get_page_from_free_area(area, fallback_mt); steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, can_steal); @@ -3019,6 +3060,7 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { + struct free_area *area = &page_zone(page)->free_area[order]; unsigned long watermark; struct zone *zone; int mt; @@ -3043,9 +3085,8 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* Remove page from free list */ - list_del(&page->lru); - zone->free_area[order].nr_free--; - rmv_page_order(page); + + del_page_from_free_area(page, area); /* * Set the pageblock if the isolated page is at least half of a @@ -3120,9 +3161,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + struct zone *zone, gfp_t gfp_flags, + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3134,7 +3174,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } local_irq_restore(flags); @@ -3154,8 +3194,8 @@ struct page *rmqueue(struct zone *preferred_zone, struct page *page; if (likely(order == 0)) { - page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, + migratetype, alloc_flags); goto out; } @@ -3343,13 +3383,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { - if (!list_empty(&area->free_list[mt])) + if (!free_area_empty(area, mt)) return true; } #ifdef CONFIG_CMA if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + !free_area_empty(area, MIGRATE_CMA)) { return true; } #endif @@ -4821,7 +4861,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * This function is similar to alloc_pages(), except that it allocates the * minimum number of pages to satisfy the request. alloc_pages() can only @@ -4838,6 +4878,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); unsigned long addr; + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); } @@ -4848,7 +4891,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. @@ -4858,7 +4901,12 @@ EXPORT_SYMBOL(alloc_pages_exact); void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); - struct page *p = alloc_pages_node(nid, gfp_mask, order); + struct page *p; + + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + + p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); @@ -5268,7 +5316,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { - if (!list_empty(&area->free_list[type])) + if (!free_area_empty(area, type)) types[order] |= 1 << type; } } @@ -5486,6 +5534,8 @@ static void build_zonelists(pg_data_t *pgdat) int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; + struct zone *zone; + struct zoneref *z; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; @@ -5511,6 +5561,11 @@ static void build_zonelists(pg_data_t *pgdat) build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); + + pr_info("node[%d] zonelist: ", pgdat->node_id); + for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); + pr_cont("\n"); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES @@ -5613,10 +5668,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - build_zonelists(pgdat); + if (pgdat) + build_zonelists(pgdat); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES @@ -6247,13 +6303,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long *zone_end_pfn, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, zone_start_pfn, zone_end_pfn); @@ -6897,10 +6955,8 @@ static unsigned long __init find_min_pfn_for_node(int nid) for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) min_pfn = min(min_pfn, start_pfn); - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); + if (min_pfn == ULONG_MAX) return 0; - } return min_pfn; } @@ -7244,8 +7300,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) mminit_verify_pageflags_layout(); setup_nr_node_ids(); zero_resv_unavail(); - for_each_online_node(nid) { + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); + + if (!pgdat) + continue; + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); @@ -8129,8 +8189,7 @@ unmovable: return true; } -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) - +#ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, @@ -8339,8 +8398,9 @@ done: pfn_max_align_up(end), migratetype); return ret; } +#endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned nr_pages) +void free_contig_range(unsigned long pfn, unsigned int nr_pages) { unsigned int count = 0; @@ -8352,7 +8412,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } WARN(count != 0, "%d pages are still in use!\n", count); } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* @@ -8394,7 +8453,7 @@ void zone_pcp_reset(struct zone *zone) * All pages in the range must be in a single zone and isolated * before calling this. */ -void +unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; @@ -8402,12 +8461,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned int order, i; unsigned long pfn; unsigned long flags; + unsigned long offlined_pages = 0; + /* find the first valid pfn */ for (pfn = start_pfn; pfn < end_pfn; pfn++) if (pfn_valid(pfn)) break; if (pfn == end_pfn) - return; + return offlined_pages; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); @@ -8425,24 +8487,26 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; SetPageReserved(page); + offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); + offlined_pages += 1 << order; #ifdef CONFIG_DEBUG_VM pr_info("remove from free list %lx %d %lx\n", pfn, 1 << order, end_pfn); #endif - list_del(&page->lru); - rmv_page_order(page); - zone->free_area[order].nr_free--; + del_page_from_free_area(page, &zone->free_area[order]); for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); + + return offlined_pages; } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 019280712e1b..e3638a5bafff 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -151,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) for (i = 0; i < nr_pages; i++) { struct page *page; - if (!pfn_valid_within(pfn + i)) - continue; page = pfn_to_online_page(pfn + i); if (!page) continue; diff --git a/mm/rmap.c b/mm/rmap.c index b30c7c71d1d9..e5dfe2ae6b0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -850,7 +850,7 @@ int page_referenced(struct page *page, }; *vm_flags = 0; - if (!page_mapped(page)) + if (!pra.mapcount) return 0; if (!page_rmapping(page)) @@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -928,7 +929,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, continue; flush_cache_page(vma, address, page_to_pfn(page)); - entry = pmdp_huge_clear_flush(vma, address, pmd); + entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); @@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { diff --git a/mm/shmem.c b/mm/shmem.c index f4dce9c8670d..1bb3b8dc8bb2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; diff --git a/mm/shuffle.c b/mm/shuffle.c new file mode 100644 index 000000000000..3ce12481b1dc --- /dev/null +++ b/mm/shuffle.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/mmzone.h> +#include <linux/random.h> +#include <linux/moduleparam.h> +#include "internal.h" +#include "shuffle.h" + +DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +static unsigned long shuffle_state __ro_after_init; + +/* + * Depending on the architecture, module parameter parsing may run + * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents, + * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE + * attempts to turn on the implementation, but aborts if it finds + * SHUFFLE_FORCE_DISABLE already set. + */ +__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ + if (ctl == SHUFFLE_FORCE_DISABLE) + set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state); + + if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) { + if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_disable(&page_alloc_shuffle_key); + } else if (ctl == SHUFFLE_ENABLE + && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_enable(&page_alloc_shuffle_key); +} + +static bool shuffle_param; +extern int shuffle_show(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) + ? 'Y' : 'N'); +} + +static __meminit int shuffle_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + if (shuffle_param) + page_alloc_shuffle(SHUFFLE_ENABLE); + else + page_alloc_shuffle(SHUFFLE_FORCE_DISABLE); + return 0; +} +module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) +{ + struct page *page; + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ...is the pfn even in the memmap? */ + if (!pfn_valid_within(pfn)) + return NULL; + + /* ...is the pfn in a present section or a hole? */ + if (!pfn_present(pfn)) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + page = pfn_to_page(pfn); + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (page_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through an + * array, pfns in this case, and randomly swapping each entry with another in + * the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for sources + * of bias in the distribution, like modulo bias or pseudo-random number + * generator bias. I.e. the expectation is that this shuffling raises the bar + * for attacks that exploit the predictability of page allocations, but need not + * be a perfect shuffle. + */ +#define SHUFFLE_RETRY 10 +void __meminit __shuffle_zone(struct zone *z) +{ + unsigned long i, flags; + unsigned long start_pfn = z->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(z); + const int order = SHUFFLE_ORDER; + const int order_pages = 1 << order; + + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being added + * (@start_pfn to @end_pfn), to more likely be valid compared to + * page_j randomly selected in the span @zone_start_pfn to + * @spanned_pages. + */ + page_i = shuffle_valid_page(i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page in the zone span as + * a swap target. If the selected pfn is a hole, retry + * up to SHUFFLE_RETRY attempts find a random valid pfn + * in the zone. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make sure the + * types match otherwise we're moving pages to lists where they + * do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +/** + * shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + */ +void __meminit __shuffle_free_memory(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z); +} + +void add_to_free_area_random(struct page *page, struct free_area *area, + int migratetype) +{ + static u64 rand; + static u8 rand_bits; + + /* + * The lack of locking is deliberate. If 2 threads race to + * update the rand state it just adds to the entropy. + */ + if (rand_bits == 0) { + rand_bits = 64; + rand = get_random_u64(); + } + + if (rand & 1) + add_to_free_area(page, area, migratetype); + else + add_to_free_area_tail(page, area, migratetype); + rand_bits--; + rand >>= 1; +} diff --git a/mm/shuffle.h b/mm/shuffle.h new file mode 100644 index 000000000000..777a257a0d2f --- /dev/null +++ b/mm/shuffle.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. +#ifndef _MM_SHUFFLE_H +#define _MM_SHUFFLE_H +#include <linux/jump_label.h> + +/* + * SHUFFLE_ENABLE is called from the command line enabling path, or by + * platform-firmware enabling that indicates the presence of a + * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from + * the command line path and overrides any previous or future + * SHUFFLE_ENABLE. + */ +enum mm_shuffle_ctl { + SHUFFLE_ENABLE, + SHUFFLE_FORCE_DISABLE, +}; + +#define SHUFFLE_ORDER (MAX_ORDER-1) + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); +extern void __shuffle_free_memory(pg_data_t *pgdat); +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_free_memory(pgdat); +} + +extern void __shuffle_zone(struct zone *z); +static inline void shuffle_zone(struct zone *z) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_zone(z); +} + +static inline bool is_shuffle_order(int order) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return false; + return order >= SHUFFLE_ORDER; +} +#else +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ +} + +static inline void shuffle_zone(struct zone *z) +{ +} + +static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ +} + +static inline bool is_shuffle_order(int order) +{ + return false; +} +#endif +#endif /* _MM_SHUFFLE_H */ diff --git a/mm/slab.c b/mm/slab.c index 284ab737faee..2915d912e89a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -990,10 +990,8 @@ static void cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = per_cpu_ptr(cachep->cpu_cache, cpu); - if (nc) { - free_block(cachep, nc->entry, nc->avail, node, &list); - nc->avail = 0; - } + free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1674,8 +1672,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) { struct page *page, *n; - list_for_each_entry_safe(page, n, list, lru) { - list_del(&page->lru); + list_for_each_entry_safe(page, n, list, slab_list) { + list_del(&page->slab_list); slab_destroy(cachep, page); } } @@ -2231,8 +2229,8 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - page = list_entry(p, struct page, lru); - list_del(&page->lru); + page = list_entry(p, struct page, slab_list); + list_del(&page->slab_list); n->free_slabs--; n->total_slabs--; /* @@ -2691,13 +2689,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) if (!page) return; - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->slab_list); n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); n->total_slabs++; if (!page->active) { - list_add_tail(&page->lru, &(n->slabs_free)); + list_add_tail(&page->slab_list, &n->slabs_free); n->free_slabs++; } else fixup_slab_list(cachep, n, page, &list); @@ -2806,9 +2804,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, void **list) { /* move slabp to correct slabp list: */ - list_del(&page->lru); + list_del(&page->slab_list); if (page->active == cachep->num) { - list_add(&page->lru, &n->slabs_full); + list_add(&page->slab_list, &n->slabs_full); if (OBJFREELIST_SLAB(cachep)) { #if DEBUG /* Poisoning will be done without holding the lock */ @@ -2822,7 +2820,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, page->freelist = NULL; } } else - list_add(&page->lru, &n->slabs_partial); + list_add(&page->slab_list, &n->slabs_partial); } /* Try to find non-pfmemalloc slab if needed */ @@ -2845,20 +2843,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, } /* Move pfmemalloc slab to the end of list to speed up next search */ - list_del(&page->lru); + list_del(&page->slab_list); if (!page->active) { - list_add_tail(&page->lru, &n->slabs_free); + list_add_tail(&page->slab_list, &n->slabs_free); n->free_slabs++; } else - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&page->slab_list, &n->slabs_partial); - list_for_each_entry(page, &n->slabs_partial, lru) { + list_for_each_entry(page, &n->slabs_partial, slab_list) { if (!PageSlabPfmemalloc(page)) return page; } n->free_touched = 1; - list_for_each_entry(page, &n->slabs_free, lru) { + list_for_each_entry(page, &n->slabs_free, slab_list) { if (!PageSlabPfmemalloc(page)) { n->free_slabs--; return page; @@ -2873,11 +2871,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) struct page *page; assert_spin_locked(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); + page = list_first_entry_or_null(&n->slabs_partial, struct page, + slab_list); if (!page) { n->free_touched = 1; page = list_first_entry_or_null(&n->slabs_free, struct page, - lru); + slab_list); if (page) n->free_slabs--; } @@ -3378,29 +3377,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp, objp = objpp[i]; page = virt_to_head_page(objp); - list_del(&page->lru); + list_del(&page->slab_list); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ if (page->active == 0) { - list_add(&page->lru, &n->slabs_free); + list_add(&page->slab_list, &n->slabs_free); n->free_slabs++; } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&page->slab_list, &n->slabs_partial); } } while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { n->free_objects -= cachep->num; - page = list_last_entry(&n->slabs_free, struct page, lru); - list_move(&page->lru, list); + page = list_last_entry(&n->slabs_free, struct page, slab_list); + list_move(&page->slab_list, list); n->free_slabs--; n->total_slabs--; } @@ -3438,7 +3437,7 @@ free_done: int i = 0; struct page *page; - list_for_each_entry(page, &n->slabs_free, lru) { + list_for_each_entry(page, &n->slabs_free, slab_list) { BUG_ON(page->active); i++; @@ -4292,8 +4291,12 @@ static int leaks_show(struct seq_file *m, void *p) * whole processing. */ do { - set_store_user_clean(cachep); drain_cpu_caches(cachep); + /* + * drain_cpu_caches() could make kmemleak_object and + * debug_objects_cache dirty, so reset afterwards. + */ + set_store_user_clean(cachep); x[1] = 0; @@ -4302,9 +4305,9 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) + list_for_each_entry(page, &n->slabs_full, slab_list) handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) + list_for_each_entry(page, &n->slabs_partial, slab_list) handle_slab(x, cachep, page); spin_unlock_irq(&n->list_lock); } diff --git a/mm/slob.c b/mm/slob.c index 307c2c9feb44..84aefd9b91ee 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->lru, list); + list_add(&sp->slab_list, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->lru); + list_del(&sp->slab_list); __ClearPageSlobFree(sp); } @@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order) } /* - * Allocate a slob block within a given slob_page sp. + * slob_page_alloc() - Allocate a slob block within a given slob_page sp. + * @sp: Page to look in. + * @size: Size of the allocation. + * @align: Allocation alignment. + * @page_removed_from_list: Return parameter. + * + * Tries to find a chunk of memory at least @size bytes big within @page. + * + * Return: Pointer to memory if allocated, %NULL otherwise. If the + * allocation fills up @page then the page is removed from the + * freelist, in this case @page_removed_from_list will be set to + * true (set to false otherwise). */ -static void *slob_page_alloc(struct page *sp, size_t size, int align) +static void *slob_page_alloc(struct page *sp, size_t size, int align, + bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); + *page_removed_from_list = false; for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); @@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align) } sp->units -= units; - if (!sp->units) + if (!sp->units) { clear_slob_page_free(sp); + *page_removed_from_list = true; + } return cur; } if (slob_last(cur)) @@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align) static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { struct page *sp; - struct list_head *prev; struct list_head *slob_list; slob_t *b = NULL; unsigned long flags; + bool _unused; if (size < SLOB_BREAK1) slob_list = &free_slob_small; @@ -283,7 +298,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, lru) { + list_for_each_entry(sp, slob_list, slab_list) { + bool page_removed_from_list = false; #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - /* Attempt to alloc */ - prev = sp->lru.prev; - b = slob_page_alloc(sp, size, align); + b = slob_page_alloc(sp, size, align, &page_removed_from_list); if (!b) continue; - /* Improve fragment distribution and reduce our average - * search time by starting our next search here. (see - * Knuth vol 1, sec 2.5, pg 449) */ - if (prev != slob_list->prev && - slob_list->next != prev->next) - list_move_tail(slob_list, prev->next); + /* + * If slob_page_alloc() removed sp from the list then we + * cannot call list functions on sp. If so allocation + * did not fragment the page anyway so optimisation is + * unnecessary. + */ + if (!page_removed_from_list) { + /* + * Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) + */ + if (!list_is_first(&sp->slab_list, slob_list)) + list_rotate_to_front(&sp->slab_list, slob_list); + } break; } spin_unlock_irqrestore(&slob_lock, flags); @@ -323,10 +346,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->lru); + INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align); + b = slob_page_alloc(sp, size, align, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } diff --git a/mm/slub.c b/mm/slub.c index 6b28cd2b5a58..e6ce13c54cb0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -58,10 +58,11 @@ * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not - * on any list. The processor that froze the slab is the one who can - * perform list operations on the page. Other processors may put objects - * onto the freelist but the processor that froze the slab is the only - * one that can retrieve the objects from the page's freelist. + * on any list except per cpu partial list. The processor that froze the + * slab is the one who can perform list operations on the page. Other + * processors may put objects onto the freelist but the processor that + * froze the slab is the only one that can retrieve the objects from the + * page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -1014,7 +1015,7 @@ static void add_full(struct kmem_cache *s, return; lockdep_assert_held(&n->list_lock); - list_add(&page->lru, &n->full); + list_add(&page->slab_list, &n->full); } static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -1023,7 +1024,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct return; lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); } /* Tracking of the number of slabs for debugging purposes */ @@ -1764,9 +1765,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) - list_add_tail(&page->lru, &n->partial); + list_add_tail(&page->slab_list, &n->partial); else - list_add(&page->lru, &n->partial); + list_add(&page->slab_list, &n->partial); } static inline void add_partial(struct kmem_cache_node *n, @@ -1780,7 +1781,7 @@ static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); n->nr_partial--; } @@ -1854,7 +1855,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, return NULL; spin_lock(&n->list_lock); - list_for_each_entry_safe(page, page2, &n->partial, lru) { + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; if (!pfmemalloc_match(page, flags)) @@ -1942,7 +1943,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, } } } while (read_mems_allowed_retry(cpuset_mems_cookie)); -#endif +#endif /* CONFIG_NUMA */ return NULL; } @@ -2240,7 +2241,7 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } /* @@ -2299,7 +2300,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_restore(flags); } preempt_enable(); -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2398,7 +2399,7 @@ static unsigned long count_partial(struct kmem_cache_node *n, struct page *page; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + list_for_each_entry(page, &n->partial, slab_list) x += get_count(page); spin_unlock_irqrestore(&n->list_lock, flags); return x; @@ -2804,7 +2805,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif -#endif +#endif /* CONFIG_NUMA */ /* * Slow path handling. This may still be called frequently since objects @@ -2903,8 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - if (kmem_cache_debug(s)) - remove_full(s, n, page); + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -3696,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &n->partial, lru) { + list_for_each_entry_safe(page, h, &n->partial, slab_list) { if (!page->inuse) { remove_partial(n, page); - list_add(&page->lru, &discard); + list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, "Objects remaining in %s on __kmem_cache_shutdown()"); @@ -3707,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) } spin_unlock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &discard, lru) + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -3839,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } EXPORT_SYMBOL(__kmalloc_node); -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_HARDENED_USERCOPY /* @@ -3987,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ - list_for_each_entry_safe(page, t, &n->partial, lru) { + list_for_each_entry_safe(page, t, &n->partial, slab_list) { int free = page->objects - page->inuse; /* Do not reread page->inuse */ @@ -3997,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s) BUG_ON(free <= 0); if (free == page->objects) { - list_move(&page->lru, &discard); + list_move(&page->slab_list, &discard); n->nr_partial--; } else if (free <= SHRINK_PROMOTE_MAX) - list_move(&page->lru, promote + free - 1); + list_move(&page->slab_list, promote + free - 1); } /* @@ -4013,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, &discard, lru) + list_for_each_entry_safe(page, t, &discard, slab_list) discard_slab(s, page); if (slabs_node(s, node)) @@ -4057,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) */ slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); } -#endif +#endif /* CONFIG_MEMCG */ static int slab_mem_going_offline_callback(void *arg) { @@ -4205,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) for_each_kmem_cache_node(s, node, n) { struct page *p; - list_for_each_entry(p, &n->partial, lru) + list_for_each_entry(p, &n->partial, slab_list) p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) + list_for_each_entry(p, &n->full, slab_list) p->slab_cache = s; #endif } @@ -4426,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s, spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) { + list_for_each_entry(page, &n->partial, slab_list) { validate_slab_slab(s, page, map); count++; } @@ -4437,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s, if (!(s->flags & SLAB_STORE_USER)) goto out; - list_for_each_entry(page, &n->full, lru) { + list_for_each_entry(page, &n->full, slab_list) { validate_slab_slab(s, page, map); count++; } @@ -4633,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf, continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + list_for_each_entry(page, &n->partial, slab_list) process_slab(&t, s, page, alloc, map); - list_for_each_entry(page, &n->full, lru) + list_for_each_entry(page, &n->full, slab_list) process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -4690,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf, "No data\n"); return len; } -#endif +#endif /* CONFIG_SLUB_DEBUG */ #ifdef SLUB_RESILIENCY_TEST static void __init resiliency_test(void) @@ -4750,7 +4750,7 @@ static void __init resiliency_test(void) #ifdef CONFIG_SYSFS static void resiliency_test(void) {}; #endif -#endif +#endif /* SLUB_RESILIENCY_TEST */ #ifdef CONFIG_SYSFS enum slab_stat_type { @@ -5103,6 +5103,14 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) SLAB_ATTR_RO(cache_dma); #endif +#ifdef CONFIG_ZONE_DMA32 +static ssize_t cache_dma32_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA32)); +} +SLAB_ATTR_RO(cache_dma32); +#endif + static ssize_t usersize_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%u\n", s->usersize); @@ -5407,7 +5415,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); -#endif +#endif /* CONFIG_SLUB_STATS */ static struct attribute *slab_attrs[] = { &slab_size_attr.attr, @@ -5443,6 +5451,9 @@ static struct attribute *slab_attrs[] = { #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif +#ifdef CONFIG_ZONE_DMA32 + &cache_dma32_attr.attr, +#endif #ifdef CONFIG_NUMA &remote_node_defrag_ratio_attr.attr, #endif @@ -5608,7 +5619,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (buffer) free_page((unsigned long)buffer); -#endif +#endif /* CONFIG_MEMCG */ } static void kmem_cache_release(struct kobject *k) diff --git a/mm/sparse.c b/mm/sparse.c index 56e057c432f9..fd13166949b5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap) #endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -/* - * returns the number of sections whose mem_maps were properly - * set. If this is <=0, then that means that the passed-in - * map was not consumed and must be freed. +/** + * sparse_add_one_section - add a memory section + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @altmap: device page map + * + * This is only intended for hotplug. + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. */ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, struct vmem_altmap *altmap) diff --git a/mm/swap.c b/mm/swap.c index 301ed4e04320..3a75722e68a9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -867,7 +867,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); /* * Page becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. + * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. * 2) Before acquiring LRU lock to put the page to correct LRU and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] diff --git a/mm/swap_state.c b/mm/swap_state.c index 85245fdec8d9..eb714165afd2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } diff --git a/mm/swapfile.c b/mm/swapfile.c index cf63b5f01adf..be36f6fe2f8c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1783,8 +1783,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); mem_cgroup_commit_charge(page, memcg, true, false); @@ -1793,6 +1791,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index d59b5a73dfb3..9932d5755e4c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -271,8 +271,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, - idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; diff --git a/mm/util.c b/mm/util.c index 43a2984bccaa..e2e4f8c3fa12 100644 --- a/mm/util.c +++ b/mm/util.c @@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) + int nr_pages, unsigned int gup_flags, + struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, pages, - write ? FOLL_WRITE : 0); + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - long free, allowed, reserve; + long allowed; VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < -(s64)vm_committed_as_batch * num_online_cpus(), @@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_zone_page_state(NR_FREE_PAGES); - free += global_node_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_node_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_node_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Part of the kernel memory, which can be released - * under memory pressure. - */ - free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) + if (pages > totalram_pages() + total_swap_pages) goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; + return 0; } allowed = vm_commit_limit(); @@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Don't let a single process grow so big a user can't recover */ if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e5e9e1fcac01..22caff913f7d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -32,6 +32,7 @@ #include <linux/compiler.h> #include <linux/llist.h> #include <linux/bitops.h> +#include <linux/rbtree_augmented.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -324,6 +325,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 + #define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 @@ -332,14 +336,74 @@ static DEFINE_SPINLOCK(vmap_area_lock); LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; +static bool vmap_initialized __read_mostly; + +/* + * This kmem_cache is used for vmap_area objects. Instead of + * allocating from slab we reuse an object from this cache to + * make things faster. Especially in "no edge" splitting of + * free block. + */ +static struct kmem_cache *vmap_area_cachep; + +/* + * This linked list is used in pair with free_vmap_area_root. + * It gives O(1) access to prev/next to perform fast coalescing. + */ +static LIST_HEAD(free_vmap_area_list); + +/* + * This augment red-black tree represents the free vmap space. + * All vmap_area objects in this tree are sorted by va->va_start + * address. It is used for allocation and merging when a vmap + * object is released. + * + * Each vmap_area node contains a maximum available free block + * of its sub-tree, right or left. Therefore it is possible to + * find a lowest match of free area. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; + +static __always_inline unsigned long +va_size(struct vmap_area *va) +{ + return (va->va_end - va->va_start); +} + +static __always_inline unsigned long +get_subtree_max_size(struct rb_node *node) +{ + struct vmap_area *va; + + va = rb_entry_safe(node, struct vmap_area, rb_node); + return va ? va->subtree_max_size : 0; +} + +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} + +RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, + compute_subtree_max_size) -/* The vmap cache globals are protected by vmap_area_lock */ -static struct rb_node *free_vmap_cache; -static unsigned long cached_hole_size; -static unsigned long cached_vstart; -static unsigned long cached_align; +static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static unsigned long lazy_max_pages(void); + +static atomic_long_t nr_vmalloc_pages; -static unsigned long vmap_area_pcpu_hole; +unsigned long vmalloc_nr_pages(void) +{ + return atomic_long_read(&nr_vmalloc_pages); +} static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -360,41 +424,610 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) return NULL; } -static void __insert_vmap_area(struct vmap_area *va) -{ - struct rb_node **p = &vmap_area_root.rb_node; - struct rb_node *parent = NULL; - struct rb_node *tmp; +/* + * This function returns back addresses of parent node + * and its left or right link for further processing. + */ +static __always_inline struct rb_node ** +find_va_links(struct vmap_area *va, + struct rb_root *root, struct rb_node *from, + struct rb_node **parent) +{ + struct vmap_area *tmp_va; + struct rb_node **link; + + if (root) { + link = &root->rb_node; + if (unlikely(!*link)) { + *parent = NULL; + return link; + } + } else { + link = &from; + } - while (*p) { - struct vmap_area *tmp_va; + /* + * Go to the bottom of the tree. When we hit the last point + * we end up with parent rb_node and correct direction, i name + * it link, where the new va->rb_node will be attached to. + */ + do { + tmp_va = rb_entry(*link, struct vmap_area, rb_node); - parent = *p; - tmp_va = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp_va->va_end) - p = &(*p)->rb_left; - else if (va->va_end > tmp_va->va_start) - p = &(*p)->rb_right; + /* + * During the traversal we also do some sanity check. + * Trigger the BUG() if there are sides(left/right) + * or full overlaps. + */ + if (va->va_start < tmp_va->va_end && + va->va_end <= tmp_va->va_start) + link = &(*link)->rb_left; + else if (va->va_end > tmp_va->va_start && + va->va_start >= tmp_va->va_end) + link = &(*link)->rb_right; else BUG(); + } while (*link); + + *parent = &tmp_va->rb_node; + return link; +} + +static __always_inline struct list_head * +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) +{ + struct list_head *list; + + if (unlikely(!parent)) + /* + * The red-black tree where we try to find VA neighbors + * before merging or inserting is empty, i.e. it means + * there is no free vmap space. Normally it does not + * happen but we handle this case anyway. + */ + return NULL; + + list = &rb_entry(parent, struct vmap_area, rb_node)->list; + return (&parent->rb_right == link ? list->next : list); +} + +static __always_inline void +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, struct list_head *head) +{ + /* + * VA is still not in the list, but we can + * identify its future previous list_head node. + */ + if (likely(parent)) { + head = &rb_entry(parent, struct vmap_area, rb_node)->list; + if (&parent->rb_right != link) + head = head->prev; } - rb_link_node(&va->rb_node, parent, p); - rb_insert_color(&va->rb_node, &vmap_area_root); + /* Insert to the rb-tree */ + rb_link_node(&va->rb_node, parent, link); + if (root == &free_vmap_area_root) { + /* + * Some explanation here. Just perform simple insertion + * to the tree. We do not set va->subtree_max_size to + * its current size before calling rb_insert_augmented(). + * It is because of we populate the tree from the bottom + * to parent levels when the node _is_ in the tree. + * + * Therefore we set subtree_max_size to zero after insertion, + * to let __augment_tree_propagate_from() puts everything to + * the correct order later on. + */ + rb_insert_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + va->subtree_max_size = 0; + } else { + rb_insert_color(&va->rb_node, root); + } - /* address-sort this list */ - tmp = rb_prev(&va->rb_node); - if (tmp) { - struct vmap_area *prev; - prev = rb_entry(tmp, struct vmap_area, rb_node); - list_add_rcu(&va->list, &prev->list); - } else - list_add_rcu(&va->list, &vmap_area_list); + /* Address-sort this list */ + list_add(&va->list, head); } -static void purge_vmap_area_lazy(void); +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + /* + * During merging a VA node can be empty, therefore + * not linked with the tree nor list. Just check it. + */ + if (!RB_EMPTY_NODE(&va->rb_node)) { + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); + } +} + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; + + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +/* + * This function populates subtree_max_size from bottom to upper + * levels starting from VA point. The propagation must be done + * when VA size is modified by changing its va_start/va_end. Or + * in case of newly inserting of VA to the tree. + * + * It means that __augment_tree_propagate_from() must be called: + * - After VA has been inserted to the tree(free path); + * - After VA has been shrunk(allocation path); + * - After VA has been increased(merging path). + * + * Please note that, it does not mean that upper parent nodes + * and their subtree_max_size are recalculated all the time up + * to the root node. + * + * 4--8 + * /\ + * / \ + * / \ + * 2--2 8--8 + * + * For example if we modify the node 4, shrinking it to 2, then + * no any modification is required. If we shrink the node 2 to 1 + * its subtree_max_size is updated only, and set to 1. If we shrink + * the node 8 to 6, then its subtree_max_size is set to 6 and parent + * node becomes 4--6. + */ +static __always_inline void +augment_tree_propagate_from(struct vmap_area *va) +{ + struct rb_node *node = &va->rb_node; + unsigned long new_va_sub_max_size; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + new_va_sub_max_size = compute_subtree_max_size(va); + + /* + * If the newly calculated maximum available size of the + * subtree is equal to the current one, then it means that + * the tree is propagated correctly. So we have to stop at + * this point to save cycles. + */ + if (va->subtree_max_size == new_va_sub_max_size) + break; + + va->subtree_max_size = new_va_sub_max_size; + node = rb_parent(&va->rb_node); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static void +insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + link = find_va_links(va, root, NULL, &parent); + link_va(va, root, parent, link, head); +} + +static void +insert_vmap_area_augment(struct vmap_area *va, + struct rb_node *from, struct rb_root *root, + struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + if (from) + link = find_va_links(va, NULL, from, &parent); + else + link = find_va_links(va, root, NULL, &parent); + + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. If coalesce is not done a new + * free area is inserted. If VA has been merged, it is + * freed. + */ +static __always_inline void +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct vmap_area *sibling; + struct list_head *next; + struct rb_node **link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + link = find_va_links(va, root, NULL, &parent); + + /* + * Get next node of VA to check if merging can be done. + */ + next = get_va_next_sibling(parent, link); + if (unlikely(next == NULL)) + goto insert; + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (next->prev != head) { + sibling = list_entry(next->prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + sibling->va_end = va->va_end; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + return; + } + } + +insert: + if (!merged) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } +} + +static __always_inline bool +is_within_this_va(struct vmap_area *va, unsigned long size, + unsigned long align, unsigned long vstart) +{ + unsigned long nva_start_addr; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Can be overflowed due to big size or alignment. */ + if (nva_start_addr + size < nva_start_addr || + nva_start_addr < vstart) + return false; + + return (nva_start_addr + size <= va->va_end); +} + +/* + * Find the first free block(lowest start address) in the tree, + * that will accomplish the request corresponding to passing + * parameters. + */ +static __always_inline struct vmap_area * +find_vmap_lowest_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long length; + + /* Start from the root. */ + node = free_vmap_area_root.rb_node; + + /* Adjust the search size for alignment overhead. */ + length = size + align - 1; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) >= length && + vstart < va->va_start) { + node = node->rb_left; + } else { + if (is_within_this_va(va, size, align, vstart)) + return va; + + /* + * Does not make sense to go deeper towards the right + * sub-tree if it does not have a free block that is + * equal or bigger to the requested search length. + */ + if (get_subtree_max_size(node->rb_right) >= length) { + node = node->rb_right; + continue; + } + + /* + * OK. We roll back and find the fist right sub-tree, + * that will satisfy the search criteria. It can happen + * only once due to "vstart" restriction. + */ + while ((node = rb_parent(node))) { + va = rb_entry(node, struct vmap_area, rb_node); + if (is_within_this_va(va, size, align, vstart)) + return va; + + if (get_subtree_max_size(node->rb_right) >= length && + vstart <= va->va_start) { + node = node->rb_right; + break; + } + } + } + } + + return NULL; +} + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK +#include <linux/random.h> + +static struct vmap_area * +find_vmap_lowest_linear_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + + list_for_each_entry(va, &free_vmap_area_list, list) { + if (!is_within_this_va(va, size, align, vstart)) + continue; + + return va; + } + + return NULL; +} + +static void +find_vmap_lowest_match_check(unsigned long size) +{ + struct vmap_area *va_1, *va_2; + unsigned long vstart; + unsigned int rnd; + + get_random_bytes(&rnd, sizeof(rnd)); + vstart = VMALLOC_START + rnd; + + va_1 = find_vmap_lowest_match(size, 1, vstart); + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + + if (va_1 != va_2) + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", + va_1, va_2, vstart); +} +#endif + +enum fit_type { + NOTHING_FIT = 0, + FL_FIT_TYPE = 1, /* full fit */ + LE_FIT_TYPE = 2, /* left edge fit */ + RE_FIT_TYPE = 3, /* right edge fit */ + NE_FIT_TYPE = 4 /* no edge fit */ +}; + +static __always_inline enum fit_type +classify_va_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size) +{ + enum fit_type type; + + /* Check if it is within VA. */ + if (nva_start_addr < va->va_start || + nva_start_addr + size > va->va_end) + return NOTHING_FIT; + + /* Now classify. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + type = FL_FIT_TYPE; + else + type = LE_FIT_TYPE; + } else if (va->va_end == nva_start_addr + size) { + type = RE_FIT_TYPE; + } else { + type = NE_FIT_TYPE; + } + + return type; +} + +static __always_inline int +adjust_va_to_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size, + enum fit_type type) +{ + struct vmap_area *lva; + + if (type == FL_FIT_TYPE) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + unlink_va(va, &free_vmap_area_root); + kmem_cache_free(vmap_area_cachep, va); + } else if (type == LE_FIT_TYPE) { + /* + * Split left edge of fit VA. + * + * | | + * V NVA V R + * |-------|-------| + */ + va->va_start += size; + } else if (type == RE_FIT_TYPE) { + /* + * Split right edge of fit VA. + * + * | | + * L V NVA V + * |-------|-------| + */ + va->va_end = nva_start_addr; + } else if (type == NE_FIT_TYPE) { + /* + * Split no edge of fit VA. + * + * | | + * L V NVA V R + * |---|-------|---| + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (unlikely(!lva)) + return -1; + + /* + * Build the remainder. + */ + lva->va_start = va->va_start; + lva->va_end = nva_start_addr; + + /* + * Shrink this VA to remaining size. + */ + va->va_start = nva_start_addr + size; + } else { + return -1; + } + + if (type != FL_FIT_TYPE) { + augment_tree_propagate_from(va); + + if (type == NE_FIT_TYPE) + insert_vmap_area_augment(lva, &va->rb_node, + &free_vmap_area_root, &free_vmap_area_list); + } + + return 0; +} + +/* + * Returns a start address of the newly allocated area, if success. + * Otherwise a vend is returned that indicates failure. + */ +static __always_inline unsigned long +__alloc_vmap_area(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int node) +{ + unsigned long nva_start_addr; + struct vmap_area *va; + enum fit_type type; + int ret; + + va = find_vmap_lowest_match(size, align, vstart); + if (unlikely(!va)) + return vend; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Classify what we have found. */ + type = classify_va_fit_type(va, nva_start_addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + if (ret) + return vend; + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK + find_vmap_lowest_match_check(size); +#endif + + return nva_start_addr; +} /* * Allocate a region of KVA of the specified size and alignment, within the @@ -406,18 +1039,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; - struct rb_node *n; unsigned long addr; int purged = 0; - struct vmap_area *first; BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + might_sleep(); - va = kmalloc_node(sizeof(struct vmap_area), + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -430,87 +1064,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: spin_lock(&vmap_area_lock); - /* - * Invalidate cache if we have more permissive parameters. - * cached_hole_size notes the largest hole noticed _below_ - * the vmap_area cached in free_vmap_cache: if size fits - * into that hole, we want to scan from vstart to reuse - * the hole instead of allocating above free_vmap_cache. - * Note that __free_vmap_area may update free_vmap_cache - * without updating cached_hole_size or cached_align. - */ - if (!free_vmap_cache || - size < cached_hole_size || - vstart < cached_vstart || - align < cached_align) { -nocache: - cached_hole_size = 0; - free_vmap_cache = NULL; - } - /* record if we encounter less permissive parameters */ - cached_vstart = vstart; - cached_align = align; - - /* find starting point for our search */ - if (free_vmap_cache) { - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - addr = ALIGN(first->va_end, align); - if (addr < vstart) - goto nocache; - if (addr + size < addr) - goto overflow; - - } else { - addr = ALIGN(vstart, align); - if (addr + size < addr) - goto overflow; - - n = vmap_area_root.rb_node; - first = NULL; - - while (n) { - struct vmap_area *tmp; - tmp = rb_entry(n, struct vmap_area, rb_node); - if (tmp->va_end >= addr) { - first = tmp; - if (tmp->va_start <= addr) - break; - n = n->rb_left; - } else - n = n->rb_right; - } - - if (!first) - goto found; - } - - /* from the starting point, walk areas until a suitable hole is found */ - while (addr + size > first->va_start && addr + size <= vend) { - if (addr + cached_hole_size < first->va_start) - cached_hole_size = first->va_start - addr; - addr = ALIGN(first->va_end, align); - if (addr + size < addr) - goto overflow; - - if (list_is_last(&first->list, &vmap_area_list)) - goto found; - first = list_next_entry(first, list); - } - -found: /* - * Check also calculated address against the vstart, - * because it can be 0 because of big align request. + * If an allocation fails, the "vend" address is + * returned. Therefore trigger the overflow path. */ - if (addr + size > vend || addr < vstart) + addr = __alloc_vmap_area(size, align, vstart, vend, node); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; - __insert_vmap_area(va); - free_vmap_cache = &va->rb_node; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -539,7 +1106,8 @@ overflow: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); - kfree(va); + + kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } @@ -559,35 +1127,16 @@ static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); - if (free_vmap_cache) { - if (va->va_end < cached_vstart) { - free_vmap_cache = NULL; - } else { - struct vmap_area *cache; - cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - if (va->va_start <= cache->va_start) { - free_vmap_cache = rb_prev(&va->rb_node); - /* - * We don't try to update cached_hole_size or - * cached_align, but it won't go very wrong. - */ - } - } - } - rb_erase(&va->rb_node, &vmap_area_root); - RB_CLEAR_NODE(&va->rb_node); - list_del_rcu(&va->list); - /* - * Track the highest possible candidate for pcpu area - * allocation. Areas outside of vmalloc area can be returned - * here too, consider only end addresses which fall inside - * vmalloc area proper. + * Remove from the busy tree/list. */ - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) - vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + unlink_va(va, &vmap_area_root); - kfree_rcu(va, rcu_head); + /* + * Merge VA with its neighbors, otherwise just add it. + */ + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); } /* @@ -633,7 +1182,7 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual criticial section protected @@ -651,7 +1200,7 @@ static void purge_fragmented_blocks_allcpus(void); */ void set_iounmap_nonlazy(void) { - atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); + atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); } /* @@ -659,34 +1208,40 @@ void set_iounmap_nonlazy(void) */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { + unsigned long resched_threshold; struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - bool do_free = false; lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); + if (unlikely(valist == NULL)) + return false; + + /* + * TODO: to calculate a flush range without looping. + * The list can be up to lazy_max_pages() elements. + */ llist_for_each_entry(va, valist, purge_list) { if (va->va_start < start) start = va->va_start; if (va->va_end > end) end = va->va_end; - do_free = true; } - if (!do_free) - return false; - flush_tlb_kernel_range(start, end); + resched_threshold = lazy_max_pages() << 1; spin_lock(&vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { - int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; __free_vmap_area(va); - atomic_sub(nr, &vmap_lazy_nr); - cond_resched_lock(&vmap_area_lock); + atomic_long_sub(nr, &vmap_lazy_nr); + + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) + cond_resched_lock(&vmap_area_lock); } spin_unlock(&vmap_area_lock); return true; @@ -722,10 +1277,10 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - int nr_lazy; + unsigned long nr_lazy; - nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, - &vmap_lazy_nr); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> + PAGE_SHIFT, &vmap_lazy_nr); /* After this point, we may free va at any time */ llist_add(&va->purge_list, &vmap_purge_list); @@ -788,8 +1343,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) -static bool vmap_initialized __read_mostly = false; - struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1250,12 +1803,58 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm_area_add_early(vm); } +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; @@ -1270,16 +1869,21 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - __insert_vmap_area(va); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - vmap_area_pcpu_hole = VMALLOC_END; - + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); vmap_initialized = true; } @@ -1478,6 +2082,22 @@ struct vm_struct *find_vm_area(const void *addr) return NULL; } +static struct vm_struct *__remove_vm_area(struct vmap_area *va) +{ + struct vm_struct *vm = va->vm; + + spin_lock(&vmap_area_lock); + va->vm = NULL; + va->flags &= ~VM_VM_AREA; + va->flags |= VM_LAZY_FREE; + spin_unlock(&vmap_area_lock); + + kasan_free_shadow(vm); + free_unmap_vmap_area(va); + + return vm; +} + /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -1490,26 +2110,14 @@ struct vm_struct *find_vm_area(const void *addr) */ struct vm_struct *remove_vm_area(const void *addr) { + struct vm_struct *vm = NULL; struct vmap_area *va; - might_sleep(); - va = find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) { - struct vm_struct *vm = va->vm; - - spin_lock(&vmap_area_lock); - va->vm = NULL; - va->flags &= ~VM_VM_AREA; - va->flags |= VM_LAZY_FREE; - spin_unlock(&vmap_area_lock); - - kasan_free_shadow(vm); - free_unmap_vmap_area(va); + if (va && va->flags & VM_VM_AREA) + vm = __remove_vm_area(va); - return vm; - } - return NULL; + return vm; } static inline void set_area_direct_map(const struct vm_struct *area, @@ -1523,8 +2131,9 @@ static inline void set_area_direct_map(const struct vm_struct *area, } /* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +static void vm_remove_mappings(struct vmap_area *va, int deallocate_pages) { + struct vm_struct *area = va->vm; unsigned long addr = (unsigned long)area->addr; unsigned long start = ULONG_MAX, end = 0; int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; @@ -1541,7 +2150,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) set_memory_rw(addr, area->nr_pages); } - remove_vm_area(area->addr); + __remove_vm_area(va); /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) @@ -1581,6 +2190,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; + struct vmap_area *va; if (!addr) return; @@ -1589,17 +2199,18 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vm_area(addr); - if (unlikely(!area)) { + va = find_vmap_area((unsigned long)addr); + if (unlikely(!va || !(va->flags & VM_VM_AREA))) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } + area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); + vm_remove_mappings(va, deallocate_pages); if (deallocate_pages) { int i; @@ -1610,12 +2221,12 @@ static void __vunmap(const void *addr, int deallocate_pages) BUG_ON(!page); __free_pages(page, 0); } + atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); kvfree(area->pages); } kfree(area); - return; } static inline void __vfree_deferred(const void *addr) @@ -1787,12 +2398,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (map_vm_area(area, prot, pages)) goto fail; @@ -2471,81 +3084,64 @@ static struct vmap_area *node_to_va(struct rb_node *n) } /** - * pvm_find_next_prev - find the next and prev vmap_area surrounding @end - * @end: target address - * @pnext: out arg for the next vmap_area - * @pprev: out arg for the previous vmap_area + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to + * @addr: target address * - * Returns: %true if either or both of next and prev are found, - * %false if no vmap_area exists - * - * Find vmap_areas end addresses of which enclose @end. ie. if not - * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + * Returns: vmap_area if it is found. If there is no such area + * the first highest(reverse order) vmap_area is returned + * i.e. va->va_start < addr && va->va_end < addr or NULL + * if there are no any areas before @addr. */ -static bool pvm_find_next_prev(unsigned long end, - struct vmap_area **pnext, - struct vmap_area **pprev) +static struct vmap_area * +pvm_find_va_enclose_addr(unsigned long addr) { - struct rb_node *n = vmap_area_root.rb_node; - struct vmap_area *va = NULL; + struct vmap_area *va, *tmp; + struct rb_node *n; + + n = free_vmap_area_root.rb_node; + va = NULL; while (n) { - va = rb_entry(n, struct vmap_area, rb_node); - if (end < va->va_end) - n = n->rb_left; - else if (end > va->va_end) + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_start <= addr) { + va = tmp; + if (tmp->va_end >= addr) + break; + n = n->rb_right; - else - break; + } else { + n = n->rb_left; + } } - if (!va) - return false; - - if (va->va_end > end) { - *pnext = va; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); - } else { - *pprev = va; - *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); - } - return true; + return va; } /** - * pvm_determine_end - find the highest aligned address between two vmap_areas - * @pnext: in/out arg for the next vmap_area - * @pprev: in/out arg for the previous vmap_area - * @align: alignment + * pvm_determine_end_from_reverse - find the highest aligned address + * of free block below VMALLOC_END + * @va: + * in - the VA we start the search(reverse order); + * out - the VA with the highest aligned end address. * - * Returns: determined end address - * - * Find the highest aligned address between *@pnext and *@pprev below - * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned - * down address is between the end addresses of the two vmap_areas. - * - * Please note that the address returned by this function may fall - * inside *@pnext vmap_area. The caller is responsible for checking - * that. + * Returns: determined end address within vmap_area */ -static unsigned long pvm_determine_end(struct vmap_area **pnext, - struct vmap_area **pprev, - unsigned long align) +static unsigned long +pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { - const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; - if (*pnext) - addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); - else - addr = vmalloc_end; - - while (*pprev && (*pprev)->va_end > addr) { - *pnext = *pprev; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + if (likely(*va)) { + list_for_each_entry_from_reverse((*va), + &free_vmap_area_list, list) { + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); + if ((*va)->va_start < addr) + return addr; + } } - return addr; + return 0; } /** @@ -2565,12 +3161,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * - * Despite its complicated look, this allocator is rather simple. It - * does everything top-down and scans areas from the end looking for - * matching slot. While scanning, if any of the areas overlaps with - * existing vmap_area, the base address is pulled down to fit the - * area. Scanning is repeated till all the areas fit and then all - * necessary data structures are inserted and the result is returned. + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans free blocks from the end looking + * for matching base. While scanning, if any of the areas do not fit the + * base address is pulled down to fit the area. Scanning is repeated till + * all the areas fit and then all necessary data structures are inserted + * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2578,11 +3174,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); - struct vmap_area **vas, *prev, *next; + struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; - unsigned long base, start, end, last_end; + unsigned long base, start, size, end, last_end; bool purged = false; + enum fit_type type; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); @@ -2618,7 +3215,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free2; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; @@ -2631,49 +3228,29 @@ retry: start = offsets[area]; end = start + sizes[area]; - if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { - base = vmalloc_end - last_end; - goto found; - } - base = pvm_determine_end(&next, &prev, align) - end; + va = pvm_find_va_enclose_addr(vmalloc_end); + base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { - BUG_ON(next && next->va_end <= base + end); - BUG_ON(prev && prev->va_end > base + end); - /* * base might have underflowed, add last_end before * comparing. */ - if (base + last_end < vmalloc_start + last_end) { - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = true; - goto retry; - } - goto err_free; - } + if (base + last_end < vmalloc_start + last_end) + goto overflow; /* - * If next overlaps, move base downwards so that it's - * right below next and then recheck. + * Fitting base has not been found. */ - if (next && next->va_start < base + end) { - base = pvm_determine_end(&next, &prev, align) - end; - term_area = area; - continue; - } + if (va == NULL) + goto overflow; /* - * If prev overlaps, shift down next and prev and move - * base so that it's right below new next and then - * recheck. + * If this VA does not fit, move base downwards and recheck. */ - if (prev && prev->va_end > base + start) { - next = prev; - prev = node_to_va(rb_prev(&next->rb_node)); - base = pvm_determine_end(&next, &prev, align) - end; + if (base + start < va->va_start || base + end > va->va_end) { + va = node_to_va(rb_prev(&va->rb_node)); + base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } @@ -2685,21 +3262,40 @@ retry: area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; + start = offsets[area]; end = start + sizes[area]; - pvm_find_next_prev(base + end, &next, &prev); + va = pvm_find_va_enclose_addr(base + end); } -found: + /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { - struct vmap_area *va = vas[area]; + int ret; - va->va_start = base + offsets[area]; - va->va_end = va->va_start + sizes[area]; - __insert_vmap_area(va); - } + start = base + offsets[area]; + size = sizes[area]; + + va = pvm_find_va_enclose_addr(start); + if (WARN_ON_ONCE(va == NULL)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + type = classify_va_fit_type(va, start, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; - vmap_area_pcpu_hole = base + offsets[last_area]; + ret = adjust_va_to_fit_type(va, start, size, type); + if (unlikely(ret)) + goto recovery; + + /* Allocated area. */ + va = vas[area]; + va->va_start = start; + va->va_end = start + size; + + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } spin_unlock(&vmap_area_lock); @@ -2711,9 +3307,38 @@ found: kfree(vas); return vms; +recovery: + /* Remove previously inserted areas. */ + while (area--) { + __free_vmap_area(vas[area]); + vas[area] = NULL; + } + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + + /* Before "retry", check if we recover. */ + for (area = 0; area < nr_vms; area++) { + if (vas[area]) + continue; + + vas[area] = kmem_cache_zalloc( + vmap_area_cachep, GFP_KERNEL); + if (!vas[area]) + goto err_free; + } + + goto retry; + } + err_free: for (area = 0; area < nr_vms; area++) { - kfree(vas[area]); + if (vas[area]) + kmem_cache_free(vmap_area_cachep, vas[area]); + kfree(vms[area]); } err_free2: diff --git a/mm/vmscan.c b/mm/vmscan.c index fd9de504e516..df9941179fc5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid; if (!mem_cgroup_disabled()) - lru_size = mem_cgroup_get_lru_size(lruvec, lru); + lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); @@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, LIST_HEAD(ret_pages); LIST_HEAD(free_pages); unsigned nr_reclaimed = 0; + unsigned pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1466,8 +1467,10 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { + int type = page_is_file_cache(page); SetPageActive(page); - stat->nr_activate++; + pgactivate++; + stat->nr_activate[type] += hpage_nr_pages(page); count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1482,7 +1485,7 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, stat->nr_activate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -1804,40 +1807,54 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, return isolated > inactive; } -static noinline_for_stack void -putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) +/* + * This moves pages from @list to corresponding LRU list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone_lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone_lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_refcount against each page. + * But we had to alter page->flags anyway. + * + * Returns the number of pages moved to the given lruvec. + */ + +static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, + struct list_head *list) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); + struct page *page; + enum lru_list lru; - /* - * Put back any unfreeable pages. - */ - while (!list_empty(page_list)) { - struct page *page = lru_to_page(page_list); - int lru; - + while (!list_empty(list)) { + page = lru_to_page(list); VM_BUG_ON_PAGE(PageLRU(page), page); - list_del(&page->lru); if (unlikely(!page_evictable(page))) { + list_del(&page->lru); spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(page, lruvec, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - int numpages = hpage_nr_pages(page); - reclaim_stat->recent_rotated[file] += numpages; - } + nr_pages = hpage_nr_pages(page); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); + list_move(&page->lru, &lruvec->lists[lru]); + if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); @@ -1850,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); + } else { + nr_moved += nr_pages; } } /* * To save our caller's stack, now use input list for pages to free. */ - list_splice(&pages_to_free, page_list); + list_splice(&pages_to_free, list); + + return nr_moved; } /* @@ -1886,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_taken; struct reclaim_stat stat; int file = is_file_lru(lru); + enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; @@ -1913,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (current_is_kswapd()) { - if (global_reclaim(sc)) - __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, - nr_scanned); - } else { - if (global_reclaim(sc)) - __count_vm_events(PGSCAN_DIRECT, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, - nr_scanned); - } + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1934,19 +1949,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (current_is_kswapd()) { - if (global_reclaim(sc)) - __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, - nr_reclaimed); - } else { - if (global_reclaim(sc)) - __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, - nr_reclaimed); - } + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, nr_reclaimed); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; + reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; - putback_inactive_pages(lruvec, &page_list); + move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); @@ -1983,73 +1993,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return nr_reclaimed; } -/* - * This moves pages from the active list to the inactive list. - * - * We move them the other way if the page is referenced by one or more - * processes, from rmap. - * - * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold pgdat->lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop pgdat->lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. - * - * The downside is that we have to touch page->_refcount against each page. - * But we had to alter page->flags anyway. - * - * Returns the number of pages moved to the given lru. - */ - -static unsigned move_active_pages_to_lru(struct lruvec *lruvec, - struct list_head *list, - struct list_head *pages_to_free, - enum lru_list lru) -{ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct page *page; - int nr_pages; - int nr_moved = 0; - - while (!list_empty(list)) { - page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - - nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); - list_move(&page->lru, &lruvec->lists[lru]); - - if (put_page_testzero(page)) { - __ClearPageLRU(page); - __ClearPageActive(page); - del_page_from_lru_list(page, lruvec, lru); - - if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); - (*get_compound_page_dtor(page))(page); - spin_lock_irq(&pgdat->lru_lock); - } else - list_add(&page->lru, pages_to_free); - } else { - nr_moved += nr_pages; - } - } - - if (!is_active_lru(lru)) { - __count_vm_events(PGDEACTIVATE, nr_moved); - count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_moved); - } - - return nr_moved; -} - static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, @@ -2079,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2136,13 +2079,19 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + nr_activate = move_pages_to_lru(lruvec, &l_active); + nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + /* Keep all free pages in l_active list */ + list_splice(&l_inactive, &l_active); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge_list(&l_hold); - free_unref_page_list(&l_hold); + mem_cgroup_uncharge_list(&l_active); + free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2250,8 +2199,7 @@ enum scan_balance { * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *nr, - unsigned long *lru_pages) + struct scan_control *sc, unsigned long *nr) { int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; @@ -2402,20 +2350,72 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg, + sc->memcg_low_reclaim); + + if (protection) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + cgroup_size; + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2435,7 +2435,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2444,7 +2444,6 @@ out: BUG(); } - *lru_pages += size; nr[lru] = scan; } } @@ -2453,7 +2452,7 @@ out: * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *lru_pages) + struct scan_control *sc) { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; @@ -2465,7 +2464,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, memcg, sc, nr, lru_pages); + get_scan_count(lruvec, memcg, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2670,7 +2669,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) .pgdat = pgdat, .priority = sc->priority, }; - unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; memset(&sc->nr, 0, sizeof(sc->nr)); @@ -2680,7 +2678,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { - unsigned long lru_pages; unsigned long reclaimed; unsigned long scanned; @@ -2705,13 +2702,19 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); - node_lru_pages += lru_pages; + shrink_node_memcg(pgdat, memcg, sc); if (sc->may_shrinkslab) { shrink_slab(sc->gfp_mask, pgdat->node_id, @@ -3212,10 +3215,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; - trace_mm_vmscan_direct_reclaim_begin(order, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3240,15 +3240,12 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .may_swap = !noswap, .may_shrinkslab = 1, }; - unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it @@ -3257,7 +3254,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); + shrink_node_memcg(pgdat, memcg, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -3297,10 +3294,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; - trace_mm_vmscan_memcg_reclaim_begin(0, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); @@ -4149,6 +4143,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .reclaim_idx = gfp_zone(gfp_mask), }; + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, + sc.gfp_mask); + cond_resched(); fs_reclaim_acquire(sc.gfp_mask); /* @@ -4175,6 +4172,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + + trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed >= nr_pages; } diff --git a/mm/workingset.c b/mm/workingset.c index 0bedf67502d5..6419baebd306 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -426,10 +426,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; + int i; - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL); lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) + pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); } else diff --git a/mm/z3fold.c b/mm/z3fold.c index aee9b0b8d907..1ffecd6333e5 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -24,16 +24,47 @@ #include <linux/atomic.h> #include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/dcache.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/page-flags.h> +#include <linux/migrate.h> +#include <linux/node.h> +#include <linux/compaction.h> #include <linux/percpu.h> +#include <linux/mount.h> +#include <linux/fs.h> #include <linux/preempt.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks + * in the beginning of an allocated page are occupied by z3fold header, so + * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), + * which shows the max number of free chunks in z3fold page, also there will + * be 63, or 62, respectively, freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) +#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) +#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +#define BUDDY_MASK (0x3) +#define BUDDY_SHIFT 2 +#define SLOTS_ALIGN (0x40) + /***************** * Structures *****************/ @@ -47,8 +78,18 @@ enum buddy { FIRST, MIDDLE, LAST, - BUDDIES_MAX + BUDDIES_MAX = LAST +}; + +struct z3fold_buddy_slots { + /* + * we are using BUDDY_MASK in handle_to_buddy etc. so there should + * be enough slots to hold all possible variants + */ + unsigned long slot[BUDDY_MASK + 1]; + unsigned long pool; /* back link + flags */ }; +#define HANDLE_FLAG_MASK (0x03) /* * struct z3fold_header - z3fold page metadata occupying first chunks of each @@ -58,49 +99,29 @@ enum buddy { * @page_lock: per-page lock * @refcount: reference count for the z3fold page * @work: work_struct for page layout optimization - * @pool: pointer to the pool which this page belongs to + * @slots: pointer to the structure holding buddy slots * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free * @first_num: the starting number (for the first handle) + * @mapped_count: the number of objects currently mapped */ struct z3fold_header { struct list_head buddy; spinlock_t page_lock; struct kref refcount; struct work_struct work; - struct z3fold_pool *pool; + struct z3fold_buddy_slots *slots; short cpu; unsigned short first_chunks; unsigned short middle_chunks; unsigned short last_chunks; unsigned short start_middle; unsigned short first_num:2; + unsigned short mapped_count:2; }; -/* - * NCHUNKS_ORDER determines the internal allocation granularity, effectively - * adjusting internal fragmentation. It also determines the number of - * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks - * in the beginning of an allocated page are occupied by z3fold header, so - * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), - * which shows the max number of free chunks in z3fold page, also there will - * be 63, or 62, respectively, freelists per pool. - */ -#define NCHUNKS_ORDER 6 - -#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) -#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) -#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) -#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) - -#define BUDDY_MASK (0x3) -#define BUDDY_SHIFT 2 - /** * struct z3fold_pool - stores metadata for each z3fold pool * @name: pool name @@ -113,11 +134,13 @@ struct z3fold_header { * added buddy. * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. + * @c_handle: cache for z3fold_buddy_slots allocation * @ops: pointer to a structure of user defined operations specified at * pool creation time. * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release + * @inode: inode for z3fold pseudo filesystem * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -130,12 +153,14 @@ struct z3fold_pool { struct list_head lru; struct list_head stale; atomic64_t pages_nr; + struct kmem_cache *c_handle; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; + struct inode *inode; }; /* @@ -164,11 +189,118 @@ static int size_to_chunks(size_t size) static void compact_page_work(struct work_struct *w); +static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool) +{ + struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle, + GFP_KERNEL); + + if (slots) { + memset(slots->slot, 0, sizeof(slots->slot)); + slots->pool = (unsigned long)pool; + } + + return slots; +} + +static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) +{ + return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); +} + +static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) +{ + return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); +} + +static inline void free_handle(unsigned long handle) +{ + struct z3fold_buddy_slots *slots; + int i; + bool is_free; + + if (handle & (1 << PAGE_HEADLESS)) + return; + + WARN_ON(*(unsigned long *)handle == 0); + *(unsigned long *)handle = 0; + slots = handle_to_slots(handle); + is_free = true; + for (i = 0; i <= BUDDY_MASK; i++) { + if (slots->slot[i]) { + is_free = false; + break; + } + } + + if (is_free) { + struct z3fold_pool *pool = slots_to_pool(slots); + + kmem_cache_free(pool->c_handle, slots); + } +} + +static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33); +} + +static struct file_system_type z3fold_fs = { + .name = "z3fold", + .mount = z3fold_do_mount, + .kill_sb = kill_anon_super, +}; + +static struct vfsmount *z3fold_mnt; +static int z3fold_mount(void) +{ + int ret = 0; + + z3fold_mnt = kern_mount(&z3fold_fs); + if (IS_ERR(z3fold_mnt)) + ret = PTR_ERR(z3fold_mnt); + + return ret; +} + +static void z3fold_unmount(void) +{ + kern_unmount(z3fold_mnt); +} + +static const struct address_space_operations z3fold_aops; +static int z3fold_register_migration(struct z3fold_pool *pool) +{ + pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); + if (IS_ERR(pool->inode)) { + pool->inode = NULL; + return 1; + } + + pool->inode->i_mapping->private_data = pool; + pool->inode->i_mapping->a_ops = &z3fold_aops; + return 0; +} + +static void z3fold_unregister_migration(struct z3fold_pool *pool) +{ + if (pool->inode) + iput(pool->inode); + } + /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, struct z3fold_pool *pool) { struct z3fold_header *zhdr = page_address(page); + struct z3fold_buddy_slots *slots = alloc_slots(pool); + + if (!slots) + return NULL; INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); @@ -185,15 +317,21 @@ static struct z3fold_header *init_z3fold_page(struct page *page, zhdr->first_num = 0; zhdr->start_middle = 0; zhdr->cpu = -1; - zhdr->pool = pool; + zhdr->slots = slots; INIT_LIST_HEAD(&zhdr->buddy); INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } /* Resets the struct page fields and frees the page */ -static void free_z3fold_page(struct page *page) +static void free_z3fold_page(struct page *page, bool headless) { + if (!headless) { + lock_page(page); + __ClearPageMovable(page); + unlock_page(page); + } + ClearPagePrivate(page); __free_page(page); } @@ -215,33 +353,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr) spin_unlock(&zhdr->page_lock); } +/* Helper function to build the index */ +static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) +{ + return (bud + zhdr->first_num) & BUDDY_MASK; +} + /* * Encodes the handle of a particular buddy within a z3fold page * Pool lock should be held as this function accesses first_num */ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) { - unsigned long handle; + struct z3fold_buddy_slots *slots; + unsigned long h = (unsigned long)zhdr; + int idx = 0; - handle = (unsigned long)zhdr; - if (bud != HEADLESS) { - handle |= (bud + zhdr->first_num) & BUDDY_MASK; - if (bud == LAST) - handle |= (zhdr->last_chunks << BUDDY_SHIFT); - } - return handle; + /* + * For a headless page, its handle is its pointer with the extra + * PAGE_HEADLESS bit set + */ + if (bud == HEADLESS) + return h | (1 << PAGE_HEADLESS); + + /* otherwise, return pointer to encoded handle */ + idx = __idx(zhdr, bud); + h += idx; + if (bud == LAST) + h |= (zhdr->last_chunks << BUDDY_SHIFT); + + slots = zhdr->slots; + slots->slot[idx] = h; + return (unsigned long)&slots->slot[idx]; } /* Returns the z3fold page where a given handle is stored */ -static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) +static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) { - return (struct z3fold_header *)(handle & PAGE_MASK); + unsigned long addr = h; + + if (!(addr & (1 << PAGE_HEADLESS))) + addr = *(unsigned long *)h; + + return (struct z3fold_header *)(addr & PAGE_MASK); } /* only for LAST bud, returns zero otherwise */ static unsigned short handle_to_chunks(unsigned long handle) { - return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; + unsigned long addr = *(unsigned long *)handle; + + return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; } /* @@ -251,21 +413,31 @@ static unsigned short handle_to_chunks(unsigned long handle) */ static enum buddy handle_to_buddy(unsigned long handle) { - struct z3fold_header *zhdr = handle_to_z3fold_header(handle); - return (handle - zhdr->first_num) & BUDDY_MASK; + struct z3fold_header *zhdr; + unsigned long addr; + + WARN_ON(handle & (1 << PAGE_HEADLESS)); + addr = *(unsigned long *)handle; + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + return (addr - zhdr->first_num) & BUDDY_MASK; +} + +static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) +{ + return slots_to_pool(zhdr->slots); } static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); - struct z3fold_pool *pool = zhdr->pool; + struct z3fold_pool *pool = zhdr_to_pool(zhdr); WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); if (!list_empty(&page->lru)) - list_del(&page->lru); + list_del_init(&page->lru); spin_unlock(&pool->lock); if (locked) z3fold_page_unlock(zhdr); @@ -295,9 +467,10 @@ static void release_z3fold_page_locked_list(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); - spin_lock(&zhdr->pool->lock); + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); list_del_init(&zhdr->buddy); - spin_unlock(&zhdr->pool->lock); + spin_unlock(&pool->lock); WARN_ON(z3fold_page_trylock(zhdr)); __release_z3fold_page(zhdr, true); @@ -318,7 +491,7 @@ static void free_pages_work(struct work_struct *w) continue; spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); - free_z3fold_page(page); + free_z3fold_page(page, false); cond_resched(); spin_lock(&pool->stale_lock); } @@ -349,6 +522,23 @@ static int num_free_chunks(struct z3fold_header *zhdr) return nfree; } +/* Add to the appropriate unbuddied list */ +static inline void add_to_unbuddied(struct z3fold_pool *pool, + struct z3fold_header *zhdr) +{ + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); + + int freechunks = num_free_chunks(zhdr); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + put_cpu_ptr(pool->unbuddied); + } +} + static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { @@ -367,6 +557,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) return 0; /* can't move middle chunk, it's used */ + if (unlikely(PageIsolated(page))) + return 0; + if (zhdr->middle_chunks == 0) return 0; /* nothing to compact */ @@ -406,10 +599,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) static void do_compact_page(struct z3fold_header *zhdr, bool locked) { - struct z3fold_pool *pool = zhdr->pool; + struct z3fold_pool *pool = zhdr_to_pool(zhdr); struct page *page; - struct list_head *unbuddied; - int fchunks; page = virt_to_page(zhdr); if (locked) @@ -429,19 +620,14 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } - z3fold_compact_page(zhdr); - unbuddied = get_cpu_ptr(pool->unbuddied); - fchunks = num_free_chunks(zhdr); - if (fchunks < NCHUNKS && - (!zhdr->first_chunks || !zhdr->middle_chunks || - !zhdr->last_chunks)) { - /* the page's not completely free and it's unbuddied */ - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[fchunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); + if (unlikely(PageIsolated(page) || + test_bit(PAGE_STALE, &page->private))) { + z3fold_page_unlock(zhdr); + return; } - put_cpu_ptr(pool->unbuddied); + + z3fold_compact_page(zhdr); + add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); } @@ -453,6 +639,103 @@ static void compact_page_work(struct work_struct *w) do_compact_page(zhdr, false); } +/* returns _locked_ z3fold page header or NULL */ +static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + size_t size, bool can_sleep) +{ + struct z3fold_header *zhdr = NULL; + struct page *page; + struct list_head *unbuddied; + int chunks = size_to_chunks(size), i; + +lookup: + /* First, try to find an unbuddied z3fold page. */ + unbuddied = get_cpu_ptr(pool->unbuddied); + for_each_unbuddied_list(i, chunks) { + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr) + continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + l = &unbuddied[i]; + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + put_cpu_ptr(pool->unbuddied); + + if (!zhdr) { + int cpu; + + /* look for _exact_ match on other cpus' lists */ + for_each_online_cpu(cpu) { + struct list_head *l; + + unbuddied = per_cpu_ptr(pool->unbuddied, cpu); + spin_lock(&pool->lock); + l = &unbuddied[chunks]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr || !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + continue; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + if (can_sleep) + cond_resched(); + continue; + } + kref_get(&zhdr->refcount); + break; + } + } + + return zhdr; +} /* * API Functions @@ -476,6 +759,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool = kzalloc(sizeof(struct z3fold_pool), gfp); if (!pool) goto out; + pool->c_handle = kmem_cache_create("z3fold_handle", + sizeof(struct z3fold_buddy_slots), + SLOTS_ALIGN, 0, NULL); + if (!pool->c_handle) + goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); @@ -497,15 +785,21 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; + if (z3fold_register_migration(pool)) + goto out_rwq; INIT_WORK(&pool->work, free_pages_work); pool->ops = ops; return pool; +out_rwq: + destroy_workqueue(pool->release_wq); out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: free_percpu(pool->unbuddied); out_pool: + kmem_cache_destroy(pool->c_handle); +out_c: kfree(pool); out: return NULL; @@ -519,6 +813,8 @@ out: */ static void z3fold_destroy_pool(struct z3fold_pool *pool) { + kmem_cache_destroy(pool->c_handle); + z3fold_unregister_migration(pool); destroy_workqueue(pool->release_wq); destroy_workqueue(pool->compact_wq); kfree(pool); @@ -546,7 +842,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { - int chunks = 0, i, freechunks; + int chunks = size_to_chunks(size); struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; @@ -561,56 +857,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { - struct list_head *unbuddied; - chunks = size_to_chunks(size); - -lookup: - /* First, try to find an unbuddied z3fold page. */ - unbuddied = get_cpu_ptr(pool->unbuddied); - for_each_unbuddied_list(i, chunks) { - struct list_head *l = &unbuddied[i]; - - zhdr = list_first_entry_or_null(READ_ONCE(l), - struct z3fold_header, buddy); - - if (!zhdr) - continue; - - /* Re-check under lock. */ - spin_lock(&pool->lock); - l = &unbuddied[i]; - if (unlikely(zhdr != list_first_entry(READ_ONCE(l), - struct z3fold_header, buddy)) || - !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); - put_cpu_ptr(pool->unbuddied); - goto lookup; - } - list_del_init(&zhdr->buddy); - zhdr->cpu = -1; - spin_unlock(&pool->lock); - - page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { - z3fold_page_unlock(zhdr); - zhdr = NULL; - put_cpu_ptr(pool->unbuddied); - if (can_sleep) - cond_resched(); - goto lookup; - } - - /* - * this page could not be removed from its unbuddied - * list while pool lock was held, and then we've taken - * page lock so kref_put could not be called before - * we got here, so it's safe to just call kref_get() - */ - kref_get(&zhdr->refcount); - break; - } - put_cpu_ptr(pool->unbuddied); - +retry: + zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && @@ -630,8 +878,9 @@ lookup: z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); - goto lookup; + goto retry; } + page = virt_to_page(zhdr); goto found; } bud = FIRST; @@ -662,13 +911,18 @@ lookup: if (!page) return -ENOMEM; - atomic64_inc(&pool->pages_nr); zhdr = init_z3fold_page(page, pool); + if (!zhdr) { + __free_page(page); + return -ENOMEM; + } + atomic64_inc(&pool->pages_nr); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); goto headless; } + __SetPageMovable(page, pool->inode->i_mapping); z3fold_page_lock(zhdr); found: @@ -680,19 +934,7 @@ found: zhdr->middle_chunks = chunks; zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } - - if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || - zhdr->middle_chunks == 0) { - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); - - /* Add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[freechunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); - put_cpu_ptr(pool->unbuddied); - } + add_to_unbuddied(pool, zhdr); headless: spin_lock(&pool->lock); @@ -739,7 +981,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); - free_z3fold_page(page); + free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); } return; @@ -766,6 +1008,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } + free_handle(handle); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; @@ -774,7 +1017,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) z3fold_page_unlock(zhdr); return; } - if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { + if (unlikely(PageIsolated(page)) || + test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -855,10 +1099,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (test_and_set_bit(PAGE_CLAIMED, &page->private)) continue; - zhdr = page_address(page); + if (unlikely(PageIsolated(page))) + continue; if (test_bit(PAGE_HEADLESS, &page->private)) break; + zhdr = page_address(page); if (!z3fold_page_trylock(zhdr)) { zhdr = NULL; continue; /* can't evict at this point */ @@ -919,7 +1165,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) next: if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { - free_z3fold_page(page); + free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); return 0; } @@ -996,6 +1242,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) break; } + if (addr) + zhdr->mapped_count++; z3fold_page_unlock(zhdr); out: return addr; @@ -1022,6 +1270,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + zhdr->mapped_count--; z3fold_page_unlock(zhdr); } @@ -1036,6 +1285,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } +static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + if (test_bit(PAGE_HEADLESS, &page->private)) + return false; + + zhdr = page_address(page); + z3fold_page_lock(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_STALE, &page->private)) + goto out; + + pool = zhdr_to_pool(zhdr); + + if (zhdr->mapped_count == 0) { + kref_get(&zhdr->refcount); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del(&page->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); + return true; + } +out: + z3fold_page_unlock(zhdr); + return false; +} + +static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct z3fold_header *zhdr, *new_zhdr; + struct z3fold_pool *pool; + struct address_space *new_mapping; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + if (!trylock_page(page)) + return -EAGAIN; + + if (!z3fold_page_trylock(zhdr)) { + unlock_page(page); + return -EAGAIN; + } + if (zhdr->mapped_count != 0) { + z3fold_page_unlock(zhdr); + unlock_page(page); + return -EBUSY; + } + new_zhdr = page_address(newpage); + memcpy(new_zhdr, zhdr, PAGE_SIZE); + newpage->private = page->private; + page->private = 0; + z3fold_page_unlock(zhdr); + spin_lock_init(&new_zhdr->page_lock); + new_mapping = page_mapping(page); + __ClearPageMovable(page); + ClearPagePrivate(page); + + get_page(newpage); + z3fold_page_lock(new_zhdr); + if (new_zhdr->first_chunks) + encode_handle(new_zhdr, FIRST); + if (new_zhdr->last_chunks) + encode_handle(new_zhdr, LAST); + if (new_zhdr->middle_chunks) + encode_handle(new_zhdr, MIDDLE); + set_bit(NEEDS_COMPACTING, &newpage->private); + new_zhdr->cpu = smp_processor_id(); + spin_lock(&pool->lock); + list_add(&newpage->lru, &pool->lru); + spin_unlock(&pool->lock); + __SetPageMovable(newpage, new_mapping); + z3fold_page_unlock(new_zhdr); + + queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + + page_mapcount_reset(page); + unlock_page(page); + put_page(page); + return 0; +} + +static void z3fold_page_putback(struct page *page) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + z3fold_page_lock(zhdr); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + INIT_LIST_HEAD(&page->lru); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return; + } + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); +} + +static const struct address_space_operations z3fold_aops = { + .isolate_page = z3fold_page_isolate, + .migratepage = z3fold_page_migrate, + .putback_page = z3fold_page_putback, +}; + /***************** * zpool ****************/ @@ -1133,8 +1504,14 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { + int ret; + /* Make sure the z3fold header is not larger than the page size */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); + ret = z3fold_mount(); + if (ret) + return ret; + zpool_register_driver(&z3fold_zpool_driver); return 0; @@ -1142,6 +1519,7 @@ static int __init init_z3fold(void) static void __exit exit_z3fold(void) { + z3fold_unmount(); zpool_unregister_driver(&z3fold_zpool_driver); } diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d3736f5bffec..74cafc0142ea 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, pages + got); + num_pages - got, write_page ? FOLL_WRITE : 0, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 71f06900473e..b96fd3f54705 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -163,7 +163,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, static void hooks_validate(const struct nf_hook_entries *hooks) { -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC struct nf_hook_ops **orig_ops; int prio = INT_MIN; size_t i = 0; diff --git a/net/rds/info.c b/net/rds/info.c index e367a97a18c8..03f6fd56d237 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, 1, pages); + ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 182ab8430594..b340ed4fc43a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, { int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, + pages); if (ret >= 0 && ret < nr_pages) { while (ret--) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 989e52386c35..2b18223e7eb8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages_longterm(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) { diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bb28b178d929..1c421ac42b07 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2725,8 +2725,10 @@ sub process { ($line =~ /^\s*(?:WARNING:|BUG:)/ || $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || # timestamp - $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { - # stack dump address + $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) || + $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ || + $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) { + # stack dump address styles $commit_log_possible_stack_dump = 1; } @@ -3069,21 +3071,21 @@ sub process { # check SPDX comment style for .[chsS] files if ($realfile =~ /\.[chsS]$/ && $rawline =~ /SPDX-License-Identifier:/ && - $rawline !~ /^\+\s*\Q$comment\E\s*/) { + $rawline !~ m@^\+\s*\Q$comment\E\s*@) { WARN("SPDX_LICENSE_TAG", "Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr); } if ($comment !~ /^$/ && - $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { - WARN("SPDX_LICENSE_TAG", - "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); } elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) { - my $spdx_license = $1; - if (!is_SPDX_License_valid($spdx_license)) { - WARN("SPDX_LICENSE_TAG", - "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); - } + my $spdx_license = $1; + if (!is_SPDX_License_valid($spdx_license)) { + WARN("SPDX_LICENSE_TAG", + "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); + } } } } diff --git a/scripts/gdb/linux/clk.py b/scripts/gdb/linux/clk.py new file mode 100644 index 000000000000..6bf71976b55d --- /dev/null +++ b/scripts/gdb/linux/clk.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) NXP 2019 + +import gdb +import sys + +from linux import utils, lists + +clk_core_type = utils.CachedType("struct clk_core") + + +def clk_core_for_each_child(hlist_head): + return lists.hlist_for_each_entry(hlist_head, + clk_core_type.get_type().pointer(), "child_node") + + +class LxClkSummary(gdb.Command): + """Print Linux kernel log buffer.""" + + def __init__(self): + super(LxClkSummary, self).__init__("lx-clk-summary", gdb.COMMAND_DATA) + + def show_subtree(self, clk, level): + gdb.write("%*s%-*s %7d %8d %8d\n" % ( + level * 3 + 1, "", + 30 - level * 3, + clk['name'].string(), + clk['enable_count'], + clk['prepare_count'], + clk['protect_count'])) + + for child in clk_core_for_each_child(clk['children']): + self.show_subtree(child, level + 1) + + def invoke(self, arg, from_tty): + gdb.write(" enable prepare protect\n") + gdb.write(" clock count count count\n") + gdb.write("---------------------------------------------------------\n") + for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_root_list")): + self.show_subtree(clk, 0) + for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_orphan_list")): + self.show_subtree(clk, 0) + + +LxClkSummary() + + +class LxClkCoreLookup(gdb.Function): + """Find struct clk_core by name""" + + def __init__(self): + super(LxClkCoreLookup, self).__init__("lx_clk_core_lookup") + + def lookup_hlist(self, hlist_head, name): + for child in clk_core_for_each_child(hlist_head): + if child['name'].string() == name: + return child + result = self.lookup_hlist(child['children'], name) + if result: + return result + + def invoke(self, name): + name = name.string() + return (self.lookup_hlist(gdb.parse_and_eval("clk_root_list"), name) or + self.lookup_hlist(gdb.parse_and_eval("clk_orphan_list"), name)) + + +LxClkCoreLookup() diff --git a/scripts/gdb/linux/config.py b/scripts/gdb/linux/config.py new file mode 100644 index 000000000000..90e1565b1967 --- /dev/null +++ b/scripts/gdb/linux/config.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2019 Google LLC. + +import gdb +import zlib + +from linux import utils + + +class LxConfigDump(gdb.Command): + """Output kernel config to the filename specified as the command + argument. Equivalent to 'zcat /proc/config.gz > config.txt' on + a running target""" + + def __init__(self): + super(LxConfigDump, self).__init__("lx-configdump", gdb.COMMAND_DATA, + gdb.COMPLETE_FILENAME) + + def invoke(self, arg, from_tty): + if len(arg) == 0: + filename = "config.txt" + else: + filename = arg + + try: + py_config_ptr = gdb.parse_and_eval("kernel_config_data + 8") + py_config_size = gdb.parse_and_eval( + "sizeof(kernel_config_data) - 1 - 8 * 2") + except gdb.error as e: + raise gdb.GdbError("Can't find config, enable CONFIG_IKCONFIG?") + + inf = gdb.inferiors()[0] + zconfig_buf = utils.read_memoryview(inf, py_config_ptr, + py_config_size).tobytes() + + config_buf = zlib.decompress(zconfig_buf, 16) + with open(filename, 'wb') as f: + f.write(config_buf) + + gdb.write("Dumped config to " + filename + "\n") + + +LxConfigDump() diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index d3319a80788a..76f46f427b96 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -13,8 +13,10 @@ */ #include <linux/fs.h> +#include <linux/hrtimer.h> #include <linux/mount.h> #include <linux/of_fdt.h> +#include <linux/threads.h> /* We need to stringify expanded macros so that they can be parsed */ @@ -44,6 +46,9 @@ LX_VALUE(SB_DIRSYNC) LX_VALUE(SB_NOATIME) LX_VALUE(SB_NODIRATIME) +/* linux/htimer.h */ +LX_GDBPARSED(hrtimer_resolution) + /* linux/mount.h */ LX_VALUE(MNT_NOSUID) LX_VALUE(MNT_NODEV) @@ -52,8 +57,16 @@ LX_VALUE(MNT_NOATIME) LX_VALUE(MNT_NODIRATIME) LX_VALUE(MNT_RELATIME) +/* linux/threads.h */ +LX_VALUE(NR_CPUS) + /* linux/of_fdt.h> */ LX_VALUE(OF_DT_HEADER) /* Kernel Configs */ +LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS) +LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) +LX_CONFIG(CONFIG_HIGH_RES_TIMERS) +LX_CONFIG(CONFIG_NR_CPUS) LX_CONFIG(CONFIG_OF) +LX_CONFIG(CONFIG_TICK_ONESHOT) diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index ca11e8df31b6..008e62f3190d 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -135,6 +135,7 @@ and can help identify the state of hotplugged CPUs""" gdb.write("Online CPUs : {}\n".format(list(each_online_cpu()))) gdb.write("Active CPUs : {}\n".format(list(each_active_cpu()))) + LxCpus() diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py index 2f335fbd86fd..55356b66f8ea 100644 --- a/scripts/gdb/linux/lists.py +++ b/scripts/gdb/linux/lists.py @@ -16,6 +16,8 @@ import gdb from linux import utils list_head = utils.CachedType("struct list_head") +hlist_head = utils.CachedType("struct hlist_head") +hlist_node = utils.CachedType("struct hlist_node") def list_for_each(head): @@ -39,6 +41,27 @@ def list_for_each_entry(head, gdbtype, member): yield utils.container_of(node, gdbtype, member) +def hlist_for_each(head): + if head.type == hlist_head.get_type().pointer(): + head = head.dereference() + elif head.type != hlist_head.get_type(): + raise gdb.GdbError("Must be struct hlist_head not {}" + .format(head.type)) + + node = head['first'].dereference() + while node.address: + yield node.address + node = node['next'].dereference() + + +def hlist_for_each_entry(head, gdbtype, member): + for node in hlist_for_each(head): + if node.type != hlist_node.get_type().pointer(): + raise TypeError("Type {} found. Expected struct hlist_head *." + .format(node.type)) + yield utils.container_of(node, gdbtype, member) + + def list_check(head): nb = 0 if (head.type == list_head.get_type().pointer()): @@ -110,4 +133,5 @@ class LxListChk(gdb.Command): raise gdb.GdbError("lx-list-check takes one argument") list_check(gdb.parse_and_eval(argv[0])) + LxListChk() diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 2f01a958eb22..6a56bba233a9 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -29,6 +29,7 @@ class LxCmdLine(gdb.Command): def invoke(self, arg, from_tty): gdb.write(gdb.parse_and_eval("saved_command_line").string() + "\n") + LxCmdLine() @@ -43,6 +44,7 @@ class LxVersion(gdb.Command): # linux_banner should contain a newline gdb.write(gdb.parse_and_eval("(char *)linux_banner").string()) + LxVersion() @@ -86,6 +88,7 @@ Equivalent to cat /proc/iomem on a running target""" def invoke(self, arg, from_tty): return show_lx_resources("iomem_resource") + LxIOMem() @@ -100,6 +103,7 @@ Equivalent to cat /proc/ioports on a running target""" def invoke(self, arg, from_tty): return show_lx_resources("ioport_resource") + LxIOPorts() @@ -149,7 +153,7 @@ values of that process namespace""" if len(argv) >= 1: try: pid = int(argv[0]) - except: + except gdb.error: raise gdb.GdbError("Provide a PID as integer value") else: pid = 1 @@ -195,6 +199,7 @@ values of that process namespace""" info_opts(FS_INFO, s_flags), info_opts(MNT_INFO, m_flags))) + LxMounts() @@ -259,7 +264,7 @@ class LxFdtDump(gdb.Command): try: f = open(filename, 'wb') - except: + except gdb.error: raise gdb.GdbError("Could not open file to dump fdt") f.write(fdt_buf) @@ -267,4 +272,5 @@ class LxFdtDump(gdb.Command): gdb.write("Dumped fdt blob to " + filename + "\n") + LxFdtDump() diff --git a/scripts/gdb/linux/rbtree.py b/scripts/gdb/linux/rbtree.py new file mode 100644 index 000000000000..39db889b874c --- /dev/null +++ b/scripts/gdb/linux/rbtree.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2019 Google LLC. + +import gdb + +from linux import utils + +rb_root_type = utils.CachedType("struct rb_root") +rb_node_type = utils.CachedType("struct rb_node") + + +def rb_first(root): + if root.type == rb_root_type.get_type(): + node = node.address.cast(rb_root_type.get_type().pointer()) + elif root.type != rb_root_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_root not {}".format(root.type)) + + node = root['rb_node'] + if node is 0: + return None + + while node['rb_left']: + node = node['rb_left'] + + return node + + +def rb_last(root): + if root.type == rb_root_type.get_type(): + node = node.address.cast(rb_root_type.get_type().pointer()) + elif root.type != rb_root_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_root not {}".format(root.type)) + + node = root['rb_node'] + if node is 0: + return None + + while node['rb_right']: + node = node['rb_right'] + + return node + + +def rb_parent(node): + parent = gdb.Value(node['__rb_parent_color'] & ~3) + return parent.cast(rb_node_type.get_type().pointer()) + + +def rb_empty_node(node): + return node['__rb_parent_color'] == node.address + + +def rb_next(node): + if node.type == rb_node_type.get_type(): + node = node.address.cast(rb_node_type.get_type().pointer()) + elif node.type != rb_node_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_node not {}".format(node.type)) + + if rb_empty_node(node): + return None + + if node['rb_right']: + node = node['rb_right'] + while node['rb_left']: + node = node['rb_left'] + return node + + parent = rb_parent(node) + while parent and node == parent['rb_right']: + node = parent + parent = rb_parent(node) + + return parent + + +def rb_prev(node): + if node.type == rb_node_type.get_type(): + node = node.address.cast(rb_node_type.get_type().pointer()) + elif node.type != rb_node_type.get_type().pointer(): + raise gdb.GdbError("Must be struct rb_node not {}".format(node.type)) + + if rb_empty_node(node): + return None + + if node['rb_left']: + node = node['rb_left'] + while node['rb_right']: + node = node['rb_right'] + return node.dereference() + + parent = rb_parent(node) + while parent and node == parent['rb_left'].dereference(): + node = parent + parent = rb_parent(node) + + return parent + + +class LxRbFirst(gdb.Function): + """Lookup and return a node from an RBTree + +$lx_rb_first(root): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbFirst, self).__init__("lx_rb_first") + + def invoke(self, root): + result = rb_first(root) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbFirst() + + +class LxRbLast(gdb.Function): + """Lookup and return a node from an RBTree. + +$lx_rb_last(root): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbLast, self).__init__("lx_rb_last") + + def invoke(self, root): + result = rb_last(root) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbLast() + + +class LxRbNext(gdb.Function): + """Lookup and return a node from an RBTree. + +$lx_rb_next(node): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbNext, self).__init__("lx_rb_next") + + def invoke(self, node): + result = rb_next(node) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbNext() + + +class LxRbPrev(gdb.Function): + """Lookup and return a node from an RBTree. + +$lx_rb_prev(node): Return the node at the given index. +If index is omitted, the root node is dereferenced and returned.""" + + def __init__(self): + super(LxRbPrev, self).__init__("lx_rb_prev") + + def invoke(self, node): + result = rb_prev(node) + if result is None: + raise gdb.GdbError("No entry in tree") + + return result + + +LxRbPrev() diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 004b0ac7fa72..2f5b95f09fa0 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -139,8 +139,12 @@ lx-symbols command.""" saved_states.append({'breakpoint': bp, 'enabled': bp.enabled}) # drop all current symbols and reload vmlinux + orig_vmlinux = 'vmlinux' + for obj in gdb.objfiles(): + if obj.filename.endswith('vmlinux'): + orig_vmlinux = obj.filename gdb.execute("symbol-file", to_string=True) - gdb.execute("symbol-file vmlinux") + gdb.execute("symbol-file {0}".format(orig_vmlinux)) self.loaded_modules = [] module_list = modules.module_list() diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index f6ab3ccf698f..0301dc1e0138 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -79,6 +79,7 @@ class LxPs(gdb.Command): pid=task["pid"], comm=task["comm"].string())) + LxPs() @@ -134,4 +135,5 @@ variable.""" else: raise gdb.GdbError("No task of PID " + str(pid)) + LxThreadInfoByPidFunc() diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py new file mode 100644 index 000000000000..071d0dd5a634 --- /dev/null +++ b/scripts/gdb/linux/timerlist.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2019 Google LLC. + +import binascii +import gdb + +from linux import constants +from linux import cpus +from linux import rbtree +from linux import utils + +timerqueue_node_type = utils.CachedType("struct timerqueue_node").get_type() +hrtimer_type = utils.CachedType("struct hrtimer").get_type() + + +def ktime_get(): + """Returns the current time, but not very accurately + + We can't read the hardware timer itself to add any nanoseconds + that need to be added since we last stored the time in the + timekeeper. But this is probably good enough for debug purposes.""" + tk_core = gdb.parse_and_eval("&tk_core") + + return tk_core['timekeeper']['tkr_mono']['base'] + + +def print_timer(rb_node, idx): + timerqueue = utils.container_of(rb_node, timerqueue_node_type.pointer(), + "node") + timer = utils.container_of(timerqueue, hrtimer_type.pointer(), "node") + + function = str(timer['function']).split(" ")[1].strip("<>") + softexpires = timer['_softexpires'] + expires = timer['node']['expires'] + now = ktime_get() + + text = " #{}: <{}>, {}, ".format(idx, timer, function) + text += "S:{:02x}\n".format(int(timer['state'])) + text += " # expires at {}-{} nsecs [in {} to {} nsecs]\n".format( + softexpires, expires, softexpires - now, expires - now) + return text + + +def print_active_timers(base): + curr = base['active']['next']['node'] + curr = curr.address.cast(rbtree.rb_node_type.get_type().pointer()) + idx = 0 + while curr: + yield print_timer(curr, idx) + curr = rbtree.rb_next(curr) + idx += 1 + + +def print_base(base): + text = " .base: {}\n".format(base.address) + text += " .index: {}\n".format(base['index']) + + text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) + + text += " .get_time: {}\n".format(base['get_time']) + if constants.LX_CONFIG_HIGH_RES_TIMERS: + text += " .offset: {} nsecs\n".format(base['offset']) + text += "active timers:\n" + text += "".join([x for x in print_active_timers(base)]) + return text + + +def print_cpu(hrtimer_bases, cpu, max_clock_bases): + cpu_base = cpus.per_cpu(hrtimer_bases, cpu) + jiffies = gdb.parse_and_eval("jiffies_64") + tick_sched_ptr = gdb.parse_and_eval("&tick_cpu_sched") + ts = cpus.per_cpu(tick_sched_ptr, cpu) + + text = "cpu: {}\n".format(cpu) + for i in xrange(max_clock_bases): + text += " clock {}:\n".format(i) + text += print_base(cpu_base['clock_base'][i]) + + if constants.LX_CONFIG_HIGH_RES_TIMERS: + fmts = [(" .{} : {} nsecs", 'expires_next'), + (" .{} : {}", 'hres_active'), + (" .{} : {}", 'nr_events'), + (" .{} : {}", 'nr_retries'), + (" .{} : {}", 'nr_hangs'), + (" .{} : {}", 'max_hang_time')] + text += "\n".join([s.format(f, cpu_base[f]) for s, f in fmts]) + text += "\n" + + if constants.LX_CONFIG_TICK_ONESHOT: + fmts = [(" .{} : {}", 'nohz_mode'), + (" .{} : {} nsecs", 'last_tick'), + (" .{} : {}", 'tick_stopped'), + (" .{} : {}", 'idle_jiffies'), + (" .{} : {}", 'idle_calls'), + (" .{} : {}", 'idle_sleeps'), + (" .{} : {} nsecs", 'idle_entrytime'), + (" .{} : {} nsecs", 'idle_waketime'), + (" .{} : {} nsecs", 'idle_exittime'), + (" .{} : {} nsecs", 'idle_sleeptime'), + (" .{}: {} nsecs", 'iowait_sleeptime'), + (" .{} : {}", 'last_jiffies'), + (" .{} : {}", 'next_timer'), + (" .{} : {} nsecs", 'idle_expires')] + text += "\n".join([s.format(f, ts[f]) for s, f in fmts]) + text += "\njiffies: {}\n".format(jiffies) + + text += "\n" + + return text + + +def print_tickdevice(td, cpu): + dev = td['evtdev'] + text = "Tick Device: mode: {}\n".format(td['mode']) + if cpu < 0: + text += "Broadcast device\n" + else: + text += "Per CPU device: {}\n".format(cpu) + + text += "Clock Event Device: " + if dev == 0: + text += "<NULL>\n" + return text + + text += "{}\n".format(dev['name']) + text += " max_delta_ns: {}\n".format(dev['max_delta_ns']) + text += " min_delta_ns: {}\n".format(dev['min_delta_ns']) + text += " mult: {}\n".format(dev['mult']) + text += " shift: {}\n".format(dev['shift']) + text += " mode: {}\n".format(dev['state_use_accessors']) + text += " next_event: {} nsecs\n".format(dev['next_event']) + + text += " set_next_event: {}\n".format(dev['set_next_event']) + + members = [('set_state_shutdown', " shutdown: {}\n"), + ('set_state_periodic', " periodic: {}\n"), + ('set_state_oneshot', " oneshot: {}\n"), + ('set_state_oneshot_stopped', " oneshot stopped: {}\n"), + ('tick_resume', " resume: {}\n")] + for member, fmt in members: + if dev[member]: + text += fmt.format(dev[member]) + + text += " event_handler: {}\n".format(dev['event_handler']) + text += " retries: {}\n".format(dev['retries']) + + return text + + +def pr_cpumask(mask): + nr_cpu_ids = 1 + if constants.LX_NR_CPUS > 1: + nr_cpu_ids = gdb.parse_and_eval("nr_cpu_ids") + + inf = gdb.inferiors()[0] + bits = mask['bits'] + num_bytes = (nr_cpu_ids + 7) / 8 + buf = utils.read_memoryview(inf, bits, num_bytes).tobytes() + buf = binascii.b2a_hex(buf) + + chunks = [] + i = num_bytes + while i > 0: + i -= 1 + start = i * 2 + end = start + 2 + chunks.append(buf[start:end]) + if i != 0 and i % 4 == 0: + chunks.append(',') + + extra = nr_cpu_ids % 8 + if 0 < extra <= 4: + chunks[0] = chunks[0][0] # Cut off the first 0 + + return "".join(chunks) + + +class LxTimerList(gdb.Command): + """Print /proc/timer_list""" + + def __init__(self): + super(LxTimerList, self).__init__("lx-timerlist", gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + hrtimer_bases = gdb.parse_and_eval("&hrtimer_bases") + max_clock_bases = gdb.parse_and_eval("HRTIMER_MAX_CLOCK_BASES") + + text = "Timer List Version: gdb scripts\n" + text += "HRTIMER_MAX_CLOCK_BASES: {}\n".format(max_clock_bases) + text += "now at {} nsecs\n".format(ktime_get()) + + for cpu in cpus.each_online_cpu(): + text += print_cpu(hrtimer_bases, cpu, max_clock_bases) + + if constants.LX_CONFIG_GENERIC_CLOCKEVENTS: + if constants.LX_CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: + bc_dev = gdb.parse_and_eval("&tick_broadcast_device") + text += print_tickdevice(bc_dev, -1) + text += "\n" + mask = gdb.parse_and_eval("tick_broadcast_mask") + mask = pr_cpumask(mask) + text += "tick_broadcast_mask: {}\n".format(mask) + if constants.LX_CONFIG_TICK_ONESHOT: + mask = gdb.parse_and_eval("tick_broadcast_oneshot_mask") + mask = pr_cpumask(mask) + text += "tick_broadcast_oneshot_mask: {}\n".format(mask) + text += "\n" + + tick_cpu_devices = gdb.parse_and_eval("&tick_cpu_device") + for cpu in cpus.each_online_cpu(): + tick_dev = cpus.per_cpu(tick_cpu_devices, cpu) + text += print_tickdevice(tick_dev, cpu) + text += "\n" + + gdb.write(text) + + +LxTimerList() diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index 50805874cfc3..bc67126118c4 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -66,6 +66,7 @@ Note that TYPE and ELEMENT have to be quoted as strings.""" return container_of(ptr, gdb.lookup_type(typename.string()).pointer(), elementname.string()) + ContainerOf() @@ -148,14 +149,14 @@ def get_gdbserver_type(): def probe_qemu(): try: return gdb.execute("monitor info version", to_string=True) != "" - except: + except gdb.error: return False def probe_kgdb(): try: thread_info = gdb.execute("info thread 2", to_string=True) return "shadowCPU0" in thread_info - except: + except gdb.error: return False global gdbserver_type @@ -172,7 +173,7 @@ def get_gdbserver_type(): def gdb_eval_or_none(expresssion): try: return gdb.parse_and_eval(expresssion) - except: + except gdb.error: return None diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py index 6e0b0afd888a..eff5a48ac026 100644 --- a/scripts/gdb/vmlinux-gdb.py +++ b/scripts/gdb/vmlinux-gdb.py @@ -27,7 +27,11 @@ else: import linux.modules import linux.dmesg import linux.tasks + import linux.config import linux.cpus import linux.lists + import linux.rbtree import linux.proc import linux.constants + import linux.timerlist + import linux.clk diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 86b87332b9e5..dcd25a31f9f5 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -21,11 +21,14 @@ absolut||absolute absoulte||absolute acccess||access acceess||access +accelaration||acceleration acceleratoin||acceleration +acceleratrion||acceleration accelleration||acceleration accesing||accessing accesnt||accent accessable||accessible +accessiable||accessible accesss||access accidentaly||accidentally accidentually||accidentally @@ -39,45 +42,58 @@ accout||account accquire||acquire accquired||acquired accross||across +accss||access acessable||accessible acess||access +acheived||achieved achitecture||architecture acient||ancient acitions||actions acitve||active acknowldegement||acknowledgment acknowledgement||acknowledgment +acknowleding||acknowledging ackowledge||acknowledge ackowledged||acknowledged +ackowledgement||acknowledgement +ackowledges||acknowledges acording||according -activete||activate actived||activated +activete||activate actualy||actually acumulating||accumulating acumulator||accumulator +acutally||actually adapater||adapter +adddress||address +adderss||address addional||additional additionaly||additionally +additionnally||additionally +additoinal||additional additonal||additional addres||address -adddress||address addreses||addresses addresss||address addrress||address aditional||additional aditionally||additionally aditionaly||additionally +adjusment||adjustment adminstrative||administrative adress||address adresses||addresses adrresses||addresses advertisment||advertisement adviced||advised +advnace||advance afecting||affecting +affvecs||affects againt||against agaist||against aggreataon||aggregation aggreation||aggregation +agruments||arguments albumns||albums alegorical||allegorical algined||aligned @@ -87,12 +103,15 @@ algoritm||algorithm algoritms||algorithms algorrithm||algorithm algorritm||algorithm +algrthm||algorithm aligment||alignment alignement||alignment allign||align alligned||aligned alllocate||allocate alloated||allocated +allocagtor||allocator +allocas||allices allocatote||allocate allocatrd||allocated allocte||allocate @@ -103,23 +122,29 @@ alogrithm||algorithm alot||a lot alow||allow alows||allows +alreaady||already altough||although alue||value +amacout||mahout ambigious||ambiguous +ammount||amount +ammounts||amounts amoung||among amout||amount amplifer||amplifier amplifyer||amplifier -an union||a union -an user||a user -an userspace||a userspace -an one||a one +analagous||analogous +analogous||analogous analysator||analyzer ang||and anniversery||anniversary annoucement||announcement anomolies||anomalies anomoly||anomaly +an one||a one +an union||a union +an user||a user +an userspace||a userspace anway||anyway aplication||application appearence||appearance @@ -134,48 +159,68 @@ approriate||appropriate approriately||appropriately apropriate||appropriate aquainted||acquainted +aquaris||Aquarius +aquire||acquire aquired||acquired aquisition||acquisition arbitary||arbitrary +arbitralrily||arbitrarily +arbitray||arbitrary architechture||architecture arguement||argument arguements||arguments +argumetns||arguments aritmetic||arithmetic arne't||aren't arraival||arrival artifical||artificial artillary||artillery asign||assign +assebmly||assembly asser||assert assertation||assertion assertting||asserting +assgined||assigned assiged||assigned assigment||assignment assigments||assignments assistent||assistant +assmebly||assembly +assocaition||association assocation||association +assocativity||associativity associcated||associated +assoicatied||associated assotiated||associated asssert||assert assum||assume +assumming||assuming assumtpion||assumption +asume||assume asuming||assuming asycronous||asynchronous -asynchnous||asynchronous -asynchromous||asynchronous asymetric||asymmetric asymmeric||asymmetric +asynchnous||asynchronous +asynchromous||asynchronous +asynchrounous||asynchronous +atempt||attempt atomatically||automatically atomicly||atomically -atempt||attempt +atribute||attribute attachement||attachment attched||attached -attemps||attempts attemping||attempting +attemppt||attempt +attemps||attempts attepmpt||attempt +attirbutes||attributes attnetion||attention +attribut||attribute attruibutes||attributes authentification||authentication +autherr||author +automagicaly||automatically automaticaly||automatically automaticly||automatically automatize||automate @@ -185,9 +230,11 @@ autonymous||autonomous auxillary||auxiliary auxilliary||auxiliary avaiable||available +avaialable||available avaible||available availabe||available availabled||available +availablility||availability availablity||availability availaible||available availale||available @@ -205,6 +252,7 @@ bahavior||behavior bakup||backup baloon||balloon baloons||balloons +bandwitdh||bandwidth bandwith||bandwidth banlance||balance batery||battery @@ -215,17 +263,24 @@ becuase||because beeing||being befor||before begining||beginning +beginnning||beginning +beiginning||beginning beter||better betweeen||between bianries||binaries +bidirectionnal||bidirectional bitmast||bitmask boardcast||broadcast +bondary||boundary +boook||book borad||board boundry||boundary brievely||briefly broadcase||broadcast broadcat||broadcast bufufer||buffer +byteen||between +cachces||caches cacluated||calculated caculate||calculate caculation||calculation @@ -238,20 +293,27 @@ calucate||calculate calulate||calculate cancelation||cancellation cancle||cancel +capabale||capable +capabiliities||capabilities capabilites||capabilities +capabilitie||capabilities +capabilitites||capabilities +capabilitiy||capability +capabilties||capabilities capabilty||capability capabitilies||capabilities capablity||capability -capatibilities||capabilities capapbilities||capabilities +capatibilities||capabilities caputure||capture carefuly||carefully cariage||carriage catagory||category +cconcentrator||concentrator cehck||check +chache||cache challange||challenge challanges||challenges -chache||cache chanell||channel changable||changeable chanined||chained @@ -259,6 +321,7 @@ channle||channel channnel||channel charachter||character charachters||characters +characteristcs||characteristics charactor||character charater||character charaters||characters @@ -268,6 +331,7 @@ chck||check checksumed||checksummed checksuming||checksumming childern||children +child'ren||children childs||children chiled||child chked||checked @@ -282,138 +346,212 @@ clared||cleared closeing||closing clustred||clustered coexistance||coexistence +cofigured||configured colescing||coalescing collapsable||collapsible +collpased||collapsed colorfull||colorful +comamnd||command comand||command +combatibility||compatibility comit||commit commerical||commercial comming||coming +comminicate||communicate comminucation||communication commited||committed commiting||committing committ||commit commoditiy||commodity -comsume||consume -comsumer||consumer -comsuming||consuming +communation||commination +communicaion||communication compability||compatibility compaibility||compatibility +comparation||compilation comparsion||comparison compatability||compatibility compatable||compatible +compatbility||compatibility +compatibile||compatible compatibiliy||compatibility compatibilty||compatibility +compatibl||compatible compatiblity||compatibility +compatile||compatible competion||completion compilant||compliant compleatly||completely +completey||completely +completiom||completion +completited||completed +completition||competition completition||completion completly||completely complient||compliant -componnents||components compoment||component +componnents||components +componse||compose comppatible||compatible compres||compress compresion||compression comression||compression +comsume||consume +comsumer||consumer +comsuming||consuming comunication||communication conbination||combination +conditinal||conditional conditionaly||conditionally conected||connected conector||connector -connecetd||connected +conerter||converter +configuared||configured configuartion||configuration configuation||configuration +configued||configured +configuraions||configurations configuratoin||configuration configuraton||configuration +configuredi||configured configuretion||configuration +configurtion||configuration configutation||configuration +congifured||configured conider||consider conjuction||conjunction +connecetd||connected +connecteded||connected connectinos||connections connnection||connection connnections||connections +consecutivity||consecutively +consel||console consistancy||consistency consistant||consistent +consitional||conditional +constitue||constitute containes||contains containts||contains contaisn||contains contant||contact contence||contents +contigious||contagious continious||continuous continous||continuous continously||continuously continueing||continuing -contraints||constraints -contruct||construct +continuosuly||continuously +continuying||continuing contol||control contoller||controller +contraints||constraints controled||controlled controler||controller +controlers||controllers controll||control +controlller||controller +controllor||controller +contruct||construct contruction||construction contry||country conuntry||country +converstion||conversation convertion||conversion convertor||converter +conveter||converter convienient||convenient convinient||convenient +convnetional||conventional +convrate||concrete +convrates||concretes +coommon||common +coonnected||connected +coordiante||coordinate corected||corrected +correcly||correctly correponding||corresponding correponds||corresponds +corresonding||corresponding correspoding||corresponding cotrol||control cound||could couter||counter coutner||counter +coutry||country +creater||creator +criticial||critical cryptocraphic||cryptographic cunter||counter +curcuit||circuit +cureent||current curently||currently cylic||cyclic +daemonise||demonize dafault||default deafult||default deamon||daemon +debufs||debugs +decapsulates||encapsulates +decendents||decedents decompres||decompress -decsribed||described +decomression||decompression +decrementation||defragmentation +decremeter||decremented +decribed||described decription||description +decriptor||descriptor +decsribed||described dectected||detected defailt||default +defaiult||default deferal||deferral deffered||deferred defferred||deferred definate||definite definately||definitely +defin||define +definetly||definitely +definision||definition +defintiion||definition defintion||definition defintions||definitions +defragmentaion||defragmentation defualt||default defult||default -deintializing||deinitializing -deintialize||deinitialize deintialized||deinitialized +deintialize||deinitialize +deintializing||deinitializing deivce||device delared||declared delare||declare delares||declares delaring||declaring delemiter||delimiter -demodualtor||demodulator +delievers||delivers demension||dimension +demodualtor||demodulator +depdendency||dependency dependancies||dependencies dependancy||dependency dependant||dependent +depenedent||dependent +depenencies||dependencies depreacted||deprecated depreacte||deprecate desactivate||deactivate +descibe||describe desciptor||descriptor desciptors||descriptors +descpriptor||descriptor +descripition||description descripton||description descrition||description descritptor||descriptor desctiptor||descriptor +desination||destination +desribe||describe desriptor||descriptor desriptors||descriptors -desination||destination destionation||destination destoried||destroyed destory||destroy @@ -422,6 +560,8 @@ destorys||destroys destroied||destroyed detabase||database deteced||detected +detemine||determine +detroyed||destroyed develope||develop developement||development developped||developed @@ -435,79 +575,102 @@ diable||disable dictionnary||dictionary didnt||didn't diferent||different +differenciate||differentiate differrence||difference diffrent||different -differenciate||differentiate diffrentiate||differentiate difinition||definition +digigram||decigram dimention||dimension dimesions||dimensions -dispalying||displaying diplay||display +direcly||directly directon||direction direectly||directly diregard||disregard -disassocation||disassociation disapear||disappear disapeared||disappeared disappared||disappeared -disbale||disable +disassocation||disassociation disbaled||disabled -disble||disable +disbale||disable disbled||disabled +disble||disable disconnet||disconnect discontinous||discontinuous +discpline||discipline +discription||description +discsignr||discoing disharge||discharge +disinguish||distinguish disnabled||disabled +dispalying||displaying dispertion||dispersion dissapears||disappears distiction||distinction +distinquish||distinguish +distributon||distribution divisable||divisible divsiors||divisors docuentation||documentation documantation||documentation +documenation||documentation documentaion||documentation documment||document +documnetation||documentation doesnt||doesn't +dont't||don't dorp||drop dosen||doesn downlad||download downlads||downloads droped||dropped druing||during +dyanamically||dynamically dynmaic||dynamic eanable||enable +eariler||earlier easilly||easily ecspecially||especially edditable||editable editting||editing efective||effective +effectiv||effective efficently||efficiently ehther||ether eigth||eight +ejectcd||ejected elementry||elementary eletronic||electronic embeded||embedded +emergerncy||emergency +emualted||emulated +emultor||emulator enabledi||enabled enble||enable enchanced||enhanced +encomapssing||encompassing encorporating||incorporating encrupted||encrypted encrypiton||encryption encryptio||encryption endianess||endianness enhaced||enhanced +enlightment||enlighten enlightnment||enlightenment +enocded||encoded enqueing||enqueuing +enterily||entirely +entimate||intimate entires||entries entites||entities +entrancy||entrance entrys||entries -enocded||encoded -enterily||entirely enviroiment||environment enviroment||environment environement||environment environent||environment +epected||expected eqivalent||equivalent equiped||equipped equivelant||equivalent @@ -515,63 +678,78 @@ equivilant||equivalent eror||error errorr||error estbalishment||establishment +ethernel||eternal etsablishment||establishment etsbalishment||establishment +evntse||vents +exacltly||exactly excecutable||executable exceded||exceeded excellant||excellent +excluevt||exclave execeeded||exceeded execeeds||exceeds exeed||exceed +exiample||example existance||existence existant||existent exixt||exist exlcude||exclude exlcusive||exclusive exmaple||example +exmaples||examples expecially||especially experies||expires +explainig||explaining explicite||explicit explicitely||explicitly -explict||explicit +explicity||explicitly explictely||explicitly +explict||explicit explictly||explicitly expresion||expression exprimental||experimental extened||extended extensability||extensibility -extention||extension extenstion||extension +extention||extension extracter||extractor faield||failed -falied||failed -faild||failed failded||failed +faild||failed failer||failure -faill||fail failied||failed +faill||fail faillure||failure +failng||failing failue||failure failuer||failure -failng||failing faireness||fairness falied||failed faliure||failure fallbck||fallback familar||familiar +familiy||family fatser||faster +faulure||failure +favorit||favorite +featues||features feauture||feature feautures||features +ferequencies||frequencies +feroceon||ferrocene fetaure||feature fetaures||features fileystem||filesystem fimware||firmware -firmare||firmware -firware||firmware finanize||finalize findn||find finilizes||finalizes finsih||finish +firmare||firmware +firware||firmware +firwmware||firmware +flexsible||flexible flusing||flushing folloing||following followign||following @@ -582,19 +760,27 @@ forseeable||foreseeable forse||force fortan||fortran forwardig||forwarding +fotmat||format frambuffer||framebuffer +framewor||framework framming||framing framwork||framework frequncy||frequency frome||from +frramework||framework fucntion||function fuction||function fuctions||functions funcation||function +funcional||functional funcion||function functionallity||functionality +functionalty||functionality functionaly||functionally +functionlity||functionality functionnality||functionality +functionnals||functional +functiuon||function functonality||functionality funtion||function funtions||functions @@ -603,104 +789,143 @@ futhermore||furthermore futrue||future gauage||gauge gaurenteed||guaranteed -generiously||generously genereate||generate genereted||generated +genericization||generalization +generiously||generously genric||generic globel||global grabing||grabbing grahical||graphical grahpical||graphical +grammathical||grammatical grapic||graphic grranted||granted guage||gauge +guaratees||guarantees guarenteed||guaranteed guarentee||guarantee +guarentees||guarantees +gurantee||guarantee halfs||halves hander||handler handfull||handful hanlde||handle hanled||handled happend||happened +hardocde||hardcode harware||hardware heirarchically||hierarchically helpfull||helpful -hybernate||hibernate +hertergenous||heterogeneous +hieararchy||hierarchy +hiearchy||hierarchy hierachy||hierarchy hierarchie||hierarchy homogenous||homogeneous howver||however hsould||should +hybernate||hibernate hypervior||hypervisor hypter||hyper +idential||identical identidier||identifier +identificator||identification +iinclude||include iligal||illegal -illigal||illegal illgal||illegal -iomaped||iomapped +illigal||illegal imblance||imbalance +imeediate||immediate +imlemented||implemented immeadiately||immediately immedaite||immediate immediatelly||immediately immediatly||immediately immidiate||immediate +impatients||impatient impelentation||implementation impementated||implemented implemantation||implementation implemenation||implementation implementaiton||implementation implementated||implemented -implemention||implementation implementd||implemented +implemention||implementation implemetation||implementation implemntation||implementation implentation||implementation implmentation||implementation implmenting||implementing +inavlid||invalid incative||inactive +incluing||including incomming||incoming incompatabilities||incompatibilities incompatable||incompatible +incompleted||incomplete inconsistant||inconsistent +incoporate||incorporate increas||increase +incrementes||increments incremeted||incremented +incresed||increased incrment||increment +indecated||indicated indendation||indentation indended||intended +indentifier||identifier +indentifies||identifies +indentify||identify independant||independent independantly||independently independed||independent +independend||independent indiate||indicate indicat||indicate +indicies||indices +indix||index +indle||idle +inefficint||inefficient inexpect||inexpected inferface||interface infomation||information +informaion||information informatiom||information informations||information informtion||information infromation||information ingore||ignore +inialized||animalized inital||initial -initalized||initialized initalised||initialized initalise||initialize +initalized||initialized initalize||initialize initation||initiation initators||initiators +initerface||interface initialiazation||initialization +initializatiion||initialization initializiation||initialization -initialze||initialize initialzed||initialized +initialze||initialize initialzing||initializing initilization||initialization initilize||initialize +inititalizing||initializing +initted||ignited +inmediately||immediately inofficial||unofficial inrerface||interface insititute||institute instace||instance instal||install -instanciate||instantiate instanciated||instantiated +instanciate||instantiate +instantanous||instantaneous +instituto||institute +instuction||instruction insufficent||insufficient inteface||interface integreated||integrated @@ -710,14 +935,14 @@ intendet||intended intented||intended interanl||internal interchangable||interchangeable +intercs||inters interferring||interfering interger||integer intermittant||intermittent +internallly||internally internel||internal interoprability||interoperability -interuupt||interrupt -interupt||interrupt -interupts||interrupts +interprete||interpreted interrface||interface interrrupt||interrupt interrup||interrupt @@ -725,6 +950,8 @@ interrups||interrupts interruptted||interrupted interupted||interrupted interupt||interrupt +interupts||interrupts +interuupt||interrupt intial||initial intialisation||initialisation intialised||initialised @@ -732,23 +959,26 @@ intialise||initialise intialization||initialization intialized||initialized intialize||initialize +intializes||initializes intregral||integral intrerrupt||interrupt intrrupt||interrupt +intruction||instruction +intstruments||instruments intterrupt||interrupt intuative||intuitive -inavlid||invalid invaid||invalid invaild||invalid invailid||invalid -invald||invalid invalde||invalid +invald||invalid invalide||invalid invalidiate||invalidate invalud||invalid invididual||individual invokation||invocation invokations||invocations +iomaped||iomapped ireelevant||irrelevant irrelevent||irrelevant isnt||isn't @@ -757,16 +987,20 @@ iternations||iterations itertation||iteration itslef||itself jave||java +jection||ejection jeffies||jiffies +journalid||journaled juse||just jus||just kown||known +kthreadd's||thread's langage||language langauage||language langauge||language langugage||language lauch||launch layed||laid +leftoever||leftover legnth||length leightweight||lightweight lengh||length @@ -778,6 +1012,7 @@ libary||library librairies||libraries libraris||libraries licenceing||licencing +limitiaion||limitation loggging||logging loggin||login logile||logfile @@ -786,23 +1021,29 @@ loosing||losing losted||lost machinary||machinery maibox||mailbox +mailformed||malformed maintainance||maintenance maintainence||maintenance +maintanence||maintenance maintan||maintain makeing||making -mailformed||malformed malplaced||misplaced malplace||misplace managable||manageable +managament||management +managememt||management managment||management mangement||management manoeuvering||maneuvering +manufactor||manufactory manufaucturing||manufacturing +mappigns||mappings mappping||mapping matchs||matches mathimatical||mathematical mathimatic||mathematic mathimatics||mathematics +maximim||maximum maximium||maximum maxium||maximum mechamism||mechanism @@ -810,45 +1051,60 @@ meetign||meeting memeory||memory memmber||member memoery||memory +memomry||memory ment||meant +menual||manual mergable||mergeable mesage||message +messagges||messages messags||messages +messgae||message messgaes||messages messsage||message messsages||messages micropone||microphone microprocesspr||microprocessor +migarion||migration migrateable||migratable +migratecd||migrated +miliseconds||milliseconds +millenium||millennium milliseonds||milliseconds -minium||minimum minimam||minimum +minimim||minimum +minium||minimum miniumum||minimum minumum||minimum misalinged||misaligned miscelleneous||miscellaneous misformed||malformed -mispelled||misspelled -mispelt||misspelt mising||missing mismactch||mismatch +mispelled||misspelled +mispelt||misspelt missmanaged||mismanaged missmatch||mismatch +misterious||mysterious miximum||maximum mmnemonic||mnemonic mnay||many modfiy||modify modulues||modules +modyfying||modifying momery||memory -memomry||memory monochorome||monochrome monochromo||monochrome monocrome||monochrome mopdule||module +mordern||modern mroe||more +muliptle||multiple mulitplied||multiplied multidimensionnal||multidimensional +multipe||multiple +multipy||multiply multple||multiple +muma||numa mumber||number muticast||multicast mutilcast||multicast @@ -860,19 +1116,27 @@ nead||need neccecary||necessary neccesary||necessary neccessary||necessary +nececssary||necessary necesary||necessary neded||needed negaive||negative +negitive||negative +neglibigle||negligible +negligeable||negligible negoitation||negotiation negotation||negotiation +negtive||negative nerver||never nescessary||necessary nessessary||necessary noticable||noticeable notications||notifications notifed||notified +notifer||rotifer +notifys||notifies numebr||number numner||number +numnodes||nimrods obtaion||obtain obusing||abusing occassionally||occasionally @@ -881,12 +1145,14 @@ occurance||occurrence occurances||occurrences occured||occurred occurence||occurrence +occurences||occurrences occure||occurred -occured||occurred +occures||occurs occuring||occurring -offser||offset offet||offset offloded||offloaded +offseet||offset +offser||offset omited||omitted omiting||omitting omitt||omit @@ -894,26 +1160,35 @@ ommiting||omitting ommitted||omitted onself||oneself ony||only +openattr||operator operatione||operation opertaions||operations +optimzation||optimization optionnal||optional optmizations||optimizations +orginally||originally orientatied||orientated orientied||oriented -orignal||original originial||original +orignal||original otherise||otherwise +othrewise||otherwise ouput||output oustanding||outstanding overaall||overall +overcommited||overcommitted overhread||overhead -overlaping||overlapping +overidden||overridden overide||override +overlaping||overlapping +overriddenp||overridden overrided||overridden overriden||overridden overun||overrun -overwritting||overwriting overwriten||overwritten +overwritting||overwriting +ovewrite||overwrite +ovrie||ovaries pacakge||package pachage||package packacge||package @@ -922,11 +1197,12 @@ packge||package packtes||packets pakage||package paket||packet +palaeontologic||paleontological pallette||palette paln||plan paramameters||parameters -paramaters||parameters paramater||parameter +paramaters||parameters parametes||parameters parametised||parametrised paramter||parameter @@ -936,6 +1212,7 @@ particuarly||particularly particularily||particularly partion||partition partions||partitions +partititon||partition partiton||partition pased||passed passin||passing @@ -947,12 +1224,19 @@ peice||piece pendantic||pedantic peprocessor||preprocessor perfoming||performing +periperal||peripheral peripherial||peripheral +permision||permission permissons||permissions peroid||period persistance||persistence persistant||persistent +pessemistic||pessimistic phoneticly||phonetically +phyical||physical +physcial||physical +physial||physical +pipleline||pipeline plalform||platform platfoem||platform platfrom||platform @@ -961,11 +1245,17 @@ pleaes||please ploting||plotting plugable||pluggable poinnter||pointer +pointee||pointer pointeur||pointer poiter||pointer +polluate||pollute +poroperties||properties posible||possible +posioned||poisoned positon||position +positve||positive possibilites||possibilities +posssible||possible powerfull||powerful pramater||parameter preamle||preamble @@ -976,26 +1266,36 @@ preceeding||preceding preceed||precede precendence||precedence precission||precision +predictible||predictable preemptable||preemptible prefered||preferred +prefetcht||prefect prefferably||preferably premption||preemption prepaired||prepared +prepapre||prepare preperation||preparation +presentes||presents pressre||pressure +pretetch||protect +previosuly||previously +prileged||pillaged primative||primitive princliple||principle +princples||principles priorty||priority +privatep||private privilaged||privileged privilage||privilege priviledge||privilege priviledges||privileges +privilged||privileged probaly||probably procceed||proceed proccesors||processors procesed||processed -proces||process procesing||processing +proces||process processessing||processing processess||processes processpr||processor @@ -1006,30 +1306,39 @@ prodecure||procedure progams||programs progess||progress programers||programmers +programmble||programmable programm||program programms||programs progresss||progress +promiscouos||promiscuous promiscous||promiscuous +promixity||proximity promps||prompts +promsicuous||promiscuous pronnounced||pronounced prononciation||pronunciation pronouce||pronounce pronunce||pronounce +properies||properties +properites||properties +properities||prosperities propery||property propigate||propagate propigation||propagation propogate||propagate +proptery||property prosess||process protable||portable protcol||protocol protecion||protection protocoll||protocol -promixity||proximity psudo||pseudo psuedo||pseudo psychadelic||psychedelic +publishhed||published pwoer||power queing||queuing +quently||queenly quering||querying randomally||randomly raoming||roaming @@ -1045,6 +1354,7 @@ recieves||receives recogniced||recognised recognizeable||recognizable recommanded||recommended +recurive||recursive recyle||recycle redircet||redirect redirectrion||redirection @@ -1054,23 +1364,33 @@ refcounf||refcount refence||reference refered||referred referenace||reference +refererence||reference refering||referring refernces||references refernnce||reference refrence||reference registed||registered -registerd||registered registeration||registration +registerd||registered +registe||register registeresd||registered registerred||registered registes||registers registraration||registration regsiter||register regster||register +regsters||registers +regstiers||registers regualar||regular +regualtor||regulator +regualtors||regulators reguator||regulator +regulaltor||regulator regulamentations||regulations +regularily||regularly reigstration||registration +reinititalise||reinitialize +relatime||realtime releated||related relevent||relevant reloade||reload @@ -1086,60 +1406,78 @@ representaion||representation reqeust||request requestied||requested requiere||require +requiremenst||requirements requirment||requirement requred||required requried||required +requsted||requested requst||request reregisteration||reregistration +reselient||resilient +reservd||reserved reseting||resetting reseved||reserved reseverd||reserved resizeable||resizable +resoltuion||resolution resouce||resource resouces||resources resoures||resources +resovle||resolve responce||response resrouce||resource ressizes||resizes ressource||resource ressources||resources restesting||retesting +restrct||restrict resumbmitting||resubmitting +resvered||revered retransmited||retransmitted retreived||retrieved retreive||retrieve retreiving||retrieving retrive||retrieve +retuerned||returned retuned||returned reudce||reduce reuest||request reuqest||request reutnred||returned revsion||revision +rgister||register rmeoved||removed rmeove||remove rmeoves||removes +robustneess||robustness rountine||routine routins||routines rquest||request runing||running runned||ran +runnig||running runnning||running runtine||runtime sacrifying||sacrificing safly||safely safty||safety +sasitfy||salsify savable||saveable +scalabilty||scalability scaleing||scaling scaned||scanned scaning||scanning scarch||search +scehduled||scheduled +schd||scud +schduler||scheduler seach||search searchs||searches secquence||sequence secund||second segement||segment semaphone||semaphore +sempahore||semaphore senario||scenario senarios||scenarios sentivite||sensitive @@ -1152,15 +1490,21 @@ seperate||separate seperatly||separately seperator||separator sepperate||separate -seqeunce||sequence -seqeuncer||sequencer seqeuencer||sequencer +seqeuncer||sequencer +seqeunce||sequence sequece||sequence sequencial||sequential +serivced||serviced +serliazed||sterilized serveral||several servive||service setts||sets settting||setting +severly||severely +shapping||shaping +sharability||salability +shimphy||simply shotdown||shutdown shoud||should shouldnt||shouldn't @@ -1168,44 +1512,64 @@ shoule||should shrinked||shrunk siginificantly||significantly signabl||signal +signifcant||significant similary||similarly similiar||similar simlar||similar simliar||similar simpified||simplified +simplely||simplify singaled||signaled singal||signal singed||signed +singnalled||signaled sleeped||slept softwares||software +solidum||solidus +spcified||specified speach||speech specfic||specific +specfied||specified specfield||specified +specfiers||specifiers +speciall||special speciefied||specified +specifcally||specifically specifc||specific specifed||specified +specifer||specifier specificatin||specification specificaton||specification +specifiction||specification specifing||specifying specifiying||specifying +specifiy||specify +specyfing||specifying speficied||specified +speicific||specific speicify||specify speling||spelling +spiint||splint spinlcok||spinlock spinock||spinlock +splited||spited splitted||split spreaded||spread +spuriuous||spurious +spurriously||spuriously spurrious||spurious sructure||structure stablilization||stabilization staically||statically staion||station standardss||standards +standarized||standardized standartization||standardization standart||standard standy||standby stardard||standard staticly||statically +statistcs||statistics statuss||status stoped||stopped stoping||stopping @@ -1213,56 +1577,70 @@ stoppped||stopped straming||streaming struc||struct structres||structures -stuct||struct strucuture||structure +stuct||struct stucture||structure sturcture||structure +subcribed||subscribed subdirectoires||subdirectories suble||subtle -substract||subtract submition||submission -suceed||succeed +submmitted||submitted +substract||subtract +subsytem||subsystem +subystem||subsystem +succecss||success succesfully||successfully succesful||successful successed||succeeded successfull||successful successfuly||successfully +successully||successfully +succsful||successful +suceed||succeed sucessfully||successfully +sucessful||successful sucess||success superflous||superfluous superseeded||superseded suplied||supplied suported||supported suport||support -supportet||supported suppored||supported +supportet||supported supportin||supporting suppoted||supported suppported||supported suppport||support +supressed||suppressed supress||suppress surpressed||suppressed surpresses||suppresses susbsystem||subsystem +suseconds||seconds +susepnd||suspend suspeneded||suspended -suspsend||suspend suspicously||suspiciously +suspsend||suspend swaping||swapping switchs||switches -swith||switch swithable||switchable -swithc||switch swithced||switched swithcing||switching +swithc||switch swithed||switched swithing||switching +swith||switch swtich||switch +sychronize||synchronize symetric||symmetric synax||syntax +synchonization||synchronization synchonized||synchronized +synchornization||synchronization synchronuously||synchronously -syncronize||synchronize syncronized||synchronized +syncronize||synchronize syncronizing||synchronizing syncronus||synchronous syste||system @@ -1273,23 +1651,30 @@ tansmit||transmit targetted||targeted targetting||targeting taskelt||tasklet +technologie||technologies +technologt||technology teh||the temorary||temporary temproarily||temporarily +terminage||terminate +terminted||terminated thead||thread +theorical||theatrical therfore||therefore +theroretical||theoretical thier||their +thourgh||though threds||threads threshhold||threshold thresold||threshold throught||through -troughput||throughput thses||these -tiggers||triggers tiggered||triggered -tipically||typically +tiggers||triggers timeing||timing +timestampns||timestamps timout||timeout +tipically||typically tmis||this toogle||toggle torerable||tolerable @@ -1298,19 +1683,31 @@ tramsmitted||transmitted tramsmit||transmit tranasction||transaction tranfer||transfer +tranlsate||translate +transceive||transceiver transcevier||transceiver transciever||transceiver transferd||transferred transfered||transferred transfering||transferring +transferrs||transfers transision||transition +transistions||transitions +translarion||translation transmittd||transmitted +transofrmed||transformed transormed||transformed +transparantly||transparently +transportt||transport +transpport||transport trasfer||transfer trasmission||transmission treshold||threshold trigerred||triggered trigerring||triggering +trivialy||trivially +troughput||throughput +trsfr||turfs trun||turn tunning||tuning ture||true @@ -1318,11 +1715,13 @@ tyep||type udpate||update uesd||used uknown||unknown -usupported||unsupported +ulility||utility +unamed||unnamed uncommited||uncommitted unconditionaly||unconditionally underun||underrun unecessary||unnecessary +uneeded||unneeded unexecpted||unexpected unexepected||unexpected unexpcted||unexpected @@ -1330,37 +1729,40 @@ unexpectd||unexpected unexpeted||unexpected unexpexted||unexpected unfortunatelly||unfortunately +unhandeable||unchangeable unifiy||unify -uniterrupted||uninterrupted +uninitialzed||uninitialized unintialized||uninitialized +uniterrupted||uninterrupted unitialized||uninitialized unkmown||unknown unknonw||unknown unknow||unknown unkown||unknown -unamed||unnamed -uneeded||unneeded -unneded||unneeded +unmached||unmatched unneccecary||unnecessary unneccesary||unnecessary unneccessary||unnecessary unnecesary||unnecessary +unneded||unneeded unneedingly||unnecessarily unnsupported||unsupported -unmached||unmatched unregester||unregister unresgister||unregister unrgesiter||unregister unsinged||unsigned -unstabel||unstable unsolicitied||unsolicited +unstabel||unstable unsuccessfull||unsuccessful unsuported||unsupported +unsychronized||unsynchronized untill||until unuseful||useless unvalid||invalid +upadted||updated upate||update upsupported||unsupported +uptodate||up-to-date usefule||useful usefull||useful usege||usage @@ -1383,24 +1785,27 @@ vaule||value verbse||verbose verisons||versions verison||version +veriton||version verson||version vicefersa||vice-versa virtal||virtual virtaul||virtual virtiual||virtual +virulization||virtualization visiters||visitors vitual||virtual +vmext||vmexit vunerable||vulnerable wakeus||wakeups wathdog||watchdog wating||waiting -wiat||wait wether||whether whataver||whatever whcih||which whenver||whenever wheter||whether whe||when +wiat||wait wierd||weird wiil||will wirte||write diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index 64073e050c6a..b02279da6fa1 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -6,4 +6,5 @@ execveat.moved execveat.path.ephemeral execveat.ephemeral execveat.denatured -xxxxxxxx*
\ No newline at end of file +/recursion-depth +xxxxxxxx* diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 427c41ba5151..33339e31e365 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -1,11 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS = -Wall +CFLAGS += -Wno-nonnull +CFLAGS += -D_GNU_SOURCE TEST_GEN_PROGS := execveat TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile +TEST_GEN_PROGS += recursion-depth + EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* include ../lib.mk diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c new file mode 100644 index 000000000000..2dbd5bc45b3e --- /dev/null +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that pointing #! script interpreter to self doesn't recurse. */ +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mount.h> +#include <unistd.h> + +int main(void) +{ + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 4; + } + fprintf(stderr, "error: unshare, errno %d\n", errno); + return 1; + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + fprintf(stderr, "error: mount '/', errno %d\n", errno); + return 1; + } + /* Require "exec" filesystem. */ + if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) { + fprintf(stderr, "error: mount ramfs, errno %d\n", errno); + return 1; + } + +#define FILENAME "/tmp/1" + + int fd = creat(FILENAME, 0700); + if (fd == -1) { + fprintf(stderr, "error: creat, errno %d\n", errno); + return 1; + } +#define S "#!" FILENAME "\n" + if (write(fd, S, strlen(S)) != strlen(S)) { + fprintf(stderr, "error: write, errno %d\n", errno); + return 1; + } + close(fd); + + int rv = execve(FILENAME, NULL, NULL); + if (rv == -1 && errno == ELOOP) { + return 0; + } + fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno); + return 1; +} diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 780ce7123374..6a970b127c9b 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -24,19 +24,21 @@ TEST_FILE=$(mktemp) # This represents # -# TEST_ID:TEST_COUNT:ENABLED +# TEST_ID:TEST_COUNT:ENABLED:TARGET # # TEST_ID: is the test id number # TEST_COUNT: number of times we should run the test # ENABLED: 1 if enabled, 0 otherwise +# TARGET: test target file required on the test_sysctl module # # Once these are enabled please leave them as-is. Write your own test, # we have tons of space. -ALL_TESTS="0001:1:1" -ALL_TESTS="$ALL_TESTS 0002:1:1" -ALL_TESTS="$ALL_TESTS 0003:1:1" -ALL_TESTS="$ALL_TESTS 0004:1:1" -ALL_TESTS="$ALL_TESTS 0005:3:1" +ALL_TESTS="0001:1:1:int_0001" +ALL_TESTS="$ALL_TESTS 0002:1:1:string_0001" +ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" +ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" +ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" +ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" test_modprobe() { @@ -149,6 +151,9 @@ reset_vals() string_0001) VAL="(none)" ;; + bitmap_0001) + VAL="" + ;; *) ;; esac @@ -157,8 +162,10 @@ reset_vals() set_orig() { - if [ ! -z $TARGET ]; then - echo "${ORIG}" > "${TARGET}" + if [ ! -z $TARGET ] && [ ! -z $ORIG ]; then + if [ -f ${TARGET} ]; then + echo "${ORIG}" > "${TARGET}" + fi fi } @@ -177,9 +184,25 @@ verify() return 0 } +# proc files get read a page at a time, which can confuse diff, +# and get you incorrect results on proc files with long data. To use +# diff against them you must first extract the output to a file, and +# then compare against that file. +verify_diff_proc_file() +{ + TMP_DUMP_FILE=$(mktemp) + cat $1 > $TMP_DUMP_FILE + + if ! diff -w -q $TMP_DUMP_FILE $2; then + return 1 + else + return 0 + fi +} + verify_diff_w() { - echo "$TEST_STR" | diff -q -w -u - $1 + echo "$TEST_STR" | diff -q -w -u - $1 > /dev/null return $? } @@ -600,9 +623,70 @@ run_stringtests() test_rc } +target_exists() +{ + TARGET="${SYSCTL}/$1" + TEST_ID="$2" + + if [ ! -f ${TARGET} ] ; then + echo "Target for test $TEST_ID: $TARGET not exist, skipping test ..." + return 0 + fi + return 1 +} + +run_bitmaptest() { + # Total length of bitmaps string to use, a bit under + # the maximum input size of the test node + LENGTH=$((RANDOM % 65000)) + + # First bit to set + BIT=$((RANDOM % 1024)) + + # String containing our list of bits to set + TEST_STR=$BIT + + # build up the string + while [ "${#TEST_STR}" -le "$LENGTH" ]; do + # Make sure next entry is discontiguous, + # skip ahead at least 2 + BIT=$((BIT + $((2 + RANDOM % 10)))) + + # Add new bit to the list + TEST_STR="${TEST_STR},${BIT}" + + # Randomly make it a range + if [ "$((RANDOM % 2))" -eq "1" ]; then + RANGE_END=$((BIT + $((1 + RANDOM % 10)))) + TEST_STR="${TEST_STR}-${RANGE_END}" + BIT=$RANGE_END + fi + done + + echo -n "Checking bitmap handler... " + TEST_FILE=$(mktemp) + echo -n "$TEST_STR" > $TEST_FILE + + cat $TEST_FILE > $TARGET 2> /dev/null + if [ $? -ne 0 ]; then + echo "FAIL" >&2 + rc=1 + test_rc + fi + + if ! verify_diff_proc_file "$TARGET" "$TEST_FILE"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + rc=0 + fi + test_rc +} + sysctl_test_0001() { - TARGET="${SYSCTL}/int_0001" + TARGET="${SYSCTL}/$(get_test_target 0001)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) @@ -614,7 +698,7 @@ sysctl_test_0001() sysctl_test_0002() { - TARGET="${SYSCTL}/string_0001" + TARGET="${SYSCTL}/$(get_test_target 0002)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR="Testing sysctl" @@ -627,7 +711,7 @@ sysctl_test_0002() sysctl_test_0003() { - TARGET="${SYSCTL}/int_0002" + TARGET="${SYSCTL}/$(get_test_target 0003)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) @@ -640,7 +724,7 @@ sysctl_test_0003() sysctl_test_0004() { - TARGET="${SYSCTL}/uint_0001" + TARGET="${SYSCTL}/$(get_test_target 0004)" reset_vals ORIG=$(cat "${TARGET}") TEST_STR=$(( $ORIG + 1 )) @@ -653,13 +737,21 @@ sysctl_test_0004() sysctl_test_0005() { - TARGET="${SYSCTL}/int_0003" + TARGET="${SYSCTL}/$(get_test_target 0005)" reset_vals ORIG=$(cat "${TARGET}") run_limit_digit_int_array } +sysctl_test_0006() +{ + TARGET="${SYSCTL}/bitmap_0001" + reset_vals + ORIG="" + run_bitmaptest +} + list_tests() { echo "Test ID list:" @@ -673,10 +765,9 @@ list_tests() echo "0003 x $(get_test_count 0003) - tests proc_dointvec()" echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" + echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" } -test_reqs - usage() { NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .) @@ -724,25 +815,35 @@ function get_test_count() { test_num $1 TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') - LAST_TWO=${TEST_DATA#*:*} - echo ${LAST_TWO%:*} + echo ${TEST_DATA} | awk -F":" '{print $2}' } function get_test_enabled() { test_num $1 TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') - echo ${TEST_DATA#*:*:} + echo ${TEST_DATA} | awk -F":" '{print $3}' +} + +function get_test_target() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + echo ${TEST_DATA} | awk -F":" '{print $4}' } function run_all_tests() { for i in $ALL_TESTS ; do - TEST_ID=${i%:*:*} + TEST_ID=${i%:*:*:*} ENABLED=$(get_test_enabled $TEST_ID) TEST_COUNT=$(get_test_count $TEST_ID) + TEST_TARGET=$(get_test_target $TEST_ID) + if target_exists $TEST_TARGET $TEST_ID; then + continue + fi if [[ $ENABLED -eq "1" ]]; then - test_case $TEST_ID $TEST_COUNT + test_case $TEST_ID $TEST_COUNT $TEST_TARGET fi done } @@ -775,12 +876,14 @@ function watch_case() function test_case() { - NUM_TESTS=$DEFAULT_NUM_TESTS - if [ $# -eq 2 ]; then - NUM_TESTS=$2 - fi + NUM_TESTS=$2 i=0 + + if target_exists $3 $1; then + continue + fi + while [ $i -lt $NUM_TESTS ]; do test_num $1 watch_log $i ${TEST_NAME}_test_$1 noclear @@ -803,15 +906,15 @@ function parse_args() elif [[ "$1" = "-t" ]]; then shift test_num $1 - test_case $1 $(get_test_count $1) + test_case $1 $(get_test_count $1) $(get_test_target $1) elif [[ "$1" = "-c" ]]; then shift test_num $1 test_num $2 - test_case $1 $2 + test_case $1 $2 $(get_test_target $1) elif [[ "$1" = "-s" ]]; then shift - test_case $1 1 + test_case $1 1 $(get_test_target $1) elif [[ "$1" = "-l" ]]; then list_tests elif [[ "$1" = "-h" || "$1" = "--help" ]]; then @@ -825,8 +928,8 @@ function parse_args() test_reqs allow_user_defaults check_production_sysctl_writes_strict -test_modprobe load_req_mod +test_modprobe trap "test_finish" EXIT diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 73818f1b2ef8..3bf2a1fef8ed 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -29,7 +29,7 @@ struct slabinfo { char *name; int alias; int refs; - int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; + int aliases, align, cache_dma, cache_dma32, cpu_slabs, destroy_by_rcu; unsigned int hwcache_align, object_size, objs_per_slab; unsigned int sanity_checks, slab_size, store_user, trace; int order, poison, reclaim_account, red_zone; @@ -534,6 +534,8 @@ static void report(struct slabinfo *s) printf("** Hardware cacheline aligned\n"); if (s->cache_dma) printf("** Memory is allocated in a special DMA zone\n"); + if (s->cache_dma32) + printf("** Memory is allocated in a special DMA32 zone\n"); if (s->destroy_by_rcu) printf("** Slabs are destroyed via RCU\n"); if (s->reclaim_account) @@ -602,6 +604,8 @@ static void slabcache(struct slabinfo *s) *p++ = '*'; if (s->cache_dma) *p++ = 'd'; + if (s->cache_dma32) + *p++ = 'D'; if (s->hwcache_align) *p++ = 'A'; if (s->poison) @@ -1208,6 +1212,7 @@ static void read_slab_dir(void) slab->aliases = get_obj("aliases"); slab->align = get_obj("align"); slab->cache_dma = get_obj("cache_dma"); + slab->cache_dma32 = get_obj("cache_dma32"); slab->cpu_slabs = get_obj("cpu_slabs"); slab->destroy_by_rcu = get_obj("destroy_by_rcu"); slab->hwcache_align = get_obj("hwcache_align"); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2b18a923ab7a..4bf728e3d857 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -391,7 +391,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, spin_unlock(&kvm->mmu_lock); ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, - range->end, range->blockable); + range->end, + mmu_notifier_range_blockable(range)); srcu_read_unlock(&kvm->srcu, idx); |